def getYoutubeDurations(dataset, bq_table_input, api_key, outputfilename, schema, force_recompute): ''' Add youtube durations to Video Axis file using youtube id's and then write out to specified local path to prep for google storage / bigquery upload ''' fp = openfile(outputfilename, 'w') linecnt = 0 for row_dict in bq_table_input: linecnt += 1 verified_row = OrderedDict() # Initial pass-through of keys in current row for keys in row_dict: # Only include keys defined in schema if keys in schema.keys(): verified_row[keys] = row_dict[keys] # Recompute Video Length durations if force_recompute: verified_row[VIDEO_LENGTH] = findVideoLength( dataset=dataset, youtube_id=verified_row[VIDEO_ID], api_key=api_key ) # Ensure schema type check_schema(linecnt, verified_row, the_ds=schema, coerce=True) try: fp.write(json.dumps(verified_row)+'\n') except Exception as err: print "Failed to write line %s! Error=%s, data=%s" % (linecnt, str(err), dataset) fp.close()
def getYoutubeDurations(dataset, bq_table_input, api_key, outputfilename, schema, force_recompute): ''' Add youtube durations to Video Axis file using youtube id's and then write out to specified local path to prep for google storage / bigquery upload ''' fp = openfile(outputfilename, 'w') linecnt = 0 for row_dict in bq_table_input: linecnt += 1 verified_row = OrderedDict() # Initial pass-through of keys in current row for keys in row_dict: # Only include keys defined in schema if keys in schema.keys(): verified_row[keys] = row_dict[keys] # Recompute Video Length durations if force_recompute: verified_row[VIDEO_LENGTH] = findVideoLength( dataset=dataset, youtube_id=verified_row[VIDEO_ID], api_key=api_key) # Ensure schema type check_schema(linecnt, verified_row, the_ds=schema, coerce=True) try: fp.write(json.dumps(verified_row) + '\n') except Exception as err: print "Failed to write line %s! Error=%s, data=%s" % ( linecnt, str(err), dataset) fp.close()
def do_rephrase(data, do_schema_check=True, linecnt=0): if '_id' in data: data['mongoid'] = data['_id']['$oid'] data.pop('_id') if 'parent_id' in data: data['parent_id'] = data['parent_id']['$oid'] def fix_date(dstr): if dstr: try: dtime = int(dstr) if dtime: try: dt = datetime.datetime.utcfromtimestamp(dtime / 1000.0) except Exception as err: print "oops, failed to convert in utcfromtimestamp dtime=%s, dstr=%s" % ( dtime, dstr) raise return str(dt) except Exception as err: try: dt = date_parse(dstr[:16]) return str(dt) except Exception as err: return dstr return None def do_fix_date(field, rec): if field in rec: rec[field] = fix_date(rec[field]['$date']) do_fix_date('time', data.get('endorsement', {}) or {}) if 'updated_at' in data: data['updated_at'] = fix_date(data['updated_at']['$date']) if 'created_at' in data: data['created_at'] = fix_date(data['created_at']['$date']) if 'last_activity_at' in data: data['last_activity_at'] = fix_date(data['last_activity_at']['$date']) if 'comment_thread_id' in data: data['comment_thread_id'] = data['comment_thread_id']['$oid'] if ('endorsement' in data) and ((data['endorsement'] == 'null') or (not data['endorsement']) or (data['endorsement'] is None)): data.pop('endorsement') if 'parent_ids' in data: data['parent_ids'] = ' '.join([x['$oid'] for x in data['parent_ids']]) def mkstring(key, rec): if key in rec: rec[key] = str(rec[key]) mkstring('historical_abuse_flaggers', data) mkstring('abuse_flaggers', data) mkstring('at_position_list', data) mkstring('tags_array', data) mkstring('up', data.get('votes', {})) mkstring('down', data.get('votes', {})) # check for any funny keys, recursively funny_key_sections = [] def check_for_funny_keys(entry, name='toplevel'): for key, val in entry.iteritems(): if key.startswith('i4x-') or key.startswith('xblock.'): sys.stderr.write( "[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, '')) funny_key_sections.append(name) return True if len(key) > 25: sys.stderr.write( "[rephrase] suspicious key at %s in entry: %s, data=%s\n" % (name, entry, '')) if key[0] in '0123456789': sys.stderr.write( "[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, '')) funny_key_sections.append(name) return True if '-' in key or '.' in key: # bad key name! rename it, chaning "-" to "_" newkey = key.replace('-', '_').replace('.', '__') sys.stderr.write( "[rephrase] oops, bad keyname at %s in entry: %s newkey+%s\n" % (name, entry, newkey)) entry[newkey] = val entry.pop(key) key = newkey if type(val) == dict: ret = check_for_funny_keys(val, name + '/' + key) if ret is True: sys.stderr.write( " coercing section %s to become a string\n" % (name + "/" + key)) entry[key] = json.dumps(val) return False check_for_funny_keys(data) if 'context' in data: data.pop('context') # 25aug15: remove key try: check_schema(linecnt, data, the_ds=SCHEMA_DICT, coerce=True) except Exception as err: sys.stderr.write('[%d] oops, err=%s, failed in check_schema %s\n' % (linecnt, str(err), json.dumps(data, indent=4))) sys.stderr.write(traceback.format_exc()) return
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False): basedir = path(basedir or '') course_dir = course_id.replace('/', '__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) cdir = lfp print "Processing %s from files in %s" % (course_id, cdir) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath the_dict_schema = schema2dict( json.loads(open(SCHEMA_FILE).read())['user_info_combo']) uic = defaultdict( dict ) # dict with key = user_id, and val = dict to be written out as JSON line def copy_elements(src, dest, fields, prefix="", skip_empty=False): for key in fields: if skip_empty and (not key in src): src[key] = None if src[key] == 'NULL': continue if key == 'course_id' and src[key].startswith('course-v1:'): # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015 src[key] = src[key].split(':', 1)[1].replace('+', '/') dest[prefix + key] = src[key] def openfile(fn_in, mode='r', add_dir=True): if add_dir: fn = cdir / fn_in else: fn = fn_in if (not os.path.exists(fn)) and (not fn.endswith('.gz')): fn += ".gz" if mode == 'r' and not os.path.exists(fn): newfn = convert_sql( fn) # try converting from *.sql file, if that exists if not newfn: return None # failure, no file found, return None fn = newfn if fn.endswith('.gz'): return gzip.GzipFile(fn, mode) return open(fn, mode) def tsv2csv(fn_in, fn_out): import csv fp = openfile(fn_out, 'w', add_dir=False) csvfp = csv.writer(fp) for line in openfile(fn_in, add_dir=False): csvfp.writerow(line[:-1].split('\t')) fp.close() def convert_sql(fnroot): ''' Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values. Returns False otherwise. ''' if fnroot.endswith('.gz'): fnroot = fnroot[:-3] if fnroot.endswith('.csv'): fnroot = fnroot[:-4] if os.path.exists(fnroot + ".csv"): return fnroot + ".csv" if os.path.exists(fnroot + ".csv.gz"): return fnroot + ".csv.gz" if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) return outfn return False nusers = 0 fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] for line in csv.DictReader(openfile('users.csv')): uid = int(line['id']) copy_elements(line, uic[uid], fields) uic[uid]['user_id'] = uid nusers += 1 uic[uid]['y1_anomalous'] = None uic[uid]['edxinstructordash_Grade'] = None uic[uid]['edxinstructordash_Grade_timestamp'] = None print " %d users loaded from users.csv" % nusers fp = openfile('profiles.csv') if fp is None: print "--> Skipping profiles.csv, file does not exist" else: nprofiles = 0 fields = [ 'name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city' ] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="profile_") nprofiles += 1 print " %d profiles loaded from profiles.csv" % nprofiles fp = openfile('enrollment.csv') if fp is None: print "--> Skipping enrollment.csv, file does not exist" else: nenrollments = 0 fields = [ 'course_id', 'created', 'is_active', 'mode', ] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="enrollment_") nenrollments += 1 print " %d enrollments loaded from profiles.csv" % nenrollments # see if from_mongodb files are present for this course; if so, merge in that data mongodir = cdir.dirname() / 'from_mongodb' if mongodir.exists(): print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir sys.stdout.flush() fp = gzip.GzipFile(mongodir / "users.json.gz") fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] nadded = 0 for line in fp: pdata = json.loads(line) uid = int(pdata['_id']) if not uid in uic: copy_elements(pdata, uic[uid], fields, skip_empty=True) uic[uid]['user_id'] = uid nadded += 1 fp.close() print " %d additional users loaded from %s/users.json.gz" % (nadded, mongodir) fp = gzip.GzipFile(mongodir / "profiles.json.gz") fields = [ 'name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city' ] nadd_profiles = 0 def fix_unicode(elem, fields): for k in fields: if (k in elem) and elem[k]: elem[k] = elem[k].encode('utf8') for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('profile_name', None): copy_elements(pdata, uic[uid], fields, prefix="profile_", skip_empty=True) fix_unicode(uic[uid], [ 'profile_name', 'profile_mailing_address', 'profile_goals', 'profile_location', 'profile_language' ]) uic[uid]['y1_anomalous'] = 1 nadd_profiles += 1 fp.close() print " %d additional profiles loaded from %s/profiles.json.gz" % ( nadd_profiles, mongodir) # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff cutoff = None if datedir: cutoff = "%s 00:00:00" % datedir fp = gzip.GzipFile(mongodir / "enrollment.json.gz") fields = [ 'course_id', 'created', 'is_active', 'mode', ] nadd_enrollment = 0 n_removed_after_cutoff = 0 for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('enrollment_course_id', None): if cutoff and (pdata['created'] > cutoff) and ( uic[uid].get('y1_anomalous') == 1): # remove if enrolled after datedir cutoff uic.pop(uid) n_removed_after_cutoff += 1 else: copy_elements(pdata, uic[uid], fields, prefix="enrollment_", skip_empty=True) nadd_enrollment += 1 fp.close() print " %d additional enrollments loaded from %s/enrollment.json.gz" % ( nadd_enrollment, mongodir) print " from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % ( nadded - n_removed_after_cutoff, nadded, nadd_profiles, nadd_enrollment, n_removed_after_cutoff, cutoff) sys.stdout.flush() # See if instructor grade reports are present for this course; if so, merge in that data edxinstructordash = cdir.dirname() / 'from_edxinstructordash' if edxinstructordash.exists(): print "--> %s exists, merging in users, profile, and enrollment data from_edxinstructordash" % edxinstructordash sys.stdout.flush() grade_report_fn = (edxinstructordash / 'grade_report.csv') fp = openfile(grade_report_fn, add_dir=False) if fp is None: print "--> Skipping grade_report.csv, file does not exist in dir from_edxinstructordash" nadded = 0 print fp for line in csv.DictReader(fp): uid = int(line['Student ID']) fields = ['Grade', 'Grade_timestamp'] #['course_id','Student ID','Email','Username','Grade' ] #'Enrollment Track',' Verification Status','Certificate Eligible','Certificate Delivered','Certificate Type' ] copy_elements(line, uic[uid], fields, prefix="edxinstructordash_") nadded += 1 fp.close() print " %d grades loaded from %s/grade_report.csv" % ( nadded, edxinstructordash) sys.stdout.flush() fp = openfile('certificates.csv') if fp is None: print "--> Skipping certificates.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['user_id']) fields = [ 'download_url', 'grade', 'course_id', 'key', 'distinction', 'status', 'verify_uuid', 'download_uuid', 'name', 'created_date', 'modified_date', 'error_reason', 'mode', ] copy_elements(line, uic[uid], fields, prefix="certificate_") if 'user_id' not in uic[uid]: uic[uid]['user_id'] = uid # sanity check for entries with user_id but missing username nmissing_uname = 0 for uid, entry in uic.iteritems(): if (not 'username' in entry) or (not entry['username']): nmissing_uname += 1 if nmissing_uname < 10: print "missing username: %s" % entry print "--> %d entries missing username" % nmissing_uname sys.stdout.flush() # sanity check for entries missing course_id nmissing_cid = 0 for uid, entry in uic.iteritems(): if (not 'enrollment_course_id' in entry) or (not entry['enrollment_course_id']): nmissing_cid += 1 entry['enrollment_course_id'] = course_id print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % ( nmissing_cid, course_id) sys.stdout.flush() fp = openfile('user_id_map.csv') if fp is None: print "--> Skipping user_id_map.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['id']) fields = ['hash_id'] copy_elements(line, uic[uid], fields, prefix="id_map_") # sort by userid uidset = uic.keys() uidset.sort() # write out result, checking schema along the way fieldnames = the_dict_schema.keys() ofp = openfile('user_info_combo.json.gz', 'w') ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'), fieldnames=fieldnames) ocsv.writeheader() for uid in uidset: data = uic[uid] check_schema(uid, data, the_ds=the_dict_schema, coerce=True) if ('enrollment_course_id' not in data) and ('certificate_course_id' not in data): print "Oops! missing course_id in user_info_combo line: inconsistent SQL?" print "data = %s" % data print "Suppressing this row" continue row_course_id = data.get('enrollment_course_id', data.get('certificate_course_id', '')) if not row_course_id == course_id: print "Oops! course_id=%s in user_info_combo line: inconsistent with expected=%s" % ( row_course_id, course_id) print "data = %s" % data print "Suppressing this row" continue ofp.write(json.dumps(data) + '\n') try: ocsv.writerow(data) except Exception as err: print "failed to write data=%s" % data raise print "Done with make_user_info_combo for %s" % course_id sys.stdout.flush()
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False): basedir = path(basedir or '') course_dir = course_id.replace('/','__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) cdir = lfp print "Processing %s from files in %s" % (course_id, cdir) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath the_dict_schema = schema2dict(json.loads(open(SCHEMA_FILE).read())['user_info_combo']) uic = defaultdict(dict) # dict with key = user_id, and val = dict to be written out as JSON line def copy_elements(src, dest, fields, prefix="", skip_empty=False): for key in fields: if skip_empty and (not key in src): src[key] = None if src[key]=='NULL': continue if key=='course_id' and src[key].startswith('course-v1:'): # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015 src[key] = src[key].split(':',1)[1].replace('+','/') dest[prefix + key] = src[key] def openfile(fn_in, mode='r', add_dir=True): if add_dir: fn = cdir / fn_in else: fn = fn_in if (not os.path.exists(fn)) and (not fn.endswith('.gz')): fn += ".gz" if mode=='r' and not os.path.exists(fn): newfn = convert_sql(fn) # try converting from *.sql file, if that exists if not newfn: return None # failure, no file found, return None fn = newfn if fn.endswith('.gz'): return gzip.GzipFile(fn, mode) return open(fn, mode) def tsv2csv(fn_in, fn_out): import csv fp = openfile(fn_out, 'w', add_dir=False) csvfp = csv.writer(fp) for line in openfile(fn_in, add_dir=False): csvfp.writerow(line[:-1].split('\t')) fp.close() def convert_sql(fnroot): ''' Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values. Returns False otherwise. ''' if fnroot.endswith('.gz'): fnroot = fnroot[:-3] if fnroot.endswith('.csv'): fnroot = fnroot[:-4] if os.path.exists(fnroot + ".csv"): return fnroot + ".csv" if os.path.exists(fnroot + ".csv.gz"): return fnroot + ".csv.gz" if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) return outfn return False nusers = 0 fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] for line in csv.DictReader(openfile('users.csv')): uid = int(line['id']) copy_elements(line, uic[uid], fields) uic[uid]['user_id'] = uid nusers += 1 uic[uid]['y1_anomalous'] = None print " %d users loaded from users.csv" % nusers fp = openfile('profiles.csv') if fp is None: print "--> Skipping profiles.csv, file does not exist" else: nprofiles = 0 fields = ['name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city'] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="profile_") nprofiles += 1 print " %d profiles loaded from profiles.csv" % nprofiles fp = openfile('enrollment.csv') if fp is None: print "--> Skipping enrollment.csv, file does not exist" else: nenrollments = 0 fields = ['course_id', 'created', 'is_active', 'mode', ] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="enrollment_") nenrollments += 1 print " %d enrollments loaded from profiles.csv" % nenrollments # see if from_mongodb files are present for this course; if so, merge in that data mongodir = cdir.dirname() / 'from_mongodb' if mongodir.exists(): print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir sys.stdout.flush() fp = gzip.GzipFile(mongodir / "users.json.gz") fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] nadded = 0 for line in fp: pdata = json.loads(line) uid = int(pdata['_id']) if not uid in uic: copy_elements(pdata, uic[uid], fields, skip_empty=True) uic[uid]['user_id'] = uid nadded += 1 fp.close() print " %d additional users loaded from %s/users.json.gz" % (nadded, mongodir) fp = gzip.GzipFile(mongodir / "profiles.json.gz") fields = ['name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city'] nadd_profiles = 0 def fix_unicode(elem, fields): for k in fields: if (k in elem) and elem[k]: elem[k] = elem[k].encode('utf8') for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('profile_name', None): copy_elements(pdata, uic[uid], fields, prefix="profile_", skip_empty=True) fix_unicode(uic[uid], ['profile_name', 'profile_mailing_address', 'profile_goals', 'profile_location', 'profile_language']) uic[uid]['y1_anomalous'] = 1 nadd_profiles += 1 fp.close() print " %d additional profiles loaded from %s/profiles.json.gz" % (nadd_profiles, mongodir) # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff cutoff = None if datedir: cutoff = "%s 00:00:00" % datedir fp = gzip.GzipFile(mongodir / "enrollment.json.gz") fields = ['course_id', 'created', 'is_active', 'mode', ] nadd_enrollment = 0 n_removed_after_cutoff = 0 for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('enrollment_course_id', None): if cutoff and (pdata['created'] > cutoff) and (uic[uid].get('y1_anomalous')==1): # remove if enrolled after datedir cutoff uic.pop(uid) n_removed_after_cutoff += 1 else: copy_elements(pdata, uic[uid], fields, prefix="enrollment_", skip_empty=True) nadd_enrollment += 1 fp.close() print " %d additional enrollments loaded from %s/enrollment.json.gz" % (nadd_enrollment, mongodir) print " from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % (nadded - n_removed_after_cutoff, nadded, nadd_profiles, nadd_enrollment, n_removed_after_cutoff, cutoff) sys.stdout.flush() fp = openfile('certificates.csv') if fp is None: print "--> Skipping certificates.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['user_id']) fields = ['download_url', 'grade', 'course_id', 'key', 'distinction', 'status', 'verify_uuid', 'download_uuid', 'name', 'created_date', 'modified_date', 'error_reason', 'mode',] copy_elements(line, uic[uid], fields, prefix="certificate_") if 'user_id' not in uic[uid]: uic[uid]['user_id'] = uid # sanity check for entries with user_id but missing username nmissing_uname = 0 for uid, entry in uic.iteritems(): if (not 'username' in entry) or (not entry['username']): nmissing_uname += 1 if nmissing_uname < 10: print "missing username: %s" % entry print "--> %d entries missing username" % nmissing_uname sys.stdout.flush() # sanity check for entries missing course_id nmissing_cid = 0 for uid, entry in uic.iteritems(): if (not 'enrollment_course_id' in entry) or (not entry['enrollment_course_id']): nmissing_cid += 1 entry['enrollment_course_id'] = course_id print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % (nmissing_cid, course_id) sys.stdout.flush() fp = openfile('user_id_map.csv') if fp is None: print "--> Skipping user_id_map.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['id']) fields = ['hash_id'] copy_elements(line, uic[uid], fields, prefix="id_map_") # sort by userid uidset = uic.keys() uidset.sort() # write out result, checking schema along the way fieldnames = the_dict_schema.keys() ofp = openfile('user_info_combo.json.gz', 'w') ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'), fieldnames=fieldnames) ocsv.writeheader() for uid in uidset: data = uic[uid] check_schema(uid, data, the_ds=the_dict_schema, coerce=True) if ('enrollment_course_id' not in data) and ('certificate_course_id' not in data): print "Oops! missing course_id in user_info_combo line: inconsistent SQL?" print "data = %s" % data print "Suppressing this row" continue row_course_id = data.get('enrollment_course_id', data.get('certificate_course_id', '')) if not row_course_id==course_id: print "Oops! course_id=%s in user_info_combo line: inconsistent with expected=%s" % (row_course_id, course_id) print "data = %s" % data print "Suppressing this row" continue ofp.write(json.dumps(data) + '\n') try: ocsv.writerow(data) except Exception as err: print "failed to write data=%s" % data raise print "Done with make_user_info_combo for %s" % course_id sys.stdout.flush()
def do_rephrase(data, do_schema_check=True, linecnt=0): # add course_id? if 'course_id' not in data: cid = data.get('context',{}).get('course_id','') if cid: data['course_id'] = cid # add module_id? if 'module_id' not in data: add_module_id(data) # ensure event is dict when possible if not 'event_js' in data: try: event = data['event'] if not type(event)==dict: event = json.loads(event) event_js = True except Exception as err: event_js = False data['event'] = event data['event_js'] = event_js #---------------------------------------- # "event" is used for too many kinds of data, with colliding types for fields. # thus, in general, we can't store it as a fixed schema field. # # so we turn "event" into a JSON string. # # for specific kinds of events ("event_type"), we keep a parsed version of # "event", e.g. for problem_* event types. # # store that parsed version as "event_struct" event = None if 'event' in data: event = data['event'] data['event'] = json.dumps(data['event']) # now the real rephrasing event_type = data.get('event_type', None) #---------------------------------------- # types to keep KNOWN_TYPES = ['play_video', 'seq_goto', 'seq_next', 'seq_prev', 'seek_video', 'load_video', 'save_problem_success', 'save_problem_fail', 'reset_problem_success', 'reset_problem_fail', 'show_answer', 'edx.course.enrollment.activated', 'edx.course.enrollment.deactivated', 'edx.course.enrollment.mode_changed', 'edx.course.enrollment.upgrade.succeeded', 'speed_change_video', 'problem_check', 'problem_save', 'problem_reset' ] if (type(event)==dict and (('problem_' in event_type) or event_type in KNOWN_TYPES) and not ('video_embedded' in event_type or 'harvardx.button' in event_type or 'harvardx.' in event_type )): data['event_struct'] = event elif type(event)==dict: # default to always including GET and POST when available data['event_struct'] = {'GET': json.dumps(event.get('GET')), 'POST': json.dumps(event.get('POST'))} data['event_struct']['query'] = event.get('query') else: if 'event_struct' in data: data.pop('event_struct') #---------------------------------------- # special cases if '_id' in data: data['mongoid'] = data['_id']['$oid'] data.pop('_id') if type(event)==dict and 'POST' in event: post_str = json.dumps(event['POST']) event['POST'] = post_str if type(event)==dict and 'GET' in event: get_str = json.dumps(event['GET']) event['GET'] = get_str if event_type in ['problem_check', 'problem_save', 'problem_reset'] and data['event_source']=='browser': if type(event) in [str, unicode]: event = {'data': json.dumps(event)} if type(event) in [str, unicode]: #if event and data['event_js']: # sys.stderr.write('unexpected STRING event: ' + json.dumps(data, indent=4) + '\n') event = {'data': json.dumps(event)} if type(event) in [list]: event = {'data': json.dumps(event)} def make_str(key): if event is not None and 'state' in event: state = event['state'] if key in state: state[key] = json.dumps(state[key]) make_str('input_state') make_str('correct_map') make_str('student_answers') def make_str0(key): ev = event or {} if key in ev: ev[key] = json.dumps(ev[key]) make_str0('correct_map') make_str0('answers') make_str0('submission') make_str0('old_state') make_str0('new_state') make_str0('permutation') make_str0('options_selected') make_str0('corrections') def make_str2(key): context = data.get('context', {}) if key in context: context[key] = json.dumps(context[key]) make_str2('course_user_tags') def move_unknown_fields_from_context_to_context_agent(keys): # needed to handle new fields from mobile client context = data.get('context', {}) agent = {'oldagent': context.get('agent', "")} for key in keys: if '.' in key: (prefix, subkey) = key.split('.',1) if prefix in context: subcontext = context[prefix] if subkey in subcontext: agent[key] = subcontext[subkey] subcontext.pop(subkey) else: if key in context: agent[key] = context[key] context.pop(key) context['agent'] = json.dumps(agent) # 31-Jan-15: handle new "module.usage_key" field in context, e.g.: # # "module": { # "display_name": "Radiation Exposure", # "usage_key": "i4x://MITx/6.00.1x_5/problem/ps03:ps03-Radiation-Exposure" # }, # 28-May-16: context.asides mobile_api_context_fields = ['application', 'client', 'received_at', 'component', "open_in_browser_url", "module.usage_key", "module.original_usage_version", "module.original_usage_key", "asides", ] move_unknown_fields_from_context_to_context_agent(mobile_api_context_fields) #---------------------------------------- # new fields which are not in schema get moved as JSON strings to the pre-existing "mongoid" field, # which is unused except in very old records # do this, for example, for the "referer" and "accept_language" fields def move_fields_to_mongoid(field_paths): ''' field_path is a list of lists which gives the recursive dict keys to traverse to get to the field to move. Move that field, with the path intact, into the mongoid field. ''' mongoid = data.get('mongoid') if not type(mongoid)==dict: mongoid = {'old_mongoid' : mongoid} def move_field_value(ddict, vdict, fp): '''recursively traverse dict to get and move value for specified field path''' key = fp[0] if len(fp)==1: # base case if key in ddict: fval = ddict.get(key) vdict[key] = fval # copy to new values dict ddict.pop(key) # remove key from current path within data dict return fval return None if not key in vdict: vdict[key] = {} return move_field_value(ddict.get(key, {}), vdict[key], fp[1:]) vdict = mongoid for field_path in field_paths: move_field_value(data, vdict, field_path) data['mongoid'] = json.dumps(vdict) # 16-Mar-15: remove event_struct.requested_skip_interval move_fields_to_mongoid([ ['referer'], ['accept_language'], ['event_struct', 'requested_skip_interval'], ['event_struct', 'submitted_answer'], ['event_struct', 'num_attempts'], ['event_struct', 'task_id'], # 05oct15 ['event_struct', 'content'], # 11jan16 ['nonInteraction'], # 24aug15 ['label'], # 24aug15 ['event_struct', 'widget_placement'], # 08may16 ['event_struct', 'tab_count'], # 08may16 ['event_struct', 'current_tab'], # 08may16 ['event_struct', 'target_tab'], # 08may16 ]) #---------------------------------------- # general checks def fix_dash(key): ev = event or {} if key in ev: newkey = key.replace('-', '_') ev[newkey] = ev[key] ev.pop(key) fix_dash('child-id') def check_empty(data, *keys): # print "--> keys=%s, data=%s" % (str(keys), data) key = keys[0] if type(data)==dict and key in data: if len(keys)==1: if data[key] in ["", u'']: # print "---> popped %s" % key data.pop(key) else: check_empty(data[key], *keys[1:]) check_empty(data, 'context', "user_id") data.pop('event_js') # leftover from mongo import script #----------------------------------------- # check for null values in speed_change_video # Error encountered parsing LAC data from Oct. 2013 # Requires that we also be able to convert the value to a float def string_is_float(s): try: float(s) return True except ValueError: return False if data['event_type']=='speed_change_video': if 'event_struct' in data and 'new_speed' in data['event_struct']: # First check if string is float if string_is_float(data['event_struct']['new_speed']): # Second check if value is null if isnan(float(data['event_struct']['new_speed'])): data['event_struct'].pop('new_speed') # check for any funny keys, recursively funny_key_sections = [] def check_for_funny_keys(entry, name='toplevel'): for key, val in entry.iteritems(): if key.startswith('i4x-') or key.startswith('xblock.'): sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, '')) funny_key_sections.append(name) return True if len(key)>25: sys.stderr.write("[rephrase] suspicious key at %s in entry: %s, data=%s\n" % (name, entry, '')) if key[0] in '0123456789': sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, '')) funny_key_sections.append(name) return True if '-' in key or '.' in key: # bad key name! rename it, chaning "-" to "_" newkey = key.replace('-','_').replace('.','__') sys.stderr.write("[rephrase] oops, bad keyname at %s in entry: %s newkey+%s\n" % (name, entry, newkey)) entry[newkey] = val entry.pop(key) key = newkey if type(val)==dict: ret = check_for_funny_keys(val, name + '/' + key) if ret is True: sys.stderr.write(" coercing section %s to become a string\n" % (name+"/"+key) ) entry[key] = json.dumps(val) return False check_for_funny_keys(data) try: check_schema(linecnt, data, coerce=True) except Exception as err: sys.stderr.write('[%d] oops, err=%s, failed in check_schema %s\n' % (linecnt, str(err), json.dumps(data, indent=4))) sys.stderr.write(traceback.format_exc()) return
def do_rephrase(data, do_schema_check=True, linecnt=0): if '_id' in data: data['mongoid'] = data['_id']['$oid'] data.pop('_id') if 'parent_id' in data: data['parent_id'] = data['parent_id']['$oid'] def fix_date(dstr): if dstr: try: dtime = int(dstr) if dtime: try: dt = datetime.datetime.utcfromtimestamp(dtime/1000.0) except Exception as err: print "oops, failed to convert in utcfromtimestamp dtime=%s, dstr=%s" % (dtime, dstr) raise return str(dt) except Exception as err: try: dt = date_parse(dstr[:16]) return str(dt) except Exception as err: return dstr return None def do_fix_date(field, rec): if field in rec: rec[field] = fix_date(rec[field]['$date']) do_fix_date('time', data.get('endorsement',{}) or {}) if 'updated_at' in data: data['updated_at'] = fix_date(data['updated_at']['$date']) if 'created_at' in data: data['created_at'] = fix_date(data['created_at']['$date']) if 'last_activity_at' in data: data['last_activity_at'] = fix_date(data['last_activity_at']['$date']) if 'comment_thread_id' in data: data['comment_thread_id'] = data['comment_thread_id']['$oid'] if ('endorsement' in data) and ((data['endorsement']=='null') or (not data['endorsement']) or (data['endorsement'] is None)): data.pop('endorsement') if 'parent_ids' in data: data['parent_ids'] = ' '.join([x['$oid'] for x in data['parent_ids']]) def mkstring(key, rec): if key in rec: rec[key] = str(rec[key]) mkstring('historical_abuse_flaggers', data) mkstring('abuse_flaggers', data) mkstring('at_position_list', data) mkstring('tags_array', data) mkstring('up', data.get('votes', {})) mkstring('down', data.get('votes', {})) # check for any funny keys, recursively funny_key_sections = [] def check_for_funny_keys(entry, name='toplevel'): for key, val in entry.iteritems(): if key.startswith('i4x-') or key.startswith('xblock.'): sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, '')) funny_key_sections.append(name) return True if len(key)>25: sys.stderr.write("[rephrase] suspicious key at %s in entry: %s, data=%s\n" % (name, entry, '')) if key[0] in '0123456789': sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, '')) funny_key_sections.append(name) return True if '-' in key or '.' in key: # bad key name! rename it, chaning "-" to "_" newkey = key.replace('-','_').replace('.','__') sys.stderr.write("[rephrase] oops, bad keyname at %s in entry: %s newkey+%s\n" % (name, entry, newkey)) entry[newkey] = val entry.pop(key) key = newkey if type(val)==dict: ret = check_for_funny_keys(val, name + '/' + key) if ret is True: sys.stderr.write(" coercing section %s to become a string\n" % (name+"/"+key) ) entry[key] = json.dumps(val) return False check_for_funny_keys(data) if 'context' in data: data.pop('context') # 25aug15: remove key try: check_schema(linecnt, data, the_ds=SCHEMA_DICT, coerce=True) except Exception as err: sys.stderr.write('[%d] oops, err=%s, failed in check_schema %s\n' % (linecnt, str(err), json.dumps(data, indent=4))) sys.stderr.write(traceback.format_exc()) return
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False): ''' Save course axis data to bigquery cid = course_id caset = list of course axis data in dict format xbundle = XML bundle of course (everything except static files) datadir = directory where output files should be written log_msg = list of messages about processing errors and issues ''' # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis'] dict_schema = schema2dict(the_schema) caset = copy.deepcopy(caset_in) datadir = path(datadir) cafn = datadir / 'course_axis.json' xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__'))) fp = open(cafn, 'w') linecnt = 0 for ca in caset: linecnt += 1 ca['course_id'] = cid data = ca['data'] if data and not type(data)==dict: try: ca['data'] = json.loads(data) # make it native, for mongo except Exception as err: print "failed to create json for %s, error=%s" % (data, err) if ca['start'] is not None: ca['start'] = str(ca['start']) # datetime to string if ca['due'] is not None: ca['due'] = str(ca['due']) # datetime to string if (ca['data'] is None) or (ca['data']==''): ca.pop('data') check_schema(linecnt, ca, the_ds=dict_schema, coerce=True) try: # db.course_axis.insert(ca) fp.write(json.dumps(ca)+'\n') except Exception as err: print "Failed to save! Error=%s, data=%s" % (err, ca) fp.close() # upload axis.json file and course xbundle gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if 1: gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False) gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False) # import into BigQuery dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent table = "course_axis" bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema) msg = "="*100 + '\n' msg += "Course axis for %s\n" % (cid) msg += "="*100 + '\n' msg += '\n'.join(log_msg) msg = msg[:16184] # max message length 16384 bqutil.add_description_to_table(dataset, table, msg, append=True) print " Done - inserted %s records into course_axis" % len(caset)
def do_rephrase(data, do_schema_check=True, linecnt=0): """ Modify the provided data dictionary in place to rephrase certain pieces of data for easy loading to BigQuery :TODO: Move the inner functions outside this function. :type: dict :param data: A tracking log record from the edX nightly data files :type: bool :param do_schema_check: Whether or not the provided record should be checked against the target schema :type: int :param linecnt: Some line count value :rtype: None :return: Nothing is returned since the data parameter is modified in place """ # add course_id? if 'course_id' not in data: cid = data.get('context', {}).get('course_id', '') if cid: data['course_id'] = cid # add module_id? if 'module_id' not in data: add_module_id(data) if not "event" in data: data['event'] = "" # ensure event is dict when possible if not 'event_js' in data: event = data.get('event') try: if not isinstance(event, dict): event = json.loads(event) event_js = True except Exception as err: # note - do not erase event even if it can't be loaded as JSON: see how it becomes JSONified below event_js = False data['event'] = event data['event_js'] = event_js #---------------------------------------- # "event" is used for too many kinds of data, with colliding types for fields. # thus, in general, we can't store it as a fixed schema field. # # so we turn "event" into a JSON string. # # for specific kinds of events ("event_type"), we keep a parsed version of # "event", e.g. for problem_* event types. # # store that parsed version as "event_struct" event = None if 'event' in data: event = data['event'] data['event'] = json.dumps(data['event']) # now the real rephrasing event_type = data.get('event_type', '') #---------------------------------------- # types to keep KNOWN_TYPES = [ 'play_video', 'seq_goto', 'seq_next', 'seq_prev', 'seek_video', 'load_video', 'save_problem_success', 'save_problem_fail', 'reset_problem_success', 'reset_problem_fail', 'show_answer', 'edx.course.enrollment.activated', 'edx.course.enrollment.deactivated', 'edx.course.enrollment.mode_changed', 'edx.course.enrollment.upgrade.succeeded', 'speed_change_video', 'problem_check', 'problem_save', 'problem_reset' ] if isinstance(event, dict): outs = ('video_embedded', 'harvardx.button', 'harvardx.') out_conds = not any(k in event_type for k in outs) in_conds = 'problem_' in event_type or event_type in KNOWN_TYPES if in_conds and out_conds: data['event_struct'] = event else: data['event_struct'] = { 'GET': json.dumps(event.get('GET')), 'POST': json.dumps(event.get('POST')), 'query': event.get('query'), } else: if 'event_struct' in data: data.pop('event_struct') #---------------------------------------- # special cases if '_id' in data: data['mongoid'] = data['_id']['$oid'] data.pop('_id') if isinstance(event, dict) and 'POST' in event: post_str = json.dumps(event['POST']) event['POST'] = post_str if isinstance(event, dict) and 'GET' in event: get_str = json.dumps(event['GET']) event['GET'] = get_str if event_type in ['problem_check', 'problem_save', 'problem_reset' ] and data['event_source'] == 'browser': if isinstance(event, (str, unicode)): event = {'data': json.dumps(event)} if isinstance(event, (str, unicode)): #if event and data['event_js']: # sys.stderr.write('unexpected STRING event: ' + json.dumps(data, indent=4) + '\n') event = {'data': json.dumps(event)} if isinstance(event, (list, )): event = {'data': json.dumps(event)} def make_str(key): if event is not None and 'state' in event: state = event['state'] if key in state: state[key] = json.dumps(state[key]) make_str('input_state') make_str('correct_map') make_str('student_answers') def make_str0(key): ev = event or {} if key in ev: ev[key] = json.dumps(ev[key]) make_str0('correct_map') make_str0('answers') make_str0('submission') make_str0('old_state') make_str0('new_state') make_str0('permutation') make_str0('options_selected') make_str0('corrections') def make_str2(key): context = data.get('context', {}) if key in context: context[key] = json.dumps(context[key]) make_str2('course_user_tags') def move_unknown_fields_from_context_to_context_agent( keys): # needed to handle new fields from mobile client context = data.get('context', {}) agent = {'oldagent': context.get('agent', "")} for key in keys: if '.' in key: (prefix, subkey) = key.split('.', 1) if prefix in context: subcontext = context[prefix] if subkey in subcontext: agent[key] = subcontext[subkey] subcontext.pop(subkey) else: if key in context: agent[key] = context[key] context.pop(key) context['agent'] = json.dumps(agent) # 31-Jan-15: handle new "module.usage_key" field in context, e.g.: # # "module": { # "display_name": "Radiation Exposure", # "usage_key": "i4x://MITx/6.00.1x_5/problem/ps03:ps03-Radiation-Exposure" # }, # 28-May-16: context.asides mobile_api_context_fields = [ 'application', 'client', 'received_at', 'component', "open_in_browser_url", "module.usage_key", "module.original_usage_version", "module.original_usage_key", "asides", ] move_unknown_fields_from_context_to_context_agent( mobile_api_context_fields) #---------------------------------------- # new fields which are not in schema get moved as JSON strings to the pre-existing "mongoid" field, # which is unused except in very old records # do this, for example, for the "referer" and "accept_language" fields def move_fields_to_mongoid(field_paths): ''' field_path is a list of lists which gives the recursive dict keys to traverse to get to the field to move. Move that field, with the path intact, into the mongoid field. ''' mongoid = data.get('mongoid') if not isinstance(mongoid, dict): mongoid = {'old_mongoid': mongoid} def move_field_value(ddict, vdict, fp): '''recursively traverse dict to get and move value for specified field path''' key = fp[0] if len(fp) == 1: # base case if key in ddict: fval = ddict.get(key) vdict[key] = fval # copy to new values dict ddict.pop( key) # remove key from current path within data dict return fval return None if not key in vdict: vdict[key] = {} return move_field_value(ddict.get(key, {}), vdict[key], fp[1:]) vdict = mongoid for field_path in field_paths: move_field_value(data, vdict, field_path) data['mongoid'] = json.dumps(vdict) # 16-Mar-15: remove event_struct.requested_skip_interval move_fields_to_mongoid([ ['referer'], ['accept_language'], ['event_struct', 'requested_skip_interval'], ['event_struct', 'submitted_answer'], ['event_struct', 'num_attempts'], ['event_struct', 'task_id'], # 05oct15 ['event_struct', 'content'], # 11jan16 ['nonInteraction'], # 24aug15 ['label'], # 24aug15 ['event_struct', 'widget_placement'], # 08may16 ['event_struct', 'tab_count'], # 08may16 ['event_struct', 'current_tab'], # 08may16 ['event_struct', 'target_tab'], # 08may16 ['event_struct', 'state', 'has_saved_answers'], # 06dec2016 ['context', 'label'], # 24aug15 ['roles'], # 06sep2017 rp ['environment'], # 06sep2017 rp ['minion_id'], # 06sep2017 rp ['event_struct', 'duration'], # 22nov2017 ic ['event_struct', 'play_medium'] ]) #---------------------------------------- # general checks def fix_dash(key): ev = event or {} if key in ev: newkey = key.replace('-', '_') ev[newkey] = ev[key] ev.pop(key) fix_dash('child-id') def check_empty(data, *keys): # print "--> keys=%s, data=%s" % (str(keys), data) key = keys[0] if isinstance(data, dict) and key in data: if len(keys) == 1: if data[key] in ["", u'']: # print "---> popped %s" % key data.pop(key) else: check_empty(data[key], *keys[1:]) check_empty(data, 'context', "user_id") data.pop('event_js') # leftover from mongo import script #----------------------------------------- # check for null values in speed_change_video # Error encountered parsing LAC data from Oct. 2013 # Requires that we also be able to convert the value to a float def string_is_float(s): try: float(s) return True except ValueError: return False if data.get('event_type') == 'speed_change_video': if 'event_struct' in data and 'new_speed' in data['event_struct']: # First check if string is float if string_is_float(data['event_struct']['new_speed']): # Second check if value is null if isnan(float(data['event_struct']['new_speed'])): data['event_struct'].pop('new_speed') # check for any funny keys, recursively funny_key_sections = [] def check_for_funny_keys(entry, name='toplevel'): for key, val in entry.iteritems(): if key.startswith('i4x-') or key.startswith('xblock.'): sys.stderr.write( "[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, '')) funny_key_sections.append(name) return True if len(key) > 25: sys.stderr.write( "[rephrase] suspicious key at %s in entry: %s, data=%s\n" % (name, entry, '')) if key[0].isdigit(): sys.stderr.write( "[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, '')) funny_key_sections.append(name) return True if '-' in key or '.' in key: # bad key name! rename it, chaning "-" to "_" newkey = key.replace('-', '_').replace('.', '__') sys.stderr.write( "[rephrase] oops, bad keyname at %s in entry: %s newkey+%s\n" % (name, entry, newkey)) entry[newkey] = val entry.pop(key) key = newkey if isinstance(val, dict): ret = check_for_funny_keys(val, name + '/' + key) if ret is True: sys.stderr.write( " coercing section %s to become a string\n" % (name + "/" + key)) entry[key] = json.dumps(val) return False check_for_funny_keys(data) try: check_schema(linecnt, data, coerce=True) except Exception as err: sys.stderr.write('[%d] oops, err=%s, failed in check_schema %s\n' % (linecnt, str(err), json.dumps(data, indent=4))) sys.stderr.write(traceback.format_exc()) return
def do_rephrase(data, do_schema_check=True, linecnt=0): # add course_id? if "course_id" not in data: cid = data.get("context", {}).get("course_id", "") if cid: data["course_id"] = cid # add module_id? if "module_id" not in data: add_module_id(data) # ensure event is dict when possible if not "event_js" in data: try: event = data["event"] if not type(event) == dict: event = json.loads(event) event_js = True except Exception as err: event_js = False data["event"] = event data["event_js"] = event_js # ---------------------------------------- # "event" is used for too many kinds of data, with colliding types for fields. # thus, in general, we can't store it as a fixed schema field. # # so we turn "event" into a JSON string. # # for specific kinds of events ("event_type"), we keep a parsed version of # "event", e.g. for problem_* event types. # # store that parsed version as "event_struct" event = None if "event" in data: event = data["event"] data["event"] = json.dumps(data["event"]) # now the real rephrasing event_type = data.get("event_type", None) # ---------------------------------------- # types to keep KNOWN_TYPES = [ "play_video", "seq_goto", "seq_next", "seq_prev", "seek_video", "load_video", "save_problem_success", "save_problem_fail", "reset_problem_success", "reset_problem_fail", "show_answer", "edx.course.enrollment.activated", "edx.course.enrollment.deactivated", "edx.course.enrollment.mode_changed", "edx.course.enrollment.upgrade.succeeded", "speed_change_video", "problem_check", "problem_save", "problem_reset", ] if ( type(event) == dict and (("problem_" in event_type) or event_type in KNOWN_TYPES) and not ("video_embedded" in event_type or "harvardx.button" in event_type or "harvardx." in event_type) ): data["event_struct"] = event elif type(event) == dict: # default to always including GET and POST when available data["event_struct"] = {"GET": json.dumps(event.get("GET")), "POST": json.dumps(event.get("POST"))} data["event_struct"]["query"] = event.get("query") else: if "event_struct" in data: data.pop("event_struct") # ---------------------------------------- # special cases if "_id" in data: data["mongoid"] = data["_id"]["$oid"] data.pop("_id") if type(event) == dict and "POST" in event: post_str = json.dumps(event["POST"]) event["POST"] = post_str if type(event) == dict and "GET" in event: get_str = json.dumps(event["GET"]) event["GET"] = get_str if event_type in ["problem_check", "problem_save", "problem_reset"] and data["event_source"] == "browser": if type(event) in [str, unicode]: event = {"data": json.dumps(event)} if type(event) in [str, unicode]: # if event and data['event_js']: # sys.stderr.write('unexpected STRING event: ' + json.dumps(data, indent=4) + '\n') event = {"data": json.dumps(event)} if type(event) in [list]: event = {"data": json.dumps(event)} def make_str(key): if event is not None and "state" in event: state = event["state"] if key in state: state[key] = json.dumps(state[key]) make_str("input_state") make_str("correct_map") make_str("student_answers") def make_str0(key): ev = event or {} if key in ev: ev[key] = json.dumps(ev[key]) make_str0("correct_map") make_str0("answers") make_str0("submission") make_str0("old_state") make_str0("new_state") make_str0("permutation") make_str0("options_selected") make_str0("corrections") def make_str2(key): context = data.get("context", {}) if key in context: context[key] = json.dumps(context[key]) make_str2("course_user_tags") def move_unknown_fields_from_context_to_context_agent(keys): # needed to handle new fields from mobile client context = data.get("context", {}) agent = {"oldagent": context.get("agent", "")} for key in keys: if "." in key: (prefix, subkey) = key.split(".", 1) if prefix in context: subcontext = context[prefix] if subkey in subcontext: agent[key] = subcontext[subkey] subcontext.pop(subkey) else: if key in context: agent[key] = context[key] context.pop(key) context["agent"] = json.dumps(agent) # 31-Jan-15: handle new "module.usage_key" field in context, e.g.: # # "module": { # "display_name": "Radiation Exposure", # "usage_key": "i4x://MITx/6.00.1x_5/problem/ps03:ps03-Radiation-Exposure" # }, mobile_api_context_fields = [ "application", "client", "received_at", "component", "open_in_browser_url", "module.usage_key", "module.original_usage_version", "module.original_usage_key", ] move_unknown_fields_from_context_to_context_agent(mobile_api_context_fields) # ---------------------------------------- # new fields which are not in schema get moved as JSON strings to the pre-existing "mongoid" field, # which is unused except in very old records # do this, for example, for the "referer" and "accept_language" fields def move_fields_to_mongoid(field_paths): """ field_path is a list of lists which gives the recursive dict keys to traverse to get to the field to move. Move that field, with the path intact, into the mongoid field. """ mongoid = data.get("mongoid") if not type(mongoid) == dict: mongoid = {"old_mongoid": mongoid} def move_field_value(ddict, vdict, fp): """recursively traverse dict to get and move value for specified field path""" key = fp[0] if len(fp) == 1: # base case if key in ddict: fval = ddict.get(key) vdict[key] = fval # copy to new values dict ddict.pop(key) # remove key from current path within data dict return fval return None if not key in vdict: vdict[key] = {} return move_field_value(ddict.get(key, {}), vdict[key], fp[1:]) vdict = mongoid for field_path in field_paths: move_field_value(data, vdict, field_path) data["mongoid"] = json.dumps(vdict) # 16-Mar-15: remove event_struct.requested_skip_interval move_fields_to_mongoid( [ ["referer"], ["accept_language"], ["event_struct", "requested_skip_interval"], ["event_struct", "submitted_answer"], ["event_struct", "num_attempts"], ["event_struct", "task_id"], # 05oct15 ["nonInteraction"], # 24aug15 ["label"], # 24aug15 ] ) # ---------------------------------------- # general checks def fix_dash(key): ev = event or {} if key in ev: newkey = key.replace("-", "_") ev[newkey] = ev[key] ev.pop(key) fix_dash("child-id") def check_empty(data, *keys): # print "--> keys=%s, data=%s" % (str(keys), data) key = keys[0] if type(data) == dict and key in data: if len(keys) == 1: if data[key] in ["", u""]: # print "---> popped %s" % key data.pop(key) else: check_empty(data[key], *keys[1:]) check_empty(data, "context", "user_id") data.pop("event_js") # leftover from mongo import script # ----------------------------------------- # check for null values in speed_change_video # Error encountered parsing LAC data from Oct. 2013 # Requires that we also be able to convert the value to a float def string_is_float(s): try: float(s) return True except ValueError: return False if data["event_type"] == "speed_change_video": if "event_struct" in data and "new_speed" in data["event_struct"]: # First check if string is float if string_is_float(data["event_struct"]["new_speed"]): # Second check if value is null if isnan(float(data["event_struct"]["new_speed"])): data["event_struct"].pop("new_speed") # check for any funny keys, recursively funny_key_sections = [] def check_for_funny_keys(entry, name="toplevel"): for key, val in entry.iteritems(): if key.startswith("i4x-") or key.startswith("xblock."): sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, "")) funny_key_sections.append(name) return True if len(key) > 25: sys.stderr.write("[rephrase] suspicious key at %s in entry: %s, data=%s\n" % (name, entry, "")) if key[0] in "0123456789": sys.stderr.write("[rephrase] oops, funny key at %s in entry: %s, data=%s\n" % (name, entry, "")) funny_key_sections.append(name) return True if "-" in key or "." in key: # bad key name! rename it, chaning "-" to "_" newkey = key.replace("-", "_").replace(".", "__") sys.stderr.write("[rephrase] oops, bad keyname at %s in entry: %s newkey+%s\n" % (name, entry, newkey)) entry[newkey] = val entry.pop(key) key = newkey if type(val) == dict: ret = check_for_funny_keys(val, name + "/" + key) if ret is True: sys.stderr.write(" coercing section %s to become a string\n" % (name + "/" + key)) entry[key] = json.dumps(val) return False check_for_funny_keys(data) try: check_schema(linecnt, data, coerce=True) except Exception as err: sys.stderr.write( "[%d] oops, err=%s, failed in check_schema %s\n" % (linecnt, str(err), json.dumps(data, indent=4)) ) sys.stderr.write(traceback.format_exc()) return