def process_course(course_id, basedir, datedir, use_dataset_latest, verbose=False): sdir = load_course_sql.find_course_sql_dir(course_id, basedir=basedir, datedir=datedir, use_dataset_latest=use_dataset_latest, ) edx2course_axis.DATADIR = sdir edx2course_axis.VERBOSE_WARNINGS = verbose fn = sdir / 'course.xml.tar.gz' if not os.path.exists(fn): fn = sdir / 'course-prod-analytics.xml.tar.gz' if not os.path.exists(fn): fn = sdir / 'course-prod-edge-analytics.xml.tar.gz' if not os.path.exists(fn): print "---> oops, cannot generate course axis for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn) sys.stdout.flush() return # TODO: only create new axis if the table is missing, or the course axis is not already created try: edx2course_axis.process_xml_tar_gz_file(fn, use_dataset_latest=use_dataset_latest, force_course_id=course_id) except Exception as err: print err
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """ Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table. :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" sdir = load_course_sql.find_course_sql_dir(cid, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest), ) csvfn = sdir / csv_name tempfn = sdir / temp_name mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not os.path.exists(csvfn): print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn return if not subsection: cleanup_rows_from_grade_persistent(csvfn, tempfn) else: cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted") gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def get_stats_module_usage( course_id, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, ): ''' Get data from the stats_module_usage table, if it doesn't already exist as a local file. Compute it if necessary. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) sql = """ SELECT module_type, module_id, count(*) as ncount FROM [{dataset}.studentmodule] group by module_id, module_type order by module_id """.format(dataset=dataset) table = 'stats_module_usage' course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / (table + ".csv") data = {} if csvfn.exists(): # read file into data structure for k in list(csv.DictReader(open(csvfn))): midfrag = tuple(k['module_id'].split('/')[-2:]) data[midfrag] = k else: # download if it is already computed, or recompute if needed bqdat = bqutil.get_bq_table(dataset, table, sql=sql) if bqdat is None: bqdat = {'data': []} fields = ["module_type", "module_id", "ncount"] fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=fields) cdw.writeheader() for k in bqdat['data']: midfrag = tuple(k['module_id'].split('/')[-2:]) data[midfrag] = k try: k['module_id'] = k['module_id'].encode('utf8') cdw.writerow(k) except Exception as err: print "Error writing row %s, err=%s" % (k, str(err)) fp.close() print "[analyze_content] got %d lines of studentmodule usage data" % len( data) return data
def do_user_part_csv(course_id, basedir=None, datedir=None, use_dataset_latest=False, verbose=False, pin_date=None): sdir = find_course_sql_dir(course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) # upload to google storage dfn = sdir / "user_api_usercoursetag.csv.gz" if not os.path.exists(dfn): print("[load_user_part] Missing %s, skipping" % dfn) return # reformat True / False to 1/0 for "value" field if verbose: print("[load_user_part] extracting user partition data from %s" % dfn) sys.stdout.flush() cdr = csv.DictReader(gzip.GzipFile(dfn)) fields = cdr.fieldnames if verbose: print("fieldnames = %s" % fields) fixed_data = [] bmap = {'true': 1, 'false': 0} for row in cdr: vstr = row['value'].lower() row['value'] = bmap.get(vstr, vstr) fixed_data.append(row) ofnb = 'user_partitions.csv.gz' odfn = sdir / ofnb with gzip.GzipFile(odfn, 'w') as ofp: cdw = csv.DictWriter(ofp, fieldnames=fields) cdw.writeheader() cdw.writerows(fixed_data) if verbose: print("[load_user_part] Wrote %d rows of user partition data to %s" % (len(fixed_data), odfn)) sys.stdout.flush() gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)) gsutil.upload_file_to_gs(odfn, gsdir / ofnb, verbose=False) mypath = os.path.dirname(os.path.realpath(__file__)) schema = json.loads(open('%s/schemas/schema_user_partitions.json' % mypath).read())['user_partitions'] # import into BigQuery table = "user_partitions" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
def extractResearchData( self, course_id, tablename, the_dataset=None, rdp=None, rdp_format='csv', output_bucket=None, basedir='', datedir='', do_gzip=True): ''' Get research data output into tables and archive onto server ''' # Archive location if course_id is not None: # Individual Course Research Data Products self.gsp = gsutil.gs_path_from_course_id( course_id=course_id, gsbucket=output_bucket, use_dataset_latest=True ) gsfilename = "%s/%s" % ( self.gsp, RESEARCH_DATA_PRODUCTS[ rdp ] ) else: print "ERROR! Must specify course_id's. Aborting." return try: # Copy to Google Storage msg = "[researchData]: Copying Research Data table %s to %s" % ( tablename, gsfilename ) print msg #gsfilename = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) # temp gsfilename = "%s/%s.csv.gz" % ( self.gsp, tablename ) # temp ret = bqutil.extract_table_to_gs( the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True) msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename ) print msg sys.stdout.flush() except Exception as err: print str(err) if ('BQ Error creating table' in str(err) ): msg = "[researchData]: Retrying... by sharding." print msg sys.stdout.flush() gsfilename = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) print gsfilename sys.stdout.flush() ret = bqutil.extract_table_to_gs( the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True) msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename ) print msg sys.stdout.flush() # Copy from Google Storage to Secure Data Warehouse for archiving archiveLocation = find_course_sql_dir(course_id=course_id, basedir=basedir, datedir=datedir, use_dataset_latest=True) #time.sleep( CFG.TIME_TO_WAIT_30s ) # delay needed to allow for GS to upload file fully (This should be size dependent, and may not be enough time) msg = "[researchData]: Archiving Research Data table %s from %s to %s" % ( tablename, gsfilename, archiveLocation ) print msg sys.stdout.flush() gsutil.upload_file_to_gs(src=gsfilename, dst=archiveLocation, verbose=True) pass
def get_stats_module_usage(course_id, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, ): ''' Get data from the stats_module_usage table, if it doesn't already exist as a local file. Compute it if necessary. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) sql = """ SELECT module_type, module_id, count(*) as ncount FROM [{dataset}.studentmodule] group by module_id, module_type order by module_id """.format(dataset=dataset) table = 'stats_module_usage' course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / (table + ".csv") data = {} if csvfn.exists(): # read file into data structure for k in list(csv.DictReader(open(csvfn))): midfrag = tuple(k['module_id'].split('/')[-2:]) data[midfrag] = k else: # download if it is already computed, or recompute if needed bqdat = bqutil.get_bq_table(dataset, table, sql=sql) if bqdat is None: bqdat = {'data': []} fields = [ "module_type", "module_id", "ncount" ] fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=fields) cdw.writeheader() for k in bqdat['data']: midfrag = tuple(k['module_id'].split('/')[-2:]) data[midfrag] = k try: k['module_id'] = k['module_id'].encode('utf8') cdw.writerow(k) except Exception as err: print "Error writing row %s, err=%s" % (k, str(err)) fp.close() print "[analyze_content] got %d lines of studentmodule usage data" % len(data) return data
def process_course( course_id, basedir, datedir, use_dataset_latest, use_local_files, verbose=False, pin_date=None, stop_on_error=True, ): if pin_date: datedir = pin_date sdir = load_course_sql.find_course_sql_dir( course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) edx2course_axis.DATADIR = sdir edx2course_axis.VERBOSE_WARNINGS = verbose fn_to_try = getattr(edx2bigquery_config, 'COURSE_FILES_PREFIX_NAMES', []) for fntt in fn_to_try: fn = sdir / fntt if os.path.exists(fn): break if not os.path.exists(fn): print "---> oops, cannot generate course axis for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % ( course_id, fn) sys.stdout.flush() return # TODO: only create new axis if the table is missing, or the course axis is not already created try: edx2course_axis.process_xml_tar_gz_file( fn, use_local_files=use_local_files, use_dataset_latest=use_dataset_latest, force_course_id=course_id, ) except Exception as err: print err traceback.print_exc() if stop_on_error: raise
def make_gp_table(course_id, basedir=None, datedir=None, use_dataset_latest=False, verbose=False, pin_date=None): if pin_date: datedir = pin_date sdir = load_course_sql.find_course_sql_dir(course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) fn_to_try = ['course.xml.tar.gz', 'course-prod-analytics.xml.tar.gz', 'course-prod-edge-analytics.xml.tar.gz', 'course-prod-edx-replica.xml.tar.gz', ] for fntt in fn_to_try: fn = sdir / fntt if os.path.exists(fn): break if not os.path.exists(fn): msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn) raise Exception(msg) gpstr, gpfn = read_grading_policy_from_tar_file(fn) fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn) ofnb = 'grading_policy.csv' ofn = sdir / ofnb ofp = open(ofn, 'w') cdw = csv.DictWriter(ofp, fieldnames=fields) cdw.writeheader() cdw.writerows(gptab) ofp.close() # upload to google storage gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)) gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False) # import into BigQuery table = "grading_policy" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
def process_course(course_id, basedir, datedir, use_dataset_latest, verbose=False, pin_date=None): if pin_date: datedir = pin_date sdir = load_course_sql.find_course_sql_dir( course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) edx2course_axis.DATADIR = sdir edx2course_axis.VERBOSE_WARNINGS = verbose fn_to_try = [ 'course.xml.tar.gz', 'course-prod-analytics.xml.tar.gz', 'course-prod-edge-analytics.xml.tar.gz', 'course-prod-edx-replica.xml.tar.gz', ] for fntt in fn_to_try: fn = sdir / fntt if os.path.exists(fn): break if not os.path.exists(fn): print "---> oops, cannot generate course axis for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % ( course_id, fn) sys.stdout.flush() return # TODO: only create new axis if the table is missing, or the course axis is not already created try: edx2course_axis.process_xml_tar_gz_file( fn, use_dataset_latest=use_dataset_latest, force_course_id=course_id) except Exception as err: print err traceback.print_exc()
def process_courses(clist, basedir, listings): data = { y['course_id']: y for y in [ json.loads(x) for x in open(listings) ] } for course_id in clist: wrap = data[course_id].get('Course Wrap') try: (month, day, year) = map(int, wrap.split('/')) except Exception as err: print "Cannot parse date %s for course %s" % (wrap, course_id) continue wrap_dn = "%04d-%02d-%02d" % (year, month, day) datedir = "" use_dataset_latest = True course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest, verbose=False) sql_dir = course_dir.dirname() dirs_available = list(glob.glob( sql_dir / '20*' / "course*.tar.gz" )) dirs_available.sort() # print "course %s, wrap %s, sql_dir=%s, dirs_available=%s" % (course_id, wrap_dn, sql_dir, dirs_available) chosen_dir = None just_chose = False for dn in dirs_available: ddate = dn.rsplit('/', 2)[1] if ddate < wrap_dn: chosen_dir = dn just_chose = True elif just_chose: chosen_dir = dn # shift to the date just after wrap just_chose = False if not chosen_dir and dirs_available: chosen_dir = dirs_available[0] # print "course %s, wrap %s, suggestd pin dir=%s" % (course_id, wrap_dn, chosen_dir) if chosen_dir: print " '%s': '%s'," % (course_id, chosen_dir.rsplit('/',2)[1]) else: print "--> course %s, wrap %s, suggestd pin dir NOT FOUND" % (course_id, wrap_dn)
def process_course(course_id, basedir, datedir, use_dataset_latest, verbose=False, pin_date=None, stop_on_error=True): if pin_date: datedir = pin_date sdir = load_course_sql.find_course_sql_dir(course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) edx2course_axis.DATADIR = sdir edx2course_axis.VERBOSE_WARNINGS = verbose fn_to_try = ['course.xml.tar.gz', 'course-prod-analytics.xml.tar.gz', 'course-prod-edge-analytics.xml.tar.gz', 'course-prod-edx-replica.xml.tar.gz', ] for fntt in fn_to_try: fn = sdir / fntt if os.path.exists(fn): break if not os.path.exists(fn): print "---> oops, cannot generate course axis for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn) sys.stdout.flush() return # TODO: only create new axis if the table is missing, or the course axis is not already created try: edx2course_axis.process_xml_tar_gz_file(fn, use_dataset_latest=use_dataset_latest, force_course_id=course_id) except Exception as err: print err traceback.print_exc() if stop_on_error: raise
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False): basedir = path(basedir or '') course_dir = course_id.replace('/','__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) cdir = lfp print "Processing %s from files in %s" % (course_id, cdir) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath the_dict_schema = schema2dict(json.loads(open(SCHEMA_FILE).read())['user_info_combo']) uic = defaultdict(dict) # dict with key = user_id, and val = dict to be written out as JSON line def copy_elements(src, dest, fields, prefix="", skip_empty=False): for key in fields: if skip_empty and (not key in src): src[key] = None if src[key]=='NULL': continue if key=='course_id' and src[key].startswith('course-v1:'): # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015 src[key] = src[key].split(':',1)[1].replace('+','/') dest[prefix + key] = src[key] def openfile(fn_in, mode='r', add_dir=True): if add_dir: fn = cdir / fn_in else: fn = fn_in if (not os.path.exists(fn)) and (not fn.endswith('.gz')): fn += ".gz" if mode=='r' and not os.path.exists(fn): newfn = convert_sql(fn) # try converting from *.sql file, if that exists if not newfn: return None # failure, no file found, return None fn = newfn if fn.endswith('.gz'): return gzip.GzipFile(fn, mode) return open(fn, mode) def tsv2csv(fn_in, fn_out): import csv fp = openfile(fn_out, 'w', add_dir=False) csvfp = csv.writer(fp) for line in openfile(fn_in, add_dir=False): csvfp.writerow(line[:-1].split('\t')) fp.close() def convert_sql(fnroot): ''' Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values. Returns False otherwise. ''' if fnroot.endswith('.gz'): fnroot = fnroot[:-3] if fnroot.endswith('.csv'): fnroot = fnroot[:-4] if os.path.exists(fnroot + ".csv"): return fnroot + ".csv" if os.path.exists(fnroot + ".csv.gz"): return fnroot + ".csv.gz" if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) return outfn return False nusers = 0 fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] for line in csv.DictReader(openfile('users.csv')): uid = int(line['id']) copy_elements(line, uic[uid], fields) uic[uid]['user_id'] = uid nusers += 1 uic[uid]['y1_anomalous'] = None print " %d users loaded from users.csv" % nusers fp = openfile('profiles.csv') if fp is None: print "--> Skipping profiles.csv, file does not exist" else: nprofiles = 0 fields = ['name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city'] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="profile_") nprofiles += 1 print " %d profiles loaded from profiles.csv" % nprofiles fp = openfile('enrollment.csv') if fp is None: print "--> Skipping enrollment.csv, file does not exist" else: nenrollments = 0 fields = ['course_id', 'created', 'is_active', 'mode', ] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="enrollment_") nenrollments += 1 print " %d enrollments loaded from profiles.csv" % nenrollments # see if from_mongodb files are present for this course; if so, merge in that data mongodir = cdir.dirname() / 'from_mongodb' if mongodir.exists(): print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir sys.stdout.flush() fp = gzip.GzipFile(mongodir / "users.json.gz") fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] nadded = 0 for line in fp: pdata = json.loads(line) uid = int(pdata['_id']) if not uid in uic: copy_elements(pdata, uic[uid], fields, skip_empty=True) uic[uid]['user_id'] = uid nadded += 1 fp.close() print " %d additional users loaded from %s/users.json.gz" % (nadded, mongodir) fp = gzip.GzipFile(mongodir / "profiles.json.gz") fields = ['name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city'] nadd_profiles = 0 def fix_unicode(elem, fields): for k in fields: if (k in elem) and elem[k]: elem[k] = elem[k].encode('utf8') for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('profile_name', None): copy_elements(pdata, uic[uid], fields, prefix="profile_", skip_empty=True) fix_unicode(uic[uid], ['profile_name', 'profile_mailing_address', 'profile_goals', 'profile_location', 'profile_language']) uic[uid]['y1_anomalous'] = 1 nadd_profiles += 1 fp.close() print " %d additional profiles loaded from %s/profiles.json.gz" % (nadd_profiles, mongodir) # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff cutoff = None if datedir: cutoff = "%s 00:00:00" % datedir fp = gzip.GzipFile(mongodir / "enrollment.json.gz") fields = ['course_id', 'created', 'is_active', 'mode', ] nadd_enrollment = 0 n_removed_after_cutoff = 0 for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('enrollment_course_id', None): if cutoff and (pdata['created'] > cutoff) and (uic[uid].get('y1_anomalous')==1): # remove if enrolled after datedir cutoff uic.pop(uid) n_removed_after_cutoff += 1 else: copy_elements(pdata, uic[uid], fields, prefix="enrollment_", skip_empty=True) nadd_enrollment += 1 fp.close() print " %d additional enrollments loaded from %s/enrollment.json.gz" % (nadd_enrollment, mongodir) print " from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % (nadded - n_removed_after_cutoff, nadded, nadd_profiles, nadd_enrollment, n_removed_after_cutoff, cutoff) sys.stdout.flush() fp = openfile('certificates.csv') if fp is None: print "--> Skipping certificates.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['user_id']) fields = ['download_url', 'grade', 'course_id', 'key', 'distinction', 'status', 'verify_uuid', 'download_uuid', 'name', 'created_date', 'modified_date', 'error_reason', 'mode',] copy_elements(line, uic[uid], fields, prefix="certificate_") if 'user_id' not in uic[uid]: uic[uid]['user_id'] = uid # sanity check for entries with user_id but missing username nmissing_uname = 0 for uid, entry in uic.iteritems(): if (not 'username' in entry) or (not entry['username']): nmissing_uname += 1 if nmissing_uname < 10: print "missing username: %s" % entry print "--> %d entries missing username" % nmissing_uname sys.stdout.flush() # sanity check for entries missing course_id nmissing_cid = 0 for uid, entry in uic.iteritems(): if (not 'enrollment_course_id' in entry) or (not entry['enrollment_course_id']): nmissing_cid += 1 entry['enrollment_course_id'] = course_id print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % (nmissing_cid, course_id) sys.stdout.flush() fp = openfile('user_id_map.csv') if fp is None: print "--> Skipping user_id_map.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['id']) fields = ['hash_id'] copy_elements(line, uic[uid], fields, prefix="id_map_") # sort by userid uidset = uic.keys() uidset.sort() # write out result, checking schema along the way fieldnames = the_dict_schema.keys() ofp = openfile('user_info_combo.json.gz', 'w') ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'), fieldnames=fieldnames) ocsv.writeheader() for uid in uidset: data = uic[uid] check_schema(uid, data, the_ds=the_dict_schema, coerce=True) if ('enrollment_course_id' not in data) and ('certificate_course_id' not in data): print "Oops! missing course_id in user_info_combo line: inconsistent SQL?" print "data = %s" % data print "Suppressing this row" continue row_course_id = data.get('enrollment_course_id', data.get('certificate_course_id', '')) if not row_course_id==course_id: print "Oops! course_id=%s in user_info_combo line: inconsistent with expected=%s" % (row_course_id, course_id) print "data = %s" % data print "Suppressing this row" continue ofp.write(json.dumps(data) + '\n') try: ocsv.writerow(data) except Exception as err: print "failed to write data=%s" % data raise print "Done with make_user_info_combo for %s" % course_id sys.stdout.flush()
def fix_missing_grades( course_id, getGrades=None, download_only=None, course_id_type='opaque', # Assume newer courses with format 'course-v1:HarvardX+course+term' org_list=None, # org list is passed in order to properly grab data from edX instructor dashboard basedir=None, datedir=None, use_dataset_latest=False): lfp = find_course_sql_dir(course_id=course_id, basedir=basedir) # , org_list=org_list) print lfp gradereport_dir = lfp + '/from_edxinstructordash' print gradereport_dir if not gradereport_dir.exists(): os.mkdir(gradereport_dir) # Download grade report for single course def do_grade_report_export(ofn_dir, ofn, course_id): # Properly convert to opaque, if needed # Use Case: course_list contains transparent course ids, # but need to pass opaque course id for edX instructor dashboard # course_id_type can be set using --course-id-type opaque, for example, on the command line # Important to properly pass the original course id, as it exists in the edX platform course_id = parseCourseIdField(course=course_id, org_list=org_list, course_id_type=course_id_type) print 'Processing course %s' % course_id sys.stdout.flush() try: # Make Grade Report Request getGrades.make_grade_report_request(course_id) # Check current set of grade reports grade_reports_dict = getGrades.get_grade_reports( course_id, ofn_dir) if download_only: print 'Download grade report only specified' sys.stdout.flush() getGrades.get_latest_grade_report(grade_reports_dict, ofn, ofn_dir) else: # Keep checking until the Grade Report is Ready grade_report_ready = False while not grade_report_ready: current_time = datetime.datetime.now() print "Checking grade report status at %s" % current_time sys.stdout.flush() time.sleep(TIME_TO_WAIT) if grade_report_download_ready(course_id): grade_reports_dict = getGrades.get_grade_reports( course_id, ofn_dir) getGrades.get_latest_grade_report( grade_reports_dict, ofn, ofn_dir) grade_report_ready = True except Exception as err: print str(err) grade_report_downloaded = False # Try one last time try: # If failure, then just try grabbing the latest grade report, just in case grade_reports_dict = getGrades.get_grade_reports( course_id, ofn_dir) getGrades.get_latest_grade_report(grade_reports_dict, ofn, ofn_dir) grade_report_downloaded = True except Exception as err: print 'Failure on second attempt %s' % str(err) sys.stdout.flush() raise if grade_report_downloaded: print 'Success on second attempt %s' sys.stdout.flush() else: print 'Failure on second attempt %s' % str(err) sys.stdout.flush() raise def grade_report_download_ready(course_id): getGrades.set_course_id(course_id) if getGrades.list_instructor_tasks()['tasks']: return False else: return True # Do single Course do_grade_report_export(gradereport_dir, "grade_report.csv", course_id)
def analyze_course_content(course_id, listings_file=None, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, do_upload=False, courses=None, verbose=True, ): ''' Compute course_content table, which quantifies: - number of chapter, sequential, vertical modules - number of video modules - number of problem, *openended, mentoring modules - number of dicussion, annotatable, word_cloud modules Do this using the course "xbundle" file, produced when the course axis is computed. Include only modules which had nontrivial use, to rule out the staff and un-shown content. Do the exclusion based on count of module appearing in the studentmodule table, based on stats_module_usage for each course. Also, from the course listings file, compute the number of weeks the course was open. If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset as the "stats_course_content" table. Also generate a "course_summary_stats" table, stored in the course_report_ORG or course_report_latest dataset. The course_summary_stats table combines data from many reports,, including stats_course_content, the medians report, the listings file, broad_stats_by_course, and time_on_task_stats_by_course. ''' if do_upload: if use_dataset_latest: org = "latest" else: org = courses[0].split('/',1)[0] # extract org from first course_id in courses crname = 'course_report_%s' % org gspath = gsutil.gs_path_from_course_id(crname) gsfnp = gspath / CCDATA gsutil.upload_file_to_gs(CCDATA, gsfnp) tableid = "stats_course_content" dataset = crname mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath try: the_schema = json.loads(open(SCHEMA_FILE).read())[tableid] except Exception as err: print "Oops! Failed to load schema file for %s. Error: %s" % (tableid, str(err)) raise if 0: bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1) table = 'course_metainfo' course_tables = ',\n'.join([('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses]) sql = "select * from {course_tables}".format(course_tables=course_tables) print "--> Creating %s.%s using %s" % (dataset, table, sql) if 1: metainfo_dataset = bqutil.get_bq_table(dataset, table, sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), ) # bqutil.create_bq_table(dataset, table, sql, overwrite=True) #----------------------------------------------------------------------------- # make course_summary_stats table # # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo. # Also use (and create if necessary) the nregistered_by_wrap table. # get the broad_stats_by_course data bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course') table_list = bqutil.get_list_of_table_ids(dataset) latest_person_course = max([ x for x in table_list if x.startswith('person_course_')]) print "Latest person_course table in %s is %s" % (dataset, latest_person_course) sql = """ SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, FROM [{dataset}.{person_course}] as pc left join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' )) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id """.format(dataset=dataset, person_course=latest_person_course) nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'}) # rates for registrants before and during course sql = """ SELECT *, ncertified / nregistered * 100 as pct_certified_of_reg, ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch, ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course, ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap, ncertified / nviewed * 100 as pct_certified_of_viewed, ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap, ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap, FROM ( # ------------------------ # get aggregate data SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.certified then 1 else 0 end) ncertified, sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap, sum(case when pc.viewed then 1 else 0 end) nviewed, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap, sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap, sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch, sum(case when pc.start_time < cminfo.launch_date and pc.certified then 1 else 0 end) ncertified_and_registered_before_launch, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) and pc.certified then 1 else 0 end) ncertified_and_registered_during_course, FROM [{dataset}.{person_course}] as pc left join ( # -------------------- # get course launch and wrap dates from course_metainfo SELECT AA.course_id as course_id, AA.wrap_date as wrap_date, AA.launch_date as launch_date, BB.ewrap_date as ewrap_date, FROM ( # inner get course launch and wrap dates from course_metainfo SELECT A.course_id as course_id, A.wrap_date as wrap_date, B.launch_date as launch_date, from ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' ) ) as A left outer join ( SELECT course_id, TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month, regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day, regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year, FROM [{dataset}.course_metainfo] where key='listings_Course Launch' ) ) as B on A.course_id = B.course_id # end inner course_metainfo subquery ) as AA left outer join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Empirical Course Wrap' ) ) as BB on AA.course_id = BB.course_id # end course_metainfo subquery # -------------------- ) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id # ---- end get aggregate data ) order by course_id """.format(dataset=dataset, person_course=latest_person_course) print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration' sys.stdout.flush() cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), key={'name': 'course_id'}) # start assembling course_summary_stats c_sum_stats = defaultdict(OrderedDict) for entry in bsbc['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] cmci.update(entry) cnbw = nr_by_wrap['data_by_key'][course_id] nbw = int(cnbw['nregistered_by_wrap']) cmci['nbw_wrap_date'] = cnbw['wrap_date'] cmci['nregistered_by_wrap'] = nbw cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct'] cmci['frac_female'] = float(entry['n_female_viewed']) / (float(entry['n_male_viewed']) + float(entry['n_female_viewed'])) ncert = float(cmci['certified_sum']) if ncert: cmci['certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0 else: cmci['certified_of_nregistered_by_wrap_pct'] = None cbr = cert_by_reg['data_by_key'][course_id] for field, value in cbr.items(): cmci['cbr_%s' % field] = value # add medians for viewed, explored, and certified msbc_tables = {'msbc_viewed': "viewed_median_stats_by_course", 'msbc_explored': 'explored_median_stats_by_course', 'msbc_certified': 'certified_median_stats_by_course', 'msbc_verified': 'verified_median_stats_by_course', } for prefix, mtab in msbc_tables.items(): print "--> Merging median stats data from %s" % mtab sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, mtab) for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): cmci['%s_%s' % (prefix, field)] = value # add time on task data tot_table = "time_on_task_stats_by_course" prefix = "ToT" print "--> Merging time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field=='course_id': continue cmci['%s_%s' % (prefix, field)] = value # add serial time on task data tot_table = "time_on_task_serial_stats_by_course" prefix = "SToT" print "--> Merging serial time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field=='course_id': continue cmci['%s_%s' % (prefix, field)] = value # add show_answer stats tot_table = "show_answer_stats_by_course" prefix = "SAS" print "--> Merging show_answer stats data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field=='course_id': continue cmci['%s_%s' % (prefix, field)] = value # setup list of keys, for CSV output css_keys = c_sum_stats.values()[0].keys() # retrieve course_metainfo table, pivot, add that to summary_stats print "--> Merging course_metainfo from %s" % table sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, table) def make_key(key): key = key.strip() key = key.replace(' ', '_').replace("'", "_").replace('/', '_').replace('(','').replace(')','').replace('-', '_').replace(',', '') return key listings_keys = map(make_key, ["Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", "Course Number", "Short Title", "Andrew's Short Titles", "Title", "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id", "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade", '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)" ]) listings_keys.reverse() for lk in listings_keys: css_keys.insert(1, "listings_%s" % lk) COUNTS_TO_KEEP = ['discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 'option', 'lti', 'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 'options', 'comment_prompt', 'conditional', 'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 'customtag', 'transcript', 'split_test', 'word_cloud', 'openended', 'openendedparam', 'answer_display', 'code', 'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 'textannotation', 'imageannotation', 'videosequence', 'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion'] for entry in bqdat['data']: thekey = make_key(entry['key']) # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP: # continue if thekey.startswith('listings_') and thekey[9:] not in listings_keys: # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id']) continue c_sum_stats[entry['course_id']][thekey] = entry['value'] #if 'certifies' in thekey: # print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value']) if thekey not in css_keys: css_keys.append(thekey) # compute forum_posts_per_week for course_id, entry in c_sum_stats.items(): nfps = entry.get('nforum_posts_sum', 0) if nfps: fppw = int(nfps) / float(entry['nweeks']) entry['nforum_posts_per_week'] = fppw print " course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (course_id, entry['total_assessments_per_week'], fppw) else: entry['nforum_posts_per_week'] = None css_keys.append('nforum_posts_per_week') # read in listings file and merge that in also if listings_file: if listings_file.endswith('.csv'): listings = csv.DictReader(open(listings_file)) else: listings = [ json.loads(x) for x in open(listings_file) ] for entry in listings: course_id = entry['course_id'] if course_id not in c_sum_stats: continue cmci = c_sum_stats[course_id] for field, value in entry.items(): lkey = "listings_%s" % make_key(field) if not (lkey in cmci) or (not cmci[lkey]): cmci[lkey] = value print "Storing these fields: %s" % css_keys # get schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read()) schema_dict = { x['name'] : x for x in the_schema } # write out CSV css_table = "course_summary_stats" ofn = "%s__%s.csv" % (dataset, css_table) ofn2 = "%s__%s.json" % (dataset, css_table) print "Writing data to %s and %s" % (ofn, ofn2) ofp = open(ofn, 'w') ofp2 = open(ofn2, 'w') dw = csv.DictWriter(ofp, fieldnames=css_keys) dw.writeheader() for cid, entry in c_sum_stats.items(): for ek in entry: if ek not in schema_dict: entry.pop(ek) # entry[ek] = str(entry[ek]) # coerce to be string ofp2.write(json.dumps(entry) + "\n") for key in css_keys: if key not in entry: entry[key] = None dw.writerow(entry) ofp.close() ofp2.close() # upload to bigquery # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ] if 1: gsfnp = gspath / dataset / (css_table + ".json") gsutil.upload_file_to_gs(ofn2, gsfnp) # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False, # format='csv', skiprows=1) bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False) return print "-"*60 + " %s" % course_id # get nweeks from listings lfn = path(listings_file) if not lfn.exists(): print "[analyze_content] course listings file %s doesn't exist!" % lfn return data = None for k in csv.DictReader(open(lfn)): if k['course_id']==course_id: data = k break if not data: print "[analyze_content] no entry for %s found in course listings file %s!" % (course_id, lfn) return def date_parse(field): (m, d, y) = map(int, data[field].split('/')) return datetime.datetime(y, m, d) launch = date_parse('Course Launch') wrap = date_parse('Course Wrap') ndays = (wrap - launch).days nweeks = ndays / 7.0 print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays) course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) cfn = gsutil.path_from_course_id(course_id) xbfn = course_dir / ("xbundle_%s.xml" % cfn) if not xbfn.exists(): print "[analyze_content] cannot find xbundle file %s for %s!" % (xbfn, course_id) return print "[analyze_content] For %s using %s" % (course_id, xbfn) # get module usage data mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest) xml = etree.parse(open(xbfn)).getroot() counts = defaultdict(int) nexcluded = defaultdict(int) IGNORE = ['html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title'] def walk_tree(elem): if type(elem.tag)==str and (elem.tag.lower() not in IGNORE): counts[elem.tag.lower()] += 1 for k in elem: midfrag = (k.tag, k.get('url_name_orig', None)) if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20: nexcluded[k.tag] += 1 if verbose: print " -> excluding %s (%s), ncount=%s" % (k.get('display_name', '<no_display_name>').encode('utf8'), midfrag, mudata.get(midfrag, {}).get('ncount')) continue walk_tree(k) walk_tree(xml) print counts # combine some into "qual_axis" and others into "quant_axis" qual_axis = ['openassessment', 'optionresponse', 'multiplechoiceresponse', # 'discussion', 'choiceresponse', 'word_cloud', 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti'] quant_axis = ['formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse', 'imageresponse'] nqual = 0 nquant = 0 for tag, count in counts.items(): if tag in qual_axis: nqual += count if tag in quant_axis: nquant += count print "nqual=%d, nquant=%d" % (nqual, nquant) nqual_per_week = nqual / nweeks nquant_per_week = nquant / nweeks total_per_week = nqual_per_week + nquant_per_week print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (nqual_per_week, nquant_per_week, total_per_week) # save this overall data in CCDATA lock_file(CCDATA) ccdfn = path(CCDATA) ccd = {} if ccdfn.exists(): for k in csv.DictReader(open(ccdfn)): ccd[k['course_id']] = k ccd[course_id] = {'course_id': course_id, 'nweeks': nweeks, 'nqual_per_week': nqual_per_week, 'nquant_per_week': nquant_per_week, 'total_assessments_per_week' : total_per_week, } # fields = ccd[ccd.keys()[0]].keys() fields = ['course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks'] cfp = open(ccdfn, 'w') dw = csv.DictWriter(cfp, fieldnames=fields) dw.writeheader() for cid, entry in ccd.items(): dw.writerow(entry) cfp.close() lock_file(CCDATA, release=True) # store data in course_metainfo table, which has one (course_id, key, value) on each line # keys include nweeks, nqual, nquant, count_* for module types * cmfields = OrderedDict() cmfields['course_id'] = course_id cmfields['course_length_days'] = str(ndays) cmfields.update({ ('listings_%s' % key) : value for key, value in data.items() }) # from course listings cmfields.update(ccd[course_id].copy()) # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() }) # from content counts for key in sorted(counts): # store counts in sorted order, so that the later generated CSV file can have a predictable structure value = counts[key] cmfields['count_%s' % key] = str(value) # from content counts cmfields.update({ ('nexcluded_sub_20_%s' % key) : str(value) for key, value in nexcluded.items() }) # from content counts course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / CMINFO # manual overriding of the automatically computed fields can be done by storing course_id,key,value data # in the CMINFO_OVERRIDES file csvfn_overrides = course_dir / CMINFO_OVERRIDES if csvfn_overrides.exists(): print "--> Loading manual override information from %s" % csvfn_overrides for ovent in csv.DictReader(open(csvfn_overrides)): if not ovent['course_id']==course_id: print "===> ERROR! override file has entry with wrong course_id: %s" % ovent continue print " overriding key=%s with value=%s" % (ovent['key'], ovent['value']) cmfields[ovent['key']] = ovent['value'] print "--> Course metainfo writing to %s" % csvfn fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value']) cdw.writeheader() for k, v in cmfields.items(): cdw.writerow({'course_id': course_id, 'key': k, 'value': v}) fp.close() table = 'course_metainfo' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / CMINFO print "--> Course metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, table) gsutil.upload_file_to_gs(csvfn, gsfnp) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())[table] bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
def analyze_course_content( course_id, listings_file=None, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, do_upload=False, courses=None, verbose=True, pin_date=None, ): ''' Compute course_content table, which quantifies: - number of chapter, sequential, vertical modules - number of video modules - number of problem, *openended, mentoring modules - number of dicussion, annotatable, word_cloud modules Do this using the course "xbundle" file, produced when the course axis is computed. Include only modules which had nontrivial use, to rule out the staff and un-shown content. Do the exclusion based on count of module appearing in the studentmodule table, based on stats_module_usage for each course. Also, from the course listings file, compute the number of weeks the course was open. If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset as the "stats_course_content" table. Also generate a "course_summary_stats" table, stored in the course_report_ORG or course_report_latest dataset. The course_summary_stats table combines data from many reports,, including stats_course_content, the medians report, the listings file, broad_stats_by_course, and time_on_task_stats_by_course. ''' if do_upload: if use_dataset_latest: org = "latest" else: org = courses[0].split( '/', 1)[0] # extract org from first course_id in courses crname = 'course_report_%s' % org gspath = gsutil.gs_path_from_course_id(crname) gsfnp = gspath / CCDATA gsutil.upload_file_to_gs(CCDATA, gsfnp) tableid = "stats_course_content" dataset = crname mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath try: the_schema = json.loads(open(SCHEMA_FILE).read())[tableid] except Exception as err: print "Oops! Failed to load schema file for %s. Error: %s" % ( tableid, str(err)) raise if 0: bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1) table = 'course_metainfo' course_tables = ',\n'.join([ ('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses ]) sql = "select * from {course_tables}".format( course_tables=course_tables) print "--> Creating %s.%s using %s" % (dataset, table, sql) if 1: metainfo_dataset = bqutil.get_bq_table( dataset, table, sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), ) # bqutil.create_bq_table(dataset, table, sql, overwrite=True) #----------------------------------------------------------------------------- # make course_summary_stats table # # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo. # Also use (and create if necessary) the nregistered_by_wrap table. # get the broad_stats_by_course data bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course') table_list = bqutil.get_list_of_table_ids(dataset) latest_person_course = max( [x for x in table_list if x.startswith('person_course_')]) print "Latest person_course table in %s is %s" % (dataset, latest_person_course) sql = """ SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, FROM [{dataset}.{person_course}] as pc left join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' )) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id """.format(dataset=dataset, person_course=latest_person_course) nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'}) # rates for registrants before and during course sql = """ SELECT *, ncertified / nregistered * 100 as pct_certified_of_reg, ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch, ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course, ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap, ncertified / nviewed * 100 as pct_certified_of_viewed, ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap, ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap, FROM ( # ------------------------ # get aggregate data SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.certified then 1 else 0 end) ncertified, sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap, sum(case when pc.viewed then 1 else 0 end) nviewed, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap, sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap, sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch, sum(case when pc.start_time < cminfo.launch_date and pc.certified then 1 else 0 end) ncertified_and_registered_before_launch, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) and pc.certified then 1 else 0 end) ncertified_and_registered_during_course, FROM [{dataset}.{person_course}] as pc left join ( # -------------------- # get course launch and wrap dates from course_metainfo SELECT AA.course_id as course_id, AA.wrap_date as wrap_date, AA.launch_date as launch_date, BB.ewrap_date as ewrap_date, FROM ( # inner get course launch and wrap dates from course_metainfo SELECT A.course_id as course_id, A.wrap_date as wrap_date, B.launch_date as launch_date, from ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' ) ) as A left outer join ( SELECT course_id, TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month, regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day, regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year, FROM [{dataset}.course_metainfo] where key='listings_Course Launch' ) ) as B on A.course_id = B.course_id # end inner course_metainfo subquery ) as AA left outer join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Empirical Course Wrap' ) ) as BB on AA.course_id = BB.course_id # end course_metainfo subquery # -------------------- ) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id # ---- end get aggregate data ) order by course_id """.format(dataset=dataset, person_course=latest_person_course) print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration' sys.stdout.flush() cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, newer_than=datetime.datetime( 2015, 1, 16, 3, 0), key={'name': 'course_id'}) # start assembling course_summary_stats c_sum_stats = defaultdict(OrderedDict) for entry in bsbc['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] cmci.update(entry) cnbw = nr_by_wrap['data_by_key'][course_id] nbw = int(cnbw['nregistered_by_wrap']) cmci['nbw_wrap_date'] = cnbw['wrap_date'] cmci['nregistered_by_wrap'] = nbw cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct'] cmci['frac_female'] = float(entry['n_female_viewed']) / (float( entry['n_male_viewed']) + float(entry['n_female_viewed'])) ncert = float(cmci['certified_sum']) if ncert: cmci[ 'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0 else: cmci['certified_of_nregistered_by_wrap_pct'] = None cbr = cert_by_reg['data_by_key'][course_id] for field, value in cbr.items(): cmci['cbr_%s' % field] = value # add medians for viewed, explored, and certified msbc_tables = { 'msbc_viewed': "viewed_median_stats_by_course", 'msbc_explored': 'explored_median_stats_by_course', 'msbc_certified': 'certified_median_stats_by_course', 'msbc_verified': 'verified_median_stats_by_course', } for prefix, mtab in msbc_tables.items(): print "--> Merging median stats data from %s" % mtab sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, mtab) for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): cmci['%s_%s' % (prefix, field)] = value # add time on task data tot_table = "time_on_task_stats_by_course" prefix = "ToT" print "--> Merging time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # add serial time on task data tot_table = "time_on_task_serial_stats_by_course" prefix = "SToT" print "--> Merging serial time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # add show_answer stats tot_table = "show_answer_stats_by_course" prefix = "SAS" print "--> Merging show_answer stats data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # setup list of keys, for CSV output css_keys = c_sum_stats.values()[0].keys() # retrieve course_metainfo table, pivot, add that to summary_stats print "--> Merging course_metainfo from %s" % table sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, table) listings_keys = map(make_key, [ "Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", "Course Number", "Short Title", "Andrew's Short Titles", "Title", "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id", "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade", '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)" ]) listings_keys.reverse() for lk in listings_keys: css_keys.insert(1, "listings_%s" % lk) COUNTS_TO_KEEP = [ 'discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 'option', 'lti', 'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 'options', 'comment_prompt', 'conditional', 'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 'customtag', 'transcript', 'split_test', 'word_cloud', 'openended', 'openendedparam', 'answer_display', 'code', 'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 'textannotation', 'imageannotation', 'videosequence', 'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion' ] for entry in bqdat['data']: thekey = make_key(entry['key']) # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP: # continue if thekey.startswith( 'listings_') and thekey[9:] not in listings_keys: # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id']) continue c_sum_stats[entry['course_id']][thekey] = entry['value'] #if 'certifies' in thekey: # print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value']) if thekey not in css_keys: css_keys.append(thekey) # compute forum_posts_per_week for course_id, entry in c_sum_stats.items(): nfps = entry.get('nforum_posts_sum', 0) if nfps: fppw = int(nfps) / float(entry['nweeks']) entry['nforum_posts_per_week'] = fppw print " course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % ( course_id, entry['total_assessments_per_week'], fppw) else: entry['nforum_posts_per_week'] = None css_keys.append('nforum_posts_per_week') # read in listings file and merge that in also if listings_file: if listings_file.endswith('.csv'): listings = csv.DictReader(open(listings_file)) else: listings = [json.loads(x) for x in open(listings_file)] for entry in listings: course_id = entry['course_id'] if course_id not in c_sum_stats: continue cmci = c_sum_stats[course_id] for field, value in entry.items(): lkey = "listings_%s" % make_key(field) if not (lkey in cmci) or (not cmci[lkey]): cmci[lkey] = value print "Storing these fields: %s" % css_keys # get schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read()) schema_dict = {x['name']: x for x in the_schema} # write out CSV css_table = "course_summary_stats" ofn = "%s__%s.csv" % (dataset, css_table) ofn2 = "%s__%s.json" % (dataset, css_table) print "Writing data to %s and %s" % (ofn, ofn2) ofp = open(ofn, 'w') ofp2 = open(ofn2, 'w') dw = csv.DictWriter(ofp, fieldnames=css_keys) dw.writeheader() for cid, entry in c_sum_stats.items(): for ek in entry: if ek not in schema_dict: entry.pop(ek) # entry[ek] = str(entry[ek]) # coerce to be string ofp2.write(json.dumps(entry) + "\n") for key in css_keys: if key not in entry: entry[key] = None dw.writerow(entry) ofp.close() ofp2.close() # upload to bigquery # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ] if 1: gsfnp = gspath / dataset / (css_table + ".json") gsutil.upload_file_to_gs(ofn2, gsfnp) # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False, # format='csv', skiprows=1) bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False) return print "-" * 60 + " %s" % course_id # get nweeks from listings lfn = path(listings_file) if not lfn.exists(): print "[analyze_content] course listings file %s doesn't exist!" % lfn return data = None if listings_file.endswith('.json'): data_feed = map(json.loads, open(lfn)) else: data_feed = csv.DictReader(open(lfn)) for k in data_feed: if not 'course_id' in k: print "Strange course listings row, no course_id in %s" % k raise Exception("Missing course_id") if k['course_id'] == course_id: data = k break if not data: print "[analyze_content] no entry for %s found in course listings file %s!" % ( course_id, lfn) return def date_parse(field): (m, d, y) = map(int, data[field].split('/')) return datetime.datetime(y, m, d) launch = date_parse('Course Launch') wrap = date_parse('Course Wrap') ndays = (wrap - launch).days nweeks = ndays / 7.0 print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays) if pin_date: datedir = pin_date course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest and not pin_date) cfn = gsutil.path_from_course_id(course_id) xbfn = course_dir / ("xbundle_%s.xml" % cfn) if not xbfn.exists(): print "[analyze_content] cannot find xbundle file %s for %s!" % ( xbfn, course_id) if use_dataset_latest: # try looking in earlier directories for xbundle file import glob spath = course_dir / ("../*/xbundle_%s.xml" % cfn) files = list(glob.glob(spath)) if files: xbfn = path(files[-1]) if not xbfn.exists(): print " --> also cannot find any %s ; aborting!" % spath else: print " --> Found and using instead: %s " % xbfn if not xbfn.exists(): raise Exception("[analyze_content] missing xbundle file %s" % xbfn) # if there is an xbundle*.fixed file, use that instead of the normal one if os.path.exists(str(xbfn) + ".fixed"): xbfn = path(str(xbfn) + ".fixed") print "[analyze_content] For %s using %s" % (course_id, xbfn) # get module usage data mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest) xml = etree.parse(open(xbfn)).getroot() counts = defaultdict(int) nexcluded = defaultdict(int) IGNORE = [ 'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title' ] problem_stats = defaultdict(int) def does_problem_have_random_script(problem): ''' return 1 if problem has a script with "random." in it else return 0 ''' for elem in problem.findall('.//script'): if elem.text and ('random.' in elem.text): return 1 return 0 # walk through xbundle def walk_tree(elem, policy=None): ''' Walk XML tree recursively. elem = current element policy = dict of attributes for children to inherit, with fields like due, graded, showanswer ''' policy = policy or {} if type(elem.tag) == str and (elem.tag.lower() not in IGNORE): counts[elem.tag.lower()] += 1 if elem.tag in [ "sequential", "problem", "problemset", "course", "chapter" ]: # very old courses may use inheritance from course & chapter keys = ["due", "graded", "format", "showanswer", "start"] for k in keys: # copy inheritable attributes, if they are specified val = elem.get(k) if val: policy[k] = val if elem.tag == "problem": # accumulate statistics about problems: how many have show_answer = [past_due, closed] ? have random. in script? problem_stats['n_capa_problems'] += 1 if policy.get('showanswer'): problem_stats["n_showanswer_%s" % policy.get('showanswer')] += 1 else: problem_stats[ 'n_shownanswer_finished'] += 1 # DEFAULT showanswer = finished (make sure this remains true) # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118 # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed. problem_stats[ 'n_random_script'] += does_problem_have_random_script(elem) if policy.get('graded') == 'true' or policy.get( 'graded') == 'True': problem_stats['n_capa_problems_graded'] += 1 problem_stats[ 'n_graded_random_script'] += does_problem_have_random_script( elem) if policy.get('showanswer'): problem_stats["n_graded_showanswer_%s" % policy.get('showanswer')] += 1 else: problem_stats[ 'n_graded_shownanswer_finished'] += 1 # DEFAULT showanswer = finished (make sure this remains true) for k in elem: midfrag = (k.tag, k.get('url_name_orig', None)) if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20: nexcluded[k.tag] += 1 if verbose: try: print " -> excluding %s (%s), ncount=%s" % ( k.get('display_name', '<no_display_name>').encode('utf8'), midfrag, mudata.get(midfrag, {}).get('ncount')) except Exception as err: print " -> excluding ", k continue walk_tree(k, policy.copy()) walk_tree(xml) print "--> Count of individual element tags throughout XML: ", counts print "--> problem_stats:", json.dumps(problem_stats, indent=4) # combine some into "qual_axis" and others into "quant_axis" qual_axis = [ 'openassessment', 'optionresponse', 'multiplechoiceresponse', # 'discussion', 'choiceresponse', 'word_cloud', 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti' ] quant_axis = [ 'formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse', 'imageresponse' ] nqual = 0 nquant = 0 for tag, count in counts.items(): if tag in qual_axis: nqual += count if tag in quant_axis: nquant += count print "nqual=%d, nquant=%d" % (nqual, nquant) nqual_per_week = nqual / nweeks nquant_per_week = nquant / nweeks total_per_week = nqual_per_week + nquant_per_week print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % ( nqual_per_week, nquant_per_week, total_per_week) # save this overall data in CCDATA lock_file(CCDATA) ccdfn = path(CCDATA) ccd = {} if ccdfn.exists(): for k in csv.DictReader(open(ccdfn)): ccd[k['course_id']] = k ccd[course_id] = { 'course_id': course_id, 'nweeks': nweeks, 'nqual_per_week': nqual_per_week, 'nquant_per_week': nquant_per_week, 'total_assessments_per_week': total_per_week, } # fields = ccd[ccd.keys()[0]].keys() fields = [ 'course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks' ] cfp = open(ccdfn, 'w') dw = csv.DictWriter(cfp, fieldnames=fields) dw.writeheader() for cid, entry in ccd.items(): dw.writerow(entry) cfp.close() lock_file(CCDATA, release=True) # store data in course_metainfo table, which has one (course_id, key, value) on each line # keys include nweeks, nqual, nquant, count_* for module types * cmfields = OrderedDict() cmfields['course_id'] = course_id cmfields['course_length_days'] = str(ndays) cmfields.update( {make_key('listings_%s' % key): value for key, value in data.items()}) # from course listings cmfields.update(ccd[course_id].copy()) # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() }) # from content counts cmfields['filename_xbundle'] = xbfn cmfields['filename_listings'] = lfn for key in sorted( counts ): # store counts in sorted order, so that the later generated CSV file can have a predictable structure value = counts[key] cmfields['count_%s' % key] = str(value) # from content counts for key in sorted(problem_stats): # store problem stats value = problem_stats[key] cmfields['problem_stat_%s' % key] = str(value) cmfields.update({('nexcluded_sub_20_%s' % key): str(value) for key, value in nexcluded.items() }) # from content counts course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / CMINFO # manual overriding of the automatically computed fields can be done by storing course_id,key,value data # in the CMINFO_OVERRIDES file csvfn_overrides = course_dir / CMINFO_OVERRIDES if csvfn_overrides.exists(): print "--> Loading manual override information from %s" % csvfn_overrides for ovent in csv.DictReader(open(csvfn_overrides)): if not ovent['course_id'] == course_id: print "===> ERROR! override file has entry with wrong course_id: %s" % ovent continue print " overriding key=%s with value=%s" % (ovent['key'], ovent['value']) cmfields[ovent['key']] = ovent['value'] print "--> Course metainfo writing to %s" % csvfn fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value']) cdw.writeheader() for k, v in cmfields.items(): cdw.writerow({'course_id': course_id, 'key': k, 'value': v}) fp.close() # build and output course_listings_and_metainfo dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) mypath = os.path.dirname(os.path.realpath(__file__)) clm_table = "course_listing_and_metainfo" clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table) clm_schema = json.loads(open(clm_schema_file).read()) clm = {} for finfo in clm_schema: field = finfo['name'] clm[field] = cmfields.get(field) clm_fnb = clm_table + ".json" clm_fn = course_dir / clm_fnb open(clm_fn, 'w').write(json.dumps(clm)) gsfnp = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / clm_fnb print "--> Course listing + metainfo uploading to %s then to %s.%s" % ( gsfnp, dataset, clm_table) sys.stdout.flush() gsutil.upload_file_to_gs(clm_fn, gsfnp) bqutil.load_data_to_table(dataset, clm_table, gsfnp, clm_schema, wait=True, verbose=False) # output course_metainfo table = 'course_metainfo' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) gsfnp = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / CMINFO print "--> Course metainfo uploading to %s then to %s.%s" % ( gsfnp, dataset, table) sys.stdout.flush() gsutil.upload_file_to_gs(csvfn, gsfnp) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())[table] bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
def rephrase_forum_json_for_course( course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" ofncsv = "forum.csv.gz" # To match table name in BQ ofncsv_lfp = lfp / ofncsv dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn) and os.path.exists(ofncsv_lfp): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) print "Already done %s -> %s (skipping)" % (fn, ofncsv_lfp) sys.stdout.flush() return print "Processing %s -> writing to %s and %s (%s)" % ( fn, ofn, ofncsv, datetime.datetime.now()) sys.stdout.flush() # Setup CSV header ocsv = csv.DictWriter(openfile(ofncsv, 'w'), fieldnames=SCHEMA_DICT.keys(), quoting=csv.QUOTE_NONNUMERIC) ocsv.writeheader() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') data = OrderedDict() for line in fp: cnt += 1 # Write JSON row newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) try: #Write CSV row data = json.loads(newline) ocsv.writerow(data) except Exception as err: print "Error writing CSV output row %s=%s" % (cnt, data) raise ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt == 0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """ Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table. :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path( gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" sdir = load_course_sql.find_course_sql_dir( cid, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest), ) csvfn = sdir / csv_name tempfn = sdir / temp_name mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not os.path.exists(csvfn): print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn return if not subsection: cleanup_rows_from_grade_persistent(csvfn, tempfn) else: cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted") gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent( dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False): basedir = path(basedir or '') course_dir = course_id.replace('/', '__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) cdir = lfp print "Processing %s from files in %s" % (course_id, cdir) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_user_info_combo.json' % mypath the_dict_schema = schema2dict( json.loads(open(SCHEMA_FILE).read())['user_info_combo']) uic = defaultdict( dict ) # dict with key = user_id, and val = dict to be written out as JSON line def copy_elements(src, dest, fields, prefix="", skip_empty=False): for key in fields: if skip_empty and (not key in src): src[key] = None if src[key] == 'NULL': continue if key == 'course_id' and src[key].startswith('course-v1:'): # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015 src[key] = src[key].split(':', 1)[1].replace('+', '/') dest[prefix + key] = src[key] def openfile(fn_in, mode='r', add_dir=True): if add_dir: fn = cdir / fn_in else: fn = fn_in if (not os.path.exists(fn)) and (not fn.endswith('.gz')): fn += ".gz" if mode == 'r' and not os.path.exists(fn): newfn = convert_sql( fn) # try converting from *.sql file, if that exists if not newfn: return None # failure, no file found, return None fn = newfn if fn.endswith('.gz'): return gzip.GzipFile(fn, mode) return open(fn, mode) def tsv2csv(fn_in, fn_out): import csv fp = openfile(fn_out, 'w', add_dir=False) csvfp = csv.writer(fp) for line in openfile(fn_in, add_dir=False): csvfp.writerow(line[:-1].split('\t')) fp.close() def convert_sql(fnroot): ''' Returns filename if suitable file exists or was created by conversion of tab separated values to comma separated values. Returns False otherwise. ''' if fnroot.endswith('.gz'): fnroot = fnroot[:-3] if fnroot.endswith('.csv'): fnroot = fnroot[:-4] if os.path.exists(fnroot + ".csv"): return fnroot + ".csv" if os.path.exists(fnroot + ".csv.gz"): return fnroot + ".csv.gz" if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) return outfn return False nusers = 0 fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] for line in csv.DictReader(openfile('users.csv')): uid = int(line['id']) copy_elements(line, uic[uid], fields) uic[uid]['user_id'] = uid nusers += 1 uic[uid]['y1_anomalous'] = None uic[uid]['edxinstructordash_Grade'] = None uic[uid]['edxinstructordash_Grade_timestamp'] = None print " %d users loaded from users.csv" % nusers fp = openfile('profiles.csv') if fp is None: print "--> Skipping profiles.csv, file does not exist" else: nprofiles = 0 fields = [ 'name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city' ] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="profile_") nprofiles += 1 print " %d profiles loaded from profiles.csv" % nprofiles fp = openfile('enrollment.csv') if fp is None: print "--> Skipping enrollment.csv, file does not exist" else: nenrollments = 0 fields = [ 'course_id', 'created', 'is_active', 'mode', ] for line in csv.DictReader(fp): uid = int(line['user_id']) copy_elements(line, uic[uid], fields, prefix="enrollment_") nenrollments += 1 print " %d enrollments loaded from profiles.csv" % nenrollments # see if from_mongodb files are present for this course; if so, merge in that data mongodir = cdir.dirname() / 'from_mongodb' if mongodir.exists(): print "--> %s exists, merging in users, profile, and enrollment data from mongodb" % mongodir sys.stdout.flush() fp = gzip.GzipFile(mongodir / "users.json.gz") fields = ['username', 'email', 'is_staff', 'last_login', 'date_joined'] nadded = 0 for line in fp: pdata = json.loads(line) uid = int(pdata['_id']) if not uid in uic: copy_elements(pdata, uic[uid], fields, skip_empty=True) uic[uid]['user_id'] = uid nadded += 1 fp.close() print " %d additional users loaded from %s/users.json.gz" % (nadded, mongodir) fp = gzip.GzipFile(mongodir / "profiles.json.gz") fields = [ 'name', 'language', 'location', 'meta', 'courseware', 'gender', 'mailing_address', 'year_of_birth', 'level_of_education', 'goals', 'allow_certificate', 'country', 'city' ] nadd_profiles = 0 def fix_unicode(elem, fields): for k in fields: if (k in elem) and elem[k]: elem[k] = elem[k].encode('utf8') for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('profile_name', None): copy_elements(pdata, uic[uid], fields, prefix="profile_", skip_empty=True) fix_unicode(uic[uid], [ 'profile_name', 'profile_mailing_address', 'profile_goals', 'profile_location', 'profile_language' ]) uic[uid]['y1_anomalous'] = 1 nadd_profiles += 1 fp.close() print " %d additional profiles loaded from %s/profiles.json.gz" % ( nadd_profiles, mongodir) # if datedir is specified, then do not add entries from mongodb where the enrollment happened after the datedir cutoff cutoff = None if datedir: cutoff = "%s 00:00:00" % datedir fp = gzip.GzipFile(mongodir / "enrollment.json.gz") fields = [ 'course_id', 'created', 'is_active', 'mode', ] nadd_enrollment = 0 n_removed_after_cutoff = 0 for line in fp: pdata = json.loads(line.decode('utf8')) uid = int(pdata['user_id']) if not uic[uid].get('enrollment_course_id', None): if cutoff and (pdata['created'] > cutoff) and ( uic[uid].get('y1_anomalous') == 1): # remove if enrolled after datedir cutoff uic.pop(uid) n_removed_after_cutoff += 1 else: copy_elements(pdata, uic[uid], fields, prefix="enrollment_", skip_empty=True) nadd_enrollment += 1 fp.close() print " %d additional enrollments loaded from %s/enrollment.json.gz" % ( nadd_enrollment, mongodir) print " from mongodb files, added %s (of %s) new users (%s profiles, %s enrollments, %s after cutoff %s)" % ( nadded - n_removed_after_cutoff, nadded, nadd_profiles, nadd_enrollment, n_removed_after_cutoff, cutoff) sys.stdout.flush() # See if instructor grade reports are present for this course; if so, merge in that data edxinstructordash = cdir.dirname() / 'from_edxinstructordash' if edxinstructordash.exists(): print "--> %s exists, merging in users, profile, and enrollment data from_edxinstructordash" % edxinstructordash sys.stdout.flush() grade_report_fn = (edxinstructordash / 'grade_report.csv') fp = openfile(grade_report_fn, add_dir=False) if fp is None: print "--> Skipping grade_report.csv, file does not exist in dir from_edxinstructordash" nadded = 0 print fp for line in csv.DictReader(fp): uid = int(line['Student ID']) fields = ['Grade', 'Grade_timestamp'] #['course_id','Student ID','Email','Username','Grade' ] #'Enrollment Track',' Verification Status','Certificate Eligible','Certificate Delivered','Certificate Type' ] copy_elements(line, uic[uid], fields, prefix="edxinstructordash_") nadded += 1 fp.close() print " %d grades loaded from %s/grade_report.csv" % ( nadded, edxinstructordash) sys.stdout.flush() fp = openfile('certificates.csv') if fp is None: print "--> Skipping certificates.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['user_id']) fields = [ 'download_url', 'grade', 'course_id', 'key', 'distinction', 'status', 'verify_uuid', 'download_uuid', 'name', 'created_date', 'modified_date', 'error_reason', 'mode', ] copy_elements(line, uic[uid], fields, prefix="certificate_") if 'user_id' not in uic[uid]: uic[uid]['user_id'] = uid # sanity check for entries with user_id but missing username nmissing_uname = 0 for uid, entry in uic.iteritems(): if (not 'username' in entry) or (not entry['username']): nmissing_uname += 1 if nmissing_uname < 10: print "missing username: %s" % entry print "--> %d entries missing username" % nmissing_uname sys.stdout.flush() # sanity check for entries missing course_id nmissing_cid = 0 for uid, entry in uic.iteritems(): if (not 'enrollment_course_id' in entry) or (not entry['enrollment_course_id']): nmissing_cid += 1 entry['enrollment_course_id'] = course_id print "--> %d entries missing enrollment_course_id (all fixed by setting to %s)" % ( nmissing_cid, course_id) sys.stdout.flush() fp = openfile('user_id_map.csv') if fp is None: print "--> Skipping user_id_map.csv, file does not exist" else: for line in csv.DictReader(fp): uid = int(line['id']) fields = ['hash_id'] copy_elements(line, uic[uid], fields, prefix="id_map_") # sort by userid uidset = uic.keys() uidset.sort() # write out result, checking schema along the way fieldnames = the_dict_schema.keys() ofp = openfile('user_info_combo.json.gz', 'w') ocsv = csv.DictWriter(openfile('user_info_combo.csv.gz', 'w'), fieldnames=fieldnames) ocsv.writeheader() for uid in uidset: data = uic[uid] check_schema(uid, data, the_ds=the_dict_schema, coerce=True) if ('enrollment_course_id' not in data) and ('certificate_course_id' not in data): print "Oops! missing course_id in user_info_combo line: inconsistent SQL?" print "data = %s" % data print "Suppressing this row" continue row_course_id = data.get('enrollment_course_id', data.get('certificate_course_id', '')) if not row_course_id == course_id: print "Oops! course_id=%s in user_info_combo line: inconsistent with expected=%s" % ( row_course_id, course_id) print "data = %s" % data print "Suppressing this row" continue ofp.write(json.dumps(data) + '\n') try: ocsv.writerow(data) except Exception as err: print "failed to write data=%s" % data raise print "Done with make_user_info_combo for %s" % course_id sys.stdout.flush()
def process_file(course_id, basedir=None, datedir=None, use_dataset_latest=False): basedir = path(basedir or '') course_dir = course_id.replace('/','__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) cdir = lfp print "Processing %s role data in %s" % (course_id, cdir) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_roles.json' % mypath the_dict_schema = schema2dict(json.loads(open(SCHEMA_FILE).read())['staff']) role_data = defaultdict(dict) # dict with key = user_id, and val = dict to be written out as JSON line def convertLongToWide( longData, unique_id, column_header, values, columns ): import numpy as np print " Currently %d roles assigned... Dropping duplicates" % len(longData) sys.stdout.flush() ld2 = longData.drop_duplicates(subset=[unique_id, column_header])#, keep='last') print " %d unique roles assigned" % len(ld2) print " Processing role data with these known roles: %s" % columns sys.stdout.flush() wideData = ld2.pivot( index=unique_id, columns=column_header, values=values ) wideData = pd.DataFrame( wideData , index=wideData.index ).reset_index() # Ensuring all specified columns exist cols = wideData.columns.tolist() for c in columns: if c not in cols: wideData[ c ] = pd.Series([np.nan]) return wideData def createUniqueId( longData ): import numpy as np if pd.notnull(longData['user_id']) and pd.notnull(['course_id']): uid = str(longData['user_id']) + str('__') + str(longData['course_id']) else: uid = 'NULL' return uid def splitUniqueId( wideData ): import numpy as np wideData['user_id'] = pd.Series([np.nan]) wideData['course_id'] = pd.Series([np.nan]) wideData[['user_id', 'course_id']] = wideData['uid'].apply( lambda x: pd.Series(x.split('__') ) ) wideData['course_id'] = wideData['course_id'].apply( forceTransparentCourseId ) return wideData def forceTransparentCourseId( course_id ): #if key=='course_id' and src[key].startswith('course-v1:'): if 'course-v1' in course_id: # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015 #src[key] = src[key].split(':',1)[1].replace('+','/') return course_id.split(':',1)[1].replace('+','/') else: return course_id def openfile(fn_in, mode='r', add_dir=True, pandas_df=False): if pandas_df: return pd.read_csv( gzip.GzipFile( cdir / fn_in ), sep=',' ) else: if add_dir: fn = cdir / fn_in else: fn = fn_in if fn.endswith('.gz'): return gzip.GzipFile(fn, mode) return open(fn, mode) def createRoleVar( wideData, all_roles ): if wideData[ all_roles ].notnull().values.any(): return str("Staff") else: return str("Student") def cleanRoles( longData, unique_id, column_header, values, columns, allfields ): longData[ values ] = int(1) longData['uid'] = longData.apply(createUniqueId, axis=1 ) wideData = convertLongToWide( longData, unique_id=unique_id, column_header=column_header, values=values, columns=columns ) wideData = splitUniqueId( wideData )[ allfields ] return wideData def copy_elements(src, dest, fields, prefix="", skip_empty=False): for key in fields: if skip_empty and (not key in src): src[key] = None if src[key]=='NULL': continue if key=='course_id' and src[key].startswith('course-v1:'): # special handling for mangled "opaque keys" version of course_id, e.g. course-v1:MITx+6.00.2x_3+1T2015 src[key] = src[key].split(':',1)[1].replace('+','/') # Ensure integer for floats, or null if pd.isnull(src[key]): copyKey = None elif type(src[key]) == float: copyKey = int(float(src[key])) else: copyKey = src[key] dest[prefix + key] = copyKey #roledata = pd.read_csv( cdir / ROLE_COURSE_ACCESS, sep=',') #rolediscdata = pd.read_csv( cdir / ROLE_FORUM_ACCESS, sep=',') roledata = openfile( fn_in=ROLE_COURSE_ACCESS, mode='w', pandas_df=True) rolediscdata = openfile( fn_in=ROLE_FORUM_ACCESS, mode='w', pandas_df=True) # Process Course Access Roles known_roles_course = OrderedDict([ ('beta_testers', 'roles_isBetaTester'), ('instructor', 'roles_isInstructor'), ('staff', 'roles_isStaff'), ('ccx_coach', 'roles_isCCX'), ('finance_admin', 'roles_isFinance'), ('library_user', 'roles_isLibrary'), ('sales_admin', 'roles_isSales') ]) base_fields = ['user_id', 'course_id'] fields = base_fields + known_roles_course.keys() print " Cleaning %s" % ROLE_COURSE_ACCESS wide_roledata = cleanRoles( roledata, 'uid', 'role', 'value', known_roles_course.keys(), fields ) # Process Forum Discussion Roles known_roles_disc = OrderedDict([ ('Administrator','forumRoles_isAdmin'), ('CommunityTA','forumRoles_isCommunityTA'), ('Moderator', 'forumRoles_isModerator'), ('Student', 'forumRoles_isStudent') ]) base_fields = ['user_id', 'course_id'] extra_fields = ['roles'] # To be used to create custom mapping based on existing course and discussion forum roles fields = base_fields + known_roles_disc.keys() print " Cleaning %s" % ROLE_FORUM_ACCESS wide_rolediscdata = cleanRoles( rolediscdata, 'uid', 'name', 'value', known_roles_disc.keys(), fields ) # Compile fields_all = base_fields + known_roles_course.keys() + known_roles_disc.keys() + extra_fields rename_dict = dict(known_roles_course, **known_roles_disc) wideData = pd.merge( wide_roledata, wide_rolediscdata, how='outer', on=["user_id", "course_id"], suffixes=['', '_disc'] ) # Create Roles var all_roles = known_roles_disc.keys() + known_roles_course.keys() all_roles.remove('Student') wideData['roles'] = wideData.apply( createRoleVar, args=[all_roles], axis=1 ) # Rename columns wideData = wideData[ fields_all ] wideData.rename( columns=rename_dict, inplace=True ) finalFields = wideData.columns.tolist() # Write out fieldnames = the_dict_schema.keys() ofp = openfile('roles.json.gz', 'w') ocsv = csv.DictWriter(openfile('roles.csv', 'w'), fieldnames=finalFields) ocsv.writeheader() wideData_dict = wideData.to_dict(orient='record') for line in wideData_dict: uid = int(line['user_id']) copy_elements(line, role_data[uid], finalFields, skip_empty=False ) role_data[uid]['user_id'] = uid data = role_data[uid] ofp.write(json.dumps(data) + '\n') try: ocsv.writerow(data) except Exception as err: print "failed to write data=%s" % data raise print "Done with make_roles for %s" % course_id sys.stdout.flush()
def rephrase_forum_json_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) sys.stdout.flush() return print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now()) sys.stdout.flush() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') for line in fp: cnt += 1 newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt==0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest, use_latest_sql_dir): ''' Create Video stats for Videos Viewed and Videos Watched. First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched ''' assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable" # Get Course Dir path basedir = path(basedir or '') course_dir = course_id.replace('/','__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest or use_latest_sql_dir) # get schema mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/%s' % ( mypath, SCHEMA_VIDEO_AXIS ) the_schema = json.loads(open(SCHEMA_FILE).read())[ SCHEMA_VIDEO_AXIS_NAME ] the_dict_schema = schema2dict(the_schema) # Create initial video axis videoAxisExists = False dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) va_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS ) assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS ) videoAxisExists = True va_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: print "%s --> Attempting to process %s table" % ( str(err), TABLE_VIDEO_AXIS ) sys.stdout.flush() # get course axis time ca_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS ) ca_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: pass if videoAxisExists and (not force_recompute) and ca_date and va_date and (ca_date > va_date): force_recompute = True print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date) sys.stdout.flush() if not videoAxisExists or force_recompute: force_recompute = True createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) # Get video lengths va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS) assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table" va_bqdata = va['data'] fileoutput = lfp / FILENAME_VIDEO_AXIS getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute ) # upload and import video axis gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS gsutil.upload_file_to_gs(fileoutput, gsfn) table = TABLE_VIDEO_AXIS bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True) else: print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS ) # Lastly, create video stats createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest ) createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest ) # also create person_course_video_watched createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest): ''' Create Video stats for Videos Viewed and Videos Watched. First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched ''' assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable" # Get Course Dir path basedir = path(basedir or '') course_dir = course_id.replace('/', '__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) # get schema mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/%s' % (mypath, SCHEMA_VIDEO_AXIS) the_schema = json.loads(open(SCHEMA_FILE).read())[SCHEMA_VIDEO_AXIS_NAME] the_dict_schema = schema2dict(the_schema) # Create initial video axis videoAxisExists = False dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) va_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS) assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS) videoAxisExists = True va_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: print "%s --> Attempting to process %s table" % (str(err), TABLE_VIDEO_AXIS) sys.stdout.flush() # get course axis time ca_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS) ca_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: pass if videoAxisExists and (not force_recompute) and ca_date and va_date and ( ca_date > va_date): force_recompute = True print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % ( va_date, ca_date) sys.stdout.flush() if not videoAxisExists or force_recompute: force_recompute = True createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) # Get video lengths va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS) assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table" va_bqdata = va['data'] fileoutput = lfp / FILENAME_VIDEO_AXIS getYoutubeDurations(dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute) # upload and import video axis gsfn = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS gsutil.upload_file_to_gs(fileoutput, gsfn) table = TABLE_VIDEO_AXIS bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True) else: print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS) # Lastly, create video stats createVideoStats_day(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) createVideoStats(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)
def mongo_dump_user_info_files(course_id, basedir=None, datedir=None, dbname=None, use_dataset_latest=False): ''' In early SQL tables from edX, enrollment records were deleted when a learner un-enrolled. This would then cause that learner's records to no longer appear in the SQL data dump to researchers. This further caused problems because there would be learners who received certificates (and appeared in the certificates tables) but weren't in the auth_user or auth_userprofile or enrollment tables. One way around this was to incrementally load all the user, userprofile, etc. tables from every weekly dump, so that if a learner ever appeared as being registered in a course, the learner would stay that way. This workaround didn't completely solve the problem, however, for early courses, and thus additional data had to be manualy requested from edX. These issues were all resolved for courses after ~Spring 2014, when un-enrollment was changed such that it did not cause deletion of the enrollment record, but rather, just a change of the "active" flag within the enrollment record. Here, to workaround the problem, for a given course_id, we generate users.csv, profiles.csv, enrollment.csv, certificates.csv, from collections stored in mongodb, for a specified course. These mongodb collections were curated and produced for the Hx and MITx Year 1 reports. Uses mongoexport. ''' basedir = path(basedir or '') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) mongodir = lfp.dirname() / 'from_mongodb' print "[mongo_dump_user_info_files] processing %s, output directory = %s" % ( course_id, mongodir) if not mongodir.exists(): os.mkdir(mongodir) def do_mongo_export(collection, ofn, ckey='course_id'): query = '{"%s": "%s"}' % (ckey, course_id) cmd = "mongoexport -d %s -c %s -q '%s' | gzip -9 > %s" % ( dbname, collection, query, ofn) print "--> %s" % cmd sys.stdout.flush() os.system(cmd) # make users with javascript join js = """conn = new Mongo(); db = conn.getDB('%s'); var cursor = db.student_courseenrollment.find({'course_id': '%s'}); while (cursor.hasNext()) { var doc = cursor.next(); udoc = db.auth_user.find({_id: doc.user_id})[0]; print(JSON.stringify(udoc)); } var course = '%s'; var cursor = db.certificates_generatedcertificate.find({'course_id': course}); while (cursor.hasNext()) { var doc = cursor.next(); usc = db.student_courseenrollment.find({'course_id': course, 'user_id': doc.user_id }); if (usc.length()==0){ udoc = db.auth_user.find({_id: doc.user_id})[0]; db.auth_userprofile.update({'user_id' : doc.user_id}, {\$addToSet : {courses: course }}); print(JSON.stringify(udoc)); } } """ % (dbname, course_id, course_id) ofn = mongodir / "users.json.gz" cmd = 'echo "%s" | mongo --quiet | tail -n +3 | gzip -9 > %s' % ( js.replace('\n', ''), ofn) print "--> %s" % cmd sys.stdout.flush() os.system(cmd) # profiles and enrollment and certificates do_mongo_export("auth_userprofile", mongodir / "profiles.json.gz", 'courses') do_mongo_export("student_courseenrollment", mongodir / "enrollment.json.gz") do_mongo_export("certificates_generatedcertificate", mongodir / "certificates.json.gz")
def rephrase_forum_json_for_course( course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) sys.stdout.flush() return print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now()) sys.stdout.flush() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') for line in fp: cnt += 1 newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt == 0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def fix_missing_grades( course_id, getGrades=None, download_only=None, course_id_type='opaque', # Assume newer courses with format 'course-v1:HarvardX+course+term' org_list=None, # org list is passed in order to properly grab data from edX instructor dashboard basedir=None, datedir=None, use_dataset_latest=False): lfp = find_course_sql_dir(course_id=course_id, basedir=basedir)# , org_list=org_list) print lfp gradereport_dir = lfp + '/from_edxinstructordash' print gradereport_dir if not gradereport_dir.exists(): os.mkdir(gradereport_dir) # Download grade report for single course def do_grade_report_export( ofn_dir, ofn, course_id): # Properly convert to opaque, if needed # Use Case: course_list contains transparent course ids, # but need to pass opaque course id for edX instructor dashboard # course_id_type can be set using --course-id-type opaque, for example, on the command line # Important to properly pass the original course id, as it exists in the edX platform course_id = parseCourseIdField( course=course_id, org_list=org_list, course_id_type=course_id_type ) print 'Processing course %s' % course_id sys.stdout.flush() try: # Make Grade Report Request getGrades.make_grade_report_request( course_id ) # Check current set of grade reports grade_reports_dict = getGrades.get_grade_reports( course_id, ofn_dir ) if download_only: print 'Download grade report only specified' sys.stdout.flush() getGrades.get_latest_grade_report( grade_reports_dict, ofn, ofn_dir ) else: # Keep checking until the Grade Report is Ready grade_report_ready = False while not grade_report_ready: current_time = datetime.datetime.now() print "Checking grade report status at %s" % current_time sys.stdout.flush() time.sleep( TIME_TO_WAIT ) if grade_report_download_ready( course_id ): grade_reports_dict = getGrades.get_grade_reports( course_id, ofn_dir ) getGrades.get_latest_grade_report( grade_reports_dict, ofn, ofn_dir ) grade_report_ready = True except Exception as err: print str(err) grade_report_downloaded = False # Try one last time try: # If failure, then just try grabbing the latest grade report, just in case grade_reports_dict = getGrades.get_grade_reports( course_id, ofn_dir ) getGrades.get_latest_grade_report( grade_reports_dict, ofn, ofn_dir ) grade_report_downloaded = True except Exception as err: print 'Failure on second attempt %s' % str(err) sys.stdout.flush() raise if grade_report_downloaded: print 'Success on second attempt %s' sys.stdout.flush() else: print 'Failure on second attempt %s' % str(err) sys.stdout.flush() raise def grade_report_download_ready( course_id ): getGrades.set_course_id( course_id) if getGrades.list_instructor_tasks()['tasks']: return False else: return True # Do single Course do_grade_report_export( gradereport_dir, "grade_report.csv", course_id)
def extractResearchData(self, course_id, tablename, the_dataset=None, rdp=None, rdp_format='csv', output_bucket=None, basedir='', datedir='', do_gzip=True): ''' Get research data output into tables and archive onto server ''' # Archive location if course_id is not None: # Individual Course Research Data Products self.gsp = gsutil.gs_path_from_course_id(course_id=course_id, gsbucket=output_bucket, use_dataset_latest=True) gsfilename = "%s/%s" % (self.gsp, RESEARCH_DATA_PRODUCTS[rdp]) else: print "ERROR! Must specify course_id's. Aborting." return try: # Copy to Google Storage msg = "[researchData]: Copying Research Data table %s to %s" % ( tablename, gsfilename) print msg #gsfilename = "%s/%s-*.csv.gz" % ( self.gsp, tablename ) # temp gsfilename = "%s/%s.csv.gz" % (self.gsp, tablename) # temp ret = bqutil.extract_table_to_gs(the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True) msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename) print msg sys.stdout.flush() except Exception as err: print str(err) if ('BQ Error creating table' in str(err)): msg = "[researchData]: Retrying... by sharding." print msg sys.stdout.flush() gsfilename = "%s/%s-*.csv.gz" % (self.gsp, tablename) print gsfilename sys.stdout.flush() ret = bqutil.extract_table_to_gs(the_dataset, tablename, gsfilename, format=rdp_format, do_gzip=True, wait=True) msg = "[researchData]: CSV download link: %s" % gsutil.gs_download_link( gsfilename) print msg sys.stdout.flush() # Copy from Google Storage to Secure Data Warehouse for archiving archiveLocation = find_course_sql_dir(course_id=course_id, basedir=basedir, datedir=datedir, use_dataset_latest=True) #time.sleep( CFG.TIME_TO_WAIT_30s ) # delay needed to allow for GS to upload file fully (This should be size dependent, and may not be enough time) msg = "[researchData]: Archiving Research Data table %s from %s to %s" % ( tablename, gsfilename, archiveLocation) print msg sys.stdout.flush() gsutil.upload_file_to_gs(src=gsfilename, dst=archiveLocation, verbose=True) pass
def mongo_dump_user_info_files(course_id, basedir=None, datedir=None, dbname=None, use_dataset_latest=False): ''' In early SQL tables from edX, enrollment records were deleted when a learner un-enrolled. This would then cause that learner's records to no longer appear in the SQL data dump to researchers. This further caused problems because there would be learners who received certificates (and appeared in the certificates tables) but weren't in the auth_user or auth_userprofile or enrollment tables. One way around this was to incrementally load all the user, userprofile, etc. tables from every weekly dump, so that if a learner ever appeared as being registered in a course, the learner would stay that way. This workaround didn't completely solve the problem, however, for early courses, and thus additional data had to be manualy requested from edX. These issues were all resolved for courses after ~Spring 2014, when un-enrollment was changed such that it did not cause deletion of the enrollment record, but rather, just a change of the "active" flag within the enrollment record. Here, to workaround the problem, for a given course_id, we generate users.csv, profiles.csv, enrollment.csv, certificates.csv, from collections stored in mongodb, for a specified course. These mongodb collections were curated and produced for the Hx and MITx Year 1 reports. Uses mongoexport. ''' basedir = path(basedir or '') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) mongodir = lfp.dirname() / 'from_mongodb' print "[mongo_dump_user_info_files] processing %s, output directory = %s" % (course_id, mongodir) if not mongodir.exists(): os.mkdir(mongodir) def do_mongo_export(collection, ofn, ckey='course_id'): query = '{"%s": "%s"}' % (ckey, course_id) cmd = "mongoexport -d %s -c %s -q '%s' | gzip -9 > %s" % (dbname, collection, query, ofn) print "--> %s" % cmd sys.stdout.flush() os.system(cmd) # make users with javascript join js = """conn = new Mongo(); db = conn.getDB('%s'); var cursor = db.student_courseenrollment.find({'course_id': '%s'}); while (cursor.hasNext()) { var doc = cursor.next(); udoc = db.auth_user.find({_id: doc.user_id})[0]; print(JSON.stringify(udoc)); } var course = '%s'; var cursor = db.certificates_generatedcertificate.find({'course_id': course}); while (cursor.hasNext()) { var doc = cursor.next(); usc = db.student_courseenrollment.find({'course_id': course, 'user_id': doc.user_id }); if (usc.length()==0){ udoc = db.auth_user.find({_id: doc.user_id})[0]; db.auth_userprofile.update({'user_id' : doc.user_id}, {\$addToSet : {courses: course }}); print(JSON.stringify(udoc)); } } """ % (dbname, course_id, course_id) ofn = mongodir / "users.json.gz" cmd = 'echo "%s" | mongo --quiet | tail -n +3 | gzip -9 > %s' % (js.replace('\n',''), ofn) print "--> %s" % cmd sys.stdout.flush() os.system(cmd) # profiles and enrollment and certificates do_mongo_export("auth_userprofile", mongodir / "profiles.json.gz", 'courses') do_mongo_export("student_courseenrollment", mongodir / "enrollment.json.gz") do_mongo_export("certificates_generatedcertificate", mongodir / "certificates.json.gz")