def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """ Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table. :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" sdir = load_course_sql.find_course_sql_dir(cid, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest), ) csvfn = sdir / csv_name tempfn = sdir / temp_name mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not os.path.exists(csvfn): print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn return if not subsection: cleanup_rows_from_grade_persistent(csvfn, tempfn) else: cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted") gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path( gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" csvfn = '%s/%s/%s/%s' % (basedir, cid.replace('/', '__'), datedir, csv_name) tempfn = '%s/%s/%s/%s' % (basedir, cid.replace('/', '__'), datedir, temp_name) mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not subsection: remove_nulls_from_grade_persistent(csvfn, tempfn) gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent( dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def do_course_listings(course_listings_fn): dataset = 'courses' table = 'listings' bqutil.create_dataset_if_nonexistent(dataset) mypath = os.path.dirname(os.path.realpath(__file__)) gsfn = gsutil.gs_path_from_course_id('courses') / 'listings.csv' gsutil.upload_file_to_gs(course_listings_fn, gsfn) schema = json.loads(open('%s/schemas/schema_course_listings.json' % mypath).read())['course_listings'] bqutil.load_data_to_table(dataset, table, gsfn, schema, wait=True, format='csv', skiprows=1)
def write_geoip_table(self): ''' Write out the geoipdat table if nchanged > 0 ''' if not self.nchanged: return ofn = 'tmp_geoip_%08d.json' % random.uniform(0, 100000000) print "--> new entries added to geoipdat, writing to %s" % (ofn) sys.stdout.flush() ofp = codecs.open(ofn, 'w', encoding='utf8') for key, val in self.geoipdat.iteritems(): try: ofp.write(json.dumps(val) + '\n') except Exception as err: print "Error! %s" % err sys.stdout.write(repr(val)) raise ofp.close() lock_file(self.gipfn) try: print "--> renaming %s to %s" % (ofn, self.gipfn) sys.stdout.flush() os.rename(ofn, self.gipfn) except Exception as err: print "Error %s in renaming gipfn" % str(err) lock_file(self.gipfn, release=True) mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip'] gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn print "--> Uploading %s to %s" % (self.gipfn, gsp) sys.stdout.flush() gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json') print "--> Importing %s to %s" % (gsp, self.giptable) sys.stdout.flush() try: bqutil.create_dataset_if_nonexistent(self.gipdataset) except Exception as err: print "--> Warning: failed to create %s, err=%s" % (gsp, err) try: bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema) except Exception as err: print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % ( gsp, self.gipdataset, self.giptable, err) print "---> Continuing anyway" sys.stdout.flush()
def write_geoip_table(self): ''' Write out the geoipdat table if nchanged > 0 ''' if not self.nchanged: return ofn = 'tmp_geoip_%08d.json' % random.uniform(0,100000000) print "--> new entries added to geoipdat, writing to %s" % (ofn) sys.stdout.flush() ofp = codecs.open(ofn, 'w', encoding='utf8') for key, val in self.geoipdat.iteritems(): try: ofp.write(json.dumps(val)+'\n') except Exception as err: print "Error! %s" % err sys.stdout.write(repr(val)) raise ofp.close() lock_file(self.gipfn) try: print "--> renaming %s to %s" % (ofn, self.gipfn) sys.stdout.flush() os.rename(ofn, self.gipfn) except Exception as err: print "Error %s in renaming gipfn" % str(err) lock_file(self.gipfn, release=True) mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip'] gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn print "--> Uploading %s to %s" % (self.gipfn, gsp) sys.stdout.flush() gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json') print "--> Importing %s to %s" % (gsp, self.giptable) sys.stdout.flush() try: bqutil.create_dataset_if_nonexistent(self.gipdataset) except Exception as err: print "--> Warning: failed to create %s, err=%s" % (gsp, err) try: bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema) except Exception as err: print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (gsp, self.gipdataset, self.giptable, err) print "---> Continuing anyway" sys.stdout.flush()
def load_sql_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir="2014-09-21", do_gs_copy=False, use_dataset_latest=False): ''' Load SQL files into google cloud storage then import into BigQuery. Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "." If use_dataset_latest then "_latest" is appended to the dataset name. Thus, the latest SQL dataset can always be put in a consistently named dataset. ''' print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() # convert studentmodule if necessary fn_sm = lfp / 'studentmodule.csv.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.csv' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql' if not fn_sm.exists(): print "Error! Missing studentmodule.[sql,csv][.gz]" if fn_sm.exists(): # have .sql or .sql.gz version: convert to .csv newfn = lfp / 'studentmodule.csv.gz' print "--> Converting %s to %s" % (fn_sm, newfn) tsv2csv(fn_sm, newfn) fn_sm = newfn if fn_sm.exists(): # rephrase studentmodule if it's using opaque keys fline = '' smfp = openfile(fn_sm) fline = smfp.readline() # skip first line - it's a header fline = smfp.readline() if 'block-v1:' in fline or 'course-v1' in fline: rephrase_studentmodule_opaque_keys(fn_sm) def convert_sql(fnroot): if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot + ".csv.gz"): return if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) # convert sql files if necesssary fnset = ['users', 'certificates', 'enrollment', "profiles", 'user_id_map', 'rolecourse', 'roleforum'] for fn in fnset: convert_sql(lfp / fn) local_files = glob.glob(lfp / '*') # if using latest date directory, also look for course_image.jpg one level up if use_dataset_latest: print lfp.dirname() ci_files = glob.glob(lfp.dirname() / 'course_image.jpg') if ci_files: local_files += list(ci_files) print "--> local course_image file: %s" % ci_files gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest) local = pytz.timezone ("America/New_York") if do_gs_copy: try: fnset = get_gs_file_list(gsdir) except Exception as err: fnset = [] def copy_if_newer(fn, fnset, options='-z csv,json'): statbuf = os.stat(fn) mt = datetime.datetime.fromtimestamp(statbuf.st_mtime) # do some date checking to upload files which have changed, and are newer than that on google cloud storage local_dt = local.localize(mt, is_dst=None) utc_dt = local_dt.astimezone (pytz.utc) fnb = os.path.basename(fn) if fnb in fnset and fnset[fnb]['date'] > utc_dt: print "...%s already copied, skipping" % fn sys.stdout.flush() return elif fnb in fnset: print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, fnset[fnb]['date'], mt) gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True) for fn in local_files: fnb = os.path.basename(fn) if fnb=='course_image.jpg': copy_if_newer(fn, fnset, options='-a public-read') if not (fnb.endswith('.csv') or fnb.endswith('.json') or fnb.endswith('.csv.gz') or fnb.endswith('.json.gz') or fnb.endswith('.mongo.gz')): print "...unknown file type %s, skipping" % fn sys.stdout.flush() continue copy_if_newer(fn, fnset) # load into bigquery dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) mypath = os.path.dirname(os.path.realpath(__file__)) # load user_info_combo uicfn = lfp / 'user_info_combo.json.gz' if uicfn.exists(): uic_schema = json.loads(open('%s/schemas/schema_user_info_combo.json' % mypath).read())['user_info_combo'] bqutil.load_data_to_table(dataset, 'user_info_combo', gsdir / "user_info_combo.json.gz", uic_schema, wait=False) else: print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn # load studentmodule if fn_sm.exists(): schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read()) cwsm_schema = schemas['courseware_studentmodule'] bqutil.load_data_to_table(dataset, 'studentmodule', gsdir / fn_sm.basename(), cwsm_schema, format='csv', wait=False, skiprows=1) else: print "--> Not loading studentmodule: file %s not found" % fn_sm
def rephrase_forum_json_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) sys.stdout.flush() return print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now()) sys.stdout.flush() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') for line in fp: cnt += 1 newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt==0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False, check_dates=True): ''' Load daily tracking logs for course from google storage into BigQuery. If wait=True then waits for loading jobs to be completed. It's desirable to wait if subsequent jobs which need these tables (like person_day) are to be run immediately afterwards. ''' print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() gsroot = gsutil.path_from_course_id(course_id) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA = json.loads( open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log'] gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot) fnset = gsutil.get_gs_file_list(gsdir) dataset = bqutil.course_id2dataset(gsroot, dtype="logs") # create this dataset if necessary bqutil.create_dataset_if_nonexistent(dataset) tables = bqutil.get_list_of_table_ids(dataset) tables = [x for x in tables if x.startswith('track')] if verbose: print "-" * 77 print "current tables loaded:", json.dumps(tables, indent=4) print "files to load: ", json.dumps(fnset.keys(), indent=4) print "-" * 77 sys.stdout.flush() for fn, fninfo in fnset.iteritems(): if int(fninfo['size']) <= 45: print "Zero size file %s, skipping" % fn continue m = re.search('(\d\d\d\d-\d\d-\d\d)', fn) if not m: continue date = m.group(1) tablename = "tracklog_%s" % date.replace( '-', '') # YYYYMMDD for compatibility with table wildcards # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True) file_date = fninfo['date'].replace(tzinfo=None) if tablename in tables: skip = True if check_dates: table_date = bqutil.get_bq_table_last_modified_datetime( dataset, tablename) if not (table_date > file_date): print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % ( tablename, fn, file_date, table_date) skip = False if skip: if verbose: print "Already have table %s, skipping file %s" % ( tablename, fn) sys.stdout.flush() continue #if date < '2014-07-27': # continue print "Loading %s into table %s " % (fn, tablename) if verbose: print "start [%s]" % datetime.datetime.now() sys.stdout.flush() gsfn = fninfo['name'] ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000) if verbose: print "-" * 77 print "done with %s [%s]" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """ Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table. :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path( gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" sdir = load_course_sql.find_course_sql_dir( cid, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest), ) csvfn = sdir / csv_name tempfn = sdir / temp_name mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not os.path.exists(csvfn): print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn return if not subsection: cleanup_rows_from_grade_persistent(csvfn, tempfn) else: cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted") gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent( dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def old_process_course(course_id, force_recompute=False): ''' DEPRACATED - instead of creating one table per day, because there is so little total data, create one enrollday_all table (see other function below). make enrollday2_* tables for specified course_id ''' SQL = """ SELECT "{course_id}" as course_id, time, event_struct.user_id as user_id, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "honor") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "honor") then -1 else 0 end) as diff_enrollment_honor, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "verified") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "verified") then -1 else 0 end) as diff_enrollment_verified, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "audit") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "audit") then -1 else 0 end) as diff_enrollment_audit, FROM [{dataset}.{table_id}] where (event_type = "edx.course.enrollment.activated") or (event_type = "edx.course.enrollment.deactivated") order by time; """ course_dir = course_id.replace('/','__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])] # print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'enrollday2_%s' % date if (table_out in pcday_tables) and not force_recompute: print "%s...already done, skipping" % table_id sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print ("Creating %s " % table_out), the_sql = SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def load_local_sql_files_to_bigquery(course_id, verbose, base_dir, date_dir): """ Loads the MySQL files from edx-analytics-exporter into Google Big Query. Run the waldofy command before attempting to upload MySQL data into Google Big Query. Args: course_id: course_id string. verbose: Whether or not the function logging should be verbose. """ STUDENTMODULE_TABLE_NAME = 'studentmodule' USER_INFO_COMBO_TABLE_NAME = 'user_info_combo' dataset_name = bqutil.course_id2dataset(course_id) sql_scheme_file_names = getattr(edx2bigquery_config, 'SQL_SCHEME_FILE_NAMES', None) sql_files_abs_path = os.path.abspath('{}'.format( find_course_sql_dir(course_id, base_dir, date_dir))) if not sql_scheme_file_names: print( 'Missing SQL_SCHEME_FILE_NAMES setting, unable to load scheme files.' ) exit() for table_name, scheme_data in sql_scheme_file_names.items(): dict_schema = get_schema_from_file( scheme_data.get('scheme_file_name', '')) if table_name == STUDENTMODULE_TABLE_NAME: scheme = dict_schema.get('courseware_studentmodule', {}) elif table_name == USER_INFO_COMBO_TABLE_NAME: scheme = dict_schema.get('user_info_combo', {}) if not scheme: print('Unable to load Google Big Query scheme for: {}'.format( table_name)) continue bqutil.create_dataset_if_nonexistent(dataset_name) file_name = '{}/{}{}'.format( sql_files_abs_path, table_name, DEFAULT_SQL_FILE_EXTENSION, ) if not os.path.isfile(file_name): print( 'File does not exists: {}, unable to load SQL file to Google Big Query.' .format(file_name)) continue if verbose: print('Uploading: {} to the table: {}'.format( file_name, table_name)) bqutil.upload_local_data_to_big_query( dataset_id=dataset_name, table_id=table_name, schema=scheme, course_id=course_id, file_name=file_name, source_format=DEFAULT_CSV_SOURCE_FORMAT_NAME, )
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False, check_dates=True): ''' Load daily tracking logs for course from google storage into BigQuery. If wait=True then waits for loading jobs to be completed. It's desirable to wait if subsequent jobs which need these tables (like person_day) are to be run immediately afterwards. ''' print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() gsroot = gsutil.path_from_course_id(course_id) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA = json.loads(open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log'] gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot) fnset = gsutil.get_gs_file_list(gsdir) dataset = bqutil.course_id2dataset(gsroot, dtype="logs") # create this dataset if necessary bqutil.create_dataset_if_nonexistent(dataset) tables = bqutil.get_list_of_table_ids(dataset) tables = [x for x in tables if x.startswith('track')] if verbose: print "-"*77 print "current tables loaded:", json.dumps(tables, indent=4) print "files to load: ", json.dumps(fnset.keys(), indent=4) print "-"*77 sys.stdout.flush() for fn, fninfo in fnset.iteritems(): if int(fninfo['size'])<=45: print "Zero size file %s, skipping" % fn continue m = re.search('(\d\d\d\d-\d\d-\d\d)', fn) if not m: continue date = m.group(1) tablename = "tracklog_%s" % date.replace('-','') # YYYYMMDD for compatibility with table wildcards # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True) file_date = fninfo['date'].replace(tzinfo=None) if tablename in tables: skip = True if check_dates: table_date = bqutil.get_bq_table_last_modified_datetime(dataset, tablename) if not (table_date > file_date): print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (tablename, fn, file_date, table_date) skip = False if skip: if verbose: print "Already have table %s, skipping file %s" % (tablename, fn) sys.stdout.flush() continue #if date < '2014-07-27': # continue print "Loading %s into table %s " % (fn, tablename) if verbose: print "start [%s]" % datetime.datetime.now() sys.stdout.flush() gsfn = fninfo['name'] ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000) if verbose: print "-" * 77 print "done with %s [%s]" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def CreateForumEvents(course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): ''' Create forum events table, based on tracking logs. Extracts all forum-related events, including forum post reads, into the date-time ordered table. Repeated calls to this procedure will append new events to the table. If no new events are found, the existing table is left unchanged. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) logs = bqutil.course_id2dataset(course_id, dtype='logs') table = TABLE_FORUM_EVENTS # event_type for forums may be like: # /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/threads/5460c918a2a525003a0007fa # /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/inline # /courses/UnivX/123.4x/2T2015/discussion/forum/users/4051854/followed # /courses/UnivX/123.4x/2T2015/discussion/comments/54593f21a2a525003a000351/reply # /courses/UnivX/123.4x/2T2015/discussion/threads/545e4f5da2a5251aac000672/reply # /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/upvote # /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/unvote # /courses/UnivX/123.4x/2T2015/discussion/threads/5447c22e892b213c7b0001f3/update # /courses/UnivX/123.4x/2T2015/discussion/threads/54493025892b2120a1000335/pin # /courses/UnivX/123.4x/2T2015/discussion/threads/54492e9c35c79cb03e00030c/delete # /courses/UnivX/123.4x/2T2015/discussion/forum/General/inline # /courses/UnivX/123.4x/2T2015/instructor/api/list_forum_members # /courses/UnivX/123.4x/2T2015/instructor/api/update_forum_role_membership # \"GET\": {\"action\": [\"allow\"], \"rolename\": [\"Administrator\"], \"unique_student_identifier\": [\"NEW_ADMIN_USER\"]}}"} # # module_id will be like: # "module_id": "UnivX/123.4x/forum/54492f0c892b21597e00030a" the_sql = """ SELECT time, username, '{course_id}' as course_id, (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/reply') then "reply" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/upvote') then "upvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unvote') then "unvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/update') then "update" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/delete') then "delete" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/close') then "close" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/follow') then "follow_thread" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unfollow') then "unfollow_thread" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/pin') then "pin" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unpin') then "unpin" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/downvote') then "downvote" # does this happen? when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/reply') then "comment_reply" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/upvote') then "comment_upvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/update') then "comment_update" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/unvote') then "comment_unvote" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/delete') then "comment_delete" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+/followed') then "follow_user" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+$') then "target_user" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then "read" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/inline') then "read_inline" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/search') then "search" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum$') then "enter_forum" when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/$') then "enter_forum" when REGEXP_MATCH(event_type, r'/courses/(.*)/instructor/api/(.*)') then REGEXP_EXTRACT(event_type, r'/courses/.*/instructor/api/(.*)') when event_type = "edx.forum.thread.created" then "created_thread" when event_type = "edx.forum.response.created" then "created_response" when event_type = "edx.forum.comment.created" then "created_comment" when event_type = "edx.forum.searched" then "searched" else event_type end) as forum_action, (case when module_id is not null then REGEXP_EXTRACT(module_id, r'[^/]+/[^/]+/forum/([^/]+)') # For old-school courses with transparent course ids else (case when module_id is null # otherwise, for new opaque course ids, use regex to find thread_id from event_type, since module_id is null then (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/forum/[^/]+/threads/([^/]+)') # read else (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/threads/([^/]+)') # upvote, pinned, upvoted, unvoted, deleted, followed else REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/comments/([^/]+)/') end) # comment end) end) end) as thread_id, REGEXP_EXTRACT(event_type, r'/courses/.*/forum/([^/]+)/') as subject, REGEXP_EXTRACT(event_type, r'/courses/.*/forum/users/([^/]+)') as target_user_id, event_struct.query as search_query, # unavailable before June 1, 2015 event_struct.GET as event_GET, # unavailable before June 1, 2015 FROM {DATASETS} WHERE (REGEXP_MATCH(event_type ,r'^edx\.forum\..*') or event_type contains "/discussion/forum" or event_type contains "/discussion/threads" or event_type contains "/discussion/comments" or event_type contains "list-forum-" or event_type contains "list_forum_" or event_type contains "add-forum-" or event_type contains "add_forum_" or event_type contains "remove-forum-" or event_type contains "remove_forum_" or event_type contains "update_forum_" ) AND username is not null AND event is not null and time > TIMESTAMP("{last_date}") {hash_limit} order by time """ try: bqutil.create_dataset_if_nonexistent(dataset) tinfo = bqutil.get_bq_table_info(dataset, table) assert tinfo is not None, "[make_forum_analysis] Creating %s.%s table for %s" % ( dataset, table, course_id) print "[make_forum_analysis] Appending latest data to %s.%s table for %s" % ( dataset, table, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % (dataset, table) sys.stdout.flush() pass print "=== Processing Forum Events for %s (start %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs( the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, has_hash_limit=True, end_date=end_date, skip_last_day=skip_last_day) print "Done with Forum Events for %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def process_course_time_on_asset(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, just_do_totals=False, limit_query_size=False, table_max_size_mb=800, skip_totals=False, start_date=None, config_parameter_overrides=None): ''' Create the time_on_asset_daily table, containing module_id, username, date, and time on asset stats. This table has separate rows for each date, and is ordered in time. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. Compute totals and store in time_on_asset_totals, by summing over all dates, grouped by module_id. How are time_on_asset numbers computed? See discussion in make_time_on_task.py The time_one_asset_daily table has these columns: - date: gives day for the data - username - module_id - time_umid5: total time on module (by module_id) in seconds, with a 5-minute timeout - time_umid30: total time on module (by module_id) in seconds, with a 30-minute timeout ''' if just_do_totals: return process_time_on_task_totals( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) SQL = """ SELECT "{course_id}" as course_id, date(time) as date, username, module_id, # time_umid5 = total time on module (by module_id) in seconds # time_mid5 has 5 minute timeout, time_mid30 has 30 min timeout SUM( case when dt_umid < 5*60 then dt_umid end ) as time_umid5, SUM( case when dt_umid < 30*60 then dt_umid end ) as time_umid30, FROM ( SELECT time, username, module_id, (time - last_time)/1.0E6 as dt, # dt is in seconds (time - last_time_umid)/1.0E6 as dt_umid, # dt for (user, module_id) in seconds last_time_umid, FROM ( SELECT time, username, last_username, module_id, USEC_TO_TIMESTAMP(last_time) as last_time, USEC_TO_TIMESTAMP(last_time_umid) as last_time_umid, FROM ( SELECT time, username, module_id, lag(time, 1) over (partition by username order by time) last_time, lag(username, 1) over (partition by username order by time) last_username, lag(time, 1) over (partition by username, module_id order by time) last_time_umid, FROM (SELECT time, username, (case when REGEXP_MATCH(module_id, r'.*\"\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\}}$') when REGEXP_MATCH(module_id, r'.*\"\]\}}\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\]\}}\}}$') else module_id end) as module_id, # fix some errors in module_id names FROM {DATASETS} ) WHERE module_id is not null AND username is not null AND username != "" and time > TIMESTAMP("{last_date}") ) ) ) WHERE module_id is not null AND NOT module_id CONTAINS '"' GROUP BY date, module_id, username ORDER BY date, module_id, username """ table = 'time_on_asset_daily' dataset_name = bqutil.course_id2dataset(course_id) bqutil.create_dataset_if_nonexistent(dataset_name) def gdf(row): return datetime.datetime.strptime(row['date'], '%Y-%m-%d') process_tracking_logs.run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, start_date=start_date, get_date_function=gdf, days_delta=0, has_hash_limit=True, newer_than=datetime.datetime(2015, 3, 15), # schema change table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size) if not skip_totals: return process_time_on_asset_totals( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) return
def ExtractProblemEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None): dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_PROBLEM_EVENTS the_sql = """ SELECT context.user_id as user_id, time, event_source, REGEXP_EXTRACT( (CASE when module_id is not null then module_id when event_type contains "/xblock/i4x:;_" then REPLACE(REGEXP_EXTRACT(event_type, r"i4x:;_;_(.*)/handler/xmodule"),";_", "/") else REPLACE(event_struct.problem, "i4x://", "") end), "[^/]+/problem/([^/]+)") as problem_url, (CASE when event_type contains "/xblock/i4x:;_" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)") when event_type contains "type@problem+block" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)") else event_type end) as event_type, event_struct.attempts as attempts, event_struct.success as success, event_struct.grade as grade, FROM {DATASETS} WHERE ( REGEXP_MATCH(event_type, r'problem_\w+') OR event_type = "showanswer" ) AND context.user_id is not null and time > TIMESTAMP("{last_date}") {hash_limit} order by user_id, time """ try: bqutil.create_dataset_if_nonexistent(dataset) tinfo = bqutil.get_bq_table_info(dataset, table ) assert tinfo is not None, "[make_problem_events] Creating %s.%s table for %s" % (dataset, table, course_id) print "[make_problem_events] Appending latest data to %s.%s table for %s" % (dataset, table, course_id) sys.stdout.flush() except (AssertionError, Exception) as err: print str(err) sys.stdout.flush() print " --> Missing %s.%s? Attempting to create..." % ( dataset, table ) sys.stdout.flush() pass print "=== Processing Forum Events for %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() def gdf(row): return datetime.datetime.utcfromtimestamp(float(row['time'])) process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, get_date_function=gdf, has_hash_limit=True, end_date=end_date, skip_last_day=skip_last_day ) print "Done with Problem Events for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def load_sql_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir="2014-09-21", do_gs_copy=False, use_dataset_latest=False): ''' Load SQL files into google cloud storage then import into BigQuery. Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "." If use_dataset_latest then "_latest" is appended to the dataset name. Thus, the latest SQL dataset can always be put in a consistently named dataset. ''' print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() # convert studentmodule if necessary fn_sm = lfp / 'studentmodule.csv.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.csv' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql' if not fn_sm.exists(): print "Error! Missing studentmodule.[sql,csv][.gz]" if fn_sm.exists(): # have .sql or .sql.gz version: convert to .csv newfn = lfp / 'studentmodule.csv.gz' print "--> Converting %s to %s" % (fn_sm, newfn) tsv2csv(fn_sm, newfn) fn_sm = newfn if fn_sm.exists(): # rephrase studentmodule if it's using opaque keys fline = '' smfp = openfile(fn_sm) fline = smfp.readline() # skip first line - it's a header fline = smfp.readline() if 'block-v1:' in fline or 'course-v1' in fline: rephrase_studentmodule_opaque_keys(fn_sm) def convert_sql(fnroot): if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot + ".csv.gz"): return if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) # convert sql files if necesssary fnset = [ 'users', 'certificates', 'enrollment', "profiles", 'user_id_map', 'rolecourse', 'roleforum' ] for fn in fnset: convert_sql(lfp / fn) local_files = glob.glob(lfp / '*') # if using latest date directory, also look for course_image.jpg one level up if use_dataset_latest: print lfp.dirname() ci_files = glob.glob(lfp.dirname() / 'course_image.jpg') if ci_files: local_files += list(ci_files) print "--> local course_image file: %s" % ci_files gsdir = gsutil.gs_path_from_course_id( course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest) local = pytz.timezone("America/New_York") if do_gs_copy: try: fnset = get_gs_file_list(gsdir) except Exception as err: fnset = [] def copy_if_newer(fn, fnset, options='-z csv,json'): statbuf = os.stat(fn) mt = datetime.datetime.fromtimestamp(statbuf.st_mtime) # do some date checking to upload files which have changed, and are newer than that on google cloud storage local_dt = local.localize(mt, is_dst=None) utc_dt = local_dt.astimezone(pytz.utc) fnb = os.path.basename(fn) if fnb in fnset and fnset[fnb]['date'] > utc_dt: print "...%s already copied, skipping" % fn sys.stdout.flush() return elif fnb in fnset: print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % ( fn, fnset[fnb]['date'], mt) gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True) for fn in local_files: fnb = os.path.basename(fn) if fnb == 'course_image.jpg': copy_if_newer(fn, fnset, options='-a public-read') if not (fnb.endswith('.csv') or fnb.endswith('.json') or fnb.endswith('.csv.gz') or fnb.endswith('.json.gz') or fnb.endswith('.mongo.gz')): print "...unknown file type %s, skipping" % fn sys.stdout.flush() continue copy_if_newer(fn, fnset) # load into bigquery dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) mypath = os.path.dirname(os.path.realpath(__file__)) # load user_info_combo uicfn = lfp / 'user_info_combo.json.gz' if uicfn.exists(): uic_schema = json.loads( open('%s/schemas/schema_user_info_combo.json' % mypath).read())['user_info_combo'] bqutil.load_data_to_table(dataset, 'user_info_combo', gsdir / "user_info_combo.json.gz", uic_schema, wait=False) else: print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn # load studentmodule if fn_sm.exists(): schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read()) cwsm_schema = schemas['courseware_studentmodule'] bqutil.load_data_to_table(dataset, 'studentmodule', gsdir / fn_sm.basename(), cwsm_schema, format='csv', wait=False, skiprows=1) else: print "--> Not loading studentmodule: file %s not found" % fn_sm
def rephrase_forum_json_for_course( course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" ofncsv = "forum.csv.gz" # To match table name in BQ ofncsv_lfp = lfp / ofncsv dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn) and os.path.exists(ofncsv_lfp): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) print "Already done %s -> %s (skipping)" % (fn, ofncsv_lfp) sys.stdout.flush() return print "Processing %s -> writing to %s and %s (%s)" % ( fn, ofn, ofncsv, datetime.datetime.now()) sys.stdout.flush() # Setup CSV header ocsv = csv.DictWriter(openfile(ofncsv, 'w'), fieldnames=SCHEMA_DICT.keys(), quoting=csv.QUOTE_NONNUMERIC) ocsv.writeheader() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') data = OrderedDict() for line in fp: cnt += 1 # Write JSON row newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) try: #Write CSV row data = json.loads(newline) ocsv.writerow(data) except Exception as err: print "Error writing CSV output row %s=%s" % (cnt, data) raise ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt == 0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def __init__(self, course_id_set, output_project_id=None, nskip=0, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, only_step=None, end_date=None, ): ''' Compute course report tables, based on combination of all person_course and other individual course tables. only_step: specify a single course report step to be executed; runs all reports, if None ''' if only_step and ',' in only_step: only_step = only_step.split(',') self.only_step = only_step self.end_date = end_date; if not course_id_set: print "ERROR! Must specify list of course_id's for report. Aborting." return org = course_id_set[0].split('/',1)[0] # extract org from first course_id self.org = org self.output_project_id = output_project_id crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' self.dataset = output_dataset_id or crname self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket) self.course_id_set = course_id_set course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set] # check to see which datasets have person_course tables datasets_with_pc = [] self.all_pc_tables = OrderedDict() self.all_pcday_ip_counts_tables = OrderedDict() self.all_uic_tables = OrderedDict() self.all_tott_tables = OrderedDict() for cd in course_datasets: try: table = bqutil.get_bq_table_info(cd, 'person_course') except Exception as err: print "[make-course_report_tables] Err: %s" % str(err) table = None if table is not None: self.all_pc_tables[cd] = table datasets_with_pc.append(cd) try: table = bqutil.get_bq_table_info(cd, 'pcday_ip_counts') except Exception as err: table = None if table is not None: self.all_pcday_ip_counts_tables[cd] = table try: table = bqutil.get_bq_table_info(cd, 'user_info_combo') except Exception as err: table = None if table is not None: self.all_uic_tables[cd] = table try: table = bqutil.get_bq_table_info(cd, 'time_on_task_totals') except Exception as err: print "[make-course_report_tables] Err: %s" % str(err) table = None if table is not None: self.all_tott_tables[cd] = table pc_tables = ',\n'.join(['[%s.person_course]' % x for x in datasets_with_pc]) pcday_ip_counts_tables = ',\n'.join(['[%s.pcday_ip_counts]' % x for x in self.all_pcday_ip_counts_tables]) uic_tables = ',\n'.join(['[%s.user_info_combo]' % x for x in self.all_uic_tables]) tott_tables = ',\n'.join(['[%s.time_on_task_totals]' % x for x in self.all_tott_tables]) print "%d time_on_task tables: %s" % (len(self.all_tott_tables), tott_tables) sys.stdout.flush() # find latest combined person_course table cpc_tables = [ x for x in bqutil.get_list_of_table_ids(self.dataset) if x.startswith("person_course_") ] if cpc_tables: the_cpc_table = "[%s.%s]" % (self.dataset, max(cpc_tables)) else: the_cpc_table = None print "[make_course_report_tables] ==> Using %s as the latest combined person_course table" % the_cpc_table self.parameters = {'dataset': self.dataset, 'pc_tables': pc_tables, 'uic_tables': uic_tables, 'tott_tables': tott_tables, 'pcday_ip_counts_tables': pcday_ip_counts_tables, 'combined_person_course': the_cpc_table, } print "[make_course_report_tables] ==> Using these datasets (with person_course tables): %s" % datasets_with_pc self.course_datasets = course_datasets print "="*100 print "Generating course report tables -> dataset=%s, project=%s" % (self.dataset, self.output_project_id) sys.stdout.flush() bqutil.create_dataset_if_nonexistent(self.dataset, project_id=output_project_id) self.nskip = nskip if 1: self.combine_show_answer_stats_by_course() self.make_totals_by_course() self.make_medians_by_course() self.make_table_of_email_addresses() self.make_global_modal_ip_table() self.make_enrollment_by_day() self.make_time_on_task_stats_by_course() self.make_total_populations_by_course() self.make_table_of_n_courses_registered() self.make_geographic_distributions() # self.count_tracking_log_events() self.make_overall_totals() print "="*100 print "Done with course report tables" sys.stdout.flush()
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False): ''' Save course axis data to bigquery cid = course_id caset = list of course axis data in dict format xbundle = XML bundle of course (everything except static files) datadir = directory where output files should be written log_msg = list of messages about processing errors and issues ''' # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis'] dict_schema = schema2dict(the_schema) caset = copy.deepcopy(caset_in) datadir = path(datadir) cafn = datadir / 'course_axis.json' xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__'))) fp = open(cafn, 'w') linecnt = 0 for ca in caset: linecnt += 1 ca['course_id'] = cid data = ca['data'] if data and not type(data)==dict: try: ca['data'] = json.loads(data) # make it native, for mongo except Exception as err: print "failed to create json for %s, error=%s" % (data, err) if ca['start'] is not None: ca['start'] = str(ca['start']) # datetime to string if ca['due'] is not None: ca['due'] = str(ca['due']) # datetime to string if (ca['data'] is None) or (ca['data']==''): ca.pop('data') check_schema(linecnt, ca, the_ds=dict_schema, coerce=True) try: # db.course_axis.insert(ca) fp.write(json.dumps(ca)+'\n') except Exception as err: print "Failed to save! Error=%s, data=%s" % (err, ca) fp.close() # upload axis.json file and course xbundle gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if 1: gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False) gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False) # import into BigQuery dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent table = "course_axis" bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema) msg = "="*100 + '\n' msg += "Course axis for %s\n" % (cid) msg += "="*100 + '\n' msg += '\n'.join(log_msg) msg = msg[:16184] # max message length 16384 bqutil.add_description_to_table(dataset, table, msg, append=True) print " Done - inserted %s records into course_axis" % len(caset)
def rephrase_forum_json_for_course( course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) sys.stdout.flush() return print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now()) sys.stdout.flush() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') for line in fp: cnt += 1 newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt == 0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def old_process_course(course_id, force_recompute=False): ''' DEPRACATED - instead of creating one table per day, because there is so little total data, create one enrollday_all table (see other function below). make enrollday2_* tables for specified course_id ''' SQL = """ SELECT "{course_id}" as course_id, time, event_struct.user_id as user_id, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "honor") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "honor") then -1 else 0 end) as diff_enrollment_honor, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "verified") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "verified") then -1 else 0 end) as diff_enrollment_verified, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "audit") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "audit") then -1 else 0 end) as diff_enrollment_audit, FROM [{dataset}.{table_id}] where (event_type = "edx.course.enrollment.activated") or (event_type = "edx.course.enrollment.deactivated") order by time; """ course_dir = course_id.replace('/', '__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [ x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', []) ] # print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'enrollday2_%s' % date if (table_out in pcday_tables) and not force_recompute: print "%s...already done, skipping" % table_id sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id) == 0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print("Creating %s " % table_out), the_sql = SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def obsolete_process_course(course_id, force_recompute=False, check_dates=True): ''' make person_course_day tables for specified course_id. This version produces one table for each day. It is inefficient when there are many days with very small daily tracking log tables. ''' PCDAY_SQL = """ select username, "{course_id}" as course_id, sum(bevent) as nevents, sum(bprogress) as nprogcheck, sum(bshow_answer) as nshow_answer, sum(bvideo) as nvideo, sum(bproblem_check) as nproblem_check, sum(bforum) as nforum, sum(bshow_transcript) as ntranscript, sum(bseq_goto) as nseq_goto, sum(bseek_video) as nseek_video, sum(bpause_video) as npause_video, MAX(time) as last_event, AVG( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as avg_dt, STDDEV( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sdv_dt, MAX( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as max_dt, COUNT( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as n_dt, SUM( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sum_dt from (SELECT username, case when event_type = "play_video" then 1 else 0 end as bvideo, case when event_type = "problem_check" then 1 else 0 end as bproblem_check, case when username != "" then 1 else 0 end as bevent, case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum, case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress, case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer, case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript, case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto, case when event_type = 'seek_video' then 1 else 0 end as bseek_video, case when event_type = 'pause_video' then 1 else 0 end as bpause_video, # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll, # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll time, lag(time, 1) over (partition by username order by time) last_time FROM [{dataset}.{table_id}] WHERE NOT event_type contains "/xblock/" AND username != "" ) group by course_id, username order by sdv_dt desc """ course_dir = course_id.replace('/','__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])] print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'pcday_%s' % date if (table_out in pcday_tables) and not force_recompute: skip = True if check_dates: table_out_date = bqutil.get_bq_table_last_modified_datetime(pcd_dataset, table_out) log_table_date = bqutil.get_bq_table_last_modified_datetime(log_dataset, table_id) if log_table_date > table_out_date: skip = False print "%s...already exists, but table_out date=%s and log_table date=%s, so re-computing" % (table_out, table_out_date, log_table_date) if skip: print "%s...already done, skipping" % table_out sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print ("Creating %s " % table_out), the_sql = PCDAY_SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def load_local_logs_to_biqquery(course_id, start_date, end_date, verbose): """ Loads the local tracking logs into Google BigQuery. First, it will try to create the dataset if not exist. Args: course_id: Course id string. start_date: Start date string to process the tracking logs. end_date: End date string to process the tracking logs. verbose: Whether or not the function logging should be verbose. """ dataset_name = bqutil.course_id2dataset(course_id, dtype="logs") date_pattern = getattr(edx2bigquery_config, 'TRACKING_LOG_REGEX_DATE_PATTERN', '') tracking_start_date, tracking_end_date = get_start_and_end_date( start_date, end_date) schema = get_tracking_log_schema() bqutil.create_dataset_if_nonexistent(dataset_name) for file_name in local_util.get_tracking_log_file_list(course_id): if not file_name: continue file_match = re.findall(date_pattern, file_name) if not file_match and verbose: logging( 'The file name: {} does not have the string date at the end of the name.' .format(file_name, )) continue file_date = dateutil.parser.parse(file_match[-1]) if file_date <= tracking_end_date and file_date >= tracking_start_date: table_name = 'tracklog_{}'.format( file_date.strftime( getattr(edx2bigquery_config, 'TRACKING_LOG_DATE_FORMAT', '%Y-%m-%d'))) if verbose: logging('Uploading: {} to the table: {}'.format( file_name, table_name)) bqutil.upload_local_data_to_big_query( dataset_id=dataset_name, table_id=table_name, schema=schema, course_id=course_id, file_name=file_name, source_format=DEFAULT_JSON_SOURCE_FORMAT_NAME, ) elif verbose: logging( 'The file with name: {} has a date before or after of the start_date and end_date provided.' .format(file_name, )) continue if verbose: logging( 'The file with name: {} has been succesufully uploaded to Big Query.' .format(file_name, ))