def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """ Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table. :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" sdir = load_course_sql.find_course_sql_dir(cid, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest), ) csvfn = sdir / csv_name tempfn = sdir / temp_name mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not os.path.exists(csvfn): print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn return if not subsection: cleanup_rows_from_grade_persistent(csvfn, tempfn) else: cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted") gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path( gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" csvfn = '%s/%s/%s/%s' % (basedir, cid.replace('/', '__'), datedir, csv_name) tempfn = '%s/%s/%s/%s' % (basedir, cid.replace('/', '__'), datedir, temp_name) mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not subsection: remove_nulls_from_grade_persistent(csvfn, tempfn) gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent( dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def write_geoip_table(self): ''' Write out the geoipdat table if nchanged > 0 ''' if not self.nchanged: return ofn = 'tmp_geoip_%08d.json' % random.uniform(0, 100000000) print "--> new entries added to geoipdat, writing to %s" % (ofn) sys.stdout.flush() ofp = codecs.open(ofn, 'w', encoding='utf8') for key, val in self.geoipdat.iteritems(): try: ofp.write(json.dumps(val) + '\n') except Exception as err: print "Error! %s" % err sys.stdout.write(repr(val)) raise ofp.close() lock_file(self.gipfn) try: print "--> renaming %s to %s" % (ofn, self.gipfn) sys.stdout.flush() os.rename(ofn, self.gipfn) except Exception as err: print "Error %s in renaming gipfn" % str(err) lock_file(self.gipfn, release=True) mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip'] gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn print "--> Uploading %s to %s" % (self.gipfn, gsp) sys.stdout.flush() gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json') print "--> Importing %s to %s" % (gsp, self.giptable) sys.stdout.flush() try: bqutil.create_dataset_if_nonexistent(self.gipdataset) except Exception as err: print "--> Warning: failed to create %s, err=%s" % (gsp, err) try: bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema) except Exception as err: print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % ( gsp, self.gipdataset, self.giptable, err) print "---> Continuing anyway" sys.stdout.flush()
def do_course_listings(course_listings_fn): dataset = 'courses' table = 'listings' bqutil.create_dataset_if_nonexistent(dataset) mypath = os.path.dirname(os.path.realpath(__file__)) gsfn = gsutil.gs_path_from_course_id('courses') / 'listings.csv' gsutil.upload_file_to_gs(course_listings_fn, gsfn) schema = json.loads(open('%s/schemas/schema_course_listings.json' % mypath).read())['course_listings'] bqutil.load_data_to_table(dataset, table, gsfn, schema, wait=True, format='csv', skiprows=1)
def do_user_part_csv(course_id, basedir=None, datedir=None, use_dataset_latest=False, verbose=False, pin_date=None): sdir = find_course_sql_dir(course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) # upload to google storage dfn = sdir / "user_api_usercoursetag.csv.gz" if not os.path.exists(dfn): print("[load_user_part] Missing %s, skipping" % dfn) return # reformat True / False to 1/0 for "value" field if verbose: print("[load_user_part] extracting user partition data from %s" % dfn) sys.stdout.flush() cdr = csv.DictReader(gzip.GzipFile(dfn)) fields = cdr.fieldnames if verbose: print("fieldnames = %s" % fields) fixed_data = [] bmap = {'true': 1, 'false': 0} for row in cdr: vstr = row['value'].lower() row['value'] = bmap.get(vstr, vstr) fixed_data.append(row) ofnb = 'user_partitions.csv.gz' odfn = sdir / ofnb with gzip.GzipFile(odfn, 'w') as ofp: cdw = csv.DictWriter(ofp, fieldnames=fields) cdw.writeheader() cdw.writerows(fixed_data) if verbose: print("[load_user_part] Wrote %d rows of user partition data to %s" % (len(fixed_data), odfn)) sys.stdout.flush() gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)) gsutil.upload_file_to_gs(odfn, gsdir / ofnb, verbose=False) mypath = os.path.dirname(os.path.realpath(__file__)) schema = json.loads(open('%s/schemas/schema_user_partitions.json' % mypath).read())['user_partitions'] # import into BigQuery table = "user_partitions" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
def write_geoip_table(self): ''' Write out the geoipdat table if nchanged > 0 ''' if not self.nchanged: return ofn = 'tmp_geoip_%08d.json' % random.uniform(0,100000000) print "--> new entries added to geoipdat, writing to %s" % (ofn) sys.stdout.flush() ofp = codecs.open(ofn, 'w', encoding='utf8') for key, val in self.geoipdat.iteritems(): try: ofp.write(json.dumps(val)+'\n') except Exception as err: print "Error! %s" % err sys.stdout.write(repr(val)) raise ofp.close() lock_file(self.gipfn) try: print "--> renaming %s to %s" % (ofn, self.gipfn) sys.stdout.flush() os.rename(ofn, self.gipfn) except Exception as err: print "Error %s in renaming gipfn" % str(err) lock_file(self.gipfn, release=True) mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_extra_geoip.json' % mypath).read())['extra_geoip'] gsp = gsutil.gs_path_from_course_id(self.gipdataset) / self.gipfn print "--> Uploading %s to %s" % (self.gipfn, gsp) sys.stdout.flush() gsutil.upload_file_to_gs(self.gipfn, gsp, '-z json') print "--> Importing %s to %s" % (gsp, self.giptable) sys.stdout.flush() try: bqutil.create_dataset_if_nonexistent(self.gipdataset) except Exception as err: print "--> Warning: failed to create %s, err=%s" % (gsp, err) try: bqutil.load_data_to_table(self.gipdataset, self.giptable, gsp, the_schema) except Exception as err: print "---> ERROR: failed to load %s into BigQuery %s.%s, err=%s" % (gsp, self.gipdataset, self.giptable, err) print "---> Continuing anyway" sys.stdout.flush()
def make_gp_table(course_id, basedir=None, datedir=None, use_dataset_latest=False, verbose=False, pin_date=None): if pin_date: datedir = pin_date sdir = load_course_sql.find_course_sql_dir(course_id, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest and not pin_date), ) fn_to_try = ['course.xml.tar.gz', 'course-prod-analytics.xml.tar.gz', 'course-prod-edge-analytics.xml.tar.gz', 'course-prod-edx-replica.xml.tar.gz', ] for fntt in fn_to_try: fn = sdir / fntt if os.path.exists(fn): break if not os.path.exists(fn): msg = "---> oops, cannot get couese content (with grading policy file) for %s, file %s (or 'course.xml.tar.gz' or 'course-prod-edge-analytics.xml.tar.gz') missing!" % (course_id, fn) raise Exception(msg) gpstr, gpfn = read_grading_policy_from_tar_file(fn) fields, gptab, schema = load_grading_policy(gpstr, verbose=verbose, gpfn=gpfn) ofnb = 'grading_policy.csv' ofn = sdir / ofnb ofp = open(ofn, 'w') cdw = csv.DictWriter(ofp, fieldnames=fields) cdw.writeheader() cdw.writerows(gptab) ofp.close() # upload to google storage gsdir = path(gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)) gsutil.upload_file_to_gs(ofn, gsdir / ofnb, verbose=False) # import into BigQuery table = "grading_policy" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.load_data_to_table(dataset, table, gsdir / ofnb, schema, format='csv', skiprows=1)
def do_combine( course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, extract_subset_tables=True, ): ''' Combine individual person_course tables (from the set of specified course_id's) to create one single large person_course table. Do this by downloading each file, checking to make sure they all have the same fields, concatenating, and uploading back to bigquery. This is cheaper than doing a select *, and also uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large result sizes). The result is stored in the course_report_latest dataset (if use_dataset_latest), else in course_report_ORG, where ORG is the configured organization name. If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv. (those are created using a select *, for efficiency, despite the cost). ''' print "=" * 77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-" * 77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip > 0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/', 1)[0] ofn = "person_course_%s_%s.csv" % ( org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "=" * 77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip > 1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 header = None for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) header = os.popen("zcat %s | head -1" % zfn).read().strip() firstfn = zfn else: cmd = "zcat %s | tail -n +2 >> %s" % ( zfn, ofn ) # first row is header; don't keep when concatenating print cmd first = 0 new_header = os.popen("zcat %s | head -1" % zfn).read().strip() if not header == new_header: print "==> Warning! header mismatch for %s vs %s" % (zfn, firstfn) print " %s has: %s" % (firstfn, header) print " but %s has: %s" % (zfn, new_header) sys.stdout.flush() os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "=" * 77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link( gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-', '_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "=" * 100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) # copy the new table (which has a specific date in its name) to a generically named "person_course_latest" # so that future SQL queries can simply use this as the latest person course table print "-> Copying %s to %s.person_course_latest" % (table, dataset) bqutil.copy_bq_table(dataset, table, "person_course_latest") if extract_subset_tables: do_extract_subset_person_course_tables(dataset, table) print "Done" sys.stdout.flush()
def analyze_course_content( course_id, listings_file=None, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, do_upload=False, courses=None, verbose=True, pin_date=None, ): ''' Compute course_content table, which quantifies: - number of chapter, sequential, vertical modules - number of video modules - number of problem, *openended, mentoring modules - number of dicussion, annotatable, word_cloud modules Do this using the course "xbundle" file, produced when the course axis is computed. Include only modules which had nontrivial use, to rule out the staff and un-shown content. Do the exclusion based on count of module appearing in the studentmodule table, based on stats_module_usage for each course. Also, from the course listings file, compute the number of weeks the course was open. If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset as the "stats_course_content" table. Also generate a "course_summary_stats" table, stored in the course_report_ORG or course_report_latest dataset. The course_summary_stats table combines data from many reports,, including stats_course_content, the medians report, the listings file, broad_stats_by_course, and time_on_task_stats_by_course. ''' if do_upload: if use_dataset_latest: org = "latest" else: org = courses[0].split( '/', 1)[0] # extract org from first course_id in courses crname = 'course_report_%s' % org gspath = gsutil.gs_path_from_course_id(crname) gsfnp = gspath / CCDATA gsutil.upload_file_to_gs(CCDATA, gsfnp) tableid = "stats_course_content" dataset = crname mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath try: the_schema = json.loads(open(SCHEMA_FILE).read())[tableid] except Exception as err: print "Oops! Failed to load schema file for %s. Error: %s" % ( tableid, str(err)) raise if 0: bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1) table = 'course_metainfo' course_tables = ',\n'.join([ ('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses ]) sql = "select * from {course_tables}".format( course_tables=course_tables) print "--> Creating %s.%s using %s" % (dataset, table, sql) if 1: metainfo_dataset = bqutil.get_bq_table( dataset, table, sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), ) # bqutil.create_bq_table(dataset, table, sql, overwrite=True) #----------------------------------------------------------------------------- # make course_summary_stats table # # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo. # Also use (and create if necessary) the nregistered_by_wrap table. # get the broad_stats_by_course data bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course') table_list = bqutil.get_list_of_table_ids(dataset) latest_person_course = max( [x for x in table_list if x.startswith('person_course_')]) print "Latest person_course table in %s is %s" % (dataset, latest_person_course) sql = """ SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, FROM [{dataset}.{person_course}] as pc left join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' )) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id """.format(dataset=dataset, person_course=latest_person_course) nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'}) # rates for registrants before and during course sql = """ SELECT *, ncertified / nregistered * 100 as pct_certified_of_reg, ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch, ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course, ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap, ncertified / nviewed * 100 as pct_certified_of_viewed, ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap, ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap, FROM ( # ------------------------ # get aggregate data SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.certified then 1 else 0 end) ncertified, sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap, sum(case when pc.viewed then 1 else 0 end) nviewed, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap, sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap, sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch, sum(case when pc.start_time < cminfo.launch_date and pc.certified then 1 else 0 end) ncertified_and_registered_before_launch, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) and pc.certified then 1 else 0 end) ncertified_and_registered_during_course, FROM [{dataset}.{person_course}] as pc left join ( # -------------------- # get course launch and wrap dates from course_metainfo SELECT AA.course_id as course_id, AA.wrap_date as wrap_date, AA.launch_date as launch_date, BB.ewrap_date as ewrap_date, FROM ( # inner get course launch and wrap dates from course_metainfo SELECT A.course_id as course_id, A.wrap_date as wrap_date, B.launch_date as launch_date, from ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' ) ) as A left outer join ( SELECT course_id, TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month, regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day, regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year, FROM [{dataset}.course_metainfo] where key='listings_Course Launch' ) ) as B on A.course_id = B.course_id # end inner course_metainfo subquery ) as AA left outer join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Empirical Course Wrap' ) ) as BB on AA.course_id = BB.course_id # end course_metainfo subquery # -------------------- ) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id # ---- end get aggregate data ) order by course_id """.format(dataset=dataset, person_course=latest_person_course) print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration' sys.stdout.flush() cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, newer_than=datetime.datetime( 2015, 1, 16, 3, 0), key={'name': 'course_id'}) # start assembling course_summary_stats c_sum_stats = defaultdict(OrderedDict) for entry in bsbc['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] cmci.update(entry) cnbw = nr_by_wrap['data_by_key'][course_id] nbw = int(cnbw['nregistered_by_wrap']) cmci['nbw_wrap_date'] = cnbw['wrap_date'] cmci['nregistered_by_wrap'] = nbw cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct'] cmci['frac_female'] = float(entry['n_female_viewed']) / (float( entry['n_male_viewed']) + float(entry['n_female_viewed'])) ncert = float(cmci['certified_sum']) if ncert: cmci[ 'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0 else: cmci['certified_of_nregistered_by_wrap_pct'] = None cbr = cert_by_reg['data_by_key'][course_id] for field, value in cbr.items(): cmci['cbr_%s' % field] = value # add medians for viewed, explored, and certified msbc_tables = { 'msbc_viewed': "viewed_median_stats_by_course", 'msbc_explored': 'explored_median_stats_by_course', 'msbc_certified': 'certified_median_stats_by_course', 'msbc_verified': 'verified_median_stats_by_course', } for prefix, mtab in msbc_tables.items(): print "--> Merging median stats data from %s" % mtab sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, mtab) for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): cmci['%s_%s' % (prefix, field)] = value # add time on task data tot_table = "time_on_task_stats_by_course" prefix = "ToT" print "--> Merging time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # add serial time on task data tot_table = "time_on_task_serial_stats_by_course" prefix = "SToT" print "--> Merging serial time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # add show_answer stats tot_table = "show_answer_stats_by_course" prefix = "SAS" print "--> Merging show_answer stats data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field == 'course_id': continue cmci['%s_%s' % (prefix, field)] = value # setup list of keys, for CSV output css_keys = c_sum_stats.values()[0].keys() # retrieve course_metainfo table, pivot, add that to summary_stats print "--> Merging course_metainfo from %s" % table sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, table) listings_keys = map(make_key, [ "Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", "Course Number", "Short Title", "Andrew's Short Titles", "Title", "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id", "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade", '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)" ]) listings_keys.reverse() for lk in listings_keys: css_keys.insert(1, "listings_%s" % lk) COUNTS_TO_KEEP = [ 'discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 'option', 'lti', 'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 'options', 'comment_prompt', 'conditional', 'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 'customtag', 'transcript', 'split_test', 'word_cloud', 'openended', 'openendedparam', 'answer_display', 'code', 'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 'textannotation', 'imageannotation', 'videosequence', 'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion' ] for entry in bqdat['data']: thekey = make_key(entry['key']) # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP: # continue if thekey.startswith( 'listings_') and thekey[9:] not in listings_keys: # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id']) continue c_sum_stats[entry['course_id']][thekey] = entry['value'] #if 'certifies' in thekey: # print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value']) if thekey not in css_keys: css_keys.append(thekey) # compute forum_posts_per_week for course_id, entry in c_sum_stats.items(): nfps = entry.get('nforum_posts_sum', 0) if nfps: fppw = int(nfps) / float(entry['nweeks']) entry['nforum_posts_per_week'] = fppw print " course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % ( course_id, entry['total_assessments_per_week'], fppw) else: entry['nforum_posts_per_week'] = None css_keys.append('nforum_posts_per_week') # read in listings file and merge that in also if listings_file: if listings_file.endswith('.csv'): listings = csv.DictReader(open(listings_file)) else: listings = [json.loads(x) for x in open(listings_file)] for entry in listings: course_id = entry['course_id'] if course_id not in c_sum_stats: continue cmci = c_sum_stats[course_id] for field, value in entry.items(): lkey = "listings_%s" % make_key(field) if not (lkey in cmci) or (not cmci[lkey]): cmci[lkey] = value print "Storing these fields: %s" % css_keys # get schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read()) schema_dict = {x['name']: x for x in the_schema} # write out CSV css_table = "course_summary_stats" ofn = "%s__%s.csv" % (dataset, css_table) ofn2 = "%s__%s.json" % (dataset, css_table) print "Writing data to %s and %s" % (ofn, ofn2) ofp = open(ofn, 'w') ofp2 = open(ofn2, 'w') dw = csv.DictWriter(ofp, fieldnames=css_keys) dw.writeheader() for cid, entry in c_sum_stats.items(): for ek in entry: if ek not in schema_dict: entry.pop(ek) # entry[ek] = str(entry[ek]) # coerce to be string ofp2.write(json.dumps(entry) + "\n") for key in css_keys: if key not in entry: entry[key] = None dw.writerow(entry) ofp.close() ofp2.close() # upload to bigquery # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ] if 1: gsfnp = gspath / dataset / (css_table + ".json") gsutil.upload_file_to_gs(ofn2, gsfnp) # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False, # format='csv', skiprows=1) bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False) return print "-" * 60 + " %s" % course_id # get nweeks from listings lfn = path(listings_file) if not lfn.exists(): print "[analyze_content] course listings file %s doesn't exist!" % lfn return data = None if listings_file.endswith('.json'): data_feed = map(json.loads, open(lfn)) else: data_feed = csv.DictReader(open(lfn)) for k in data_feed: if not 'course_id' in k: print "Strange course listings row, no course_id in %s" % k raise Exception("Missing course_id") if k['course_id'] == course_id: data = k break if not data: print "[analyze_content] no entry for %s found in course listings file %s!" % ( course_id, lfn) return def date_parse(field): (m, d, y) = map(int, data[field].split('/')) return datetime.datetime(y, m, d) launch = date_parse('Course Launch') wrap = date_parse('Course Wrap') ndays = (wrap - launch).days nweeks = ndays / 7.0 print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays) if pin_date: datedir = pin_date course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest and not pin_date) cfn = gsutil.path_from_course_id(course_id) xbfn = course_dir / ("xbundle_%s.xml" % cfn) if not xbfn.exists(): print "[analyze_content] cannot find xbundle file %s for %s!" % ( xbfn, course_id) if use_dataset_latest: # try looking in earlier directories for xbundle file import glob spath = course_dir / ("../*/xbundle_%s.xml" % cfn) files = list(glob.glob(spath)) if files: xbfn = path(files[-1]) if not xbfn.exists(): print " --> also cannot find any %s ; aborting!" % spath else: print " --> Found and using instead: %s " % xbfn if not xbfn.exists(): raise Exception("[analyze_content] missing xbundle file %s" % xbfn) # if there is an xbundle*.fixed file, use that instead of the normal one if os.path.exists(str(xbfn) + ".fixed"): xbfn = path(str(xbfn) + ".fixed") print "[analyze_content] For %s using %s" % (course_id, xbfn) # get module usage data mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest) xml = etree.parse(open(xbfn)).getroot() counts = defaultdict(int) nexcluded = defaultdict(int) IGNORE = [ 'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title' ] problem_stats = defaultdict(int) def does_problem_have_random_script(problem): ''' return 1 if problem has a script with "random." in it else return 0 ''' for elem in problem.findall('.//script'): if elem.text and ('random.' in elem.text): return 1 return 0 # walk through xbundle def walk_tree(elem, policy=None): ''' Walk XML tree recursively. elem = current element policy = dict of attributes for children to inherit, with fields like due, graded, showanswer ''' policy = policy or {} if type(elem.tag) == str and (elem.tag.lower() not in IGNORE): counts[elem.tag.lower()] += 1 if elem.tag in [ "sequential", "problem", "problemset", "course", "chapter" ]: # very old courses may use inheritance from course & chapter keys = ["due", "graded", "format", "showanswer", "start"] for k in keys: # copy inheritable attributes, if they are specified val = elem.get(k) if val: policy[k] = val if elem.tag == "problem": # accumulate statistics about problems: how many have show_answer = [past_due, closed] ? have random. in script? problem_stats['n_capa_problems'] += 1 if policy.get('showanswer'): problem_stats["n_showanswer_%s" % policy.get('showanswer')] += 1 else: problem_stats[ 'n_shownanswer_finished'] += 1 # DEFAULT showanswer = finished (make sure this remains true) # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118 # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed. problem_stats[ 'n_random_script'] += does_problem_have_random_script(elem) if policy.get('graded') == 'true' or policy.get( 'graded') == 'True': problem_stats['n_capa_problems_graded'] += 1 problem_stats[ 'n_graded_random_script'] += does_problem_have_random_script( elem) if policy.get('showanswer'): problem_stats["n_graded_showanswer_%s" % policy.get('showanswer')] += 1 else: problem_stats[ 'n_graded_shownanswer_finished'] += 1 # DEFAULT showanswer = finished (make sure this remains true) for k in elem: midfrag = (k.tag, k.get('url_name_orig', None)) if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20: nexcluded[k.tag] += 1 if verbose: try: print " -> excluding %s (%s), ncount=%s" % ( k.get('display_name', '<no_display_name>').encode('utf8'), midfrag, mudata.get(midfrag, {}).get('ncount')) except Exception as err: print " -> excluding ", k continue walk_tree(k, policy.copy()) walk_tree(xml) print "--> Count of individual element tags throughout XML: ", counts print "--> problem_stats:", json.dumps(problem_stats, indent=4) # combine some into "qual_axis" and others into "quant_axis" qual_axis = [ 'openassessment', 'optionresponse', 'multiplechoiceresponse', # 'discussion', 'choiceresponse', 'word_cloud', 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti' ] quant_axis = [ 'formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse', 'imageresponse' ] nqual = 0 nquant = 0 for tag, count in counts.items(): if tag in qual_axis: nqual += count if tag in quant_axis: nquant += count print "nqual=%d, nquant=%d" % (nqual, nquant) nqual_per_week = nqual / nweeks nquant_per_week = nquant / nweeks total_per_week = nqual_per_week + nquant_per_week print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % ( nqual_per_week, nquant_per_week, total_per_week) # save this overall data in CCDATA lock_file(CCDATA) ccdfn = path(CCDATA) ccd = {} if ccdfn.exists(): for k in csv.DictReader(open(ccdfn)): ccd[k['course_id']] = k ccd[course_id] = { 'course_id': course_id, 'nweeks': nweeks, 'nqual_per_week': nqual_per_week, 'nquant_per_week': nquant_per_week, 'total_assessments_per_week': total_per_week, } # fields = ccd[ccd.keys()[0]].keys() fields = [ 'course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks' ] cfp = open(ccdfn, 'w') dw = csv.DictWriter(cfp, fieldnames=fields) dw.writeheader() for cid, entry in ccd.items(): dw.writerow(entry) cfp.close() lock_file(CCDATA, release=True) # store data in course_metainfo table, which has one (course_id, key, value) on each line # keys include nweeks, nqual, nquant, count_* for module types * cmfields = OrderedDict() cmfields['course_id'] = course_id cmfields['course_length_days'] = str(ndays) cmfields.update( {make_key('listings_%s' % key): value for key, value in data.items()}) # from course listings cmfields.update(ccd[course_id].copy()) # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() }) # from content counts cmfields['filename_xbundle'] = xbfn cmfields['filename_listings'] = lfn for key in sorted( counts ): # store counts in sorted order, so that the later generated CSV file can have a predictable structure value = counts[key] cmfields['count_%s' % key] = str(value) # from content counts for key in sorted(problem_stats): # store problem stats value = problem_stats[key] cmfields['problem_stat_%s' % key] = str(value) cmfields.update({('nexcluded_sub_20_%s' % key): str(value) for key, value in nexcluded.items() }) # from content counts course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / CMINFO # manual overriding of the automatically computed fields can be done by storing course_id,key,value data # in the CMINFO_OVERRIDES file csvfn_overrides = course_dir / CMINFO_OVERRIDES if csvfn_overrides.exists(): print "--> Loading manual override information from %s" % csvfn_overrides for ovent in csv.DictReader(open(csvfn_overrides)): if not ovent['course_id'] == course_id: print "===> ERROR! override file has entry with wrong course_id: %s" % ovent continue print " overriding key=%s with value=%s" % (ovent['key'], ovent['value']) cmfields[ovent['key']] = ovent['value'] print "--> Course metainfo writing to %s" % csvfn fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value']) cdw.writeheader() for k, v in cmfields.items(): cdw.writerow({'course_id': course_id, 'key': k, 'value': v}) fp.close() # build and output course_listings_and_metainfo dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) mypath = os.path.dirname(os.path.realpath(__file__)) clm_table = "course_listing_and_metainfo" clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table) clm_schema = json.loads(open(clm_schema_file).read()) clm = {} for finfo in clm_schema: field = finfo['name'] clm[field] = cmfields.get(field) clm_fnb = clm_table + ".json" clm_fn = course_dir / clm_fnb open(clm_fn, 'w').write(json.dumps(clm)) gsfnp = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / clm_fnb print "--> Course listing + metainfo uploading to %s then to %s.%s" % ( gsfnp, dataset, clm_table) sys.stdout.flush() gsutil.upload_file_to_gs(clm_fn, gsfnp) bqutil.load_data_to_table(dataset, clm_table, gsfnp, clm_schema, wait=True, verbose=False) # output course_metainfo table = 'course_metainfo' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) gsfnp = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / CMINFO print "--> Course metainfo uploading to %s then to %s.%s" % ( gsfnp, dataset, table) sys.stdout.flush() gsutil.upload_file_to_gs(csvfn, gsfnp) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())[table] bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
def load_sql_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir="2014-09-21", do_gs_copy=False, use_dataset_latest=False): ''' Load SQL files into google cloud storage then import into BigQuery. Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "." If use_dataset_latest then "_latest" is appended to the dataset name. Thus, the latest SQL dataset can always be put in a consistently named dataset. ''' print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() # convert studentmodule if necessary fn_sm = lfp / 'studentmodule.csv.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.csv' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql' if not fn_sm.exists(): print "Error! Missing studentmodule.[sql,csv][.gz]" if fn_sm.exists(): # have .sql or .sql.gz version: convert to .csv newfn = lfp / 'studentmodule.csv.gz' print "--> Converting %s to %s" % (fn_sm, newfn) tsv2csv(fn_sm, newfn) fn_sm = newfn if fn_sm.exists(): # rephrase studentmodule if it's using opaque keys fline = '' smfp = openfile(fn_sm) fline = smfp.readline() # skip first line - it's a header fline = smfp.readline() if 'block-v1:' in fline or 'course-v1' in fline: rephrase_studentmodule_opaque_keys(fn_sm) def convert_sql(fnroot): if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot + ".csv.gz"): return if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) # convert sql files if necesssary fnset = [ 'users', 'certificates', 'enrollment', "profiles", 'user_id_map', 'rolecourse', 'roleforum' ] for fn in fnset: convert_sql(lfp / fn) local_files = glob.glob(lfp / '*') # if using latest date directory, also look for course_image.jpg one level up if use_dataset_latest: print lfp.dirname() ci_files = glob.glob(lfp.dirname() / 'course_image.jpg') if ci_files: local_files += list(ci_files) print "--> local course_image file: %s" % ci_files gsdir = gsutil.gs_path_from_course_id( course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest) local = pytz.timezone("America/New_York") if do_gs_copy: try: fnset = get_gs_file_list(gsdir) except Exception as err: fnset = [] def copy_if_newer(fn, fnset, options='-z csv,json'): statbuf = os.stat(fn) mt = datetime.datetime.fromtimestamp(statbuf.st_mtime) # do some date checking to upload files which have changed, and are newer than that on google cloud storage local_dt = local.localize(mt, is_dst=None) utc_dt = local_dt.astimezone(pytz.utc) fnb = os.path.basename(fn) if fnb in fnset and fnset[fnb]['date'] > utc_dt: print "...%s already copied, skipping" % fn sys.stdout.flush() return elif fnb in fnset: print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % ( fn, fnset[fnb]['date'], mt) gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True) for fn in local_files: fnb = os.path.basename(fn) if fnb == 'course_image.jpg': copy_if_newer(fn, fnset, options='-a public-read') if not (fnb.endswith('.csv') or fnb.endswith('.json') or fnb.endswith('.csv.gz') or fnb.endswith('.json.gz') or fnb.endswith('.mongo.gz')): print "...unknown file type %s, skipping" % fn sys.stdout.flush() continue copy_if_newer(fn, fnset) # load into bigquery dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) mypath = os.path.dirname(os.path.realpath(__file__)) # load user_info_combo uicfn = lfp / 'user_info_combo.json.gz' if uicfn.exists(): uic_schema = json.loads( open('%s/schemas/schema_user_info_combo.json' % mypath).read())['user_info_combo'] bqutil.load_data_to_table(dataset, 'user_info_combo', gsdir / "user_info_combo.json.gz", uic_schema, wait=False) else: print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn # load studentmodule if fn_sm.exists(): schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read()) cwsm_schema = schemas['courseware_studentmodule'] bqutil.load_data_to_table(dataset, 'studentmodule', gsdir / fn_sm.basename(), cwsm_schema, format='csv', wait=False, skiprows=1) else: print "--> Not loading studentmodule: file %s not found" % fn_sm
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False, check_dates=True): ''' Load daily tracking logs for course from google storage into BigQuery. If wait=True then waits for loading jobs to be completed. It's desirable to wait if subsequent jobs which need these tables (like person_day) are to be run immediately afterwards. ''' print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() gsroot = gsutil.path_from_course_id(course_id) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA = json.loads(open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log'] gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot) fnset = gsutil.get_gs_file_list(gsdir) dataset = bqutil.course_id2dataset(gsroot, dtype="logs") # create this dataset if necessary bqutil.create_dataset_if_nonexistent(dataset) tables = bqutil.get_list_of_table_ids(dataset) tables = [x for x in tables if x.startswith('track')] if verbose: print "-"*77 print "current tables loaded:", json.dumps(tables, indent=4) print "files to load: ", json.dumps(fnset.keys(), indent=4) print "-"*77 sys.stdout.flush() for fn, fninfo in fnset.iteritems(): if int(fninfo['size'])<=45: print "Zero size file %s, skipping" % fn continue m = re.search('(\d\d\d\d-\d\d-\d\d)', fn) if not m: continue date = m.group(1) tablename = "tracklog_%s" % date.replace('-','') # YYYYMMDD for compatibility with table wildcards # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True) file_date = fninfo['date'].replace(tzinfo=None) if tablename in tables: skip = True if check_dates: table_date = bqutil.get_bq_table_last_modified_datetime(dataset, tablename) if not (table_date > file_date): print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (tablename, fn, file_date, table_date) skip = False if skip: if verbose: print "Already have table %s, skipping file %s" % (tablename, fn) sys.stdout.flush() continue #if date < '2014-07-27': # continue print "Loading %s into table %s " % (fn, tablename) if verbose: print "start [%s]" % datetime.datetime.now() sys.stdout.flush() gsfn = fninfo['name'] ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000) if verbose: print "-" * 77 print "done with %s [%s]" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def load_sql_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir="2014-09-21", do_gs_copy=False, use_dataset_latest=False): ''' Load SQL files into google cloud storage then import into BigQuery. Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "." If use_dataset_latest then "_latest" is appended to the dataset name. Thus, the latest SQL dataset can always be put in a consistently named dataset. ''' print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() # convert studentmodule if necessary fn_sm = lfp / 'studentmodule.csv.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.csv' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql' if not fn_sm.exists(): print "Error! Missing studentmodule.[sql,csv][.gz]" if fn_sm.exists(): # have .sql or .sql.gz version: convert to .csv newfn = lfp / 'studentmodule.csv.gz' print "--> Converting %s to %s" % (fn_sm, newfn) tsv2csv(fn_sm, newfn) fn_sm = newfn if fn_sm.exists(): # rephrase studentmodule if it's using opaque keys fline = '' smfp = openfile(fn_sm) fline = smfp.readline() # skip first line - it's a header fline = smfp.readline() if 'block-v1:' in fline or 'course-v1' in fline: rephrase_studentmodule_opaque_keys(fn_sm) def convert_sql(fnroot): if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot + ".csv.gz"): return if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) # convert sql files if necesssary fnset = ['users', 'certificates', 'enrollment', "profiles", 'user_id_map', 'rolecourse', 'roleforum'] for fn in fnset: convert_sql(lfp / fn) local_files = glob.glob(lfp / '*') # if using latest date directory, also look for course_image.jpg one level up if use_dataset_latest: print lfp.dirname() ci_files = glob.glob(lfp.dirname() / 'course_image.jpg') if ci_files: local_files += list(ci_files) print "--> local course_image file: %s" % ci_files gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest) local = pytz.timezone ("America/New_York") if do_gs_copy: try: fnset = get_gs_file_list(gsdir) except Exception as err: fnset = [] def copy_if_newer(fn, fnset, options='-z csv,json'): statbuf = os.stat(fn) mt = datetime.datetime.fromtimestamp(statbuf.st_mtime) # do some date checking to upload files which have changed, and are newer than that on google cloud storage local_dt = local.localize(mt, is_dst=None) utc_dt = local_dt.astimezone (pytz.utc) fnb = os.path.basename(fn) if fnb in fnset and fnset[fnb]['date'] > utc_dt: print "...%s already copied, skipping" % fn sys.stdout.flush() return elif fnb in fnset: print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, fnset[fnb]['date'], mt) gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True) for fn in local_files: fnb = os.path.basename(fn) if fnb=='course_image.jpg': copy_if_newer(fn, fnset, options='-a public-read') if not (fnb.endswith('.csv') or fnb.endswith('.json') or fnb.endswith('.csv.gz') or fnb.endswith('.json.gz') or fnb.endswith('.mongo.gz')): print "...unknown file type %s, skipping" % fn sys.stdout.flush() continue copy_if_newer(fn, fnset) # load into bigquery dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) mypath = os.path.dirname(os.path.realpath(__file__)) # load user_info_combo uicfn = lfp / 'user_info_combo.json.gz' if uicfn.exists(): uic_schema = json.loads(open('%s/schemas/schema_user_info_combo.json' % mypath).read())['user_info_combo'] bqutil.load_data_to_table(dataset, 'user_info_combo', gsdir / "user_info_combo.json.gz", uic_schema, wait=False) else: print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn # load studentmodule if fn_sm.exists(): schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read()) cwsm_schema = schemas['courseware_studentmodule'] bqutil.load_data_to_table(dataset, 'studentmodule', gsdir / fn_sm.basename(), cwsm_schema, format='csv', wait=False, skiprows=1) else: print "--> Not loading studentmodule: file %s not found" % fn_sm
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, extract_subset_tables=True, ): ''' Combine individual person_course tables (from the set of specified course_id's) to create one single large person_course table. Do this by downloading each file, checking to make sure they all have the same fields, concatenating, and uploading back to bigquery. This is cheaper than doing a select *, and also uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large result sizes). The result is stored in the course_report_latest dataset (if use_dataset_latest), else in course_report_ORG, where ORG is the configured organization name. If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv. (those are created using a select *, for efficiency, despite the cost). ''' print "="*77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-"*77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip>0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/',1)[0] ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "="*77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip>1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 header = None for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) header = os.popen("zcat %s | head -1" % zfn).read().strip() firstfn = zfn else: cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn) # first row is header; don't keep when concatenating print cmd first = 0 new_header = os.popen("zcat %s | head -1" % zfn).read().strip() if not header == new_header: print "==> Warning! header mismatch for %s vs %s" % (zfn, firstfn) print " %s has: %s" % (firstfn, header) print " but %s has: %s" % (zfn, new_header) sys.stdout.flush() os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "="*77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-','_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "="*100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) # copy the new table (which has a specific date in its name) to a generically named "person_course_latest" # so that future SQL queries can simply use this as the latest person course table print "-> Copying %s to %s.person_course_latest" % (table, dataset) bqutil.copy_bq_table(dataset, table, "person_course_latest") if extract_subset_tables: do_extract_subset_person_course_tables(dataset, table) print "Done" sys.stdout.flush()
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, ): print "="*77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-"*77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip>0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/',1)[0] ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "="*77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip>1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) else: cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn) # first row is header; don't keep when concatenating print cmd first = 0 os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "="*77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-','_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "="*100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) print "Done" sys.stdout.flush()
def upload_grades_persistent_data(cid, basedir, datedir, use_dataset_latest=False, subsection=False): """ Upload grades_persistent csv.gz to Google Storage, create the BigQuery table, then insert the data into the table. :param cid: the course id :param basedir: the base directory path :param datedir: the date directory name (represented as YYYY-MM-DD) :param use_dataset_latest: should the most recent dataset be used? :param subsection: should grades_persistentsubsection be uploaded? :type cid: str :type basedir: str :type datedir: str :type use_dataset_latest: bool :type subsection: bool """ gsdir = path( gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if subsection: csv_name = "grades_persistentsubsectiongrade.csv.gz" temp_name = "grades_persistentsubsectiongrade_temp.csv.gz" table = "grades_persistent_subsection" else: csv_name = "grades_persistentcoursegrade.csv.gz" temp_name = "grades_persistentcoursegrade_temp.csv.gz" table = "grades_persistent" sdir = load_course_sql.find_course_sql_dir( cid, basedir=basedir, datedir=datedir, use_dataset_latest=(use_dataset_latest), ) csvfn = sdir / csv_name tempfn = sdir / temp_name mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads( open('%s/schemas/schema_%s.json' % (mypath, table)).read())[table] if not os.path.exists(csvfn): print "[edx2bigquery] make_grades_persistent: missing file %s, skipping" % csvfn return if not subsection: cleanup_rows_from_grade_persistent(csvfn, tempfn) else: cleanup_rows_from_grade_persistent(csvfn, tempfn, field_to_fix="first_attempted") gsutil.upload_file_to_gs(csvfn, gsdir, options="-z csv", verbose=True) dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent( dataset) # create dataset if not already existent bqutil.load_data_to_table(dataset, table, gsdir / csv_name, the_schema, format="csv", skiprows=1)
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest): ''' Create Video stats for Videos Viewed and Videos Watched. First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched ''' assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable" # Get Course Dir path basedir = path(basedir or '') course_dir = course_id.replace('/', '__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) # get schema mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/%s' % (mypath, SCHEMA_VIDEO_AXIS) the_schema = json.loads(open(SCHEMA_FILE).read())[SCHEMA_VIDEO_AXIS_NAME] the_dict_schema = schema2dict(the_schema) # Create initial video axis videoAxisExists = False dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) va_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS) assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS) videoAxisExists = True va_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: print "%s --> Attempting to process %s table" % (str(err), TABLE_VIDEO_AXIS) sys.stdout.flush() # get course axis time ca_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS) ca_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: pass if videoAxisExists and (not force_recompute) and ca_date and va_date and ( ca_date > va_date): force_recompute = True print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % ( va_date, ca_date) sys.stdout.flush() if not videoAxisExists or force_recompute: force_recompute = True createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) # Get video lengths va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS) assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table" va_bqdata = va['data'] fileoutput = lfp / FILENAME_VIDEO_AXIS getYoutubeDurations(dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute) # upload and import video axis gsfn = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS gsutil.upload_file_to_gs(fileoutput, gsfn) table = TABLE_VIDEO_AXIS bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True) else: print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS) # Lastly, create video stats createVideoStats_day(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) createVideoStats(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)
def analyze_course_content(course_id, listings_file=None, basedir="X-Year-2-data-sql", datedir="2013-09-21", use_dataset_latest=False, do_upload=False, courses=None, verbose=True, ): ''' Compute course_content table, which quantifies: - number of chapter, sequential, vertical modules - number of video modules - number of problem, *openended, mentoring modules - number of dicussion, annotatable, word_cloud modules Do this using the course "xbundle" file, produced when the course axis is computed. Include only modules which had nontrivial use, to rule out the staff and un-shown content. Do the exclusion based on count of module appearing in the studentmodule table, based on stats_module_usage for each course. Also, from the course listings file, compute the number of weeks the course was open. If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset as the "stats_course_content" table. Also generate a "course_summary_stats" table, stored in the course_report_ORG or course_report_latest dataset. The course_summary_stats table combines data from many reports,, including stats_course_content, the medians report, the listings file, broad_stats_by_course, and time_on_task_stats_by_course. ''' if do_upload: if use_dataset_latest: org = "latest" else: org = courses[0].split('/',1)[0] # extract org from first course_id in courses crname = 'course_report_%s' % org gspath = gsutil.gs_path_from_course_id(crname) gsfnp = gspath / CCDATA gsutil.upload_file_to_gs(CCDATA, gsfnp) tableid = "stats_course_content" dataset = crname mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath try: the_schema = json.loads(open(SCHEMA_FILE).read())[tableid] except Exception as err: print "Oops! Failed to load schema file for %s. Error: %s" % (tableid, str(err)) raise if 0: bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1) table = 'course_metainfo' course_tables = ',\n'.join([('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses]) sql = "select * from {course_tables}".format(course_tables=course_tables) print "--> Creating %s.%s using %s" % (dataset, table, sql) if 1: metainfo_dataset = bqutil.get_bq_table(dataset, table, sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), ) # bqutil.create_bq_table(dataset, table, sql, overwrite=True) #----------------------------------------------------------------------------- # make course_summary_stats table # # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo. # Also use (and create if necessary) the nregistered_by_wrap table. # get the broad_stats_by_course data bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course') table_list = bqutil.get_list_of_table_ids(dataset) latest_person_course = max([ x for x in table_list if x.startswith('person_course_')]) print "Latest person_course table in %s is %s" % (dataset, latest_person_course) sql = """ SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, FROM [{dataset}.{person_course}] as pc left join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' )) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id """.format(dataset=dataset, person_course=latest_person_course) nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'}) # rates for registrants before and during course sql = """ SELECT *, ncertified / nregistered * 100 as pct_certified_of_reg, ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch, ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course, ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap, ncertified / nviewed * 100 as pct_certified_of_viewed, ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap, ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap, FROM ( # ------------------------ # get aggregate data SELECT pc.course_id as course_id, cminfo.wrap_date as wrap_date, count(*) as nregistered, sum(case when pc.certified then 1 else 0 end) ncertified, sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap, sum(case when pc.viewed then 1 else 0 end) nviewed, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap, sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct, sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap, sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap, sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch, sum(case when pc.start_time < cminfo.launch_date and pc.certified then 1 else 0 end) ncertified_and_registered_before_launch, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course, sum(case when (pc.start_time >= cminfo.launch_date) and (pc.start_time < cminfo.wrap_date) and pc.certified then 1 else 0 end) ncertified_and_registered_during_course, FROM [{dataset}.{person_course}] as pc left join ( # -------------------- # get course launch and wrap dates from course_metainfo SELECT AA.course_id as course_id, AA.wrap_date as wrap_date, AA.launch_date as launch_date, BB.ewrap_date as ewrap_date, FROM ( # inner get course launch and wrap dates from course_metainfo SELECT A.course_id as course_id, A.wrap_date as wrap_date, B.launch_date as launch_date, from ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Course Wrap' ) ) as A left outer join ( SELECT course_id, TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month, regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day, regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year, FROM [{dataset}.course_metainfo] where key='listings_Course Launch' ) ) as B on A.course_id = B.course_id # end inner course_metainfo subquery ) as AA left outer join ( SELECT course_id, TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date, FROM ( SELECT course_id, regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month, regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day, regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year, FROM [{dataset}.course_metainfo] where key='listings_Empirical Course Wrap' ) ) as BB on AA.course_id = BB.course_id # end course_metainfo subquery # -------------------- ) as cminfo on pc.course_id = cminfo.course_id group by course_id, wrap_date order by course_id # ---- end get aggregate data ) order by course_id """.format(dataset=dataset, person_course=latest_person_course) print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration' sys.stdout.flush() cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, newer_than=datetime.datetime(2015, 1, 16, 3, 0), key={'name': 'course_id'}) # start assembling course_summary_stats c_sum_stats = defaultdict(OrderedDict) for entry in bsbc['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] cmci.update(entry) cnbw = nr_by_wrap['data_by_key'][course_id] nbw = int(cnbw['nregistered_by_wrap']) cmci['nbw_wrap_date'] = cnbw['wrap_date'] cmci['nregistered_by_wrap'] = nbw cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct'] cmci['frac_female'] = float(entry['n_female_viewed']) / (float(entry['n_male_viewed']) + float(entry['n_female_viewed'])) ncert = float(cmci['certified_sum']) if ncert: cmci['certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0 else: cmci['certified_of_nregistered_by_wrap_pct'] = None cbr = cert_by_reg['data_by_key'][course_id] for field, value in cbr.items(): cmci['cbr_%s' % field] = value # add medians for viewed, explored, and certified msbc_tables = {'msbc_viewed': "viewed_median_stats_by_course", 'msbc_explored': 'explored_median_stats_by_course', 'msbc_certified': 'certified_median_stats_by_course', 'msbc_verified': 'verified_median_stats_by_course', } for prefix, mtab in msbc_tables.items(): print "--> Merging median stats data from %s" % mtab sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, mtab) for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): cmci['%s_%s' % (prefix, field)] = value # add time on task data tot_table = "time_on_task_stats_by_course" prefix = "ToT" print "--> Merging time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field=='course_id': continue cmci['%s_%s' % (prefix, field)] = value # add serial time on task data tot_table = "time_on_task_serial_stats_by_course" prefix = "SToT" print "--> Merging serial time on task data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field=='course_id': continue cmci['%s_%s' % (prefix, field)] = value # add show_answer stats tot_table = "show_answer_stats_by_course" prefix = "SAS" print "--> Merging show_answer stats data from %s" % tot_table sys.stdout.flush() try: bqdat = bqutil.get_table_data(dataset, tot_table) except Exception as err: bqdat = {'data': {}} for entry in bqdat['data']: course_id = entry['course_id'] cmci = c_sum_stats[course_id] for field, value in entry.items(): if field=='course_id': continue cmci['%s_%s' % (prefix, field)] = value # setup list of keys, for CSV output css_keys = c_sum_stats.values()[0].keys() # retrieve course_metainfo table, pivot, add that to summary_stats print "--> Merging course_metainfo from %s" % table sys.stdout.flush() bqdat = bqutil.get_table_data(dataset, table) def make_key(key): key = key.strip() key = key.replace(' ', '_').replace("'", "_").replace('/', '_').replace('(','').replace(')','').replace('-', '_').replace(',', '') return key listings_keys = map(make_key, ["Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", "Course Number", "Short Title", "Andrew's Short Titles", "Title", "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id", "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade", '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)" ]) listings_keys.reverse() for lk in listings_keys: css_keys.insert(1, "listings_%s" % lk) COUNTS_TO_KEEP = ['discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 'option', 'lti', 'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 'options', 'comment_prompt', 'conditional', 'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 'customtag', 'transcript', 'split_test', 'word_cloud', 'openended', 'openendedparam', 'answer_display', 'code', 'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 'textannotation', 'imageannotation', 'videosequence', 'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion'] for entry in bqdat['data']: thekey = make_key(entry['key']) # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP: # continue if thekey.startswith('listings_') and thekey[9:] not in listings_keys: # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id']) continue c_sum_stats[entry['course_id']][thekey] = entry['value'] #if 'certifies' in thekey: # print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value']) if thekey not in css_keys: css_keys.append(thekey) # compute forum_posts_per_week for course_id, entry in c_sum_stats.items(): nfps = entry.get('nforum_posts_sum', 0) if nfps: fppw = int(nfps) / float(entry['nweeks']) entry['nforum_posts_per_week'] = fppw print " course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (course_id, entry['total_assessments_per_week'], fppw) else: entry['nforum_posts_per_week'] = None css_keys.append('nforum_posts_per_week') # read in listings file and merge that in also if listings_file: if listings_file.endswith('.csv'): listings = csv.DictReader(open(listings_file)) else: listings = [ json.loads(x) for x in open(listings_file) ] for entry in listings: course_id = entry['course_id'] if course_id not in c_sum_stats: continue cmci = c_sum_stats[course_id] for field, value in entry.items(): lkey = "listings_%s" % make_key(field) if not (lkey in cmci) or (not cmci[lkey]): cmci[lkey] = value print "Storing these fields: %s" % css_keys # get schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read()) schema_dict = { x['name'] : x for x in the_schema } # write out CSV css_table = "course_summary_stats" ofn = "%s__%s.csv" % (dataset, css_table) ofn2 = "%s__%s.json" % (dataset, css_table) print "Writing data to %s and %s" % (ofn, ofn2) ofp = open(ofn, 'w') ofp2 = open(ofn2, 'w') dw = csv.DictWriter(ofp, fieldnames=css_keys) dw.writeheader() for cid, entry in c_sum_stats.items(): for ek in entry: if ek not in schema_dict: entry.pop(ek) # entry[ek] = str(entry[ek]) # coerce to be string ofp2.write(json.dumps(entry) + "\n") for key in css_keys: if key not in entry: entry[key] = None dw.writerow(entry) ofp.close() ofp2.close() # upload to bigquery # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ] if 1: gsfnp = gspath / dataset / (css_table + ".json") gsutil.upload_file_to_gs(ofn2, gsfnp) # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False, # format='csv', skiprows=1) bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False) return print "-"*60 + " %s" % course_id # get nweeks from listings lfn = path(listings_file) if not lfn.exists(): print "[analyze_content] course listings file %s doesn't exist!" % lfn return data = None for k in csv.DictReader(open(lfn)): if k['course_id']==course_id: data = k break if not data: print "[analyze_content] no entry for %s found in course listings file %s!" % (course_id, lfn) return def date_parse(field): (m, d, y) = map(int, data[field].split('/')) return datetime.datetime(y, m, d) launch = date_parse('Course Launch') wrap = date_parse('Course Wrap') ndays = (wrap - launch).days nweeks = ndays / 7.0 print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays) course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) cfn = gsutil.path_from_course_id(course_id) xbfn = course_dir / ("xbundle_%s.xml" % cfn) if not xbfn.exists(): print "[analyze_content] cannot find xbundle file %s for %s!" % (xbfn, course_id) return print "[analyze_content] For %s using %s" % (course_id, xbfn) # get module usage data mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest) xml = etree.parse(open(xbfn)).getroot() counts = defaultdict(int) nexcluded = defaultdict(int) IGNORE = ['html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title'] def walk_tree(elem): if type(elem.tag)==str and (elem.tag.lower() not in IGNORE): counts[elem.tag.lower()] += 1 for k in elem: midfrag = (k.tag, k.get('url_name_orig', None)) if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20: nexcluded[k.tag] += 1 if verbose: print " -> excluding %s (%s), ncount=%s" % (k.get('display_name', '<no_display_name>').encode('utf8'), midfrag, mudata.get(midfrag, {}).get('ncount')) continue walk_tree(k) walk_tree(xml) print counts # combine some into "qual_axis" and others into "quant_axis" qual_axis = ['openassessment', 'optionresponse', 'multiplechoiceresponse', # 'discussion', 'choiceresponse', 'word_cloud', 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti'] quant_axis = ['formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse', 'imageresponse'] nqual = 0 nquant = 0 for tag, count in counts.items(): if tag in qual_axis: nqual += count if tag in quant_axis: nquant += count print "nqual=%d, nquant=%d" % (nqual, nquant) nqual_per_week = nqual / nweeks nquant_per_week = nquant / nweeks total_per_week = nqual_per_week + nquant_per_week print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (nqual_per_week, nquant_per_week, total_per_week) # save this overall data in CCDATA lock_file(CCDATA) ccdfn = path(CCDATA) ccd = {} if ccdfn.exists(): for k in csv.DictReader(open(ccdfn)): ccd[k['course_id']] = k ccd[course_id] = {'course_id': course_id, 'nweeks': nweeks, 'nqual_per_week': nqual_per_week, 'nquant_per_week': nquant_per_week, 'total_assessments_per_week' : total_per_week, } # fields = ccd[ccd.keys()[0]].keys() fields = ['course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks'] cfp = open(ccdfn, 'w') dw = csv.DictWriter(cfp, fieldnames=fields) dw.writeheader() for cid, entry in ccd.items(): dw.writerow(entry) cfp.close() lock_file(CCDATA, release=True) # store data in course_metainfo table, which has one (course_id, key, value) on each line # keys include nweeks, nqual, nquant, count_* for module types * cmfields = OrderedDict() cmfields['course_id'] = course_id cmfields['course_length_days'] = str(ndays) cmfields.update({ ('listings_%s' % key) : value for key, value in data.items() }) # from course listings cmfields.update(ccd[course_id].copy()) # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() }) # from content counts for key in sorted(counts): # store counts in sorted order, so that the later generated CSV file can have a predictable structure value = counts[key] cmfields['count_%s' % key] = str(value) # from content counts cmfields.update({ ('nexcluded_sub_20_%s' % key) : str(value) for key, value in nexcluded.items() }) # from content counts course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest) csvfn = course_dir / CMINFO # manual overriding of the automatically computed fields can be done by storing course_id,key,value data # in the CMINFO_OVERRIDES file csvfn_overrides = course_dir / CMINFO_OVERRIDES if csvfn_overrides.exists(): print "--> Loading manual override information from %s" % csvfn_overrides for ovent in csv.DictReader(open(csvfn_overrides)): if not ovent['course_id']==course_id: print "===> ERROR! override file has entry with wrong course_id: %s" % ovent continue print " overriding key=%s with value=%s" % (ovent['key'], ovent['value']) cmfields[ovent['key']] = ovent['value'] print "--> Course metainfo writing to %s" % csvfn fp = open(csvfn, 'w') cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value']) cdw.writeheader() for k, v in cmfields.items(): cdw.writerow({'course_id': course_id, 'key': k, 'value': v}) fp.close() table = 'course_metainfo' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / CMINFO print "--> Course metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, table) gsutil.upload_file_to_gs(csvfn, gsfnp) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())[table] bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
def rephrase_forum_json_for_course( course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" ofncsv = "forum.csv.gz" # To match table name in BQ ofncsv_lfp = lfp / ofncsv dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn) and os.path.exists(ofncsv_lfp): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) print "Already done %s -> %s (skipping)" % (fn, ofncsv_lfp) sys.stdout.flush() return print "Processing %s -> writing to %s and %s (%s)" % ( fn, ofn, ofncsv, datetime.datetime.now()) sys.stdout.flush() # Setup CSV header ocsv = csv.DictWriter(openfile(ofncsv, 'w'), fieldnames=SCHEMA_DICT.keys(), quoting=csv.QUOTE_NONNUMERIC) ocsv.writeheader() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') data = OrderedDict() for line in fp: cnt += 1 # Write JSON row newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) try: #Write CSV row data = json.loads(newline) ocsv.writerow(data) except Exception as err: print "Error writing CSV output row %s=%s" % (cnt, data) raise ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt == 0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False): ''' Save course axis data to bigquery cid = course_id caset = list of course axis data in dict format xbundle = XML bundle of course (everything except static files) datadir = directory where output files should be written log_msg = list of messages about processing errors and issues ''' # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis'] dict_schema = schema2dict(the_schema) caset = copy.deepcopy(caset_in) datadir = path(datadir) cafn = datadir / 'course_axis.json' xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__'))) fp = open(cafn, 'w') linecnt = 0 for ca in caset: linecnt += 1 ca['course_id'] = cid data = ca['data'] if data and not type(data)==dict: try: ca['data'] = json.loads(data) # make it native, for mongo except Exception as err: print "failed to create json for %s, error=%s" % (data, err) if ca['start'] is not None: ca['start'] = str(ca['start']) # datetime to string if ca['due'] is not None: ca['due'] = str(ca['due']) # datetime to string if (ca['data'] is None) or (ca['data']==''): ca.pop('data') check_schema(linecnt, ca, the_ds=dict_schema, coerce=True) try: # db.course_axis.insert(ca) fp.write(json.dumps(ca)+'\n') except Exception as err: print "Failed to save! Error=%s, data=%s" % (err, ca) fp.close() # upload axis.json file and course xbundle gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if 1: gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False) gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False) # import into BigQuery dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent table = "course_axis" bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema) msg = "="*100 + '\n' msg += "Course axis for %s\n" % (cid) msg += "="*100 + '\n' msg += '\n'.join(log_msg) msg = msg[:16184] # max message length 16384 bqutil.add_description_to_table(dataset, table, msg, append=True) print " Done - inserted %s records into course_axis" % len(caset)
def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest, use_latest_sql_dir): ''' Create Video stats for Videos Viewed and Videos Watched. First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched ''' assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable" # Get Course Dir path basedir = path(basedir or '') course_dir = course_id.replace('/','__') lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest or use_latest_sql_dir) # get schema mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/%s' % ( mypath, SCHEMA_VIDEO_AXIS ) the_schema = json.loads(open(SCHEMA_FILE).read())[ SCHEMA_VIDEO_AXIS_NAME ] the_dict_schema = schema2dict(the_schema) # Create initial video axis videoAxisExists = False dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) va_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS ) assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS ) videoAxisExists = True va_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: print "%s --> Attempting to process %s table" % ( str(err), TABLE_VIDEO_AXIS ) sys.stdout.flush() # get course axis time ca_date = None try: tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS ) ca_date = tinfo['lastModifiedTime'] # datetime except (AssertionError, Exception) as err: pass if videoAxisExists and (not force_recompute) and ca_date and va_date and (ca_date > va_date): force_recompute = True print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date) sys.stdout.flush() if not videoAxisExists or force_recompute: force_recompute = True createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest) # Get video lengths va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS) assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table" va_bqdata = va['data'] fileoutput = lfp / FILENAME_VIDEO_AXIS getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute ) # upload and import video axis gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS gsutil.upload_file_to_gs(fileoutput, gsfn) table = TABLE_VIDEO_AXIS bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True) else: print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS ) # Lastly, create video stats createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest ) createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest ) # also create person_course_video_watched createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
def rephrase_forum_json_for_course( course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) sys.stdout.flush() return print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now()) sys.stdout.flush() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') for line in fp: cnt += 1 newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt == 0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def rephrase_forum_json_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) sys.stdout.flush() return print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now()) sys.stdout.flush() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') for line in fp: cnt += 1 newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt==0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def do_combine( course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, ): print "=" * 77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-" * 77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip > 0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/', 1)[0] ofn = "person_course_%s_%s.csv" % ( org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "=" * 77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip > 1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) else: cmd = "zcat %s | tail -n +2 >> %s" % ( zfn, ofn ) # first row is header; don't keep when concatenating print cmd first = 0 os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "=" * 77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link( gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-', '_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "=" * 100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) print "Done" sys.stdout.flush()
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False, check_dates=True): ''' Load daily tracking logs for course from google storage into BigQuery. If wait=True then waits for loading jobs to be completed. It's desirable to wait if subsequent jobs which need these tables (like person_day) are to be run immediately afterwards. ''' print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() gsroot = gsutil.path_from_course_id(course_id) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA = json.loads( open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log'] gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot) fnset = gsutil.get_gs_file_list(gsdir) dataset = bqutil.course_id2dataset(gsroot, dtype="logs") # create this dataset if necessary bqutil.create_dataset_if_nonexistent(dataset) tables = bqutil.get_list_of_table_ids(dataset) tables = [x for x in tables if x.startswith('track')] if verbose: print "-" * 77 print "current tables loaded:", json.dumps(tables, indent=4) print "files to load: ", json.dumps(fnset.keys(), indent=4) print "-" * 77 sys.stdout.flush() for fn, fninfo in fnset.iteritems(): if int(fninfo['size']) <= 45: print "Zero size file %s, skipping" % fn continue m = re.search('(\d\d\d\d-\d\d-\d\d)', fn) if not m: continue date = m.group(1) tablename = "tracklog_%s" % date.replace( '-', '') # YYYYMMDD for compatibility with table wildcards # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True) file_date = fninfo['date'].replace(tzinfo=None) if tablename in tables: skip = True if check_dates: table_date = bqutil.get_bq_table_last_modified_datetime( dataset, tablename) if not (table_date > file_date): print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % ( tablename, fn, file_date, table_date) skip = False if skip: if verbose: print "Already have table %s, skipping file %s" % ( tablename, fn) sys.stdout.flush() continue #if date < '2014-07-27': # continue print "Loading %s into table %s " % (fn, tablename) if verbose: print "start [%s]" % datetime.datetime.now() sys.stdout.flush() gsfn = fninfo['name'] ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000) if verbose: print "-" * 77 print "done with %s [%s]" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()