def extract_logs_mongo2gs(course_id, start="2012-09-01", end="2014-09-24", verbose=False, dbname=DBNAME, collection = 'tracking_log', tracking_logs_directory="TRACKING_LOGS", ): print "extracting logs for course %s" % course_id # list of dates to dump dates = daterange(d2dt(start), d2dt(end)) if verbose: print "Dates to dump:", [x['dstr'] for x in dates] # what files already on gs? gspath = "%s/DAILY" % gs_path_from_course_id(course_id) gsfiles = get_gs_file_list(gspath) DIR = tracking_logs_directory if not os.path.exists(DIR): os.mkdir(DIR) DIR += '/' + path_from_course_id(course_id) if not os.path.exists(DIR): os.mkdir(DIR) filebuf = [] for k in range(len(dates)-1): d = dates[k] ofn = '%s/tracklog-%s.json.gz' % (DIR, d['dstr']) start = d['start'] end = d['end'] ofnb = os.path.basename(ofn) if ofnb in gsfiles: print "Already have %s, skipping" % ofnb sys.stdout.flush() continue # dump tracking log of certain date using mongoexport, if needed if not os.path.exists(ofn): # db.tracking_log.find({'course_id': "HarvardX/ER22x/2013_Spring", # 'time': { '$gte': "2013-08-01T00:00:00.000000", '$lt': "2013-08-02T00:00:00.000000" }}).count() query = '{"course_id": "%s", "time": {"$gte": "%s", "$lt": "%s" }}' % (course_id, start, end) cmd = "mongoexport -d %s -c %s -q '%s'| edx2bigquery rephrase_logs | gzip -9 > %s" % (dbname, collection, query, ofn) # print cmd os.system(cmd) upload_file_to_gs(ofn, gspath + '/' + ofnb) filebuf.append(ofn) if len(filebuf)>20: ffn = filebuf.pop(0) os.unlink(ffn) print "...Deleted %s" % ffn sys.stdout.flush()
def process_dir(course_id, gspath='gs://x-data', logs_directory="TRACKING_LOGS", verbose=True): cdir = path(logs_directory) / gsutil.path_from_course_id(course_id) print "=" * 77 print "Transferring tracking logs for %s from directory %s (start %s)" % ( course_id, cdir, datetime.datetime.now()) print "=" * 77 if not os.path.exists(cdir): print "Oops! non-existent course tracking logs directory %s" % cdir return sys.stdout.flush() cdir = path(cdir) gp = path(gspath + "/" + cdir.basename()) / 'DAILY' filelist = gsutil.get_gs_file_list(gp) # print filelist local_files = glob.glob(cdir / 'tracklog*.gz') local_files.sort() for fn in local_files: fnp = path(fn) fnb = fnp.basename() statbuf = os.stat(fn) mt = datetime.datetime.fromtimestamp(statbuf.st_mtime) # do some date checking to upload log files which have changed, and are newer than that on google cloud storage local = pytz.timezone("America/New_York") # naive = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S") local_dt = local.localize(mt, is_dst=None) utc_dt = local_dt.astimezone(pytz.utc) if fnb in filelist and filelist[fnb]['date'] > utc_dt: if verbose: print "%s already exists, skipping" % fn continue elif fnb in filelist: print "%s already exists, but has date=%s and mtime=%s, re-uploading" % ( fn, filelist[fnb]['date'], mt) cmd = 'gsutil cp %s %s' % (fn, gp + '/') print cmd sys.stdout.flush() os.system(cmd) print "done with %s (%s)" % (cdir, datetime.datetime.now()) print "-" * 77
def process_dir(course_id, gspath='gs://x-data', logs_directory="TRACKING_LOGS", verbose=True): cdir = path(logs_directory) / gsutil.path_from_course_id(course_id) print "="*77 print "Transferring tracking logs for %s from directory %s (start %s)" % (course_id, cdir, datetime.datetime.now()) print "="*77 if not os.path.exists(cdir): print "Oops! non-existent course tracking logs directory %s" % cdir return sys.stdout.flush() cdir = path(cdir) gp = path(gspath + "/" + cdir.basename()) / 'DAILY' filelist = gsutil.get_gs_file_list(gp) # print filelist local_files = glob.glob(cdir / 'tracklog*.gz') local_files.sort() for fn in local_files: fnp = path(fn) fnb = fnp.basename() statbuf = os.stat(fn) mt = datetime.datetime.fromtimestamp(statbuf.st_mtime) # do some date checking to upload log files which have changed, and are newer than that on google cloud storage local = pytz.timezone ("America/New_York") # naive = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S") local_dt = local.localize(mt, is_dst=None) utc_dt = local_dt.astimezone (pytz.utc) if fnb in filelist and filelist[fnb]['date'] > utc_dt: if verbose: print "%s already exists, skipping" % fn continue elif fnb in filelist: print "%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, filelist[fnb]['date'], mt) cmd = 'gsutil cp %s %s' % (fn, gp + '/') print cmd sys.stdout.flush() os.system(cmd) print "done with %s (%s)" % (cdir, datetime.datetime.now()) print "-"*77
def do_combine( course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, ): print "=" * 77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-" * 77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip > 0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/', 1)[0] ofn = "person_course_%s_%s.csv" % ( org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "=" * 77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip > 1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) else: cmd = "zcat %s | tail -n +2 >> %s" % ( zfn, ofn ) # first row is header; don't keep when concatenating print cmd first = 0 os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "=" * 77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link( gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-', '_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "=" * 100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) print "Done" sys.stdout.flush()
def extract_logs_mongo2gs( course_id, start="2012-09-01", end="2014-09-24", verbose=False, dbname=DBNAME, collection='tracking_log', tracking_logs_directory="TRACKING_LOGS", ): print "extracting logs for course %s" % course_id # list of dates to dump dates = daterange(d2dt(start), d2dt(end)) if verbose: print "Dates to dump:", [x['dstr'] for x in dates] # what files already on gs? gspath = "%s/DAILY" % gs_path_from_course_id(course_id) gsfiles = get_gs_file_list(gspath) DIR = tracking_logs_directory if not os.path.exists(DIR): os.mkdir(DIR) DIR += '/' + path_from_course_id(course_id) if not os.path.exists(DIR): os.mkdir(DIR) filebuf = [] for k in range(len(dates) - 1): d = dates[k] ofn = '%s/tracklog-%s.json.gz' % (DIR, d['dstr']) start = d['start'] end = d['end'] ofnb = os.path.basename(ofn) if ofnb in gsfiles: print "Already have %s, skipping" % ofnb sys.stdout.flush() continue # dump tracking log of certain date using mongoexport, if needed if not os.path.exists(ofn): # db.tracking_log.find({'course_id': "HarvardX/ER22x/2013_Spring", # 'time': { '$gte': "2013-08-01T00:00:00.000000", '$lt': "2013-08-02T00:00:00.000000" }}).count() query = '{"course_id": "%s", "time": {"$gte": "%s", "$lt": "%s" }}' % ( course_id, start, end) cmd = "mongoexport -d %s -c %s -q '%s'| edx2bigquery rephrase_logs | gzip -9 > %s" % ( dbname, collection, query, ofn) # print cmd os.system(cmd) upload_file_to_gs(ofn, gspath + '/' + ofnb) filebuf.append(ofn) if len(filebuf) > 20: ffn = filebuf.pop(0) os.unlink(ffn) print "...Deleted %s" % ffn sys.stdout.flush()
def do_combine( course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, extract_subset_tables=True, ): ''' Combine individual person_course tables (from the set of specified course_id's) to create one single large person_course table. Do this by downloading each file, checking to make sure they all have the same fields, concatenating, and uploading back to bigquery. This is cheaper than doing a select *, and also uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large result sizes). The result is stored in the course_report_latest dataset (if use_dataset_latest), else in course_report_ORG, where ORG is the configured organization name. If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv. (those are created using a select *, for efficiency, despite the cost). ''' print "=" * 77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-" * 77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip > 0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/', 1)[0] ofn = "person_course_%s_%s.csv" % ( org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "=" * 77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip > 1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 header = None for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) header = os.popen("zcat %s | head -1" % zfn).read().strip() firstfn = zfn else: cmd = "zcat %s | tail -n +2 >> %s" % ( zfn, ofn ) # first row is header; don't keep when concatenating print cmd first = 0 new_header = os.popen("zcat %s | head -1" % zfn).read().strip() if not header == new_header: print "==> Warning! header mismatch for %s vs %s" % (zfn, firstfn) print " %s has: %s" % (firstfn, header) print " but %s has: %s" % (zfn, new_header) sys.stdout.flush() os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "=" * 77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link( gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-', '_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "=" * 100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) # copy the new table (which has a specific date in its name) to a generically named "person_course_latest" # so that future SQL queries can simply use this as the latest person course table print "-> Copying %s to %s.person_course_latest" % (table, dataset) bqutil.copy_bq_table(dataset, table, "person_course_latest") if extract_subset_tables: do_extract_subset_person_course_tables(dataset, table) print "Done" sys.stdout.flush()
def load_sql_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir="2014-09-21", do_gs_copy=False, use_dataset_latest=False): ''' Load SQL files into google cloud storage then import into BigQuery. Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "." If use_dataset_latest then "_latest" is appended to the dataset name. Thus, the latest SQL dataset can always be put in a consistently named dataset. ''' print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() # convert studentmodule if necessary fn_sm = lfp / 'studentmodule.csv.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.csv' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql' if not fn_sm.exists(): print "Error! Missing studentmodule.[sql,csv][.gz]" if fn_sm.exists(): # have .sql or .sql.gz version: convert to .csv newfn = lfp / 'studentmodule.csv.gz' print "--> Converting %s to %s" % (fn_sm, newfn) tsv2csv(fn_sm, newfn) fn_sm = newfn if fn_sm.exists(): # rephrase studentmodule if it's using opaque keys fline = '' smfp = openfile(fn_sm) fline = smfp.readline() # skip first line - it's a header fline = smfp.readline() if 'block-v1:' in fline or 'course-v1' in fline: rephrase_studentmodule_opaque_keys(fn_sm) def convert_sql(fnroot): if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot + ".csv.gz"): return if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) # convert sql files if necesssary fnset = [ 'users', 'certificates', 'enrollment', "profiles", 'user_id_map', 'rolecourse', 'roleforum' ] for fn in fnset: convert_sql(lfp / fn) local_files = glob.glob(lfp / '*') # if using latest date directory, also look for course_image.jpg one level up if use_dataset_latest: print lfp.dirname() ci_files = glob.glob(lfp.dirname() / 'course_image.jpg') if ci_files: local_files += list(ci_files) print "--> local course_image file: %s" % ci_files gsdir = gsutil.gs_path_from_course_id( course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest) local = pytz.timezone("America/New_York") if do_gs_copy: try: fnset = get_gs_file_list(gsdir) except Exception as err: fnset = [] def copy_if_newer(fn, fnset, options='-z csv,json'): statbuf = os.stat(fn) mt = datetime.datetime.fromtimestamp(statbuf.st_mtime) # do some date checking to upload files which have changed, and are newer than that on google cloud storage local_dt = local.localize(mt, is_dst=None) utc_dt = local_dt.astimezone(pytz.utc) fnb = os.path.basename(fn) if fnb in fnset and fnset[fnb]['date'] > utc_dt: print "...%s already copied, skipping" % fn sys.stdout.flush() return elif fnb in fnset: print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % ( fn, fnset[fnb]['date'], mt) gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True) for fn in local_files: fnb = os.path.basename(fn) if fnb == 'course_image.jpg': copy_if_newer(fn, fnset, options='-a public-read') if not (fnb.endswith('.csv') or fnb.endswith('.json') or fnb.endswith('.csv.gz') or fnb.endswith('.json.gz') or fnb.endswith('.mongo.gz')): print "...unknown file type %s, skipping" % fn sys.stdout.flush() continue copy_if_newer(fn, fnset) # load into bigquery dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) mypath = os.path.dirname(os.path.realpath(__file__)) # load user_info_combo uicfn = lfp / 'user_info_combo.json.gz' if uicfn.exists(): uic_schema = json.loads( open('%s/schemas/schema_user_info_combo.json' % mypath).read())['user_info_combo'] bqutil.load_data_to_table(dataset, 'user_info_combo', gsdir / "user_info_combo.json.gz", uic_schema, wait=False) else: print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn # load studentmodule if fn_sm.exists(): schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read()) cwsm_schema = schemas['courseware_studentmodule'] bqutil.load_data_to_table(dataset, 'studentmodule', gsdir / fn_sm.basename(), cwsm_schema, format='csv', wait=False, skiprows=1) else: print "--> Not loading studentmodule: file %s not found" % fn_sm
def load_sql_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir="2014-09-21", do_gs_copy=False, use_dataset_latest=False): ''' Load SQL files into google cloud storage then import into BigQuery. Datasets are typically named by course_id, with "__" replacing "/", and "_" replacing "." If use_dataset_latest then "_latest" is appended to the dataset name. Thus, the latest SQL dataset can always be put in a consistently named dataset. ''' print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() # convert studentmodule if necessary fn_sm = lfp / 'studentmodule.csv.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.csv' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql.gz' if not fn_sm.exists(): fn_sm = lfp / 'studentmodule.sql' if not fn_sm.exists(): print "Error! Missing studentmodule.[sql,csv][.gz]" if fn_sm.exists(): # have .sql or .sql.gz version: convert to .csv newfn = lfp / 'studentmodule.csv.gz' print "--> Converting %s to %s" % (fn_sm, newfn) tsv2csv(fn_sm, newfn) fn_sm = newfn if fn_sm.exists(): # rephrase studentmodule if it's using opaque keys fline = '' smfp = openfile(fn_sm) fline = smfp.readline() # skip first line - it's a header fline = smfp.readline() if 'block-v1:' in fline or 'course-v1' in fline: rephrase_studentmodule_opaque_keys(fn_sm) def convert_sql(fnroot): if os.path.exists(fnroot + ".csv") or os.path.exists(fnroot + ".csv.gz"): return if os.path.exists(fnroot + ".sql") or os.path.exists(fnroot + ".sql.gz"): infn = fnroot + '.sql' outfn = fnroot + '.csv.gz' print "--> Converting %s to %s" % (infn, outfn) tsv2csv(infn, outfn) # convert sql files if necesssary fnset = ['users', 'certificates', 'enrollment', "profiles", 'user_id_map', 'rolecourse', 'roleforum'] for fn in fnset: convert_sql(lfp / fn) local_files = glob.glob(lfp / '*') # if using latest date directory, also look for course_image.jpg one level up if use_dataset_latest: print lfp.dirname() ci_files = glob.glob(lfp.dirname() / 'course_image.jpg') if ci_files: local_files += list(ci_files) print "--> local course_image file: %s" % ci_files gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket=gsbucket, use_dataset_latest=use_dataset_latest) local = pytz.timezone ("America/New_York") if do_gs_copy: try: fnset = get_gs_file_list(gsdir) except Exception as err: fnset = [] def copy_if_newer(fn, fnset, options='-z csv,json'): statbuf = os.stat(fn) mt = datetime.datetime.fromtimestamp(statbuf.st_mtime) # do some date checking to upload files which have changed, and are newer than that on google cloud storage local_dt = local.localize(mt, is_dst=None) utc_dt = local_dt.astimezone (pytz.utc) fnb = os.path.basename(fn) if fnb in fnset and fnset[fnb]['date'] > utc_dt: print "...%s already copied, skipping" % fn sys.stdout.flush() return elif fnb in fnset: print "...%s already exists, but has date=%s and mtime=%s, re-uploading" % (fn, fnset[fnb]['date'], mt) gsutil.upload_file_to_gs(fn, gsdir / fnb, options=options, verbose=True) for fn in local_files: fnb = os.path.basename(fn) if fnb=='course_image.jpg': copy_if_newer(fn, fnset, options='-a public-read') if not (fnb.endswith('.csv') or fnb.endswith('.json') or fnb.endswith('.csv.gz') or fnb.endswith('.json.gz') or fnb.endswith('.mongo.gz')): print "...unknown file type %s, skipping" % fn sys.stdout.flush() continue copy_if_newer(fn, fnset) # load into bigquery dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) mypath = os.path.dirname(os.path.realpath(__file__)) # load user_info_combo uicfn = lfp / 'user_info_combo.json.gz' if uicfn.exists(): uic_schema = json.loads(open('%s/schemas/schema_user_info_combo.json' % mypath).read())['user_info_combo'] bqutil.load_data_to_table(dataset, 'user_info_combo', gsdir / "user_info_combo.json.gz", uic_schema, wait=False) else: print "--> File %s does not exist, not loading user_info_combo into BigQuery" % uicfn # load studentmodule if fn_sm.exists(): schemas = json.loads(open('%s/schemas/schemas.json' % mypath).read()) cwsm_schema = schemas['courseware_studentmodule'] bqutil.load_data_to_table(dataset, 'studentmodule', gsdir / fn_sm.basename(), cwsm_schema, format='csv', wait=False, skiprows=1) else: print "--> Not loading studentmodule: file %s not found" % fn_sm
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, extract_subset_tables=True, ): ''' Combine individual person_course tables (from the set of specified course_id's) to create one single large person_course table. Do this by downloading each file, checking to make sure they all have the same fields, concatenating, and uploading back to bigquery. This is cheaper than doing a select *, and also uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large result sizes). The result is stored in the course_report_latest dataset (if use_dataset_latest), else in course_report_ORG, where ORG is the configured organization name. If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv. (those are created using a select *, for efficiency, despite the cost). ''' print "="*77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-"*77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip>0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/',1)[0] ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "="*77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip>1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 header = None for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) header = os.popen("zcat %s | head -1" % zfn).read().strip() firstfn = zfn else: cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn) # first row is header; don't keep when concatenating print cmd first = 0 new_header = os.popen("zcat %s | head -1" % zfn).read().strip() if not header == new_header: print "==> Warning! header mismatch for %s vs %s" % (zfn, firstfn) print " %s has: %s" % (firstfn, header) print " but %s has: %s" % (zfn, new_header) sys.stdout.flush() os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "="*77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-','_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "="*100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) # copy the new table (which has a specific date in its name) to a generically named "person_course_latest" # so that future SQL queries can simply use this as the latest person course table print "-> Copying %s to %s.person_course_latest" % (table, dataset) bqutil.copy_bq_table(dataset, table, "person_course_latest") if extract_subset_tables: do_extract_subset_person_course_tables(dataset, table) print "Done" sys.stdout.flush()
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False, check_dates=True): ''' Load daily tracking logs for course from google storage into BigQuery. If wait=True then waits for loading jobs to be completed. It's desirable to wait if subsequent jobs which need these tables (like person_day) are to be run immediately afterwards. ''' print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() gsroot = gsutil.path_from_course_id(course_id) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA = json.loads(open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log'] gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot) fnset = gsutil.get_gs_file_list(gsdir) dataset = bqutil.course_id2dataset(gsroot, dtype="logs") # create this dataset if necessary bqutil.create_dataset_if_nonexistent(dataset) tables = bqutil.get_list_of_table_ids(dataset) tables = [x for x in tables if x.startswith('track')] if verbose: print "-"*77 print "current tables loaded:", json.dumps(tables, indent=4) print "files to load: ", json.dumps(fnset.keys(), indent=4) print "-"*77 sys.stdout.flush() for fn, fninfo in fnset.iteritems(): if int(fninfo['size'])<=45: print "Zero size file %s, skipping" % fn continue m = re.search('(\d\d\d\d-\d\d-\d\d)', fn) if not m: continue date = m.group(1) tablename = "tracklog_%s" % date.replace('-','') # YYYYMMDD for compatibility with table wildcards # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True) file_date = fninfo['date'].replace(tzinfo=None) if tablename in tables: skip = True if check_dates: table_date = bqutil.get_bq_table_last_modified_datetime(dataset, tablename) if not (table_date > file_date): print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (tablename, fn, file_date, table_date) skip = False if skip: if verbose: print "Already have table %s, skipping file %s" % (tablename, fn) sys.stdout.flush() continue #if date < '2014-07-27': # continue print "Loading %s into table %s " % (fn, tablename) if verbose: print "start [%s]" % datetime.datetime.now() sys.stdout.flush() gsfn = fninfo['name'] ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000) if verbose: print "-" * 77 print "done with %s [%s]" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, ): print "="*77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-"*77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip>0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/',1)[0] ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "="*77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip>1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) else: cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn) # first row is header; don't keep when concatenating print cmd first = 0 os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "="*77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-','_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "="*100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) print "Done" sys.stdout.flush()
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False, check_dates=True): ''' Load daily tracking logs for course from google storage into BigQuery. If wait=True then waits for loading jobs to be completed. It's desirable to wait if subsequent jobs which need these tables (like person_day) are to be run immediately afterwards. ''' print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() gsroot = gsutil.path_from_course_id(course_id) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA = json.loads( open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log'] gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot) fnset = gsutil.get_gs_file_list(gsdir) dataset = bqutil.course_id2dataset(gsroot, dtype="logs") # create this dataset if necessary bqutil.create_dataset_if_nonexistent(dataset) tables = bqutil.get_list_of_table_ids(dataset) tables = [x for x in tables if x.startswith('track')] if verbose: print "-" * 77 print "current tables loaded:", json.dumps(tables, indent=4) print "files to load: ", json.dumps(fnset.keys(), indent=4) print "-" * 77 sys.stdout.flush() for fn, fninfo in fnset.iteritems(): if int(fninfo['size']) <= 45: print "Zero size file %s, skipping" % fn continue m = re.search('(\d\d\d\d-\d\d-\d\d)', fn) if not m: continue date = m.group(1) tablename = "tracklog_%s" % date.replace( '-', '') # YYYYMMDD for compatibility with table wildcards # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True) file_date = fninfo['date'].replace(tzinfo=None) if tablename in tables: skip = True if check_dates: table_date = bqutil.get_bq_table_last_modified_datetime( dataset, tablename) if not (table_date > file_date): print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % ( tablename, fn, file_date, table_date) skip = False if skip: if verbose: print "Already have table %s, skipping file %s" % ( tablename, fn) sys.stdout.flush() continue #if date < '2014-07-27': # continue print "Loading %s into table %s " % (fn, tablename) if verbose: print "start [%s]" % datetime.datetime.now() sys.stdout.flush() gsfn = fninfo['name'] ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000) if verbose: print "-" * 77 print "done with %s [%s]" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()