def do_table(self, the_sql, tablename, the_dataset=None, sql_for_description=None, check_skip=True): if check_skip: if self.skip_or_do_step(tablename) < 0: return # skip step if the_dataset is None: the_dataset = self.dataset print("Computing %s in BigQuery" % tablename) sys.stdout.flush() try: ret = bqutil.create_bq_table(the_dataset, tablename, the_sql, overwrite=True, output_project_id=self.output_project_id, sql_for_description=sql_for_description or the_sql, ) except Exception as err: print "ERROR! Failed on SQL=" print the_sql raise gsfn = "%s/%s.csv" % (self.gsbucket, tablename) bqutil.extract_table_to_gs(the_dataset, tablename, gsfn, format='csv', do_gzip=False, wait=False) msg = "CSV download link: %s" % gsutil.gs_download_link(gsfn) print msg bqutil.add_description_to_table(the_dataset, tablename, msg, append=True, project_id=self.output_project_id)
def do_combine( course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, ): print "=" * 77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-" * 77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip > 0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/', 1)[0] ofn = "person_course_%s_%s.csv" % ( org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "=" * 77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip > 1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) else: cmd = "zcat %s | tail -n +2 >> %s" % ( zfn, ofn ) # first row is header; don't keep when concatenating print cmd first = 0 os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "=" * 77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link( gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-', '_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "=" * 100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) print "Done" sys.stdout.flush()
def rephrase_forum_json_for_course( course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) sys.stdout.flush() return print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now()) sys.stdout.flush() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') for line in fp: cnt += 1 newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt == 0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, start_date=None, end_date=None, get_date_function=None, existing=None, log_dates=None, days_delta=1, skip_last_day=False, has_hash_limit=False, newer_than=None, table_max_size_mb=800, limit_query_size=False): ''' make a certain table (with SQL given) for specified course_id. The master table holds all the data for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. If the query fails because of "Resources exceeded during query execution" then try setting the end_date, to do part at a time. NOTE: the SQL must produce a result which is ordered by date, in increasing order. days_delta = integer number of days to increase each time; specify 0 for one day overlap, but make sure the SQL query only selects for time > TIMESTAMP("{last_date}") If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format. newer_than = if specified, as datetime, then any existing destination table must be newer than this datetime, else force_recompute is made True ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) # destination log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") if existing is None: existing = bqutil.get_list_of_table_ids(dataset) print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (len(existing or []), dataset) if log_dates is None: log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')] log_dates = [x[9:] for x in log_tables] log_dates.sort() if len(log_dates)==0: print "--> no tracking logs in %s aborting!" % (log_dataset) return if skip_last_day: old_max_date = max(log_dates) log_dates.remove(max(log_dates)) # remove the last day of data from consideration max_date = max(log_dates) print " --> skip_last_day is True: dropping %s, new max_date=%s" % (old_max_date, max_date) sys.stdout.flush() min_date = min(log_dates) max_date = max(log_dates) if start_date: start_date = start_date.replace('-','') if min_date < start_date: print " --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (min_date, start_date) min_date = start_date if end_date is not None: print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (table, min_date, max_date, end_date) sys.stdout.flush() the_end_date = end_date.replace('-','') # end_date should be YYYY-MM-DD if the_end_date < max_date: max_date = the_end_date if (table in existing) and newer_than: # get date on existing table table_datetime = bqutil.get_bq_table_last_modified_datetime(dataset, table) if (table_datetime < newer_than): print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (dataset, table, table_datetime, newer_than) force_recompute = True if force_recompute: overwrite = True else: overwrite = False last_date = "2010-10-01 01:02:03" # default last date if (not overwrite) and table in existing: # find out what the end date is of the current table pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100) if (pc_last is None): print "--> no data in table %s.%s, starting from scratch!" % (dataset, table) overwrite = True else: last_dates = [get_date_function(x) for x in pc_last['data']] last_date = max(last_dates) table_max_date = last_date.strftime('%Y%m%d') if max_date <= table_max_date: print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (table, table_max_date, min_date, max_date) sys.stdout.flush() return min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d') print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (table, table_max_date, min_date, max_date) sys.stdout.flush() overwrite = 'append' if overwrite=='append': print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % (table, course_id, min_date, max_date, last_date, datetime.datetime.now()) else: print "Making new %s table for course %s (start=%s, end=%s) [%s]" % (table, course_id, min_date, max_date, datetime.datetime.now()) sys.stdout.flush() if limit_query_size: # do only one day's tracking logs, and force use of hash if table is too large print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb the_max_date = max_date # save max_date information while min_date not in log_dates: print " tracklog_%s does not exist!" % min_date for ld in log_dates: if ld < min_date: continue if (ld > min_date) and (ld <= max_date): min_date = ld break if min_date not in log_dates: print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date raise Exception("[process_tracking_logs] missing tracking log") max_date = min_date print ' min_date = max_date = %s' % min_date tablename = 'tracklog_%s' % min_date.replace('-', '') tablesize_mb = bqutil.get_bq_table_size_bytes(log_dataset, tablename) / (1024.0*1024) nhashes = int(math.ceil(tablesize_mb / table_max_size_mb)) from_datasets = "[%s.%s]" % (log_dataset, tablename) print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes) sys.stdout.flush() if nhashes: if not has_hash_limit: print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (log_dataset, tablename, tablesize_mb, table_max_size_mb) print "SQL: ", SQL raise Exception("[process_tracking_logs] table too large") for k in range(nhashes): hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) print " Hash %d" % k sys.stdout.flush() try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise overwrite = "append" else: the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise txt = '[%s] added tracking log data from %s' % (datetime.datetime.now(), tablename) bqutil.add_description_to_table(dataset, table, txt, append=True) print "----> Done with day %s" % max_date if the_max_date > max_date: # more days still to be done print "--> Moving on to another day (max_date=%s)" % the_max_date run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, existing=existing, log_dates=log_dates, has_hash_limit=has_hash_limit, table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size, ) return from_datasets = """( TABLE_QUERY({dataset}, "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}" ) ) """.format(dataset=log_dataset, start=min_date, end=max_date) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: if ( ('Response too large to return.' in str(err)) and has_hash_limit ): # try using hash limit on username # e.g. WHERE ABS(HASH(username)) % 4 = 0 for k in range(4): hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" elif ('Resources exceeded during query execution' in str(err)): if True: # figure out time interval in days, and split that in half start_date = datetime.datetime.strptime(min_date, '%Y%m%d') end_date = datetime.datetime.strptime(max_date, '%Y%m%d') ndays = (end_date - start_date).days if (ndays < 1) and has_hash_limit: print "----> ndays=%d; retrying with limit_query_size" % ndays sys.stdout.flush() return run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=max_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates, limit_query_size=True, ) elif (ndays < 1): print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays raise nd1 = int(ndays/2) nd2 = ndays - nd1 #if nd2 > nd1: # nd1 = nd2 # nd2 = ndays - nd1 print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (ndays, nd1, nd2) sys.stdout.flush() end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d') print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date) sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) end_date = max_date print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date) sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) print "--> Done with %d + %d days!" % (nd1, nd2) return if False: def get_ym(x): return int(x[0:4]), int(x[4:6]), int(x[6:]) (min_year, min_month, min_day) = get_ym(min_date) (max_year, max_month, max_day) = get_ym(max_date) nmonths = max_month - min_month + 12 * (max_year - min_year) print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time" sys.stdout.flush() (end_year, end_month) = (min_year, min_month) for dm in range(nmonths): end_month += 1 if end_month > 12: end_month = 1 end_year += 1 end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day) print "--> with end_date=%s" % end_date sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) force_recompute = False # after first, don't force recompute return else: print the_sql raise if overwrite=='append': txt = '[%s] added tracking log data from %s to %s' % (datetime.datetime.now(), min_date, max_date) bqutil.add_description_to_table(dataset, table, txt, append=True) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def do_combine( course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, extract_subset_tables=True, ): ''' Combine individual person_course tables (from the set of specified course_id's) to create one single large person_course table. Do this by downloading each file, checking to make sure they all have the same fields, concatenating, and uploading back to bigquery. This is cheaper than doing a select *, and also uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large result sizes). The result is stored in the course_report_latest dataset (if use_dataset_latest), else in course_report_ORG, where ORG is the configured organization name. If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv. (those are created using a select *, for efficiency, despite the cost). ''' print "=" * 77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-" * 77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id( course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip > 0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % ( ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/', 1)[0] ofn = "person_course_%s_%s.csv" % ( org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "=" * 77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip > 1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 header = None for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) header = os.popen("zcat %s | head -1" % zfn).read().strip() firstfn = zfn else: cmd = "zcat %s | tail -n +2 >> %s" % ( zfn, ofn ) # first row is header; don't keep when concatenating print cmd first = 0 new_header = os.popen("zcat %s | head -1" % zfn).read().strip() if not header == new_header: print "==> Warning! header mismatch for %s vs %s" % (zfn, firstfn) print " %s has: %s" % (firstfn, header) print " but %s has: %s" % (zfn, new_header) sys.stdout.flush() os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "=" * 77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link( gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-', '_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "=" * 100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) # copy the new table (which has a specific date in its name) to a generically named "person_course_latest" # so that future SQL queries can simply use this as the latest person course table print "-> Copying %s to %s.person_course_latest" % (table, dataset) bqutil.copy_bq_table(dataset, table, "person_course_latest") if extract_subset_tables: do_extract_subset_person_course_tables(dataset, table) print "Done" sys.stdout.flush()
def rephrase_forum_json_for_course(course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) sys.stdout.flush() return print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now()) sys.stdout.flush() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') for line in fp: cnt += 1 newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt==0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False): ''' Save course axis data to bigquery cid = course_id caset = list of course axis data in dict format xbundle = XML bundle of course (everything except static files) datadir = directory where output files should be written log_msg = list of messages about processing errors and issues ''' # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema mypath = os.path.dirname(os.path.realpath(__file__)) the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis'] dict_schema = schema2dict(the_schema) caset = copy.deepcopy(caset_in) datadir = path(datadir) cafn = datadir / 'course_axis.json' xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__'))) fp = open(cafn, 'w') linecnt = 0 for ca in caset: linecnt += 1 ca['course_id'] = cid data = ca['data'] if data and not type(data)==dict: try: ca['data'] = json.loads(data) # make it native, for mongo except Exception as err: print "failed to create json for %s, error=%s" % (data, err) if ca['start'] is not None: ca['start'] = str(ca['start']) # datetime to string if ca['due'] is not None: ca['due'] = str(ca['due']) # datetime to string if (ca['data'] is None) or (ca['data']==''): ca.pop('data') check_schema(linecnt, ca, the_ds=dict_schema, coerce=True) try: # db.course_axis.insert(ca) fp.write(json.dumps(ca)+'\n') except Exception as err: print "Failed to save! Error=%s, data=%s" % (err, ca) fp.close() # upload axis.json file and course xbundle gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest)) if 1: gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False) gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False) # import into BigQuery dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) # create dataset if not already existent table = "course_axis" bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema) msg = "="*100 + '\n' msg += "Course axis for %s\n" % (cid) msg += "="*100 + '\n' msg += '\n'.join(log_msg) msg = msg[:16184] # max message length 16384 bqutil.add_description_to_table(dataset, table, msg, append=True) print " Done - inserted %s records into course_axis" % len(caset)
def rephrase_forum_json_for_course( course_id, gsbucket="gs://x-data", basedir="X-Year-2-data-sql", datedir=None, do_gs_copy=False, use_dataset_latest=False, ): print "Loading SQL for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest) print "Using this directory for local files: ", lfp sys.stdout.flush() fn = 'forum.mongo' gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest) def openfile(fn, mode='r'): if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')): fn += ".gz" if fn.endswith('.gz'): return gzip.GzipFile(lfp / fn, mode) return open(lfp / fn, mode) fp = openfile(fn) ofn = lfp / "forum-rephrased.json.gz" ofncsv = "forum.csv.gz" # To match table name in BQ ofncsv_lfp = lfp / ofncsv dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) bqutil.create_dataset_if_nonexistent(dataset) if os.path.exists(ofn) and os.path.exists(ofncsv_lfp): tables = bqutil.get_list_of_table_ids(dataset) if not 'forum' in tables: print "Already done? But no forums table loaded into datasaet %s. Redoing." % dataset else: print "Already done %s -> %s (skipping)" % (fn, ofn) print "Already done %s -> %s (skipping)" % (fn, ofncsv_lfp) sys.stdout.flush() return print "Processing %s -> writing to %s and %s (%s)" % ( fn, ofn, ofncsv, datetime.datetime.now()) sys.stdout.flush() # Setup CSV header ocsv = csv.DictWriter(openfile(ofncsv, 'w'), fieldnames=SCHEMA_DICT.keys(), quoting=csv.QUOTE_NONNUMERIC) ocsv.writeheader() cnt = 0 ofp = gzip.GzipFile('tmp.json.gz', 'w') data = OrderedDict() for line in fp: cnt += 1 # Write JSON row newline = do_rephrase_line(line, linecnt=cnt) ofp.write(newline) try: #Write CSV row data = json.loads(newline) ocsv.writerow(data) except Exception as err: print "Error writing CSV output row %s=%s" % (cnt, data) raise ofp.close() print "...done (%s)" % datetime.datetime.now() if cnt == 0: print "...but cnt=0 entries found, skipping forum loading" sys.stdout.flush() return print "...copying to gsc" sys.stdout.flush() # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away? gsfn = gsdir + '/' + "forum-rephrased.json.gz" cmd = 'gsutil cp tmp.json.gz %s' % (gsfn) os.system(cmd) os.system(cmd) table = 'forum' bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True) msg = "Original data from %s" % (lfp / fn) bqutil.add_description_to_table(dataset, table, msg, append=True) os.system('mv tmp.json.gz "%s"' % (ofn)) print "...done (%s)" % datetime.datetime.now() sys.stdout.flush()
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, extract_subset_tables=True, ): ''' Combine individual person_course tables (from the set of specified course_id's) to create one single large person_course table. Do this by downloading each file, checking to make sure they all have the same fields, concatenating, and uploading back to bigquery. This is cheaper than doing a select *, and also uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large result sizes). The result is stored in the course_report_latest dataset (if use_dataset_latest), else in course_report_ORG, where ORG is the configured organization name. If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv. (those are created using a select *, for efficiency, despite the cost). ''' print "="*77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-"*77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip>0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/',1)[0] ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "="*77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip>1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 header = None for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) header = os.popen("zcat %s | head -1" % zfn).read().strip() firstfn = zfn else: cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn) # first row is header; don't keep when concatenating print cmd first = 0 new_header = os.popen("zcat %s | head -1" % zfn).read().strip() if not header == new_header: print "==> Warning! header mismatch for %s vs %s" % (zfn, firstfn) print " %s has: %s" % (firstfn, header) print " but %s has: %s" % (zfn, new_header) sys.stdout.flush() os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "="*77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-','_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "="*100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) # copy the new table (which has a specific date in its name) to a generically named "person_course_latest" # so that future SQL queries can simply use this as the latest person course table print "-> Copying %s to %s.person_course_latest" % (table, dataset) bqutil.copy_bq_table(dataset, table, "person_course_latest") if extract_subset_tables: do_extract_subset_person_course_tables(dataset, table) print "Done" sys.stdout.flush()
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, start_date=None, end_date=None, get_date_function=None, existing=None, log_dates=None, days_delta=1, skip_last_day=False, has_hash_limit=False, newer_than=None, table_max_size_mb=800, limit_query_size=False): ''' make a certain table (with SQL given) for specified course_id. The master table holds all the data for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. If the query fails because of "Resources exceeded during query execution" then try setting the end_date, to do part at a time. NOTE: the SQL must produce a result which is ordered by date, in increasing order. days_delta = integer number of days to increase each time; specify 0 for one day overlap, but make sure the SQL query only selects for time > TIMESTAMP("{last_date}") If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format. newer_than = if specified, as datetime, then any existing destination table must be newer than this datetime, else force_recompute is made True ''' dataset = bqutil.course_id2dataset( course_id, use_dataset_latest=use_dataset_latest) # destination log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") if existing is None: existing = bqutil.get_list_of_table_ids(dataset) print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % ( len(existing or []), dataset) if log_dates is None: log_tables = [ x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20') ] log_dates = [x[9:] for x in log_tables] log_dates.sort() if len(log_dates) == 0: print "--> no tracking logs in %s aborting!" % (log_dataset) return if skip_last_day: old_max_date = max(log_dates) log_dates.remove( max(log_dates)) # remove the last day of data from consideration max_date = max(log_dates) print " --> skip_last_day is True: dropping %s, new max_date=%s" % ( old_max_date, max_date) sys.stdout.flush() min_date = min(log_dates) max_date = max(log_dates) if start_date: start_date = start_date.replace('-', '') if min_date < start_date: print " --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % ( min_date, start_date) min_date = start_date if end_date is not None: print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % ( table, min_date, max_date, end_date) sys.stdout.flush() the_end_date = end_date.replace('-', '') # end_date should be YYYY-MM-DD if the_end_date < max_date: max_date = the_end_date if (table in existing) and newer_than: # get date on existing table table_datetime = bqutil.get_bq_table_last_modified_datetime( dataset, table) if (table_datetime < newer_than): print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % ( dataset, table, table_datetime, newer_than) force_recompute = True if force_recompute: overwrite = True else: overwrite = False last_date = "2010-10-01 01:02:03" # default last date if (not overwrite) and table in existing: # find out what the end date is of the current table pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100) if (pc_last is None): print "--> no data in table %s.%s, starting from scratch!" % ( dataset, table) overwrite = True else: last_dates = [] for x in pc_last['data']: try: add_date = get_date_function(x) last_dates.append(add_date) except Exception as err: print "Error with get_date_function occurred. %s" % str( err) continue last_date = max(last_dates) table_max_date = last_date.strftime('%Y%m%d') if max_date <= table_max_date: print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % ( table, table_max_date, min_date, max_date) sys.stdout.flush() return min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d') print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % ( table, table_max_date, min_date, max_date) sys.stdout.flush() overwrite = 'append' if overwrite == 'append': print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % ( table, course_id, min_date, max_date, last_date, datetime.datetime.now()) else: print "Making new %s table for course %s (start=%s, end=%s) [%s]" % ( table, course_id, min_date, max_date, datetime.datetime.now()) sys.stdout.flush() if limit_query_size: # do only one day's tracking logs, and force use of hash if table is too large print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb the_max_date = max_date # save max_date information while min_date not in log_dates: print " tracklog_%s does not exist!" % min_date for ld in log_dates: if ld < min_date: continue if (ld > min_date) and (ld <= max_date): min_date = ld break if min_date not in log_dates: print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date raise Exception("[process_tracking_logs] missing tracking log") max_date = min_date print ' min_date = max_date = %s' % min_date tablename = 'tracklog_%s' % min_date.replace('-', '') tablesize_mb = bqutil.get_bq_table_size_bytes( log_dataset, tablename) / (1024.0 * 1024) nhashes = int(math.ceil(tablesize_mb / table_max_size_mb)) from_datasets = "[%s.%s]" % (log_dataset, tablename) print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % ( log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes) sys.stdout.flush() if nhashes: if not has_hash_limit: print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % ( log_dataset, tablename, tablesize_mb, table_max_size_mb) print "SQL: ", SQL raise Exception("[process_tracking_logs] table too large") for k in range(nhashes): hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) print " Hash %d" % k sys.stdout.flush() try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise overwrite = "append" else: the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise txt = '[%s] added tracking log data from %s' % ( datetime.datetime.now(), tablename) bqutil.add_description_to_table(dataset, table, txt, append=True) print "----> Done with day %s" % max_date if the_max_date > max_date: # more days still to be done print "--> Moving on to another day (max_date=%s)" % the_max_date run_query_on_tracking_logs( SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, existing=existing, log_dates=log_dates, has_hash_limit=has_hash_limit, table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size, ) return from_datasets = """( TABLE_QUERY({dataset}, "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}" ) ) """.format(dataset=log_dataset, start=min_date, end=max_date) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: if (('Response too large to return.' in str(err)) and has_hash_limit): # try using hash limit on username # e.g. WHERE ABS(HASH(username)) % 4 = 0 for k in range(4): hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" elif ('Resources exceeded during query execution' in str(err)): if True: # figure out time interval in days, and split that in half start_date = datetime.datetime.strptime(min_date, '%Y%m%d') end_date = datetime.datetime.strptime(max_date, '%Y%m%d') ndays = (end_date - start_date).days if (ndays < 1) and has_hash_limit: print "----> ndays=%d; retrying with limit_query_size" % ndays sys.stdout.flush() return run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=max_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates, limit_query_size=True, ) elif (ndays < 1): print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays raise nd1 = int(ndays / 2) nd2 = ndays - nd1 #if nd2 > nd1: # nd1 = nd2 # nd2 = ndays - nd1 print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % ( ndays, nd1, nd2) sys.stdout.flush() end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d') print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date) sys.stdout.flush() run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) end_date = max_date print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date) sys.stdout.flush() run_query_on_tracking_logs( SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) print "--> Done with %d + %d days!" % (nd1, nd2) return if False: def get_ym(x): return int(x[0:4]), int(x[4:6]), int(x[6:]) (min_year, min_month, min_day) = get_ym(min_date) (max_year, max_month, max_day) = get_ym(max_date) nmonths = max_month - min_month + 12 * (max_year - min_year) print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time" sys.stdout.flush() (end_year, end_month) = (min_year, min_month) for dm in range(nmonths): end_month += 1 if end_month > 12: end_month = 1 end_year += 1 end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day) print "--> with end_date=%s" % end_date sys.stdout.flush() run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) force_recompute = False # after first, don't force recompute return else: print the_sql raise if overwrite == 'append': txt = '[%s] added tracking log data from %s to %s' % ( datetime.datetime.now(), min_date, max_date) bqutil.add_description_to_table(dataset, table, txt, append=True) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0, output_project_id=None, output_dataset_id=None, output_bucket=None, use_dataset_latest=False, ): print "="*77 print "Concatenating person course datasets from the following courses:" print course_id_set print "-"*77 outdir = path(outdir) if not outdir.exists(): os.mkdir(outdir) ofnset = [] cnt = 0 for course_id in course_id_set: gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__'))) ofnset.append(ofn) if (nskip>0) and ofn.exists(): print "%s already exists, not downloading" % ofn sys.stdout.flush() continue if ofn.exists(): fnset = gsutil.get_gs_file_list(gb) local_dt = gsutil.get_local_file_mtime_in_utc(ofn) fnb = 'person_course.csv.gz' if not fnb in fnset: print "%s/%s missing! skipping %s" % (gb, fnb, course_id) continue if (fnb in fnset) and (local_dt >= fnset[fnb]['date']): print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() continue else: print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date']) sys.stdout.flush() cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn) print "Retrieving %s via %s" % (course_id, cmd) sys.stdout.flush() os.system(cmd) cnt += 1 #if cnt>2: # break org = course_id_set[0].split('/',1)[0] ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S')) print "="*77 print "Combining CSV files to produce %s" % ofn sys.stdout.flush() if (nskip>1) and os.path.exists(ofn): print "%s already exists, not downloading" % ofn else: first = 1 for zfn in ofnset: if first: cmd = "zcat %s > %s" % (zfn, ofn) else: cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn) # first row is header; don't keep when concatenating print cmd first = 0 os.system(cmd) gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket) print "="*77 print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb sys.stdout.flush() cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb) print cmd os.system(cmd) gsfn = gb + '/' + ofn print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn) # import into BigQuery crname = ('course_report_%s' % org) if use_dataset_latest: crname = 'course_report_latest' dataset = output_dataset_id or crname table = ofn[:-4].replace('-','_') print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table) sys.stdout.flush() mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath the_schema = json.loads(open(SCHEMA_FILE).read())['person_course'] bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id) msg = '' msg += "Combined person-course dataset, with data from:\n" msg += str(course_id_set) msg += "\n\n" msg += "="*100 + "\n" msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn) bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id) print "Done" sys.stdout.flush()