def already_exists(course_id, use_dataset_latest=False): ''' Return True if course_axis table already exists, and is sufficiently new ''' table = "course_axis" dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) # tables = bqutil.get_list_of_table_ids(dataset) # return table in tables try: mdt = bqutil.get_bq_table_last_modified_datetime(dataset, table) except Exception as err: return False if mdt < datetime.datetime(2015, 10, 31, 17, 00): return False return True
def obsolete_process_course(course_id, force_recompute=False, check_dates=True): ''' make person_course_day tables for specified course_id. This version produces one table for each day. It is inefficient when there are many days with very small daily tracking log tables. ''' PCDAY_SQL = """ select username, "{course_id}" as course_id, sum(bevent) as nevents, sum(bprogress) as nprogcheck, sum(bshow_answer) as nshow_answer, sum(bvideo) as nvideo, sum(bproblem_check) as nproblem_check, sum(bforum) as nforum, sum(bshow_transcript) as ntranscript, sum(bseq_goto) as nseq_goto, sum(bseek_video) as nseek_video, sum(bpause_video) as npause_video, MAX(time) as last_event, AVG( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as avg_dt, STDDEV( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sdv_dt, MAX( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as max_dt, COUNT( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as n_dt, SUM( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sum_dt from (SELECT username, case when event_type = "play_video" then 1 else 0 end as bvideo, case when event_type = "problem_check" then 1 else 0 end as bproblem_check, case when username != "" then 1 else 0 end as bevent, case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum, case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress, case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer, case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript, case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto, case when event_type = 'seek_video' then 1 else 0 end as bseek_video, case when event_type = 'pause_video' then 1 else 0 end as bpause_video, # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll, # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll time, lag(time, 1) over (partition by username order by time) last_time FROM [{dataset}.{table_id}] WHERE NOT event_type contains "/xblock/" AND username != "" ) group by course_id, username order by sdv_dt desc """ course_dir = course_id.replace('/','__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])] print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'pcday_%s' % date if (table_out in pcday_tables) and not force_recompute: skip = True if check_dates: table_out_date = bqutil.get_bq_table_last_modified_datetime(pcd_dataset, table_out) log_table_date = bqutil.get_bq_table_last_modified_datetime(log_dataset, table_id) if log_table_date > table_out_date: skip = False print "%s...already exists, but table_out date=%s and log_table date=%s, so re-computing" % (table_out, table_out_date, log_table_date) if skip: print "%s...already done, skipping" % table_out sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print ("Creating %s " % table_out), the_sql = PCDAY_SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, start_date=None, end_date=None, get_date_function=None, existing=None, log_dates=None, days_delta=1, skip_last_day=False, has_hash_limit=False, newer_than=None, table_max_size_mb=800, limit_query_size=False): ''' make a certain table (with SQL given) for specified course_id. The master table holds all the data for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. If the query fails because of "Resources exceeded during query execution" then try setting the end_date, to do part at a time. NOTE: the SQL must produce a result which is ordered by date, in increasing order. days_delta = integer number of days to increase each time; specify 0 for one day overlap, but make sure the SQL query only selects for time > TIMESTAMP("{last_date}") If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format. newer_than = if specified, as datetime, then any existing destination table must be newer than this datetime, else force_recompute is made True ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) # destination log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") if existing is None: existing = bqutil.get_list_of_table_ids(dataset) print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (len(existing or []), dataset) if log_dates is None: log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')] log_dates = [x[9:] for x in log_tables] log_dates.sort() if len(log_dates)==0: print "--> no tracking logs in %s aborting!" % (log_dataset) return if skip_last_day: old_max_date = max(log_dates) log_dates.remove(max(log_dates)) # remove the last day of data from consideration max_date = max(log_dates) print " --> skip_last_day is True: dropping %s, new max_date=%s" % (old_max_date, max_date) sys.stdout.flush() min_date = min(log_dates) max_date = max(log_dates) if start_date: start_date = start_date.replace('-','') if min_date < start_date: print " --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (min_date, start_date) min_date = start_date if end_date is not None: print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (table, min_date, max_date, end_date) sys.stdout.flush() the_end_date = end_date.replace('-','') # end_date should be YYYY-MM-DD if the_end_date < max_date: max_date = the_end_date if (table in existing) and newer_than: # get date on existing table table_datetime = bqutil.get_bq_table_last_modified_datetime(dataset, table) if (table_datetime < newer_than): print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (dataset, table, table_datetime, newer_than) force_recompute = True if force_recompute: overwrite = True else: overwrite = False last_date = "2010-10-01 01:02:03" # default last date if (not overwrite) and table in existing: # find out what the end date is of the current table pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100) if (pc_last is None): print "--> no data in table %s.%s, starting from scratch!" % (dataset, table) overwrite = True else: last_dates = [get_date_function(x) for x in pc_last['data']] last_date = max(last_dates) table_max_date = last_date.strftime('%Y%m%d') if max_date <= table_max_date: print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (table, table_max_date, min_date, max_date) sys.stdout.flush() return min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d') print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (table, table_max_date, min_date, max_date) sys.stdout.flush() overwrite = 'append' if overwrite=='append': print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % (table, course_id, min_date, max_date, last_date, datetime.datetime.now()) else: print "Making new %s table for course %s (start=%s, end=%s) [%s]" % (table, course_id, min_date, max_date, datetime.datetime.now()) sys.stdout.flush() if limit_query_size: # do only one day's tracking logs, and force use of hash if table is too large print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb the_max_date = max_date # save max_date information while min_date not in log_dates: print " tracklog_%s does not exist!" % min_date for ld in log_dates: if ld < min_date: continue if (ld > min_date) and (ld <= max_date): min_date = ld break if min_date not in log_dates: print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date raise Exception("[process_tracking_logs] missing tracking log") max_date = min_date print ' min_date = max_date = %s' % min_date tablename = 'tracklog_%s' % min_date.replace('-', '') tablesize_mb = bqutil.get_bq_table_size_bytes(log_dataset, tablename) / (1024.0*1024) nhashes = int(math.ceil(tablesize_mb / table_max_size_mb)) from_datasets = "[%s.%s]" % (log_dataset, tablename) print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes) sys.stdout.flush() if nhashes: if not has_hash_limit: print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (log_dataset, tablename, tablesize_mb, table_max_size_mb) print "SQL: ", SQL raise Exception("[process_tracking_logs] table too large") for k in range(nhashes): hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) print " Hash %d" % k sys.stdout.flush() try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise overwrite = "append" else: the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise txt = '[%s] added tracking log data from %s' % (datetime.datetime.now(), tablename) bqutil.add_description_to_table(dataset, table, txt, append=True) print "----> Done with day %s" % max_date if the_max_date > max_date: # more days still to be done print "--> Moving on to another day (max_date=%s)" % the_max_date run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, existing=existing, log_dates=log_dates, has_hash_limit=has_hash_limit, table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size, ) return from_datasets = """( TABLE_QUERY({dataset}, "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}" ) ) """.format(dataset=log_dataset, start=min_date, end=max_date) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: if ( ('Response too large to return.' in str(err)) and has_hash_limit ): # try using hash limit on username # e.g. WHERE ABS(HASH(username)) % 4 = 0 for k in range(4): hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" elif ('Resources exceeded during query execution' in str(err)): if True: # figure out time interval in days, and split that in half start_date = datetime.datetime.strptime(min_date, '%Y%m%d') end_date = datetime.datetime.strptime(max_date, '%Y%m%d') ndays = (end_date - start_date).days if (ndays < 1) and has_hash_limit: print "----> ndays=%d; retrying with limit_query_size" % ndays sys.stdout.flush() return run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=max_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates, limit_query_size=True, ) elif (ndays < 1): print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays raise nd1 = int(ndays/2) nd2 = ndays - nd1 #if nd2 > nd1: # nd1 = nd2 # nd2 = ndays - nd1 print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (ndays, nd1, nd2) sys.stdout.flush() end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d') print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date) sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) end_date = max_date print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date) sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) print "--> Done with %d + %d days!" % (nd1, nd2) return if False: def get_ym(x): return int(x[0:4]), int(x[4:6]), int(x[6:]) (min_year, min_month, min_day) = get_ym(min_date) (max_year, max_month, max_day) = get_ym(max_date) nmonths = max_month - min_month + 12 * (max_year - min_year) print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time" sys.stdout.flush() (end_year, end_month) = (min_year, min_month) for dm in range(nmonths): end_month += 1 if end_month > 12: end_month = 1 end_year += 1 end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day) print "--> with end_date=%s" % end_date sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) force_recompute = False # after first, don't force recompute return else: print the_sql raise if overwrite=='append': txt = '[%s] added tracking log data from %s to %s' % (datetime.datetime.now(), min_date, max_date) bqutil.add_description_to_table(dataset, table, txt, append=True) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def run_external_script(extcmd, param, ecinfo, course_id): """ Run external script on specified course. extcmd = string specifying external command to run param = command line parameters, including extparam ecinfo = external command info from edx2bigquery_config course_id = course_id to run external command on """ # use default for base set of parameters ed_name = ecinfo.get('default_parameters', 'DEFAULT') settings = ecinfo.get(ed_name, {}) settings.update(ecinfo.get(extcmd)) # print "settings: ", json.dumps(settings, indent=4) print settings['name'] if param.verbose: print settings.get('description', '') cidns = course_id.replace('/', '__') cidns_nodots = course_id.replace('/', '__').replace('.', '_').replace('-', '_') mypath = path(os.path.realpath(__file__)).dirname() edx2bigquery_context = {'lib': mypath / "lib", 'bin': mypath / "bin", } the_template = settings['template'].format(**edx2bigquery_context) fnpre = settings['filename_prefix'] lfn = "%s-%s.log" % (fnpre, cidns) if settings.get('logs_dir'): lfn = path(settings['logs_dir']) / lfn try: ofn = settings['script_fn'].format(filename_prefix=fnpre, cidns=cidns) except Exception as err: print "oops, errr %s" % str(err) print "settings=", json.dumps(settings, indent=4) raise cwd = os.getcwd() the_date = str(datetime.datetime.now()) dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=param.use_dataset_latest) table_prefix = dataset if param.force_recompute: param.force_recompute = 1 else: param.force_recompute = 0 context = {'course_id': course_id, 'script_name': ofn, 'the_date': the_date, 'cidns': cidns, 'cidns_nodots': cidns, 'template_file': the_template, 'log_file': lfn, 'filename_prefix': fnpre, 'filename_prefix_cidns': "%s__%s" % (fnpre, cidns), 'working_dir': cwd, 'table_prefix': table_prefix, 'lib_dir': edx2bigquery_context['lib'], 'bin_dir': edx2bigquery_context['bin'], } context.update(settings) context.update(param.__dict__) rundir = settings['run_dir'].format(**context) runcmd = settings['script_cmd'].format(**context) tem = codecs.open(the_template).read() tem = unicode(tem) try: # script_file = tem.format(**context) script_file = Template(tem).render(**context) except Exception as err: print "Oops, cannot properly format template %s" % the_template print "Error %s" % str(err) print "context: ", json.dumps(context, indent=4) raise ofndir = path(ofn).dirname() if not os.path.exists(ofndir): print "[Warning] Directory %s doesn't exist - creating it" % ofndir os.mkdir(ofndir) fp = codecs.open(ofn, 'w', encoding="utf8") fp.write(script_file) fp.close() print "Generated %s" % ofn # if depends_on is defined, and force_recompute is not true, then skip # run if output already exists and is newer than all depends_on tables. depends_on = settings.get('depends_on') output_table = settings.get('output_table') if depends_on and not type(depends_on)==list: depends_on = [ depends_on ] do_compute = param.force_recompute if (not param.force_recompute) and depends_on and output_table: # does output already exist? has_output = False try: tinfo = bqutil.get_bq_table_info(dataset, output_table) if tinfo: has_output = True except: pass if not has_output: print "Output table %s.%s doesn't exist: running" % (dataset, output_table) do_compute = True else: table_date = tinfo['lastModifiedTime'] for deptab in depends_on: try: dtab_date = bqutil.get_bq_table_last_modified_datetime(dataset, deptab) except Exception as err: raise Exception("[run_external] missing dependent table %s.%s" % (dataset, deptab)) if not dtab_date: raise Exception("[run_external] missing dependent table %s.%s" % (dataset, deptab)) if table_date and dtab_date > table_date: do_compute = True break if not do_compute: print "Output table %s.%s exists and is newer than %s, skipping" % (dataset, output_table, depends_on) if do_compute: os.chdir(rundir) print "Working directory: %s" % rundir print "Logging to %s" % lfn print "Run command: %s" % runcmd sys.stdout.flush() if not param.skiprun: start = datetime.datetime.now() if param.submit_condor: condor_template_fn = settings.get('condor_job_template', '').format(**edx2bigquery_context) if not condor_template_fn: raise Exception("[run_external] missing condor_job_template specification for %s" % (extcmd)) condor_submit_fn = "CONDOR/{filename_prefix}-{cidns}.submit".format(**context) context.update({ 'MEMORY': 32768, 'arguments': '{script_name}'.format(**context), 'executable': context['script_cmd'], 'input_file': '', 'filename': condor_submit_fn, }) condor_template = Template(open(condor_template_fn).read()).render(**context) dirs = ['CONDOR', 'JOBS'] for dir in dirs: if not os.path.exists(dir): os.mkdir(dir) fp = open(condor_submit_fn, 'w') fp.write(condor_template) fp.close() cmd = "condor_submit %s" % condor_submit_fn print cmd jobid = None for k in os.popen(cmd): m = re.search('submitted to cluster ([0-9]+)', k) if m: jobid = m.group(1) dt = str(datetime.datetime.now()) jobfile = 'condor_jobs.csv' open(jobfile, 'a').write("%s,%s,%s,%s\n" % (course_id, dt, jobid, lfn)) print "[%s] Submitted as condor job %s at %s" % (course_id, jobid, dt) # print "[run_external] submitted %s, job=%s" % (extcmd, jobnum) return else: os.system(runcmd) if settings.get('type')=="stata": # cleanup leftover log file after stata batch run batch_log = ofn.split('.')[0] + ".log" if os.path.exists(batch_log): os.unlink(batch_log) print "Removed old log file %s" % batch_log end = datetime.datetime.now() has_output = False try: tinfo = bqutil.get_bq_table_info(dataset, output_table) if tinfo: has_output = True except: pass success = has_output dt = end-start print "[run_external] DONE WITH %s, success=%s, dt=%s" % (extcmd, success, dt) sys.stdout.flush() if param.parallel and not success: raise Exception("[run_external] External command %s failed on %s" % (extcmd, course_id))
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False, check_dates=True): ''' Load daily tracking logs for course from google storage into BigQuery. If wait=True then waits for loading jobs to be completed. It's desirable to wait if subsequent jobs which need these tables (like person_day) are to be run immediately afterwards. ''' print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() gsroot = gsutil.path_from_course_id(course_id) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA = json.loads(open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log'] gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot) fnset = gsutil.get_gs_file_list(gsdir) dataset = bqutil.course_id2dataset(gsroot, dtype="logs") # create this dataset if necessary bqutil.create_dataset_if_nonexistent(dataset) tables = bqutil.get_list_of_table_ids(dataset) tables = [x for x in tables if x.startswith('track')] if verbose: print "-"*77 print "current tables loaded:", json.dumps(tables, indent=4) print "files to load: ", json.dumps(fnset.keys(), indent=4) print "-"*77 sys.stdout.flush() for fn, fninfo in fnset.iteritems(): if int(fninfo['size'])<=45: print "Zero size file %s, skipping" % fn continue m = re.search('(\d\d\d\d-\d\d-\d\d)', fn) if not m: continue date = m.group(1) tablename = "tracklog_%s" % date.replace('-','') # YYYYMMDD for compatibility with table wildcards # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True) file_date = fninfo['date'].replace(tzinfo=None) if tablename in tables: skip = True if check_dates: table_date = bqutil.get_bq_table_last_modified_datetime(dataset, tablename) if not (table_date > file_date): print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (tablename, fn, file_date, table_date) skip = False if skip: if verbose: print "Already have table %s, skipping file %s" % (tablename, fn) sys.stdout.flush() continue #if date < '2014-07-27': # continue print "Loading %s into table %s " % (fn, tablename) if verbose: print "start [%s]" % datetime.datetime.now() sys.stdout.flush() gsfn = fninfo['name'] ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000) if verbose: print "-" * 77 print "done with %s [%s]" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def cached_get_bq_table(self, dataset, table, sql=None, key=None, drop=None, logger=None, ignore_cache=False, depends_on=None, force_query=False, force_newer_than=None, startIndex=0, maxResults=1000000, allowLargeResults=False, raise_exception=False, project_id=None, ): ''' Get a dataset from BigQuery; use memcache. If "depends_on" is provided (as a list of strings), and if the desired table already exists, then check to make sure it is newer than any of the tables listed in "depends_on". if force_newer_than is set (should be a datetime) then in the depends_on testing, use that date as an override, such that the SQL is re-run if the existing table is older than this date. project_id: if specified, overrides the default BigQuery project ID (for the actual query) ''' if logger is None: logger = logging.info memset = '%s.%s' % (dataset,table) if startIndex: memset += '-%d-%d' % (startIndex, maxResults) data = mem.get(memset) optargs = {} if project_id: optargs['project_id'] = project_id if depends_on is not None: # get the latest mod time of tables in depends_on: modtimes = [ bqutil.get_bq_table_last_modified_datetime(*(x.split('.',1)), **optargs) for x in depends_on] latest = max([x for x in modtimes if x is not None] or [None]) if not latest: raise Exception("[datasource.cached_get_bq_table] Cannot get last mod time for %s (got %s), needed by %s.%s" % (depends_on, modtimes, dataset, table)) if force_newer_than and force_newer_than > latest: latest = force_newer_than if data and data.get('lastModifiedTime', None): # data has a mod time, let's see if that has expired if data.get('lastModifiedTime', None) < latest: ignore_cache = True # get the mod time of the computed table, if it exists try: table_date = bqutil.get_bq_table_last_modified_datetime(dataset, table, **optargs) except Exception as err: if 'Not Found' in str(err): table_date = None ignore_cache = True logging.info("[datasource.cached_get_bq_table] Table %s.%s doesn't exist, forcing recomputation" % (dataset, table)) else: raise if table_date and table_date < latest: ignore_cache = True if sql: force_query = True logging.info("[datasource.cached_get_bq_table] Forcing query recomputation of %s.%s, table_date=%s, latest=%s" % (dataset, table, table_date, latest)) else: logging.info("[datasource.cached_get_bq_table] Forcing cache reload of %s.%s, table_date=%s, latest=%s" % (dataset, table, table_date, latest)) # logging.info("[datasource.cached_get_bq_table] %s.%s table_date=%s, latest=%s, force_query=%s" % (dataset, table, table_date, latest, force_query)) if (not data) or ignore_cache or (not data['data']): # data['data']=None if table was empty, and in that case try again try: data = bqutil.get_bq_table(dataset, table, sql, key=key, logger=logger, force_query=force_query, startIndex=startIndex, maxResults=maxResults, allowLargeResults=allowLargeResults, **optargs) except Exception as err: logging.error(err) if raise_exception: raise data = {'fields': {}, 'field_names': [], 'data': [], 'data_by_key': {}} return data # don't cache empty result data['depends_on'] = depends_on if (drop is not None) and drop: for key in drop: data.pop(key) # because data can be too huge for memcache ("Values may not be more than 1000000 bytes in length") try: mem.set(memset, data, time=3600*12) except Exception as err: logging.error('error doing mem.set for %s.%s from bigquery' % (dataset, table)) self.bqdata[table] = data return data
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, start_date=None, end_date=None, get_date_function=None, existing=None, log_dates=None, days_delta=1, skip_last_day=False, has_hash_limit=False, newer_than=None, table_max_size_mb=800, limit_query_size=False): ''' make a certain table (with SQL given) for specified course_id. The master table holds all the data for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. If the query fails because of "Resources exceeded during query execution" then try setting the end_date, to do part at a time. NOTE: the SQL must produce a result which is ordered by date, in increasing order. days_delta = integer number of days to increase each time; specify 0 for one day overlap, but make sure the SQL query only selects for time > TIMESTAMP("{last_date}") If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format. newer_than = if specified, as datetime, then any existing destination table must be newer than this datetime, else force_recompute is made True ''' dataset = bqutil.course_id2dataset( course_id, use_dataset_latest=use_dataset_latest) # destination log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") if existing is None: existing = bqutil.get_list_of_table_ids(dataset) print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % ( len(existing or []), dataset) if log_dates is None: log_tables = [ x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20') ] log_dates = [x[9:] for x in log_tables] log_dates.sort() if len(log_dates) == 0: print "--> no tracking logs in %s aborting!" % (log_dataset) return if skip_last_day: old_max_date = max(log_dates) log_dates.remove( max(log_dates)) # remove the last day of data from consideration max_date = max(log_dates) print " --> skip_last_day is True: dropping %s, new max_date=%s" % ( old_max_date, max_date) sys.stdout.flush() min_date = min(log_dates) max_date = max(log_dates) if start_date: start_date = start_date.replace('-', '') if min_date < start_date: print " --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % ( min_date, start_date) min_date = start_date if end_date is not None: print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % ( table, min_date, max_date, end_date) sys.stdout.flush() the_end_date = end_date.replace('-', '') # end_date should be YYYY-MM-DD if the_end_date < max_date: max_date = the_end_date if (table in existing) and newer_than: # get date on existing table table_datetime = bqutil.get_bq_table_last_modified_datetime( dataset, table) if (table_datetime < newer_than): print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % ( dataset, table, table_datetime, newer_than) force_recompute = True if force_recompute: overwrite = True else: overwrite = False last_date = "2010-10-01 01:02:03" # default last date if (not overwrite) and table in existing: # find out what the end date is of the current table pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100) if (pc_last is None): print "--> no data in table %s.%s, starting from scratch!" % ( dataset, table) overwrite = True else: last_dates = [] for x in pc_last['data']: try: add_date = get_date_function(x) last_dates.append(add_date) except Exception as err: print "Error with get_date_function occurred. %s" % str( err) continue last_date = max(last_dates) table_max_date = last_date.strftime('%Y%m%d') if max_date <= table_max_date: print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % ( table, table_max_date, min_date, max_date) sys.stdout.flush() return min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d') print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % ( table, table_max_date, min_date, max_date) sys.stdout.flush() overwrite = 'append' if overwrite == 'append': print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % ( table, course_id, min_date, max_date, last_date, datetime.datetime.now()) else: print "Making new %s table for course %s (start=%s, end=%s) [%s]" % ( table, course_id, min_date, max_date, datetime.datetime.now()) sys.stdout.flush() if limit_query_size: # do only one day's tracking logs, and force use of hash if table is too large print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb the_max_date = max_date # save max_date information while min_date not in log_dates: print " tracklog_%s does not exist!" % min_date for ld in log_dates: if ld < min_date: continue if (ld > min_date) and (ld <= max_date): min_date = ld break if min_date not in log_dates: print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date raise Exception("[process_tracking_logs] missing tracking log") max_date = min_date print ' min_date = max_date = %s' % min_date tablename = 'tracklog_%s' % min_date.replace('-', '') tablesize_mb = bqutil.get_bq_table_size_bytes( log_dataset, tablename) / (1024.0 * 1024) nhashes = int(math.ceil(tablesize_mb / table_max_size_mb)) from_datasets = "[%s.%s]" % (log_dataset, tablename) print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % ( log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes) sys.stdout.flush() if nhashes: if not has_hash_limit: print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % ( log_dataset, tablename, tablesize_mb, table_max_size_mb) print "SQL: ", SQL raise Exception("[process_tracking_logs] table too large") for k in range(nhashes): hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) print " Hash %d" % k sys.stdout.flush() try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise overwrite = "append" else: the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise txt = '[%s] added tracking log data from %s' % ( datetime.datetime.now(), tablename) bqutil.add_description_to_table(dataset, table, txt, append=True) print "----> Done with day %s" % max_date if the_max_date > max_date: # more days still to be done print "--> Moving on to another day (max_date=%s)" % the_max_date run_query_on_tracking_logs( SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, existing=existing, log_dates=log_dates, has_hash_limit=has_hash_limit, table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size, ) return from_datasets = """( TABLE_QUERY({dataset}, "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}" ) ) """.format(dataset=log_dataset, start=min_date, end=max_date) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: if (('Response too large to return.' in str(err)) and has_hash_limit): # try using hash limit on username # e.g. WHERE ABS(HASH(username)) % 4 = 0 for k in range(4): hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" elif ('Resources exceeded during query execution' in str(err)): if True: # figure out time interval in days, and split that in half start_date = datetime.datetime.strptime(min_date, '%Y%m%d') end_date = datetime.datetime.strptime(max_date, '%Y%m%d') ndays = (end_date - start_date).days if (ndays < 1) and has_hash_limit: print "----> ndays=%d; retrying with limit_query_size" % ndays sys.stdout.flush() return run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=max_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates, limit_query_size=True, ) elif (ndays < 1): print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays raise nd1 = int(ndays / 2) nd2 = ndays - nd1 #if nd2 > nd1: # nd1 = nd2 # nd2 = ndays - nd1 print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % ( ndays, nd1, nd2) sys.stdout.flush() end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d') print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date) sys.stdout.flush() run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) end_date = max_date print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date) sys.stdout.flush() run_query_on_tracking_logs( SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) print "--> Done with %d + %d days!" % (nd1, nd2) return if False: def get_ym(x): return int(x[0:4]), int(x[4:6]), int(x[6:]) (min_year, min_month, min_day) = get_ym(min_date) (max_year, max_month, max_day) = get_ym(max_date) nmonths = max_month - min_month + 12 * (max_year - min_year) print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time" sys.stdout.flush() (end_year, end_month) = (min_year, min_month) for dm in range(nmonths): end_month += 1 if end_month > 12: end_month = 1 end_year += 1 end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day) print "--> with end_date=%s" % end_date sys.stdout.flush() run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) force_recompute = False # after first, don't force recompute return else: print the_sql raise if overwrite == 'append': txt = '[%s] added tracking log data from %s to %s' % ( datetime.datetime.now(), min_date, max_date) bqutil.add_description_to_table(dataset, table, txt, append=True) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False, check_dates=True): ''' Load daily tracking logs for course from google storage into BigQuery. If wait=True then waits for loading jobs to be completed. It's desirable to wait if subsequent jobs which need these tables (like person_day) are to be run immediately afterwards. ''' print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % ( course_id, datetime.datetime.now()) sys.stdout.flush() gsroot = gsutil.path_from_course_id(course_id) mypath = os.path.dirname(os.path.realpath(__file__)) SCHEMA = json.loads( open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log'] gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot) fnset = gsutil.get_gs_file_list(gsdir) dataset = bqutil.course_id2dataset(gsroot, dtype="logs") # create this dataset if necessary bqutil.create_dataset_if_nonexistent(dataset) tables = bqutil.get_list_of_table_ids(dataset) tables = [x for x in tables if x.startswith('track')] if verbose: print "-" * 77 print "current tables loaded:", json.dumps(tables, indent=4) print "files to load: ", json.dumps(fnset.keys(), indent=4) print "-" * 77 sys.stdout.flush() for fn, fninfo in fnset.iteritems(): if int(fninfo['size']) <= 45: print "Zero size file %s, skipping" % fn continue m = re.search('(\d\d\d\d-\d\d-\d\d)', fn) if not m: continue date = m.group(1) tablename = "tracklog_%s" % date.replace( '-', '') # YYYYMMDD for compatibility with table wildcards # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True) file_date = fninfo['date'].replace(tzinfo=None) if tablename in tables: skip = True if check_dates: table_date = bqutil.get_bq_table_last_modified_datetime( dataset, tablename) if not (table_date > file_date): print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % ( tablename, fn, file_date, table_date) skip = False if skip: if verbose: print "Already have table %s, skipping file %s" % ( tablename, fn) sys.stdout.flush() continue #if date < '2014-07-27': # continue print "Loading %s into table %s " % (fn, tablename) if verbose: print "start [%s]" % datetime.datetime.now() sys.stdout.flush() gsfn = fninfo['name'] ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000) if verbose: print "-" * 77 print "done with %s [%s]" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def cached_get_bq_table( self, dataset, table, sql=None, key=None, drop=None, logger=None, ignore_cache=False, depends_on=None, force_query=False, force_newer_than=None, startIndex=0, maxResults=1000000, allowLargeResults=False, raise_exception=False, project_id=None, ): ''' Get a dataset from BigQuery; use memcache. If "depends_on" is provided (as a list of strings), and if the desired table already exists, then check to make sure it is newer than any of the tables listed in "depends_on". if force_newer_than is set (should be a datetime) then in the depends_on testing, use that date as an override, such that the SQL is re-run if the existing table is older than this date. project_id: if specified, overrides the default BigQuery project ID (for the actual query) ''' if logger is None: logger = logging.info memset = '%s.%s' % (dataset, table) if startIndex: memset += '-%d-%d' % (startIndex, maxResults) data = mem.get(memset) optargs = {} if project_id: optargs['project_id'] = project_id table_date = None if depends_on is not None: # get the latest mod time of tables in depends_on: modtimes = [ bqutil.get_bq_table_last_modified_datetime( *(x.split('.', 1)), **optargs) for x in depends_on ] latest = max([x for x in modtimes if x is not None] or [None]) if not latest: raise Exception( "[datasource.cached_get_bq_table] Cannot get last mod time for %s (got %s), needed by %s.%s" % (depends_on, modtimes, dataset, table)) if force_newer_than and force_newer_than > latest: latest = force_newer_than if data and data.get('lastModifiedTime', None): # data has a mod time, let's see if that has expired if data.get('lastModifiedTime', None) < latest: ignore_cache = True # get the mod time of the computed table, if it exists try: table_date = bqutil.get_bq_table_last_modified_datetime( dataset, table, **optargs) except Exception as err: if 'Not Found' in str(err): table_date = None ignore_cache = True logging.info( "[datasource.cached_get_bq_table] Table %s.%s doesn't exist, forcing recomputation" % (dataset, table)) else: raise if table_date and table_date < latest: ignore_cache = True if sql: force_query = True logging.info( "[datasource.cached_get_bq_table] Forcing query recomputation of %s.%s, table_date=%s, latest=%s" % (dataset, table, table_date, latest)) else: logging.info( "[datasource.cached_get_bq_table] Forcing cache reload of %s.%s, table_date=%s, latest=%s" % (dataset, table, table_date, latest)) # logging.info("[datasource.cached_get_bq_table] %s.%s table_date=%s, latest=%s, force_query=%s" % (dataset, table, table_date, latest, force_query)) if (not data) or ignore_cache or ( not data['data'] ): # data['data']=None if table was empty, and in that case try again try: data = bqutil.get_bq_table(dataset, table, sql, key=key, logger=logger, force_query=force_query, startIndex=startIndex, maxResults=maxResults, allowLargeResults=allowLargeResults, **optargs) if force_query or not table_date: table_date = bqutil.get_bq_table_last_modified_datetime( dataset, table, **optargs) data['last_modified_date'] = table_date except Exception as err: logging.error(err) if raise_exception: raise data = { 'fields': {}, 'field_names': [], 'data': [], 'data_by_key': {} } return data # don't cache empty result data['depends_on'] = depends_on if (drop is not None) and drop: for key in drop: data.pop( key ) # because data can be too huge for memcache ("Values may not be more than 1000000 bytes in length") try: mem.set(memset, data, time=3600 * 12) except Exception as err: logging.error('error doing mem.set for %s.%s from bigquery' % (dataset, table)) self.bqdata[table] = data return data