def do_extract_subset_person_course_tables(the_dataset, pc_table): ''' Extract (from the latest person_course table, specified by dataset and table), rows for which viewed = True (as person_course_viewed), and verified_enroll_time is not Null (for person_course_idv). ''' the_sql = "SELECT * from [%s.%s] where viewed" % (the_dataset, pc_table) tablename = "person_course_viewed" try: ret = bqutil.create_bq_table(the_dataset, tablename, the_sql, overwrite=True, allowLargeResults=True, ) except Exception as err: print "ERROR! Failed on SQL=" print the_sql raise print " --> created %s" % tablename the_sql = "SELECT * from [%s.%s] where verified_enroll_time is not NULL" % (the_dataset, pc_table) tablename = "person_course_idv" try: ret = bqutil.create_bq_table(the_dataset, tablename, the_sql, overwrite=True, allowLargeResults=True, ) except Exception as err: print "ERROR! Failed on SQL=" print the_sql raise print " --> created %s" % tablename
def do_table(self, the_sql, tablename, the_dataset=None, sql_for_description=None, check_skip=True): if check_skip: if self.skip_or_do_step(tablename) < 0: return # skip step if the_dataset is None: the_dataset = self.dataset print("Computing %s in BigQuery" % tablename) sys.stdout.flush() try: ret = bqutil.create_bq_table(the_dataset, tablename, the_sql, overwrite=True, output_project_id=self.output_project_id, sql_for_description=sql_for_description or the_sql, ) except Exception as err: print "ERROR! Failed on SQL=" print the_sql raise gsfn = "%s/%s.csv" % (self.gsbucket, tablename) bqutil.extract_table_to_gs(the_dataset, tablename, gsfn, format='csv', do_gzip=False, wait=False) msg = "CSV download link: %s" % gsutil.gs_download_link(gsfn) print msg bqutil.add_description_to_table(the_dataset, tablename, msg, append=True, project_id=self.output_project_id)
def do_extract_subset_person_course_tables(the_dataset, pc_table): ''' Extract (from the latest person_course table, specified by dataset and table), rows for which viewed = True (as person_course_viewed), and verified_enroll_time is not Null (for person_course_idv). ''' the_sql = "SELECT * from [%s.%s] where viewed" % (the_dataset, pc_table) tablename = "person_course_viewed" try: ret = bqutil.create_bq_table( the_dataset, tablename, the_sql, overwrite=True, allowLargeResults=True, ) except Exception as err: print "ERROR! Failed on SQL=" print the_sql raise print " --> created %s" % tablename the_sql = "SELECT * from [%s.%s] where verified_enroll_time is not NULL" % ( the_dataset, pc_table) tablename = "person_course_idv" try: ret = bqutil.create_bq_table( the_dataset, tablename, the_sql, overwrite=True, allowLargeResults=True, ) except Exception as err: print "ERROR! Failed on SQL=" print the_sql raise print " --> created %s" % tablename
def obsolete_process_course(course_id, force_recompute=False, check_dates=True): ''' make person_course_day tables for specified course_id. This version produces one table for each day. It is inefficient when there are many days with very small daily tracking log tables. ''' PCDAY_SQL = """ select username, "{course_id}" as course_id, sum(bevent) as nevents, sum(bprogress) as nprogcheck, sum(bshow_answer) as nshow_answer, sum(bvideo) as nvideo, sum(bproblem_check) as nproblem_check, sum(bforum) as nforum, sum(bshow_transcript) as ntranscript, sum(bseq_goto) as nseq_goto, sum(bseek_video) as nseek_video, sum(bpause_video) as npause_video, MAX(time) as last_event, AVG( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as avg_dt, STDDEV( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sdv_dt, MAX( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as max_dt, COUNT( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as n_dt, SUM( case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end ) as sum_dt from (SELECT username, case when event_type = "play_video" then 1 else 0 end as bvideo, case when event_type = "problem_check" then 1 else 0 end as bproblem_check, case when username != "" then 1 else 0 end as bevent, case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum, case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress, case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer, case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript, case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto, case when event_type = 'seek_video' then 1 else 0 end as bseek_video, case when event_type = 'pause_video' then 1 else 0 end as bpause_video, # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll, # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll time, lag(time, 1) over (partition by username order by time) last_time FROM [{dataset}.{table_id}] WHERE NOT event_type contains "/xblock/" AND username != "" ) group by course_id, username order by sdv_dt desc """ course_dir = course_id.replace('/','__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])] print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'pcday_%s' % date if (table_out in pcday_tables) and not force_recompute: skip = True if check_dates: table_out_date = bqutil.get_bq_table_last_modified_datetime(pcd_dataset, table_out) log_table_date = bqutil.get_bq_table_last_modified_datetime(log_dataset, table_id) if log_table_date > table_out_date: skip = False print "%s...already exists, but table_out date=%s and log_table date=%s, so re-computing" % (table_out, table_out_date, log_table_date) if skip: print "%s...already done, skipping" % table_out sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print ("Creating %s " % table_out), the_sql = PCDAY_SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, start_date=None, end_date=None, get_date_function=None, existing=None, log_dates=None, days_delta=1, skip_last_day=False, has_hash_limit=False, newer_than=None, table_max_size_mb=800, limit_query_size=False): ''' make a certain table (with SQL given) for specified course_id. The master table holds all the data for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. If the query fails because of "Resources exceeded during query execution" then try setting the end_date, to do part at a time. NOTE: the SQL must produce a result which is ordered by date, in increasing order. days_delta = integer number of days to increase each time; specify 0 for one day overlap, but make sure the SQL query only selects for time > TIMESTAMP("{last_date}") If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format. newer_than = if specified, as datetime, then any existing destination table must be newer than this datetime, else force_recompute is made True ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) # destination log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") if existing is None: existing = bqutil.get_list_of_table_ids(dataset) print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (len(existing or []), dataset) if log_dates is None: log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')] log_dates = [x[9:] for x in log_tables] log_dates.sort() if len(log_dates)==0: print "--> no tracking logs in %s aborting!" % (log_dataset) return if skip_last_day: old_max_date = max(log_dates) log_dates.remove(max(log_dates)) # remove the last day of data from consideration max_date = max(log_dates) print " --> skip_last_day is True: dropping %s, new max_date=%s" % (old_max_date, max_date) sys.stdout.flush() min_date = min(log_dates) max_date = max(log_dates) if start_date: start_date = start_date.replace('-','') if min_date < start_date: print " --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (min_date, start_date) min_date = start_date if end_date is not None: print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (table, min_date, max_date, end_date) sys.stdout.flush() the_end_date = end_date.replace('-','') # end_date should be YYYY-MM-DD if the_end_date < max_date: max_date = the_end_date if (table in existing) and newer_than: # get date on existing table table_datetime = bqutil.get_bq_table_last_modified_datetime(dataset, table) if (table_datetime < newer_than): print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (dataset, table, table_datetime, newer_than) force_recompute = True if force_recompute: overwrite = True else: overwrite = False last_date = "2010-10-01 01:02:03" # default last date if (not overwrite) and table in existing: # find out what the end date is of the current table pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100) if (pc_last is None): print "--> no data in table %s.%s, starting from scratch!" % (dataset, table) overwrite = True else: last_dates = [get_date_function(x) for x in pc_last['data']] last_date = max(last_dates) table_max_date = last_date.strftime('%Y%m%d') if max_date <= table_max_date: print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (table, table_max_date, min_date, max_date) sys.stdout.flush() return min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d') print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (table, table_max_date, min_date, max_date) sys.stdout.flush() overwrite = 'append' if overwrite=='append': print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % (table, course_id, min_date, max_date, last_date, datetime.datetime.now()) else: print "Making new %s table for course %s (start=%s, end=%s) [%s]" % (table, course_id, min_date, max_date, datetime.datetime.now()) sys.stdout.flush() if limit_query_size: # do only one day's tracking logs, and force use of hash if table is too large print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb the_max_date = max_date # save max_date information while min_date not in log_dates: print " tracklog_%s does not exist!" % min_date for ld in log_dates: if ld < min_date: continue if (ld > min_date) and (ld <= max_date): min_date = ld break if min_date not in log_dates: print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date raise Exception("[process_tracking_logs] missing tracking log") max_date = min_date print ' min_date = max_date = %s' % min_date tablename = 'tracklog_%s' % min_date.replace('-', '') tablesize_mb = bqutil.get_bq_table_size_bytes(log_dataset, tablename) / (1024.0*1024) nhashes = int(math.ceil(tablesize_mb / table_max_size_mb)) from_datasets = "[%s.%s]" % (log_dataset, tablename) print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes) sys.stdout.flush() if nhashes: if not has_hash_limit: print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (log_dataset, tablename, tablesize_mb, table_max_size_mb) print "SQL: ", SQL raise Exception("[process_tracking_logs] table too large") for k in range(nhashes): hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) print " Hash %d" % k sys.stdout.flush() try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise overwrite = "append" else: the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise txt = '[%s] added tracking log data from %s' % (datetime.datetime.now(), tablename) bqutil.add_description_to_table(dataset, table, txt, append=True) print "----> Done with day %s" % max_date if the_max_date > max_date: # more days still to be done print "--> Moving on to another day (max_date=%s)" % the_max_date run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, existing=existing, log_dates=log_dates, has_hash_limit=has_hash_limit, table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size, ) return from_datasets = """( TABLE_QUERY({dataset}, "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}" ) ) """.format(dataset=log_dataset, start=min_date, end=max_date) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: if ( ('Response too large to return.' in str(err)) and has_hash_limit ): # try using hash limit on username # e.g. WHERE ABS(HASH(username)) % 4 = 0 for k in range(4): hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" elif ('Resources exceeded during query execution' in str(err)): if True: # figure out time interval in days, and split that in half start_date = datetime.datetime.strptime(min_date, '%Y%m%d') end_date = datetime.datetime.strptime(max_date, '%Y%m%d') ndays = (end_date - start_date).days if (ndays < 1) and has_hash_limit: print "----> ndays=%d; retrying with limit_query_size" % ndays sys.stdout.flush() return run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=max_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates, limit_query_size=True, ) elif (ndays < 1): print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays raise nd1 = int(ndays/2) nd2 = ndays - nd1 #if nd2 > nd1: # nd1 = nd2 # nd2 = ndays - nd1 print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (ndays, nd1, nd2) sys.stdout.flush() end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d') print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date) sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) end_date = max_date print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date) sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) print "--> Done with %d + %d days!" % (nd1, nd2) return if False: def get_ym(x): return int(x[0:4]), int(x[4:6]), int(x[6:]) (min_year, min_month, min_day) = get_ym(min_date) (max_year, max_month, max_day) = get_ym(max_date) nmonths = max_month - min_month + 12 * (max_year - min_year) print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time" sys.stdout.flush() (end_year, end_month) = (min_year, min_month) for dm in range(nmonths): end_month += 1 if end_month > 12: end_month = 1 end_year += 1 end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day) print "--> with end_date=%s" % end_date sys.stdout.flush() run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) force_recompute = False # after first, don't force recompute return else: print the_sql raise if overwrite=='append': txt = '[%s] added tracking log data from %s to %s' % (datetime.datetime.now(), min_date, max_date) bqutil.add_description_to_table(dataset, table, txt, append=True) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def old_process_course(course_id, force_recompute=False): ''' DEPRACATED - instead of creating one table per day, because there is so little total data, create one enrollday_all table (see other function below). make enrollday2_* tables for specified course_id ''' SQL = """ SELECT "{course_id}" as course_id, time, event_struct.user_id as user_id, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "honor") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "honor") then -1 else 0 end) as diff_enrollment_honor, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "verified") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "verified") then -1 else 0 end) as diff_enrollment_verified, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "audit") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "audit") then -1 else 0 end) as diff_enrollment_audit, FROM [{dataset}.{table_id}] where (event_type = "edx.course.enrollment.activated") or (event_type = "edx.course.enrollment.deactivated") order by time; """ course_dir = course_id.replace('/','__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])] # print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'enrollday2_%s' % date if (table_out in pcday_tables) and not force_recompute: print "%s...already done, skipping" % table_id sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print ("Creating %s " % table_out), the_sql = SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush()
def CreateForumPerson(course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None, has_hash_limit=False, hash_limit=HASH): ''' Create Forum Person table, based on forum events and forum posts tables. This table contains both read and writes for all forum posts, for all users. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_FORUM_PERSON original_the_sql = """ SELECT (case when PP.username_fp is not null then PP.username_fp else FE.username_fe end) as username, "{course_id}" as course_id, (case when PP.username_fp is not null then PP.slug_id else FE.slug_id end) as slug_id, (case when PP.username_fp is not null then PP.slug_type else FE.slug_type end) as slug_type, (case when PP.username_fp is not null then PP.thread_id else FE.thread_id end) as thread_id, (case when PP.username_fp is not null then PP.parent_id else FE.parent_id end) as parent_id, (case when PP.original_poster is not null then PP.original_poster else FE.original_poster end) as original_poster, (case when PP.responded_to is not null then PP.responded_to else FE.responded_to end) as responded_to, (case when PP.username_fp is not null then PP.title else FE.title end) as title, (case when PP.username_fp is not null then PP.wrote else 0 end) as wrote, FE.read as read, FE.pin as pinned, FE.upvote as upvoted, FE.unvote as unvoted, #FE.del as deleted, FE.follow as followed, (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) <= TIMESTAMP(FE.last_time)) then TIMESTAMP(PP.first_time) else (case when PP.first_time is not null and FE.last_time is null then TIMESTAMP(PP.first_time) else (case when FE.first_time is not null then TIMESTAMP(FE.first_time) else FE.last_time end) end) end) as first_time, (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) >= TIMESTAMP(FE.last_time)) then TIMESTAMP(PP.first_time) else (case when PP.first_time is not null and FE.last_time is null then TIMESTAMP(PP.first_time) else (case when FE.last_time is not null then TIMESTAMP(FE.last_time) else FE.first_time end) end) end) as last_time, FROM ( # Find 1st level posting => "original_post" SELECT username as username_fp, slug_id, slug_type, thread_id, parent_id, original_poster, responded_to, title, 1 as wrote, #created_at as first_time, first_time FROM [{dataset}.{forum_posts}] {hash_limit_where} ORDER by username_fp, first_time ) PP FULL OUTER JOIN EACH ( SELECT username as username_fe, MIN(TIMESTAMP(time)) as first_time, MAX(TIMESTAMP(time)) as last_time, slug_id, FE.thread_id as thread_id, FIRST(parent_id) as parent_id, F.slug_type as slug_type, F.original_poster as original_poster, F.responded_to as responded_to, F.title as title, #1 as read, sum(case when forum_action = "read" or forum_action = "read_inline" then 1 else 0 end) as read, sum(case when forum_action = "pin" then 1 else 0 end) as pin, sum(case when forum_action = "upvote" then 1 else 0 end) as upvote, sum(case when forum_action = "unvote" then 1 else 0 end) as unvote, #sum(case when forum_action = "delete" then 1 else 0 end) as del, sum(case when forum_action = "follow_thread" then 1 else 0 end) as follow, FROM [{dataset}.{forum_events}] FE JOIN EACH ( SELECT username as username_fe, slug_id, slug_type, thread_id, parent_id, original_poster, responded_to, title, first_time, FROM [{dataset}.{forum_posts}] {hash_limit_where} ) as F ON F.thread_id = FE.thread_id WHERE ((FE.forum_action = "read") or (FE.forum_action = "read_inline") or (FE.forum_action = "pin") or (FE.forum_action = "upvote") or (FE.forum_action = "unvote") or #(FE.forum_action = "delete") or (FE.forum_action = "follow_thread")) {hash_limit_and} GROUP BY username_fe, slug_id, thread_id, slug_type, original_poster, responded_to, title ) as FE ON (PP.username_fp = FE.username_fe) AND (PP.slug_id = FE.slug_id) WHERE (PP.username_fp is not null and PP.username_fp != '') or (FE.username_fe is not null and FE.username_fe != '') """ the_sql = original_the_sql.format(dataset=dataset, course_id=course_id, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and='', hash_limit_where='') print "[make_forum_analysis] Creating %s.%s table for %s" % ( dataset, TABLE_FORUM_PERSON, course_id) sys.stdout.flush() try: tinfo_fe = bqutil.get_bq_table_info(dataset, TABLE_FORUM_EVENTS) trows_fe = int(tinfo_fe['numRows']) print "[make_forum_analysis] %s Forum Events found " % trows_fe tinfo_fp = bqutil.get_bq_table_info(dataset, TABLE_FORUM_POSTS) trows_fp = int(tinfo_fp['numRows']) print "[make_forum_analysis] %s Forum Posts found " % trows_fp assert tinfo_fe is not None and trows_fe != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_PERSON, TABLE_FORUM_EVENTS) assert tinfo_fp is not None and trows_fp != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_PERSON, TABLE_FORUM_POSTS) except (AssertionError, Exception) as err: print " --> Err: missing %s.%s and/or %s (including 0 rows in table)? Skipping creation of %s" % ( dataset, TABLE_FORUM_POSTS, TABLE_FORUM_EVENTS, TABLE_FORUM_PERSON) sys.stdout.flush() return # Now try to create table try: if has_hash_limit: overwrite = True hash_limit = int(hash_limit) for k in range(hash_limit): hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % ( hash_limit, k) hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % ( hash_limit, k) retry_the_sql = original_the_sql.format( dataset=dataset, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and=hash_limit_and, hash_limit_where=hash_limit_where) print "[make_forum_analysis] Retrying with this query...", retry_the_sql sys.stdout.flush() bqutil.create_bq_table(dataset, table, retry_the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" else: overwrite = True bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: has_hash_limit = True if ((('Response too large to return.' in str(err)) or ('Resources exceeded during query execution' in str(err))) and has_hash_limit): # 'Resources exceeded during query execution' # try using hash limit on username # e.g. WHERE ABS(HASH(username)) % 4 = 0 print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit try: for k in range(hash_limit): hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % ( hash_limit, k) hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % ( hash_limit, k) retry_the_sql = original_the_sql.format( dataset=dataset, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and=hash_limit_and, hash_limit_where=hash_limit_where) print "[make_forum_analysis] Retrying with this query...", retry_the_sql sys.stdout.flush() bqutil.create_bq_table(dataset, table, retry_the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" except Exception as err: if ((('Response too large to return.' in str(err)) or ('Resources exceeded during query execution' in str(err))) and has_hash_limit): hash_limit = int(hash_limit * 2.0) print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit CreateForumPerson(course_id, force_recompute, use_dataset_latest, skip_last_day, end_date, has_hash_limit=True, hash_limit=hash_limit) else: print '[make_forum_analysis] An error occurred with this query: %s' % the_sql raise else: print '[make_forum_analysis] An error occurred with this query: %s' % the_sql raise print "Done with Forum Person for %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush() return
def old_process_course(course_id, force_recompute=False): ''' DEPRACATED - instead of creating one table per day, because there is so little total data, create one enrollday_all table (see other function below). make enrollday2_* tables for specified course_id ''' SQL = """ SELECT "{course_id}" as course_id, time, event_struct.user_id as user_id, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "honor") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "honor") then -1 else 0 end) as diff_enrollment_honor, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "verified") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "verified") then -1 else 0 end) as diff_enrollment_verified, (case when (event_type = "edx.course.enrollment.activated" and event_struct.mode = "audit") then 1 when (event_type = "edx.course.enrollment.deactivated" and event_struct.mode = "audit") then -1 else 0 end) as diff_enrollment_audit, FROM [{dataset}.{table_id}] where (event_type = "edx.course.enrollment.activated") or (event_type = "edx.course.enrollment.deactivated") order by time; """ course_dir = course_id.replace('/', '__') dataset = bqutil.course_id2dataset(course_id) log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday") print "Processing course %s (start %s)" % (course_id, datetime.datetime.now()) sys.stdout.flush() log_tables = bqutil.get_tables(log_dataset) try: bqutil.create_dataset_if_nonexistent(pcd_dataset) except Exception as err: print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err)) pcday_tables_info = bqutil.get_tables(pcd_dataset) pcday_tables = [ x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', []) ] # print "pcday_tables = ", pcday_tables log_table_list = log_tables['tables'] log_table_list.sort() for table in log_table_list: tr = table['tableReference'] table_id = tr['tableId'] if not table_id.startswith('tracklog'): continue date = table_id[9:] table_out = 'enrollday2_%s' % date if (table_out in pcday_tables) and not force_recompute: print "%s...already done, skipping" % table_id sys.stdout.flush() continue if bqutil.get_bq_table_size_rows(log_dataset, table_id) == 0: print "...zero size table %s, skipping" % table_id sys.stdout.flush() continue print("Creating %s " % table_out), the_sql = SQL.format(course_id=course_id, dataset=log_dataset, table_id=table_id) sys.stdout.flush() bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def CreateForumPerson( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None, has_hash_limit=False, hash_limit=HASH ): ''' Create Forum Person table, based on forum events and forum posts tables. This table contains both read and writes for all forum posts, for all users. ''' dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest) table = TABLE_FORUM_PERSON original_the_sql = """ SELECT (case when PP.username_fp is not null then PP.username_fp else FE.username_fe end) as username, "{course_id}" as course_id, (case when PP.username_fp is not null then PP.slug_id else FE.slug_id end) as slug_id, (case when PP.username_fp is not null then PP.slug_type else FE.slug_type end) as slug_type, (case when PP.username_fp is not null then PP.thread_id else FE.thread_id end) as thread_id, (case when PP.username_fp is not null then PP.parent_id else FE.parent_id end) as parent_id, (case when PP.original_poster is not null then PP.original_poster else FE.original_poster end) as original_poster, (case when PP.responded_to is not null then PP.responded_to else FE.responded_to end) as responded_to, (case when PP.username_fp is not null then PP.title else FE.title end) as title, (case when PP.username_fp is not null then PP.wrote else 0 end) as wrote, FE.read as read, FE.pin as pinned, FE.upvote as upvoted, FE.unvote as unvoted, #FE.del as deleted, FE.follow as followed, (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) <= TIMESTAMP(FE.last_time)) then TIMESTAMP(PP.first_time) else (case when PP.first_time is not null and FE.last_time is null then TIMESTAMP(PP.first_time) else (case when FE.first_time is not null then TIMESTAMP(FE.first_time) else FE.last_time end) end) end) as first_time, (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) >= TIMESTAMP(FE.last_time)) then TIMESTAMP(PP.first_time) else (case when PP.first_time is not null and FE.last_time is null then TIMESTAMP(PP.first_time) else (case when FE.last_time is not null then TIMESTAMP(FE.last_time) else FE.first_time end) end) end) as last_time, FROM ( # Find 1st level posting => "original_post" SELECT username as username_fp, slug_id, slug_type, thread_id, parent_id, original_poster, responded_to, title, 1 as wrote, #created_at as first_time, first_time FROM [{dataset}.{forum_posts}] {hash_limit_where} ORDER by username_fp, first_time ) PP FULL OUTER JOIN EACH ( SELECT username as username_fe, MIN(TIMESTAMP(time)) as first_time, MAX(TIMESTAMP(time)) as last_time, slug_id, FE.thread_id as thread_id, FIRST(parent_id) as parent_id, F.slug_type as slug_type, F.original_poster as original_poster, F.responded_to as responded_to, F.title as title, #1 as read, sum(case when forum_action = "read" or forum_action = "read_inline" then 1 else 0 end) as read, sum(case when forum_action = "pin" then 1 else 0 end) as pin, sum(case when forum_action = "upvote" then 1 else 0 end) as upvote, sum(case when forum_action = "unvote" then 1 else 0 end) as unvote, #sum(case when forum_action = "delete" then 1 else 0 end) as del, sum(case when forum_action = "follow_thread" then 1 else 0 end) as follow, FROM [{dataset}.{forum_events}] FE JOIN EACH ( SELECT username as username_fe, slug_id, slug_type, thread_id, parent_id, original_poster, responded_to, title, first_time, FROM [{dataset}.{forum_posts}] {hash_limit_where} ) as F ON F.thread_id = FE.thread_id WHERE ((FE.forum_action = "read") or (FE.forum_action = "read_inline") or (FE.forum_action = "pin") or (FE.forum_action = "upvote") or (FE.forum_action = "unvote") or #(FE.forum_action = "delete") or (FE.forum_action = "follow_thread")) {hash_limit_and} GROUP BY username_fe, slug_id, thread_id, slug_type, original_poster, responded_to, title ) as FE ON (PP.username_fp = FE.username_fe) AND (PP.slug_id = FE.slug_id) WHERE (PP.username_fp is not null and PP.username_fp != '') or (FE.username_fe is not null and FE.username_fe != '') """ the_sql = original_the_sql.format( dataset=dataset, course_id=course_id, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and='', hash_limit_where='' ) print "[make_forum_analysis] Creating %s.%s table for %s" % (dataset, TABLE_FORUM_PERSON, course_id) sys.stdout.flush() try: tinfo_fe = bqutil.get_bq_table_info( dataset, TABLE_FORUM_EVENTS ) trows_fe = int(tinfo_fe['numRows']) print "[make_forum_analysis] %s Forum Events found " % trows_fe tinfo_fp = bqutil.get_bq_table_info( dataset, TABLE_FORUM_POSTS ) trows_fp = int(tinfo_fp['numRows']) print "[make_forum_analysis] %s Forum Posts found " % trows_fp assert tinfo_fe is not None and trows_fe != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_PERSON, TABLE_FORUM_EVENTS ) assert tinfo_fp is not None and trows_fp != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_PERSON, TABLE_FORUM_POSTS ) except (AssertionError, Exception) as err: print " --> Err: missing %s.%s and/or %s (including 0 rows in table)? Skipping creation of %s" % ( dataset, TABLE_FORUM_POSTS, TABLE_FORUM_EVENTS, TABLE_FORUM_PERSON ) sys.stdout.flush() return # Now try to create table try: if has_hash_limit: overwrite = True hash_limit = int( hash_limit ) for k in range( hash_limit ): hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % ( hash_limit, k ) hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % ( hash_limit, k ) retry_the_sql = original_the_sql.format( dataset=dataset, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and=hash_limit_and, hash_limit_where=hash_limit_where ) print "[make_forum_analysis] Retrying with this query...", retry_the_sql sys.stdout.flush() bqutil.create_bq_table( dataset, table, retry_the_sql, wait=True, overwrite=overwrite, allowLargeResults=True ) overwrite = "append" else: overwrite = True bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: has_hash_limit = True if ( (('Response too large to return.' in str(err)) or ('Resources exceeded during query execution' in str(err))) and has_hash_limit ): # 'Resources exceeded during query execution' # try using hash limit on username # e.g. WHERE ABS(HASH(username)) % 4 = 0 print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit try: for k in range( hash_limit ): hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % ( hash_limit, k ) hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % ( hash_limit, k ) retry_the_sql = original_the_sql.format( dataset=dataset, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and=hash_limit_and, hash_limit_where=hash_limit_where ) print "[make_forum_analysis] Retrying with this query...", retry_the_sql sys.stdout.flush() bqutil.create_bq_table( dataset, table, retry_the_sql, wait=True, overwrite=overwrite, allowLargeResults=True ) overwrite = "append" except Exception as err: if ( (('Response too large to return.' in str(err)) or ('Resources exceeded during query execution' in str(err))) and has_hash_limit ): hash_limit = int( hash_limit * 2.0 ) print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit CreateForumPerson( course_id, force_recompute, use_dataset_latest, skip_last_day, end_date, has_hash_limit=True, hash_limit=hash_limit ) else: print '[make_forum_analysis] An error occurred with this query: %s' % the_sql raise else: print '[make_forum_analysis] An error occurred with this query: %s' % the_sql raise print "Done with Forum Person for %s (end %s)" % (course_id, datetime.datetime.now()) print "="*77 sys.stdout.flush() return
def actual_ajax_get_report_data(self, report_name=None): ''' get data for custom report. parameters like course_id, chapter_id, problem_id are passed in as GET or POST parameters Defined parameters for SQL: {person_course} --> person_course table for the specific course {dataset} --> dataset for the specific course {course_report} --> course_report_* dataset for the ORG or latest {course_report_org} --> course_report_ORG dataset for ORG = ORGANIZATION_NAME {orgname} --> organization name ''' crm, pdata, auth_ok, msg = self.custom_report_auth_check( report_name) # crm = CourseReport model if not auth_ok: return self.no_auth_sorry() course_id = pdata['course_id'] force_query = pdata.get('force_query', False) if force_query == 'false': force_query = False ignore_cache = pdata.get('ignore_cache', False) or force_query # project_id specified? optargs = {} if 'project_id' in (crm.meta_info or {}): optargs['project_id'] = crm.meta_info['project_id'] if course_id: dataset = bqutil.course_id2dataset( course_id, use_dataset_latest=self.use_dataset_latest()) pdata['person_course'] = '[%s.person_course]' % dataset elif 'dataset' in (crm.meta_info or {}): dataset = crm.meta_info['dataset'] else: dataset = self.get_course_report_dataset() # using course report dataset; list the tables, to determine which is the latest # person_course dataset, and use that for {person_course} pdata[ 'person_course_latest'] = self.find_latest_person_course_table( dataset, project_id=optargs.get('project_id')) pdata['person_course'] = '[%s.%s]' % ( dataset, pdata['person_course_latest']) pdata['dataset'] = dataset pdata['course_report'] = self.get_course_report_dataset() pdata['course_report_org'] = self.get_course_report_dataset( force_use_org=True) pdata['orgname'] = self.ORGNAME pdata['sane_username'] = self.user.replace(' ', '_').replace( '.', '_').replace('@', '_') if 'module_id' in pdata and pdata['module_id']: url_name = pdata['module_id'].rsplit('/', 1)[-1] pdata['module_url_name'] = url_name.replace(':', '__').replace( '-', '_') # what table? get custom course report configuration metadata for report name as specified table = crm.table_name if not table or table == "None": error = "No table name defined! Cannot process this custom report" data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) return # multiple table names? use parameters to select one if ',' in table: tables = table.split(',') try: table_number = int(pdata.get('table_number', 0) or 0) table = tables[table_number] except Exception as err: raise Exception( "[custom_reports] Cannot select table from tables=%s, table_number=%s, err=%s" % (tables, pdata.get('table_number'), err)) # allow parameters in table name if '{' in table: table = table.format(**pdata) table = table.replace('-', '_').replace(' ', '_') if not ('dataset' in (crm.meta_info or {})) and not table.startswith( 'stats_') and not (crm.meta_info.get('no_stats_ok')): table = "stats_" + table # special handling for person_course table from particular dataset for m in re.findall('{person_course__([^ \}]+)}', crm.sql): org = m org_dataset = self.get_course_report_dataset(orgname=org) pcd = '[%s.%s]' % (org_dataset, self.find_latest_person_course_table( org_dataset, project_id=optargs.get('project_id'))) pdata['person_course__' + org] = pcd logging.info('[cr] org=%s, pc=%s.%s' % (org, org_dataset, pcd)) # special handling for course_report tables for specific orgs for m in re.findall('{course_report__([^ \}]+)}', crm.sql): org = m org_dataset = self.get_course_report_dataset(orgname=org) pdata['course_report__' + org] = org_dataset logging.info("Using %s for custom report %s person_course" % (pdata.get('person_course'), report_name)) error = None def setup_sql_flags(): if 'sql_flags' in pdata: if not type(pdata['sql_flags']) == dict: try: pdata['sql_flags'] = json.loads(pdata['sql_flags']) except Exception as err: msg = "Cannot parse sql_flags as JSON! sql_flags=%s" % pdata[ 'sql_flags'] msg += err logging.error(msg) raise Exception(msg) # dynamic sql: the SQL is allowed to change based on input parameters # do this by treating the SQL as a jinja2 tempate if crm.meta_info.get('dynamic_sql'): setup_sql_flags() # a little sanity checking - disallow spaces in any sql_flags values sf = pdata['sql_flags'] for k in sf: if ' ' in sf[k]: msg = "Illegal sql_flags %s=%s!" % (k, sf[k]) msg += "sql_flags = %s" % json.dumps(sf, indent=4) error = "<pre>%s</pre>" % (msg.replace( '<', '<').replace('<', '>')) data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) return try: sql_template = Template(crm.sql) sql = sql_template.render(pdata) except Exception as err: msg = 'Custom report data: failed to render dynamic SQL with pdata=%s, err=%s' % ( pdata, err) logging.error(msg) logging.error('sql=%s' % crm.sql) error = "<pre>%s</pre>" % (msg.replace('<', '<').replace( '<', '>')) data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) return # append username to table name table = table + "_%s" % pdata['sane_username'] force_query = True # for now, all dynamic_sql is done with force_query ignore_cache = True the_sql = sql else: the_sql = crm.sql if 1: # generate SQL and depends_on try: sql = the_sql.format(**pdata) except Exception as err: msg = "Custom report data: failed to prepare SQL, err=%s" % str( err) msg += '\npdata = %s' % pdata logging.error(msg) if self.is_superuser(): error = "<pre>%s</pre>" % (str(msg).replace( '<', '<').replace('<', '>')) data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) logging.error("Returning with error message") return raise def strip_brackets(x): x = x.strip() if x.startswith('[') and x.endswith(']'): x = x[1:-1] return x return x if crm.meta_info.get('debug_sql'): msg = "debug_sql is true; not running SQL. This is the SQL which would have been run:\n" msg += sql msg += "\n\nwith these parameters:\n" msg += json.dumps(pdata, indent=4) msg += "\n\nproducing the output table: %s.%s\n" % (dataset, table) error = "<pre>%s</pre>" % (msg.replace('<', '<').replace( '<', '>')) data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) return try: if crm.depends_on and (not crm.depends_on == "None"): depends_on = [ strip_brackets(x.format(**pdata)) for x in (json.loads(crm.depends_on or "[]")) ] else: depends_on = None except Exception as err: logging.error( "for course report %s, cannot process depends_on=%s" % (report_name, crm.depends_on)) raise Exception( "Bad format for the 'depends_on' setting in the custom report specification" ) raise # get the data, and do query if needed logging.info( 'custom report get_report_data name=%s, table=%s.%s, depends_on=%s, pdata=%s' % (report_name, dataset, table, depends_on, pdata)) the_msg = [] def my_logger(msg): logging.info(msg) the_msg.append(msg) def output_error(err): error = "<pre>%s</pre>" % (str(err).replace('<', '<').replace( '<', '>')) logging.error('custom report error %s' % error) logging.error(err) logging.error(traceback.format_exc()) # raise if self.is_superuser(): msg = ('\n'.join(the_msg)) msg = msg.replace('<', '<').replace('<', '>') error += "<pre>%s</pre>" % msg error += "SQL: <pre>%s</pre>" % sql error += "Parameters: <pre>%s</pre>" % json.dumps(pdata, indent=4) error += "optargs: <pre>%s</pre>" % json.dumps(optargs, indent=4) data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) # is the request "indexed", meaning only matching rows of the table are to be returned? indexed_column = crm.meta_info.get('indexed') if indexed_column: if type(indexed_column) == list: indexed_columns = indexed_column try: table_number = int(pdata.get('table_number', 0) or 0) indexed_column = indexed_columns[table_number] except Exception as err: raise Exception( "[custom_reports] Cannot select indexed_column from indexed_columns=%s, table_number=%s, err=%s" % (indexed_columns, pdata.get('table_number'), err)) setup_sql_flags() indexed_value = pdata.get('sql_flags', {}).get('indexed_value') logging.info( "[custom_reports] retrieving %s.%s with indexing on %s to match value %s" % (dataset, table, indexed_column, indexed_value)) if not indexed_value: my_logger( 'Error: missing sql_flags.indexed_value to match indexed column %s in %s.%s' % (indexed_column, dataset, table)) data = {'error': msg} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) return # ensure existence of indexed version of table. By convention, that is a table named tablename + "__indexed_" + indexed_column # the table has a SHA1 hash of the indexed column added, and is sorted according to the last few characters # of the SHA1 hash. indexed_table = table + "__indexed_" + indexed_column indexing_sql_template = """select *, SUBSTR(TO_BASE64(SHA1(STRING({indexed_column}))),-3,2) as index_sha1_2ch, ROW_NUMBER() over (order by index_sha1_2ch, {indexed_column}) as index_row_number{subnum}, from [{dataset}.{table}] {where_clause} order by index_sha1_2ch, {indexed_column} """ indexing_sql = indexing_sql_template.format( dataset=dataset, table=table, indexed_column=indexed_column, where_clause="", subnum="") try: bqdata = self.cached_get_bq_table( dataset, indexed_table, sql=indexing_sql, logger=my_logger, depends_on=["%s.%s" % (dataset, table)], raise_exception=True, ignore_cache=ignore_cache, force_query=force_query, startIndex=0, maxResults=1, **optargs) except Exception as err: if "Response too large to return" in str(the_msg): # hmm - table too large! can't workaround using allowLargeResult because the return results # need to be ordered. So let's break it up into multiple queries, appending each, # by index_sha1_2ch b64chars = "+/0123456789" + ''.join( map(chr, range(ord('A'), ord('Z') + 1))) + ''.join( map(chr, range(ord('a'), ord('z') + 1))) # get table size, divide by 64M, to get number of divisions to use tinfo = bqutil.get_bq_table_info(dataset, table, **optargs) nbytes = int(tinfo['numBytes']) ndivs = int(round(nbytes / (64 * 1024 * 1024))) logging.info( "Response too large - nbytes=%s, so trying ndivs=%s" % (nbytes, ndivs)) end_idx = None start_idx = None dn = int(64 / ndivs) offset = dn overwrite = True nrows = 0 while (offset < 65): start_idx = end_idx last_row_index = nrows # note ROW_NUMBER() starts with 1 (not zero) if (offset < 64): end_idx = b64chars[offset] + "+" else: end_idx = None # boundary case wc = "where " if start_idx: wc += '(SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) >= "%s") ' % ( indexed_column, start_idx) else: wc += "True " if end_idx: wc += 'AND (SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) < "%s") ' % ( indexed_column, end_idx) logging.info( "--> start_idx=%s, end_idx=%s, starting row %d" % (start_idx, end_idx, last_row_index)) tmp_sql = indexing_sql_template.format( dataset=dataset, table=table, indexed_column=indexed_column, where_clause=wc, subnum="_sub") indexing_sql = "SELECT *, index_row_number_sub + %d as index_row_number FROM (%s)" % ( last_row_index, tmp_sql) try: bqutil.create_bq_table(dataset, indexed_table, sql=indexing_sql, overwrite=overwrite, logger=my_logger, **optargs) cnt = 0 tinfo = None while (not tinfo) and (cnt < 10): tinfo = bqutil.get_bq_table_info( dataset, indexed_table, **optargs) if not tinfo: logging.info( "==> ERROR? got unexpected None for get_bq_table_info %s.%s" % (dataset, indexed_table)) time.sleep(10) cnt += 1 nrows = int(tinfo['numRows']) logging.info( "--> Result from %s to %s has %d rows" % (start_idx, end_idx, nrows)) except Exception as err: bqdata = {'data': None} sql = indexing_sql output_error(err) return overwrite = "append" offset += dn else: bqdata = {'data': None} sql = indexing_sql output_error(err) return # now ensure table index, and retrieve it. It has just two columns: index_sha1_2ch, start_row tindex_table = table + "__index_for_" + indexed_column tindex_sql = """SELECT index_sha1_2ch, min(index_row_number) as start_row, # max(index_row_number) as end_row, # don't need this - just take next start_row FROM [{dataset}.{indexed_table}] group by index_sha1_2ch order by index_sha1_2ch """.format(dataset=dataset, indexed_table=indexed_table) try: bqdata = self.cached_get_bq_table( dataset, tindex_table, sql=tindex_sql, logger=my_logger, depends_on=["%s.%s" % (dataset, indexed_table)], raise_exception=True, ignore_cache=ignore_cache, force_query=force_query, startIndex=0, maxResults=10000, **optargs) except Exception as err: bqdata = {'data': None} sql = tindex_sql output_error(err) return # find the start and end rows to retrieve, based the last characters of the SHA1 hash of the indexed value sha1_2ch = base64.b64encode( hashlib.sha1(indexed_value).digest())[-3:-1] start_row = None end_row = None for k in bqdata['data']: if start_row and not end_row: end_row = int(k['start_row']) if (k['index_sha1_2ch'] == sha1_2ch): start_row = int(k['start_row']) logging.info( "Retrieving iv=%s, sha1_2ch=%s, rows %s to %s of %s.%s" % (indexed_value, sha1_2ch, start_row, end_row, dataset, indexed_table)) if not start_row: output_error( "Cannot find %s=%s in %s.%s" % (indexed_column, indexed_value, dataset, indexed_table)) bqdata = {'data': None} return max_results = (end_row or (start_row + 4000)) - start_row bqdata = self.cached_get_bq_table(dataset, indexed_table, ignore_cache=True, startIndex=start_row - 1, maxResults=max_results, **optargs) # extract just the row(s) with indexed_column value matching indexed_value (the hash is many to one) newdata = [] for k in range(len(bqdata['data'])): datum = bqdata['data'][k] if (datum[indexed_column] == indexed_value): newdata.append(datum) logging.info("--> Result has %d items, of which %d match" % (len(bqdata['data']), len(newdata))) bqdata['data'] = newdata table = indexed_table # so that columns are retrieved properly if not indexed_column: # retrieve full table try: bqdata = self.cached_get_bq_table( dataset, table, sql=sql, logger=my_logger, depends_on=depends_on, startIndex=int(pdata['start'] or 0), maxResults=int(pdata['length'] or 100000), raise_exception=True, ignore_cache=ignore_cache, force_query=force_query, **optargs) self.fix_bq_dates(bqdata) except Exception as err: bqdata = {'data': None} output_error(err) return tablecolumns = [] if pdata['get_table_columns']: try: tableinfo = bqutil.get_bq_table_info(dataset, table, **optargs) except Exception as err: error = (error or "\n") + str(err) tableinfo = None raise if tableinfo: fields = tableinfo['schema']['fields'] field_names = [x['name'] for x in fields] tablecolumns = [{ 'data': x, 'title': x, 'class': 'dt-center' } for x in field_names] data = self.common_data.copy() data.update({ 'data': bqdata['data'], 'draw': pdata['draw'], 'last_modified_date': str(bqdata.get('last_modified_date')), 'fields': bqdata['fields'], 'recordsTotal': bqdata.get('numRows', 0), 'recordsFiltered': bqdata.get('numRows', 0), 'error': error, 'tablecolumns': tablecolumns, 'output_table': table, 'output_dataset': dataset, }) # logging.info('[cr] data=%s' % data) self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data))
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, start_date=None, end_date=None, get_date_function=None, existing=None, log_dates=None, days_delta=1, skip_last_day=False, has_hash_limit=False, newer_than=None, table_max_size_mb=800, limit_query_size=False): ''' make a certain table (with SQL given) for specified course_id. The master table holds all the data for a course. It isn't split into separate days. It is ordered in time, however. To update it, a new day's logs are processed, then the results appended to this table. If the table doesn't exist, then run it once on all the existing tracking logs. If it already exists, then run a query on it to see what dates have already been done. Then do all tracking logs except those which have already been done. Append the results to the existing table. If the query fails because of "Resources exceeded during query execution" then try setting the end_date, to do part at a time. NOTE: the SQL must produce a result which is ordered by date, in increasing order. days_delta = integer number of days to increase each time; specify 0 for one day overlap, but make sure the SQL query only selects for time > TIMESTAMP("{last_date}") If skip_last_day is True then do not include the last day of tracking log data in the processing. This is done to avoid processing partial data, e.g. when tracking log data are incrementally loaded with a delta of less than one day. start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format. newer_than = if specified, as datetime, then any existing destination table must be newer than this datetime, else force_recompute is made True ''' dataset = bqutil.course_id2dataset( course_id, use_dataset_latest=use_dataset_latest) # destination log_dataset = bqutil.course_id2dataset(course_id, dtype="logs") if existing is None: existing = bqutil.get_list_of_table_ids(dataset) print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % ( len(existing or []), dataset) if log_dates is None: log_tables = [ x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20') ] log_dates = [x[9:] for x in log_tables] log_dates.sort() if len(log_dates) == 0: print "--> no tracking logs in %s aborting!" % (log_dataset) return if skip_last_day: old_max_date = max(log_dates) log_dates.remove( max(log_dates)) # remove the last day of data from consideration max_date = max(log_dates) print " --> skip_last_day is True: dropping %s, new max_date=%s" % ( old_max_date, max_date) sys.stdout.flush() min_date = min(log_dates) max_date = max(log_dates) if start_date: start_date = start_date.replace('-', '') if min_date < start_date: print " --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % ( min_date, start_date) min_date = start_date if end_date is not None: print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % ( table, min_date, max_date, end_date) sys.stdout.flush() the_end_date = end_date.replace('-', '') # end_date should be YYYY-MM-DD if the_end_date < max_date: max_date = the_end_date if (table in existing) and newer_than: # get date on existing table table_datetime = bqutil.get_bq_table_last_modified_datetime( dataset, table) if (table_datetime < newer_than): print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % ( dataset, table, table_datetime, newer_than) force_recompute = True if force_recompute: overwrite = True else: overwrite = False last_date = "2010-10-01 01:02:03" # default last date if (not overwrite) and table in existing: # find out what the end date is of the current table pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100) if (pc_last is None): print "--> no data in table %s.%s, starting from scratch!" % ( dataset, table) overwrite = True else: last_dates = [] for x in pc_last['data']: try: add_date = get_date_function(x) last_dates.append(add_date) except Exception as err: print "Error with get_date_function occurred. %s" % str( err) continue last_date = max(last_dates) table_max_date = last_date.strftime('%Y%m%d') if max_date <= table_max_date: print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % ( table, table_max_date, min_date, max_date) sys.stdout.flush() return min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d') print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % ( table, table_max_date, min_date, max_date) sys.stdout.flush() overwrite = 'append' if overwrite == 'append': print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % ( table, course_id, min_date, max_date, last_date, datetime.datetime.now()) else: print "Making new %s table for course %s (start=%s, end=%s) [%s]" % ( table, course_id, min_date, max_date, datetime.datetime.now()) sys.stdout.flush() if limit_query_size: # do only one day's tracking logs, and force use of hash if table is too large print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb the_max_date = max_date # save max_date information while min_date not in log_dates: print " tracklog_%s does not exist!" % min_date for ld in log_dates: if ld < min_date: continue if (ld > min_date) and (ld <= max_date): min_date = ld break if min_date not in log_dates: print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date raise Exception("[process_tracking_logs] missing tracking log") max_date = min_date print ' min_date = max_date = %s' % min_date tablename = 'tracklog_%s' % min_date.replace('-', '') tablesize_mb = bqutil.get_bq_table_size_bytes( log_dataset, tablename) / (1024.0 * 1024) nhashes = int(math.ceil(tablesize_mb / table_max_size_mb)) from_datasets = "[%s.%s]" % (log_dataset, tablename) print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % ( log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes) sys.stdout.flush() if nhashes: if not has_hash_limit: print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % ( log_dataset, tablename, tablesize_mb, table_max_size_mb) print "SQL: ", SQL raise Exception("[process_tracking_logs] table too large") for k in range(nhashes): hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) print " Hash %d" % k sys.stdout.flush() try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise overwrite = "append" else: the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: print the_sql raise txt = '[%s] added tracking log data from %s' % ( datetime.datetime.now(), tablename) bqutil.add_description_to_table(dataset, table, txt, append=True) print "----> Done with day %s" % max_date if the_max_date > max_date: # more days still to be done print "--> Moving on to another day (max_date=%s)" % the_max_date run_query_on_tracking_logs( SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, existing=existing, log_dates=log_dates, has_hash_limit=has_hash_limit, table_max_size_mb=table_max_size_mb, limit_query_size=limit_query_size, ) return from_datasets = """( TABLE_QUERY({dataset}, "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}" ) ) """.format(dataset=log_dataset, start=min_date, end=max_date) the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="") try: bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) except Exception as err: if (('Response too large to return.' in str(err)) and has_hash_limit): # try using hash limit on username # e.g. WHERE ABS(HASH(username)) % 4 = 0 for k in range(4): hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit) bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True) overwrite = "append" elif ('Resources exceeded during query execution' in str(err)): if True: # figure out time interval in days, and split that in half start_date = datetime.datetime.strptime(min_date, '%Y%m%d') end_date = datetime.datetime.strptime(max_date, '%Y%m%d') ndays = (end_date - start_date).days if (ndays < 1) and has_hash_limit: print "----> ndays=%d; retrying with limit_query_size" % ndays sys.stdout.flush() return run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=max_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates, limit_query_size=True, ) elif (ndays < 1): print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays raise nd1 = int(ndays / 2) nd2 = ndays - nd1 #if nd2 > nd1: # nd1 = nd2 # nd2 = ndays - nd1 print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % ( ndays, nd1, nd2) sys.stdout.flush() end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d') print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date) sys.stdout.flush() run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) end_date = max_date print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date) sys.stdout.flush() run_query_on_tracking_logs( SQL, table, course_id, force_recompute=False, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) print "--> Done with %d + %d days!" % (nd1, nd2) return if False: def get_ym(x): return int(x[0:4]), int(x[4:6]), int(x[6:]) (min_year, min_month, min_day) = get_ym(min_date) (max_year, max_month, max_day) = get_ym(max_date) nmonths = max_month - min_month + 12 * (max_year - min_year) print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time" sys.stdout.flush() (end_year, end_month) = (min_year, min_month) for dm in range(nmonths): end_month += 1 if end_month > 12: end_month = 1 end_year += 1 end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day) print "--> with end_date=%s" % end_date sys.stdout.flush() run_query_on_tracking_logs( SQL, table, course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest, end_date=end_date, get_date_function=get_date_function, has_hash_limit=has_hash_limit, # existing=existing, log_dates=log_dates) force_recompute = False # after first, don't force recompute return else: print the_sql raise if overwrite == 'append': txt = '[%s] added tracking log data from %s to %s' % ( datetime.datetime.now(), min_date, max_date) bqutil.add_description_to_table(dataset, table, txt, append=True) print "Done with course %s (end %s)" % (course_id, datetime.datetime.now()) print "=" * 77 sys.stdout.flush()
def actual_ajax_get_report_data(self, report_name=None): ''' get data for custom report. parameters like course_id, chapter_id, problem_id are passed in as GET or POST parameters Defined parameters for SQL: {person_course} --> person_course table for the specific course {dataset} --> dataset for the specific course {course_report} --> course_report_* dataset for the ORG or latest {course_report_org} --> course_report_ORG dataset for ORG = ORGANIZATION_NAME {orgname} --> organization name ''' crm, pdata, auth_ok, msg = self.custom_report_auth_check(report_name) # crm = CourseReport model if not auth_ok: return self.no_auth_sorry() course_id = pdata['course_id'] force_query = pdata.get('force_query', False) if force_query == 'false': force_query = False ignore_cache = pdata.get('ignore_cache', False) or force_query # project_id specified? optargs = {} if 'project_id' in (crm.meta_info or {}): optargs['project_id'] = crm.meta_info['project_id'] if course_id: dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest()) pdata['person_course'] = '[%s.person_course]' % dataset elif 'dataset' in (crm.meta_info or {}): dataset = crm.meta_info['dataset'] else: dataset = self.get_course_report_dataset() # using course report dataset; list the tables, to determine which is the latest # person_course dataset, and use that for {person_course} pdata['person_course_latest'] = self.find_latest_person_course_table(dataset, project_id=optargs.get('project_id')) pdata['person_course'] = '[%s.%s]' % (dataset, pdata['person_course_latest']) pdata['dataset'] = dataset pdata['course_report'] = self.get_course_report_dataset() pdata['course_report_org'] = self.get_course_report_dataset(force_use_org=True) pdata['orgname'] = self.ORGNAME pdata['sane_username'] = self.user.replace(' ', '_').replace('.', '_').replace('@', '_') if 'module_id' in pdata and pdata['module_id']: url_name = pdata['module_id'].rsplit('/',1)[-1] pdata['module_url_name'] = url_name.replace(':','__').replace('-','_') # what table? get custom course report configuration metadata for report name as specified table = crm.table_name if not table or table=="None": error = "No table name defined! Cannot process this custom report" data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) return # multiple table names? use parameters to select one if ',' in table: tables = table.split(',') try: table_number = int(pdata.get('table_number', 0) or 0) table = tables[table_number] except Exception as err: raise Exception("[custom_reports] Cannot select table from tables=%s, table_number=%s, err=%s" % (tables, pdata.get('table_number'), err)) # allow parameters in table name if '{' in table: table = table.format(**pdata) table = table.replace('-', '_').replace(' ', '_') if not ('dataset' in (crm.meta_info or {})) and not table.startswith('stats_') and not (crm.meta_info.get('no_stats_ok')): table = "stats_" + table # special handling for person_course table from particular dataset for m in re.findall('{person_course__([^ \}]+)}', crm.sql): org = m org_dataset = self.get_course_report_dataset(orgname=org) pcd = '[%s.%s]' % (org_dataset, self.find_latest_person_course_table(org_dataset, project_id=optargs.get('project_id'))) pdata['person_course__' + org] = pcd logging.info('[cr] org=%s, pc=%s.%s' % (org, org_dataset, pcd)) # special handling for course_report tables for specific orgs for m in re.findall('{course_report__([^ \}]+)}', crm.sql): org = m org_dataset = self.get_course_report_dataset(orgname=org) pdata['course_report__' + org] = org_dataset logging.info("Using %s for custom report %s person_course" % (pdata.get('person_course'), report_name)) error = None def setup_sql_flags(): if 'sql_flags' in pdata: if not type(pdata['sql_flags'])==dict: try: pdata['sql_flags'] = json.loads(pdata['sql_flags']) except Exception as err: msg = "Cannot parse sql_flags as JSON! sql_flags=%s" % pdata['sql_flags'] msg += err logging.error(msg) raise Exception(msg) # dynamic sql: the SQL is allowed to change based on input parameters # do this by treating the SQL as a jinja2 tempate if crm.meta_info.get('dynamic_sql'): setup_sql_flags() # a little sanity checking - disallow spaces in any sql_flags values sf = pdata['sql_flags'] for k in sf: if ' ' in sf[k]: msg = "Illegal sql_flags %s=%s!" % (k, sf[k]) msg += "sql_flags = %s" % json.dumps(sf, indent=4) error = "<pre>%s</pre>" % (msg.replace('<','<').replace('<','>')) data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) return try: sql_template = Template(crm.sql) sql = sql_template.render(pdata) except Exception as err: msg = 'Custom report data: failed to render dynamic SQL with pdata=%s, err=%s' % (pdata, err) logging.error(msg) logging.error('sql=%s' % crm.sql) error = "<pre>%s</pre>" % (msg.replace('<','<').replace('<','>')) data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) return # append username to table name table = table + "_%s" % pdata['sane_username'] force_query = True # for now, all dynamic_sql is done with force_query ignore_cache = True the_sql = sql else: the_sql = crm.sql if 1: # generate SQL and depends_on try: sql = the_sql.format(**pdata) except Exception as err: msg = "Custom report data: failed to prepare SQL, err=%s" % str(err) msg += '\npdata = %s' % pdata logging.error(msg) if self.is_superuser(): error = "<pre>%s</pre>" % (str(msg).replace('<','<').replace('<','>')) data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) logging.error("Returning with error message") return raise def strip_brackets(x): x = x.strip() if x.startswith('[') and x.endswith(']'): x = x[1:-1] return x return x if crm.meta_info.get('debug_sql'): msg = "debug_sql is true; not running SQL. This is the SQL which would have been run:\n" msg += sql msg += "\n\nwith these parameters:\n" msg += json.dumps(pdata, indent=4) msg += "\n\nproducing the output table: %s.%s\n" % (dataset, table) error = "<pre>%s</pre>" % (msg.replace('<','<').replace('<','>')) data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) return try: if crm.depends_on and (not crm.depends_on=="None"): depends_on = [ strip_brackets(x.format(**pdata)) for x in (json.loads(crm.depends_on or "[]")) ] else: depends_on = None except Exception as err: logging.error("for course report %s, cannot process depends_on=%s" % (report_name, crm.depends_on)) raise Exception("Bad format for the 'depends_on' setting in the custom report specification") raise # get the data, and do query if needed logging.info('custom report get_report_data name=%s, table=%s.%s, depends_on=%s, pdata=%s' % (report_name, dataset, table, depends_on, pdata)) the_msg = [] def my_logger(msg): logging.info(msg) the_msg.append(msg) def output_error(err): error = "<pre>%s</pre>" % (str(err).replace('<','<').replace('<','>')) logging.error('custom report error %s' % error) logging.error(err) logging.error(traceback.format_exc()) # raise if self.is_superuser(): msg = ('\n'.join(the_msg)) msg = msg.replace('<','<').replace('<','>') error += "<pre>%s</pre>" % msg error += "SQL: <pre>%s</pre>" % sql error += "Parameters: <pre>%s</pre>" % json.dumps(pdata, indent=4) error += "optargs: <pre>%s</pre>" % json.dumps(optargs, indent=4) data = {'error': error} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) # is the request "indexed", meaning only matching rows of the table are to be returned? indexed_column = crm.meta_info.get('indexed') if indexed_column: if type(indexed_column)==list: indexed_columns = indexed_column try: table_number = int(pdata.get('table_number', 0) or 0) indexed_column = indexed_columns[table_number] except Exception as err: raise Exception("[custom_reports] Cannot select indexed_column from indexed_columns=%s, table_number=%s, err=%s" % (indexed_columns, pdata.get('table_number'), err)) setup_sql_flags() indexed_value = pdata.get('sql_flags', {}).get('indexed_value') logging.info("[custom_reports] retrieving %s.%s with indexing on %s to match value %s" % (dataset, table, indexed_column, indexed_value)) if not indexed_value: my_logger('Error: missing sql_flags.indexed_value to match indexed column %s in %s.%s' % (indexed_column, dataset, table)) data = {'error': msg} self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data)) return # ensure existence of indexed version of table. By convention, that is a table named tablename + "__indexed_" + indexed_column # the table has a SHA1 hash of the indexed column added, and is sorted according to the last few characters # of the SHA1 hash. indexed_table = table + "__indexed_" + indexed_column indexing_sql_template = """select *, SUBSTR(TO_BASE64(SHA1(STRING({indexed_column}))),-3,2) as index_sha1_2ch, ROW_NUMBER() over (order by index_sha1_2ch, {indexed_column}) as index_row_number{subnum}, from [{dataset}.{table}] {where_clause} order by index_sha1_2ch, {indexed_column} """ indexing_sql = indexing_sql_template.format(dataset=dataset, table=table, indexed_column=indexed_column, where_clause="", subnum="") try: bqdata = self.cached_get_bq_table(dataset, indexed_table, sql=indexing_sql, logger=my_logger, depends_on=["%s.%s" % (dataset, table)], raise_exception=True, ignore_cache=ignore_cache, force_query=force_query, startIndex=0, maxResults=1, **optargs ) except Exception as err: if "Response too large to return" in str(the_msg): # hmm - table too large! can't workaround using allowLargeResult because the return results # need to be ordered. So let's break it up into multiple queries, appending each, # by index_sha1_2ch b64chars = "+/0123456789" + ''.join(map(chr,range(ord('A'), ord('Z')+1))) + ''.join(map(chr,range(ord('a'), ord('z')+1))) # get table size, divide by 64M, to get number of divisions to use tinfo = bqutil.get_bq_table_info(dataset, table, **optargs) nbytes = int(tinfo['numBytes']) ndivs = int(round(nbytes / (64*1024*1024))) logging.info("Response too large - nbytes=%s, so trying ndivs=%s" % (nbytes, ndivs)) end_idx = None start_idx = None dn = int(64 / ndivs) offset = dn overwrite = True nrows = 0 while (offset < 65): start_idx = end_idx last_row_index = nrows # note ROW_NUMBER() starts with 1 (not zero) if (offset < 64): end_idx = b64chars[offset] + "+" else: end_idx = None # boundary case wc = "where " if start_idx: wc += '(SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) >= "%s") ' % (indexed_column, start_idx) else: wc += "True " if end_idx: wc += 'AND (SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) < "%s") ' % (indexed_column, end_idx) logging.info("--> start_idx=%s, end_idx=%s, starting row %d" % (start_idx, end_idx, last_row_index)) tmp_sql = indexing_sql_template.format(dataset=dataset, table=table, indexed_column=indexed_column, where_clause=wc, subnum="_sub") indexing_sql = "SELECT *, index_row_number_sub + %d as index_row_number FROM (%s)" % (last_row_index, tmp_sql) try: bqutil.create_bq_table(dataset, indexed_table, sql=indexing_sql, overwrite=overwrite, logger=my_logger, **optargs ) cnt = 0 tinfo = None while (not tinfo) and (cnt < 10): tinfo = bqutil.get_bq_table_info(dataset, indexed_table, **optargs) if not tinfo: logging.info("==> ERROR? got unexpected None for get_bq_table_info %s.%s" % (dataset, indexed_table)) time.sleep(10) cnt += 1 nrows = int(tinfo['numRows']) logging.info("--> Result from %s to %s has %d rows" % (start_idx, end_idx, nrows)) except Exception as err: bqdata = {'data': None} sql = indexing_sql output_error(err) return overwrite = "append" offset += dn else: bqdata = {'data': None} sql = indexing_sql output_error(err) return # now ensure table index, and retrieve it. It has just two columns: index_sha1_2ch, start_row tindex_table = table + "__index_for_" + indexed_column tindex_sql = """SELECT index_sha1_2ch, min(index_row_number) as start_row, # max(index_row_number) as end_row, # don't need this - just take next start_row FROM [{dataset}.{indexed_table}] group by index_sha1_2ch order by index_sha1_2ch """.format(dataset=dataset, indexed_table=indexed_table) try: bqdata = self.cached_get_bq_table(dataset, tindex_table, sql=tindex_sql, logger=my_logger, depends_on=["%s.%s" % (dataset, indexed_table)], raise_exception=True, ignore_cache=ignore_cache, force_query=force_query, startIndex=0, maxResults=10000, **optargs ) except Exception as err: bqdata = {'data': None} sql = tindex_sql output_error(err) return # find the start and end rows to retrieve, based the last characters of the SHA1 hash of the indexed value sha1_2ch = base64.b64encode(hashlib.sha1(indexed_value).digest())[-3:-1] start_row = None end_row = None for k in bqdata['data']: if start_row and not end_row: end_row = int(k['start_row']) if (k['index_sha1_2ch']==sha1_2ch): start_row = int(k['start_row']) logging.info("Retrieving iv=%s, sha1_2ch=%s, rows %s to %s of %s.%s" % (indexed_value, sha1_2ch, start_row, end_row, dataset, indexed_table)) if not start_row: output_error("Cannot find %s=%s in %s.%s" % (indexed_column, indexed_value, dataset, indexed_table)) bqdata = {'data': None} return max_results = (end_row or (start_row+4000)) - start_row bqdata = self.cached_get_bq_table(dataset, indexed_table, ignore_cache=True, startIndex=start_row-1, maxResults=max_results, **optargs ) # extract just the row(s) with indexed_column value matching indexed_value (the hash is many to one) newdata = [] for k in range(len(bqdata['data'])): datum = bqdata['data'][k] if (datum[indexed_column]==indexed_value): newdata.append(datum) logging.info("--> Result has %d items, of which %d match" % (len(bqdata['data']), len(newdata))) bqdata['data'] = newdata table = indexed_table # so that columns are retrieved properly if not indexed_column: # retrieve full table try: bqdata = self.cached_get_bq_table(dataset, table, sql=sql, logger=my_logger, depends_on=depends_on, startIndex=int(pdata['start'] or 0), maxResults=int(pdata['length'] or 100000), raise_exception=True, ignore_cache=ignore_cache, force_query=force_query, **optargs ) self.fix_bq_dates(bqdata) except Exception as err: bqdata = {'data': None} output_error(err) return tablecolumns = [] if pdata['get_table_columns']: try: tableinfo = bqutil.get_bq_table_info(dataset, table, **optargs) except Exception as err: error = (error or "\n") + str(err) tableinfo = None raise if tableinfo: fields = tableinfo['schema']['fields'] field_names = [x['name'] for x in fields] tablecolumns = [ { 'data': x, 'title': x, 'class': 'dt-center' } for x in field_names ] data = self.common_data.copy() data.update({'data': bqdata['data'], 'draw': pdata['draw'], 'last_modified_date': str(bqdata.get('last_modified_date')), 'fields': bqdata['fields'], 'recordsTotal': bqdata.get('numRows', 0), 'recordsFiltered': bqdata.get('numRows', 0), 'error': error, 'tablecolumns': tablecolumns, 'output_table': table, 'output_dataset': dataset, }) # logging.info('[cr] data=%s' % data) self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json.dumps(data))