def do_table(self, the_sql, tablename, the_dataset=None, sql_for_description=None, check_skip=True):

        if check_skip:
            if self.skip_or_do_step(tablename) < 0:
                return	# skip step

        if the_dataset is None:
            the_dataset = self.dataset

        print("Computing %s in BigQuery" % tablename)
        sys.stdout.flush()
        try:
            ret = bqutil.create_bq_table(the_dataset, tablename, the_sql, 
                                         overwrite=True,
                                         output_project_id=self.output_project_id,
                                         sql_for_description=sql_for_description or the_sql,
                                     )
        except Exception as err:
            print "ERROR! Failed on SQL="
            print the_sql
            raise

        gsfn = "%s/%s.csv" % (self.gsbucket, tablename)
        bqutil.extract_table_to_gs(the_dataset, tablename, gsfn, 
                                   format='csv', 
                                   do_gzip=False,
                                   wait=False)

        msg = "CSV download link: %s" % gsutil.gs_download_link(gsfn)
        print msg
        bqutil.add_description_to_table(the_dataset, tablename, msg, append=True, project_id=self.output_project_id)
def do_combine(
    course_id_set,
    project_id,
    outdir="DATA",
    nskip=0,
    output_project_id=None,
    output_dataset_id=None,
    output_bucket=None,
    use_dataset_latest=False,
):

    print "=" * 77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-" * 77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)

    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(
            course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' %
                        (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip > 0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/', 1)[0]

    ofn = "person_course_%s_%s.csv" % (
        org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "=" * 77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip > 1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (
                    zfn, ofn
                )  # first row is header; don't keep when concatenating
            print cmd
            first = 0
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org,
                                       gsbucket=output_bucket)

    print "=" * 77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(
        gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-', '_')

    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfn,
                              the_schema,
                              format='csv',
                              skiprows=1,
                              project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "=" * 100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset,
                                    table,
                                    msg,
                                    append=True,
                                    project_id=output_project_id)

    print "Done"
    sys.stdout.flush()
Esempio n. 3
0
def rephrase_forum_json_for_course(
    course_id,
    gsbucket="gs://x-data",
    basedir="X-Year-2-data-sql",
    datedir=None,
    do_gs_copy=False,
    use_dataset_latest=False,
):

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket,
                                          use_dataset_latest)

    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            sys.stdout.flush()
            return

    print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now())
    sys.stdout.flush()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    for line in fp:
        cnt += 1
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)
    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt == 0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, 
                               start_date=None,
                               end_date=None, 
                               get_date_function=None,
                               existing=None,
                               log_dates=None,
                               days_delta=1,
                               skip_last_day=False,
                               has_hash_limit=False,
                               newer_than=None,
                               table_max_size_mb=800,
                               limit_query_size=False):
    '''
    make a certain table (with SQL given) for specified course_id.

    The master table holds all the data for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    If the query fails because of "Resources exceeded during query execution"
    then try setting the end_date, to do part at a time.

    NOTE: the SQL must produce a result which is ordered by date, in increasing order.

    days_delta = integer number of days to increase each time; specify 0 for one day overlap,
                 but make sure the SQL query only selects for time > TIMESTAMP("{last_date}")

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.

    start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format.

    newer_than = if specified, as datetime, then any existing destination table must
    be newer than this datetime, else force_recompute is made True
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)	# destination
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")

    if existing is None:
        existing = bqutil.get_list_of_table_ids(dataset)
        print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (len(existing or []), dataset)

    if log_dates is None:
        log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')]
        log_dates = [x[9:] for x in log_tables]
        log_dates.sort()

    if len(log_dates)==0:
        print "--> no tracking logs in %s aborting!" % (log_dataset)
        return

    if skip_last_day:
        old_max_date = max(log_dates)
        log_dates.remove(max(log_dates))	# remove the last day of data from consideration
        max_date = max(log_dates)
        print "         --> skip_last_day is True: dropping %s, new max_date=%s" % (old_max_date, max_date)
        sys.stdout.flush()

    min_date = min(log_dates)
    max_date = max(log_dates)

    if start_date:
        start_date = start_date.replace('-','')
        if min_date < start_date:
            print "        --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (min_date, start_date)
            min_date = start_date

    if end_date is not None:
        print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (table, min_date, max_date, end_date)
        sys.stdout.flush()
        the_end_date = end_date.replace('-','')	# end_date should be YYYY-MM-DD
        if the_end_date < max_date:
            max_date = the_end_date

    if (table in existing) and newer_than:
        # get date on existing table
        table_datetime = bqutil.get_bq_table_last_modified_datetime(dataset, table)
        if (table_datetime < newer_than):
            print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (dataset, table, table_datetime, newer_than)
            force_recompute = True

    if force_recompute:
        overwrite = True
    else:
        overwrite = False

    last_date = "2010-10-01 01:02:03"    	# default last date

    if (not overwrite) and table in existing:
        # find out what the end date is of the current table
        pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100)
        if (pc_last is None):
            print "--> no data in table %s.%s, starting from scratch!" % (dataset, table)
            overwrite = True
        else:
            last_dates = [get_date_function(x) for x in pc_last['data']]
            last_date = max(last_dates)
            table_max_date = last_date.strftime('%Y%m%d')
            if max_date <= table_max_date:
                print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (table, 
                                                                                                                  table_max_date,
                                                                                                                  min_date,
                                                                                                                  max_date)
                sys.stdout.flush()
                return
            min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d')
            print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (table, 
                                                                                                      table_max_date,
                                                                                                      min_date,
                                                                                                      max_date)
            sys.stdout.flush()
            overwrite = 'append'
    
    if overwrite=='append':
        print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]"  % (table, course_id, min_date, max_date, last_date, datetime.datetime.now())
    else:
        print "Making new %s table for course %s (start=%s, end=%s) [%s]"  % (table, course_id, min_date, max_date, datetime.datetime.now())
    sys.stdout.flush()

    if limit_query_size:
        # do only one day's tracking logs, and force use of hash if table is too large
        print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb
        the_max_date = max_date	# save max_date information

        while min_date not in log_dates:
            print "  tracklog_%s does not exist!" % min_date
            for ld in log_dates:
                if ld < min_date:
                    continue
                if (ld > min_date) and (ld <= max_date):
                    min_date = ld
                    break
        if min_date not in log_dates:
            print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date
            raise Exception("[process_tracking_logs] missing tracking log")

        max_date = min_date
        print '    min_date = max_date = %s' % min_date
        tablename = 'tracklog_%s' % min_date.replace('-', '')
        tablesize_mb = bqutil.get_bq_table_size_bytes(log_dataset, tablename) / (1024.0*1024)
        nhashes =  int(math.ceil(tablesize_mb / table_max_size_mb))
        from_datasets = "[%s.%s]" % (log_dataset, tablename)
        print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes)
        sys.stdout.flush()

        if nhashes:
            if not has_hash_limit:
                print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (log_dataset, tablename, 
                                                                                                             tablesize_mb, table_max_size_mb)
                print "SQL: ", SQL
                raise Exception("[process_tracking_logs] table too large")

            for k in range(nhashes):
                hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k)
                the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit)
                print "   Hash %d" % k
                sys.stdout.flush()
                try:
                    bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
                except Exception as err:
                    print the_sql
                    raise
                overwrite = "append"
        else:
            the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="")
            try:
                bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
            except Exception as err:
                print the_sql
                raise

        txt = '[%s] added tracking log data from %s' % (datetime.datetime.now(), tablename)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

        print "----> Done with day %s" % max_date

        if the_max_date > max_date:
            # more days still to be done
            print "--> Moving on to another day (max_date=%s)" % the_max_date
            run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, 
                                       use_dataset_latest=use_dataset_latest,
                                       end_date=end_date, 
                                       get_date_function=get_date_function,
                                       existing=existing,
                                       log_dates=log_dates,
                                       has_hash_limit=has_hash_limit,
                                       table_max_size_mb=table_max_size_mb,
                                       limit_query_size=limit_query_size,
                                   )
        return

    from_datasets = """(
                  TABLE_QUERY({dataset},
                       "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}"
                     )
                  )
         """.format(dataset=log_dataset, start=min_date, end=max_date)

    the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="")

    try:
        bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
    except Exception as err:
        if ( ('Response too large to return.' in str(err)) and has_hash_limit ):
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0

            for k in range(4):
                hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k
                the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit)
                bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
                overwrite = "append"

        elif ('Resources exceeded during query execution' in str(err)):
            if True:
                # figure out time interval in days, and split that in half
                start_date = datetime.datetime.strptime(min_date, '%Y%m%d')
                end_date = datetime.datetime.strptime(max_date, '%Y%m%d')
                ndays = (end_date - start_date).days
                if (ndays < 1) and has_hash_limit:
                    print "----> ndays=%d; retrying with limit_query_size" % ndays
                    sys.stdout.flush()
                    return run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                                      use_dataset_latest=use_dataset_latest,
                                                      end_date=max_date, 
                                                      get_date_function=get_date_function,
                                                      has_hash_limit=has_hash_limit,
                                                      # existing=existing,
                                                      log_dates=log_dates,
                                                      limit_query_size=True,
                                                  )
                elif (ndays < 1):
                    print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays
                    raise
                
                nd1 = int(ndays/2)
                nd2 = ndays - nd1
                #if nd2 > nd1:
                #    nd1 = nd2
                #    nd2 = ndays - nd1
                print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (ndays, nd1, nd2)
                sys.stdout.flush()

                end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d')
                print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                           use_dataset_latest=use_dataset_latest,
                                           end_date=end_date, 
                                           get_date_function=get_date_function,
                                           has_hash_limit=has_hash_limit,
                                           # existing=existing,
                                           log_dates=log_dates)

                end_date = max_date
                print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, 
                                           use_dataset_latest=use_dataset_latest,
                                           end_date=end_date, 
                                           get_date_function=get_date_function,
                                           has_hash_limit=has_hash_limit,
                                           # existing=existing,
                                           log_dates=log_dates)
                print "--> Done with %d + %d days!" % (nd1, nd2)
                return


            if False:
                def get_ym(x):
                    return int(x[0:4]), int(x[4:6]), int(x[6:])
                (min_year, min_month, min_day) = get_ym(min_date)
                (max_year, max_month, max_day) = get_ym(max_date)
                nmonths = max_month - min_month + 12 * (max_year - min_year)
                print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time"
                sys.stdout.flush()
                (end_year, end_month) = (min_year, min_month)
                for dm in range(nmonths):
                    end_month += 1
                    if end_month > 12:
                        end_month = 1
                        end_year += 1
                    end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day)
                    print "--> with end_date=%s" % end_date
                    sys.stdout.flush()
                    run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                               use_dataset_latest=use_dataset_latest,
                                               end_date=end_date, 
                                               get_date_function=get_date_function,
                                               has_hash_limit=has_hash_limit,
                                               # existing=existing,
                                               log_dates=log_dates)
                    force_recompute = False		# after first, don't force recompute
                return
        else:
            print the_sql
            raise

    if overwrite=='append':
        txt = '[%s] added tracking log data from %s to %s' % (datetime.datetime.now(), min_date, max_date)
        bqutil.add_description_to_table(dataset, table, txt, append=True)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
Esempio n. 5
0
def do_combine(
    course_id_set,
    project_id,
    outdir="DATA",
    nskip=0,
    output_project_id=None,
    output_dataset_id=None,
    output_bucket=None,
    use_dataset_latest=False,
    extract_subset_tables=True,
):
    '''
    Combine individual person_course tables (from the set of specified course_id's) to create one single large
    person_course table.  Do this by downloading each file, checking to make sure they all have the same
    fields, concatenating, and uploading back to bigquery.  This is cheaper than doing a select *, and also
    uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large
    result sizes).  The result is stored in the course_report_latest dataset (if use_dataset_latest), else 
    in course_report_ORG, where ORG is the configured organization name.

    If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset
    of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv.
    (those are created using a select *, for efficiency, despite the cost).
    '''

    print "=" * 77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-" * 77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)

    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(
            course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' %
                        (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip > 0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (
                    ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/', 1)[0]

    ofn = "person_course_%s_%s.csv" % (
        org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "=" * 77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip > 1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        header = None
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
                header = os.popen("zcat %s | head -1" % zfn).read().strip()
                firstfn = zfn
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (
                    zfn, ofn
                )  # first row is header; don't keep when concatenating
            print cmd
            first = 0
            new_header = os.popen("zcat %s | head -1" % zfn).read().strip()
            if not header == new_header:
                print "==> Warning!  header mismatch for %s vs %s" % (zfn,
                                                                      firstfn)
                print "    %s has: %s" % (firstfn, header)
                print "    but %s has: %s" % (zfn, new_header)
            sys.stdout.flush()
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org,
                                       gsbucket=output_bucket)

    print "=" * 77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(
        gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-', '_')

    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfn,
                              the_schema,
                              format='csv',
                              skiprows=1,
                              project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "=" * 100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset,
                                    table,
                                    msg,
                                    append=True,
                                    project_id=output_project_id)

    # copy the new table (which has a specific date in its name) to a generically named "person_course_latest"
    # so that future SQL queries can simply use this as the latest person course table
    print "-> Copying %s to %s.person_course_latest" % (table, dataset)
    bqutil.copy_bq_table(dataset, table, "person_course_latest")

    if extract_subset_tables:
        do_extract_subset_person_course_tables(dataset, table)

    print "Done"
    sys.stdout.flush()
def rephrase_forum_json_for_course(course_id, gsbucket="gs://x-data", 
                                   basedir="X-Year-2-data-sql", 
                                   datedir=None, 
                                   do_gs_copy=False,
                                   use_dataset_latest=False,
                                   ):
    
    print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest)
    
    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            sys.stdout.flush()
            return

    print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now())
    sys.stdout.flush()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    for line in fp:
        cnt += 1
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)
    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt==0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
Esempio n. 7
0
def do_save(cid, caset_in, xbundle, datadir, log_msg, use_dataset_latest=False):
    '''
    Save course axis data to bigquery
    
    cid = course_id
    caset = list of course axis data in dict format
    xbundle = XML bundle of course (everything except static files)
    datadir = directory where output files should be written
    log_msg = list of messages about processing errors and issues
    '''

    # BigQuery requires data to fit within a schema; let's make sure our lines all fit the schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    the_schema = json.loads(open('%s/schemas/schema_course_axis.json' % mypath).read())['course_axis']
    dict_schema = schema2dict(the_schema)

    caset = copy.deepcopy(caset_in)

    datadir = path(datadir)
    cafn = datadir / 'course_axis.json' 
    xbfn = datadir / ('xbundle_%s.xml' % (cid.replace('/','__')))
    fp = open(cafn, 'w')
    linecnt = 0

    for ca in caset:
        linecnt += 1
        ca['course_id'] = cid
        data = ca['data']
        if data and not type(data)==dict:
            try:
                ca['data'] = json.loads(data)	# make it native, for mongo
            except Exception as err:
                print "failed to create json for %s, error=%s" % (data, err)
        if ca['start'] is not None:
            ca['start'] = str(ca['start'])	# datetime to string
        if  ca['due'] is not None:
            ca['due'] = str(ca['due'])	# datetime to string
        if (ca['data'] is None) or (ca['data']==''):
            ca.pop('data')
        check_schema(linecnt, ca, the_ds=dict_schema, coerce=True)
        try:
            # db.course_axis.insert(ca)
            fp.write(json.dumps(ca)+'\n')
        except Exception as err:
            print "Failed to save!  Error=%s, data=%s" % (err, ca)
    fp.close()

    # upload axis.json file and course xbundle
    gsdir = path(gsutil.gs_path_from_course_id(cid, use_dataset_latest=use_dataset_latest))
    if 1:
        gsutil.upload_file_to_gs(cafn, gsdir, options="-z json", verbose=False)
        gsutil.upload_file_to_gs(xbfn, gsdir, options='-z xml', verbose=False)

    # import into BigQuery
    dataset = bqutil.course_id2dataset(cid, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)	# create dataset if not already existent
    table = "course_axis"
    bqutil.load_data_to_table(dataset, table, gsdir / (cafn.basename()), the_schema)

    msg = "="*100 + '\n'
    msg += "Course axis for %s\n" % (cid)
    msg += "="*100 + '\n'
    msg += '\n'.join(log_msg)
    msg = msg[:16184]		# max message length 16384
    
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    print "    Done - inserted %s records into course_axis" % len(caset)
Esempio n. 8
0
def rephrase_forum_json_for_course(
    course_id,
    gsbucket="gs://x-data",
    basedir="X-Year-2-data-sql",
    datedir=None,
    do_gs_copy=False,
    use_dataset_latest=False,
):

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket,
                                          use_dataset_latest)

    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"
    ofncsv = "forum.csv.gz"  # To match table name in BQ
    ofncsv_lfp = lfp / ofncsv

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn) and os.path.exists(ofncsv_lfp):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            print "Already done %s -> %s (skipping)" % (fn, ofncsv_lfp)
            sys.stdout.flush()
            return

    print "Processing %s -> writing to %s and %s (%s)" % (
        fn, ofn, ofncsv, datetime.datetime.now())
    sys.stdout.flush()

    # Setup CSV header
    ocsv = csv.DictWriter(openfile(ofncsv, 'w'),
                          fieldnames=SCHEMA_DICT.keys(),
                          quoting=csv.QUOTE_NONNUMERIC)
    ocsv.writeheader()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    data = OrderedDict()
    for line in fp:
        cnt += 1
        # Write JSON row
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)

        try:
            #Write CSV row
            data = json.loads(newline)
            ocsv.writerow(data)
        except Exception as err:
            print "Error writing CSV output row %s=%s" % (cnt, data)
            raise

    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt == 0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0,
               output_project_id=None, output_dataset_id=None, output_bucket=None,
               use_dataset_latest=False,
               extract_subset_tables=True,
               ):
    '''
    Combine individual person_course tables (from the set of specified course_id's) to create one single large
    person_course table.  Do this by downloading each file, checking to make sure they all have the same
    fields, concatenating, and uploading back to bigquery.  This is cheaper than doing a select *, and also
    uncovers person_course files which have the wrong schema (and it works around BQ's limitation on large
    result sizes).  The result is stored in the course_report_latest dataset (if use_dataset_latest), else 
    in course_report_ORG, where ORG is the configured organization name.

    If extract_subset_tables is True, then the subset of those who viewed (ie "participants"), and the subset
    of those who enrolled for IDV, are extracted and saved as person_course_viewed, and person_course_idv.
    (those are created using a select *, for efficiency, despite the cost).
    '''

    print "="*77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-"*77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)
        
    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip>0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/',1)[0]

    ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "="*77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip>1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        header = None
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
                header = os.popen("zcat %s | head -1" % zfn).read().strip()
                firstfn = zfn
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn)	# first row is header; don't keep when concatenating
            print cmd
            first = 0
            new_header = os.popen("zcat %s | head -1" % zfn).read().strip()
            if not header == new_header:
                print "==> Warning!  header mismatch for %s vs %s" % (zfn, firstfn)
                print "    %s has: %s" % (firstfn, header)
                print "    but %s has: %s" % (zfn, new_header)
            sys.stdout.flush()
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket)

    print "="*77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-','_')
    
    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "="*100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id)
    
    # copy the new table (which has a specific date in its name) to a generically named "person_course_latest"
    # so that future SQL queries can simply use this as the latest person course table
    print "-> Copying %s to %s.person_course_latest" % (table, dataset)
    bqutil.copy_bq_table(dataset, table, "person_course_latest")

    if extract_subset_tables:
        do_extract_subset_person_course_tables(dataset, table)

    print "Done"
    sys.stdout.flush()
Esempio n. 10
0
def run_query_on_tracking_logs(SQL,
                               table,
                               course_id,
                               force_recompute=False,
                               use_dataset_latest=False,
                               start_date=None,
                               end_date=None,
                               get_date_function=None,
                               existing=None,
                               log_dates=None,
                               days_delta=1,
                               skip_last_day=False,
                               has_hash_limit=False,
                               newer_than=None,
                               table_max_size_mb=800,
                               limit_query_size=False):
    '''
    make a certain table (with SQL given) for specified course_id.

    The master table holds all the data for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    If the query fails because of "Resources exceeded during query execution"
    then try setting the end_date, to do part at a time.

    NOTE: the SQL must produce a result which is ordered by date, in increasing order.

    days_delta = integer number of days to increase each time; specify 0 for one day overlap,
                 but make sure the SQL query only selects for time > TIMESTAMP("{last_date}")

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.

    start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format.

    newer_than = if specified, as datetime, then any existing destination table must
    be newer than this datetime, else force_recompute is made True
    '''

    dataset = bqutil.course_id2dataset(
        course_id, use_dataset_latest=use_dataset_latest)  # destination
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")

    if existing is None:
        existing = bqutil.get_list_of_table_ids(dataset)
        print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (
            len(existing or []), dataset)

    if log_dates is None:
        log_tables = [
            x for x in bqutil.get_list_of_table_ids(log_dataset)
            if x.startswith('tracklog_20')
        ]
        log_dates = [x[9:] for x in log_tables]
        log_dates.sort()

    if len(log_dates) == 0:
        print "--> no tracking logs in %s aborting!" % (log_dataset)
        return

    if skip_last_day:
        old_max_date = max(log_dates)
        log_dates.remove(
            max(log_dates))  # remove the last day of data from consideration
        max_date = max(log_dates)
        print "         --> skip_last_day is True: dropping %s, new max_date=%s" % (
            old_max_date, max_date)
        sys.stdout.flush()

    min_date = min(log_dates)
    max_date = max(log_dates)

    if start_date:
        start_date = start_date.replace('-', '')
        if min_date < start_date:
            print "        --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (
                min_date, start_date)
            min_date = start_date

    if end_date is not None:
        print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (
            table, min_date, max_date, end_date)
        sys.stdout.flush()
        the_end_date = end_date.replace('-',
                                        '')  # end_date should be YYYY-MM-DD
        if the_end_date < max_date:
            max_date = the_end_date

    if (table in existing) and newer_than:
        # get date on existing table
        table_datetime = bqutil.get_bq_table_last_modified_datetime(
            dataset, table)
        if (table_datetime < newer_than):
            print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (
                dataset, table, table_datetime, newer_than)
            force_recompute = True

    if force_recompute:
        overwrite = True
    else:
        overwrite = False

    last_date = "2010-10-01 01:02:03"  # default last date

    if (not overwrite) and table in existing:
        # find out what the end date is of the current table
        pc_last = bqutil.get_table_data(dataset,
                                        table,
                                        startIndex=-10,
                                        maxResults=100)
        if (pc_last is None):
            print "--> no data in table %s.%s, starting from scratch!" % (
                dataset, table)
            overwrite = True
        else:
            last_dates = []
            for x in pc_last['data']:
                try:
                    add_date = get_date_function(x)
                    last_dates.append(add_date)
                except Exception as err:
                    print "Error with get_date_function occurred. %s" % str(
                        err)
                    continue
            last_date = max(last_dates)
            table_max_date = last_date.strftime('%Y%m%d')
            if max_date <= table_max_date:
                print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (
                    table, table_max_date, min_date, max_date)
                sys.stdout.flush()
                return
            min_date = (last_date +
                        datetime.timedelta(days=days_delta)).strftime('%Y%m%d')
            print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (
                table, table_max_date, min_date, max_date)
            sys.stdout.flush()
            overwrite = 'append'

    if overwrite == 'append':
        print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % (
            table, course_id, min_date, max_date, last_date,
            datetime.datetime.now())
    else:
        print "Making new %s table for course %s (start=%s, end=%s) [%s]" % (
            table, course_id, min_date, max_date, datetime.datetime.now())
    sys.stdout.flush()

    if limit_query_size:
        # do only one day's tracking logs, and force use of hash if table is too large
        print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb
        the_max_date = max_date  # save max_date information

        while min_date not in log_dates:
            print "  tracklog_%s does not exist!" % min_date
            for ld in log_dates:
                if ld < min_date:
                    continue
                if (ld > min_date) and (ld <= max_date):
                    min_date = ld
                    break
        if min_date not in log_dates:
            print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date
            raise Exception("[process_tracking_logs] missing tracking log")

        max_date = min_date
        print '    min_date = max_date = %s' % min_date
        tablename = 'tracklog_%s' % min_date.replace('-', '')
        tablesize_mb = bqutil.get_bq_table_size_bytes(
            log_dataset, tablename) / (1024.0 * 1024)
        nhashes = int(math.ceil(tablesize_mb / table_max_size_mb))
        from_datasets = "[%s.%s]" % (log_dataset, tablename)
        print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (
            log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes)
        sys.stdout.flush()

        if nhashes:
            if not has_hash_limit:
                print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (
                    log_dataset, tablename, tablesize_mb, table_max_size_mb)
                print "SQL: ", SQL
                raise Exception("[process_tracking_logs] table too large")

            for k in range(nhashes):
                hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes,
                                                                     k)
                the_sql = SQL.format(course_id=course_id,
                                     DATASETS=from_datasets,
                                     last_date=last_date,
                                     hash_limit=hash_limit)
                print "   Hash %d" % k
                sys.stdout.flush()
                try:
                    bqutil.create_bq_table(dataset,
                                           table,
                                           the_sql,
                                           wait=True,
                                           overwrite=overwrite,
                                           allowLargeResults=True)
                except Exception as err:
                    print the_sql
                    raise
                overwrite = "append"
        else:
            the_sql = SQL.format(course_id=course_id,
                                 DATASETS=from_datasets,
                                 last_date=last_date,
                                 hash_limit="")
            try:
                bqutil.create_bq_table(dataset,
                                       table,
                                       the_sql,
                                       wait=True,
                                       overwrite=overwrite,
                                       allowLargeResults=True)
            except Exception as err:
                print the_sql
                raise

        txt = '[%s] added tracking log data from %s' % (
            datetime.datetime.now(), tablename)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

        print "----> Done with day %s" % max_date

        if the_max_date > max_date:
            # more days still to be done
            print "--> Moving on to another day (max_date=%s)" % the_max_date
            run_query_on_tracking_logs(
                SQL,
                table,
                course_id,
                force_recompute=False,
                use_dataset_latest=use_dataset_latest,
                end_date=end_date,
                get_date_function=get_date_function,
                existing=existing,
                log_dates=log_dates,
                has_hash_limit=has_hash_limit,
                table_max_size_mb=table_max_size_mb,
                limit_query_size=limit_query_size,
            )
        return

    from_datasets = """(
                  TABLE_QUERY({dataset},
                       "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}"
                     )
                  )
         """.format(dataset=log_dataset, start=min_date, end=max_date)

    the_sql = SQL.format(course_id=course_id,
                         DATASETS=from_datasets,
                         last_date=last_date,
                         hash_limit="")

    try:
        bqutil.create_bq_table(dataset,
                               table,
                               the_sql,
                               wait=True,
                               overwrite=overwrite,
                               allowLargeResults=True)
    except Exception as err:
        if (('Response too large to return.' in str(err)) and has_hash_limit):
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0

            for k in range(4):
                hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k
                the_sql = SQL.format(course_id=course_id,
                                     DATASETS=from_datasets,
                                     last_date=last_date,
                                     hash_limit=hash_limit)
                bqutil.create_bq_table(dataset,
                                       table,
                                       the_sql,
                                       wait=True,
                                       overwrite=overwrite,
                                       allowLargeResults=True)
                overwrite = "append"

        elif ('Resources exceeded during query execution' in str(err)):
            if True:
                # figure out time interval in days, and split that in half
                start_date = datetime.datetime.strptime(min_date, '%Y%m%d')
                end_date = datetime.datetime.strptime(max_date, '%Y%m%d')
                ndays = (end_date - start_date).days
                if (ndays < 1) and has_hash_limit:
                    print "----> ndays=%d; retrying with limit_query_size" % ndays
                    sys.stdout.flush()
                    return run_query_on_tracking_logs(
                        SQL,
                        table,
                        course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest,
                        end_date=max_date,
                        get_date_function=get_date_function,
                        has_hash_limit=has_hash_limit,
                        # existing=existing,
                        log_dates=log_dates,
                        limit_query_size=True,
                    )
                elif (ndays < 1):
                    print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays
                    raise

                nd1 = int(ndays / 2)
                nd2 = ndays - nd1
                #if nd2 > nd1:
                #    nd1 = nd2
                #    nd2 = ndays - nd1
                print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (
                    ndays, nd1, nd2)
                sys.stdout.flush()

                end_date = (start_date +
                            datetime.timedelta(days=nd1)).strftime('%Y%m%d')
                print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(
                    SQL,
                    table,
                    course_id,
                    force_recompute=force_recompute,
                    use_dataset_latest=use_dataset_latest,
                    end_date=end_date,
                    get_date_function=get_date_function,
                    has_hash_limit=has_hash_limit,
                    # existing=existing,
                    log_dates=log_dates)

                end_date = max_date
                print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(
                    SQL,
                    table,
                    course_id,
                    force_recompute=False,
                    use_dataset_latest=use_dataset_latest,
                    end_date=end_date,
                    get_date_function=get_date_function,
                    has_hash_limit=has_hash_limit,
                    # existing=existing,
                    log_dates=log_dates)
                print "--> Done with %d + %d days!" % (nd1, nd2)
                return

            if False:

                def get_ym(x):
                    return int(x[0:4]), int(x[4:6]), int(x[6:])

                (min_year, min_month, min_day) = get_ym(min_date)
                (max_year, max_month, max_day) = get_ym(max_date)
                nmonths = max_month - min_month + 12 * (max_year - min_year)
                print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time"
                sys.stdout.flush()
                (end_year, end_month) = (min_year, min_month)
                for dm in range(nmonths):
                    end_month += 1
                    if end_month > 12:
                        end_month = 1
                        end_year += 1
                    end_date = "%04d-%02d-%02d" % (end_year, end_month,
                                                   min_day)
                    print "--> with end_date=%s" % end_date
                    sys.stdout.flush()
                    run_query_on_tracking_logs(
                        SQL,
                        table,
                        course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest,
                        end_date=end_date,
                        get_date_function=get_date_function,
                        has_hash_limit=has_hash_limit,
                        # existing=existing,
                        log_dates=log_dates)
                    force_recompute = False  # after first, don't force recompute
                return
        else:
            print the_sql
            raise

    if overwrite == 'append':
        txt = '[%s] added tracking log data from %s to %s' % (
            datetime.datetime.now(), min_date, max_date)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

    print "Done with course %s (end %s)" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
def do_combine(course_id_set, project_id, outdir="DATA", nskip=0,
               output_project_id=None, output_dataset_id=None, output_bucket=None,
               use_dataset_latest=False,
               ):

    print "="*77
    print "Concatenating person course datasets from the following courses:"
    print course_id_set
    print "-"*77

    outdir = path(outdir)
    if not outdir.exists():
        os.mkdir(outdir)
        
    ofnset = []
    cnt = 0
    for course_id in course_id_set:
        gb = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest)
        ofn = outdir / ('person_course_%s.csv.gz' % (course_id.replace('/', '__')))
        ofnset.append(ofn)

        if (nskip>0) and ofn.exists():
            print "%s already exists, not downloading" % ofn
            sys.stdout.flush()
            continue

        if ofn.exists():
            fnset = gsutil.get_gs_file_list(gb)
            local_dt = gsutil.get_local_file_mtime_in_utc(ofn)
            fnb = 'person_course.csv.gz'
            if not fnb in fnset:
                print "%s/%s missing!  skipping %s" % (gb, fnb, course_id)
                continue
            if (fnb in fnset) and (local_dt >= fnset[fnb]['date']):
                print "%s already exists with date %s (gs file date %s), not re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()
                continue
            else:
                print "%s already exists but has date %s (gs file date %s), so re-downloading" % (ofn, local_dt, fnset[fnb]['date'])
                sys.stdout.flush()

        cmd = 'gsutil cp %s/person_course.csv.gz %s' % (gb, ofn)
        print "Retrieving %s via %s" % (course_id, cmd)
        sys.stdout.flush()
        os.system(cmd)
        cnt += 1
        #if cnt>2:
        #    break

    org = course_id_set[0].split('/',1)[0]

    ofn = "person_course_%s_%s.csv" % (org, datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    print "="*77
    print "Combining CSV files to produce %s" % ofn
    sys.stdout.flush()

    if (nskip>1) and os.path.exists(ofn):
        print "%s already exists, not downloading" % ofn
    else:
        first = 1
        for zfn in ofnset:
            if first:
                cmd = "zcat %s > %s" % (zfn, ofn)
            else:
                cmd = "zcat %s | tail -n +2 >> %s" % (zfn, ofn)	# first row is header; don't keep when concatenating
            print cmd
            first = 0
            os.system(cmd)

    gb = gsutil.gs_path_from_course_id('course_report_%s' % org, gsbucket=output_bucket)

    print "="*77
    print "Uploading combined CSV file to google cloud storage in bucket: %s" % gb
    sys.stdout.flush()
    cmd = "TMPDIR=/var/tmp gsutil cp -z csv %s %s/" % (ofn, gb)
    print cmd
    os.system(cmd)

    gsfn = gb + '/' + ofn
    print "Combined person_course dataset CSV download link: %s" % gsutil.gs_download_link(gsfn)

    # import into BigQuery
    crname = ('course_report_%s' % org)
    if use_dataset_latest:
        crname = 'course_report_latest'
    dataset = output_dataset_id or crname
    table = ofn[:-4].replace('-','_')
    
    print "Importing into BigQuery as %s:%s.%s" % (project_id, dataset, table)
    sys.stdout.flush()
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_person_course.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())['person_course']

    bqutil.load_data_to_table(dataset, table, gsfn, the_schema, format='csv', skiprows=1, project_id=output_project_id)

    msg = ''
    msg += "Combined person-course dataset, with data from:\n"
    msg += str(course_id_set)
    msg += "\n\n"
    msg += "="*100 + "\n"
    msg += "CSV download link: %s" % gsutil.gs_download_link(gsfn)

    bqutil.add_description_to_table(dataset, table, msg, append=True, project_id=output_project_id)
    
    print "Done"
    sys.stdout.flush()