Example #1
0
    def compute_activity_by_day(self, course_id, start=None, end="2015-01-01"):
        '''
        Compute course activity by day, based on person_course_day table
        '''
        dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest())

        ongoing = self.get_collection_metadata('ONGOING', False)
        if ongoing:
            end = datetime.datetime.now().strftime('%Y-%m-%d')
        else:
            end = self.get_collection_metadata('END_DATE', end)

        start = start or self.get_collection_metadata('START_DATE', "2012-08-20")

        logging.info('[compute_activity_by_day] start=%s, end=%s' % (start, end))

        tables = bqutil.get_list_of_table_ids(dataset)
        if 'person_course_day' not in tables:
            logging.info('--> Warning: using old *_pcday tables for activity_by_day for %s' % course_id)
            return self.OLD_compute_activity_by_day(course_id, start=start, end=end)

        sql = """
          SELECT 
              date(last_event) as date,
              sum(nevents) as nevents,
              sum(nvideo) as nvideo,
              sum(nshow_answer) as nshow_answer,
              sum(nproblem_check) as nproblem_check,
              sum(nforum) as nforum,
              sum(ntranscript) as ntranscript,
              sum(nseq_goto) as nseq_goto,
              sum(nseek_video) as nseek_video,
              sum(nprogcheck) as nprogcheck,
              sum(npause_video) as npause_video,
              sum(sum_dt) as sum_dt,
              avg(avg_dt) as avg_dt,
              sum(n_dt) as n_dt,
          FROM [{dataset}.person_course_day]
          WHERE date <= "{end}" AND  date >= "{start}"
          group by date
          order by date
        """.format(dataset=dataset, course_id=course_id, start=start, end=end)

        table = self.add_collection_name_prefix('stats_activity_by_day')
        key = None
        ret = self.cached_get_bq_table(dataset, table, sql=sql, key=key,
                                       depends_on=['%s.person_course_day' % dataset ],
                                       logger=logging.error)
        return ret
Example #2
0
 def find_latest_person_course_table(self, dataset):
     '''
     Return the table_id for the most recent person_course table in specified dataset
     '''
     pctable = ""
     pcdate = ""
     for table in bqutil.get_list_of_table_ids(dataset):
         if not table.startswith('person_course_'):
             continue
         m = re.search('person_course_.*_(20\d\d_\d\d_\d\d_[0-9]+)', table)
         if not m:
             continue
         tdate = m.group(1)
         if tdate > pcdate:
             pcdate = tdate
             pctable = table
     return pctable
Example #3
0
    def OLD_compute_activity_by_day(self, course_id, start="2012-08-20", end="2015-01-01"):
        '''
        Compute course activity by day, based on *_pcday tables (DEPRECATED)
        '''
        dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest())
        input_dataset = bqutil.course_id2dataset(course_id, 'pcday')
        sql = """
          SELECT 
              date(last_event) as date,
              sum(nevents) as nevents,
              sum(nvideo) as nvideo,
              sum(nshow_answer) as nshow_answer,
              sum(nproblem_check) as nproblem_check,
              sum(nforum) as nforum,
              sum(ntranscript) as ntranscript,
              sum(nseq_goto) as nseq_goto,
              sum(nseek_video) as nseek_video,
              sum(nprogcheck) as nprogcheck,
              sum(npause_video) as npause_video,
              sum(sum_dt) as sum_dt,
              avg(avg_dt) as avg_dt,
              sum(n_dt) as n_dt,
          FROM (TABLE_DATE_RANGE([{dataset}.pcday_],
                                  TIMESTAMP('{start}'),
                                  TIMESTAMP('{end}'))) 
          group by date
          order by date
        """.format(dataset=input_dataset, course_id=course_id, start=start, end=end)

        pcday_tables = bqutil.get_list_of_table_ids(input_dataset)
        last_pcday = None
        for k in pcday_tables:
            if k.startswith('pcday_'):
                if last_pcday is None or k > last_pcday:
                    last_pcday = k

        table = 'stats_activity_by_day'
        key = None
        return self.cached_get_bq_table(dataset, table, sql=sql, key=key,
                                        depends_on=['%s.%s' % (input_dataset, last_pcday),
                                                    '%s.person_course' % dataset ],
                                        logger=logging.error)
    def count_tracking_log_events(self):
        '''
        Loop over all tracking logs up to cutoff date, and sum up number of entries, by
        doing table info lookups, with no SQL queries.
        '''
        if self.skip_or_do_step("count_events") < 0:
            return	# skip step

        tlend = self.end_date.replace('-', '')	# end_date normally specified as YYYY-MM-DD

        log_event_counts = {}

        # iterate over each course, one at a time
        for course_id in self.course_id_set:
            log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
            # get list of all tracking log files for this course
            log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')]

            log_tables_todo = [x for x in log_tables if x[9:] <= tlend]
            log_tables_todo.sort()
            print "[count_tracking_log_events] for course %s using %d tracking log tables, from %s to %s" % (course_id, 
                                                                                                             len(log_tables_todo),
                                                                                                             log_tables_todo[0], 
                                                                                                             log_tables_todo[-1])
            sys.stdout.flush()

            # go through all log files and get size on each
            row_sizes = [ bqutil.get_bq_table_size_rows(log_dataset, x) for x in log_tables_todo ]
            
            log_event_counts[course_id] = sum(row_sizes)
            print "                         For %s found %d total tracking log events" % (course_id, log_event_counts[course_id])
            sys.stdout.flush()

        self.log_event_counts = log_event_counts
        
        self.total_events = sum(log_event_counts.values())
        print "--> Total number of events for %s = %d" % (self.org, self.total_events)
Example #5
0
    def find_latest_person_course_table(self, dataset, project_id=None):
        '''
        Return the table_id for the most recent person_course table in specified dataset.
        Uses default project_id if one is not specified.
        '''
        pctable = ""
        pcdate = ""

        # project_id specified?
        optargs = {}
        if project_id:
            optargs['project_id'] = project_id

        for table in bqutil.get_list_of_table_ids(dataset, **optargs):
            if not table.startswith('person_course_'):
                continue
            m = re.search('person_course_.*_(20\d\d_\d\d_\d\d_[0-9]+)', table)
            if not m:
                continue
            tdate = m.group(1)
            if tdate > pcdate:
                pcdate = tdate
                pctable = table
        return pctable
Example #6
0
    def find_latest_person_course_table(self, dataset, project_id=None):
        '''
        Return the table_id for the most recent person_course table in specified dataset.
        Uses default project_id if one is not specified.
        '''
        pctable = ""
        pcdate = ""

        # project_id specified?
        optargs = {}
        if project_id:
            optargs['project_id'] = project_id

        for table in bqutil.get_list_of_table_ids(dataset, **optargs):
            if not table.startswith('person_course_'):
                continue
            m = re.search('person_course_.*_(20\d\d_\d\d_\d\d_[0-9]+)', table)
            if not m:
                continue
            tdate = m.group(1)
            if tdate > pcdate:
                pcdate = tdate
                pctable = table
        return pctable
def rephrase_forum_json_for_course(course_id, gsbucket="gs://x-data", 
                                   basedir="X-Year-2-data-sql", 
                                   datedir=None, 
                                   do_gs_copy=False,
                                   use_dataset_latest=False,
                                   ):
    
    print "Loading SQL for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket, use_dataset_latest)
    
    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            sys.stdout.flush()
            return

    print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now())
    sys.stdout.flush()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    for line in fp:
        cnt += 1
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)
    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt==0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
Example #8
0
def already_exists(course_id, use_dataset_latest, table="grading_policy"):
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tables = bqutil.get_list_of_table_ids(dataset)
    return table in tables
Example #9
0
def load_all_daily_logs_for_course(course_id,
                                   gsbucket="gs://x-data",
                                   verbose=True,
                                   wait=False,
                                   check_dates=True):
    '''
    Load daily tracking logs for course from google storage into BigQuery.
    
    If wait=True then waits for loading jobs to be completed.  It's desirable to wait
    if subsequent jobs which need these tables (like person_day) are to be run
    immediately afterwards.
    '''

    print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()
    gsroot = gsutil.path_from_course_id(course_id)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA = json.loads(
        open('%s/schemas/schema_tracking_log.json' %
             mypath).read())['tracking_log']

    gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot)

    fnset = gsutil.get_gs_file_list(gsdir)

    dataset = bqutil.course_id2dataset(gsroot, dtype="logs")

    # create this dataset if necessary
    bqutil.create_dataset_if_nonexistent(dataset)

    tables = bqutil.get_list_of_table_ids(dataset)
    tables = [x for x in tables if x.startswith('track')]

    if verbose:
        print "-" * 77
        print "current tables loaded:", json.dumps(tables, indent=4)
        print "files to load: ", json.dumps(fnset.keys(), indent=4)
        print "-" * 77
        sys.stdout.flush()

    for fn, fninfo in fnset.iteritems():

        if int(fninfo['size']) <= 45:
            print "Zero size file %s, skipping" % fn
            continue

        m = re.search('(\d\d\d\d-\d\d-\d\d)', fn)
        if not m:
            continue
        date = m.group(1)
        tablename = "tracklog_%s" % date.replace(
            '-', '')  # YYYYMMDD for compatibility with table wildcards

        # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True)
        file_date = fninfo['date'].replace(tzinfo=None)

        if tablename in tables:
            skip = True
            if check_dates:
                table_date = bqutil.get_bq_table_last_modified_datetime(
                    dataset, tablename)
                if not (table_date > file_date):
                    print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (
                        tablename, fn, file_date, table_date)
                    skip = False

            if skip:
                if verbose:
                    print "Already have table %s, skipping file %s" % (
                        tablename, fn)
                    sys.stdout.flush()
                continue

        #if date < '2014-07-27':
        #  continue

        print "Loading %s into table %s " % (fn, tablename)
        if verbose:
            print "start [%s]" % datetime.datetime.now()
        sys.stdout.flush()
        gsfn = fninfo['name']
        ret = bqutil.load_data_to_table(dataset,
                                        tablename,
                                        gsfn,
                                        SCHEMA,
                                        wait=wait,
                                        maxbad=1000)

    if verbose:
        print "-" * 77
        print "done with %s [%s]" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
def run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, use_dataset_latest=False, 
                               start_date=None,
                               end_date=None, 
                               get_date_function=None,
                               existing=None,
                               log_dates=None,
                               days_delta=1,
                               skip_last_day=False,
                               has_hash_limit=False,
                               newer_than=None,
                               table_max_size_mb=800,
                               limit_query_size=False):
    '''
    make a certain table (with SQL given) for specified course_id.

    The master table holds all the data for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    If the query fails because of "Resources exceeded during query execution"
    then try setting the end_date, to do part at a time.

    NOTE: the SQL must produce a result which is ordered by date, in increasing order.

    days_delta = integer number of days to increase each time; specify 0 for one day overlap,
                 but make sure the SQL query only selects for time > TIMESTAMP("{last_date}")

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.

    start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format.

    newer_than = if specified, as datetime, then any existing destination table must
    be newer than this datetime, else force_recompute is made True
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)	# destination
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")

    if existing is None:
        existing = bqutil.get_list_of_table_ids(dataset)
        print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (len(existing or []), dataset)

    if log_dates is None:
        log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')]
        log_dates = [x[9:] for x in log_tables]
        log_dates.sort()

    if len(log_dates)==0:
        print "--> no tracking logs in %s aborting!" % (log_dataset)
        return

    if skip_last_day:
        old_max_date = max(log_dates)
        log_dates.remove(max(log_dates))	# remove the last day of data from consideration
        max_date = max(log_dates)
        print "         --> skip_last_day is True: dropping %s, new max_date=%s" % (old_max_date, max_date)
        sys.stdout.flush()

    min_date = min(log_dates)
    max_date = max(log_dates)

    if start_date:
        start_date = start_date.replace('-','')
        if min_date < start_date:
            print "        --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (min_date, start_date)
            min_date = start_date

    if end_date is not None:
        print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (table, min_date, max_date, end_date)
        sys.stdout.flush()
        the_end_date = end_date.replace('-','')	# end_date should be YYYY-MM-DD
        if the_end_date < max_date:
            max_date = the_end_date

    if (table in existing) and newer_than:
        # get date on existing table
        table_datetime = bqutil.get_bq_table_last_modified_datetime(dataset, table)
        if (table_datetime < newer_than):
            print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (dataset, table, table_datetime, newer_than)
            force_recompute = True

    if force_recompute:
        overwrite = True
    else:
        overwrite = False

    last_date = "2010-10-01 01:02:03"    	# default last date

    if (not overwrite) and table in existing:
        # find out what the end date is of the current table
        pc_last = bqutil.get_table_data(dataset, table, startIndex=-10, maxResults=100)
        if (pc_last is None):
            print "--> no data in table %s.%s, starting from scratch!" % (dataset, table)
            overwrite = True
        else:
            last_dates = [get_date_function(x) for x in pc_last['data']]
            last_date = max(last_dates)
            table_max_date = last_date.strftime('%Y%m%d')
            if max_date <= table_max_date:
                print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (table, 
                                                                                                                  table_max_date,
                                                                                                                  min_date,
                                                                                                                  max_date)
                sys.stdout.flush()
                return
            min_date = (last_date + datetime.timedelta(days=days_delta)).strftime('%Y%m%d')
            print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (table, 
                                                                                                      table_max_date,
                                                                                                      min_date,
                                                                                                      max_date)
            sys.stdout.flush()
            overwrite = 'append'
    
    if overwrite=='append':
        print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]"  % (table, course_id, min_date, max_date, last_date, datetime.datetime.now())
    else:
        print "Making new %s table for course %s (start=%s, end=%s) [%s]"  % (table, course_id, min_date, max_date, datetime.datetime.now())
    sys.stdout.flush()

    if limit_query_size:
        # do only one day's tracking logs, and force use of hash if table is too large
        print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb
        the_max_date = max_date	# save max_date information

        while min_date not in log_dates:
            print "  tracklog_%s does not exist!" % min_date
            for ld in log_dates:
                if ld < min_date:
                    continue
                if (ld > min_date) and (ld <= max_date):
                    min_date = ld
                    break
        if min_date not in log_dates:
            print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date
            raise Exception("[process_tracking_logs] missing tracking log")

        max_date = min_date
        print '    min_date = max_date = %s' % min_date
        tablename = 'tracklog_%s' % min_date.replace('-', '')
        tablesize_mb = bqutil.get_bq_table_size_bytes(log_dataset, tablename) / (1024.0*1024)
        nhashes =  int(math.ceil(tablesize_mb / table_max_size_mb))
        from_datasets = "[%s.%s]" % (log_dataset, tablename)
        print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes)
        sys.stdout.flush()

        if nhashes:
            if not has_hash_limit:
                print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (log_dataset, tablename, 
                                                                                                             tablesize_mb, table_max_size_mb)
                print "SQL: ", SQL
                raise Exception("[process_tracking_logs] table too large")

            for k in range(nhashes):
                hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes, k)
                the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit)
                print "   Hash %d" % k
                sys.stdout.flush()
                try:
                    bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
                except Exception as err:
                    print the_sql
                    raise
                overwrite = "append"
        else:
            the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="")
            try:
                bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
            except Exception as err:
                print the_sql
                raise

        txt = '[%s] added tracking log data from %s' % (datetime.datetime.now(), tablename)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

        print "----> Done with day %s" % max_date

        if the_max_date > max_date:
            # more days still to be done
            print "--> Moving on to another day (max_date=%s)" % the_max_date
            run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, 
                                       use_dataset_latest=use_dataset_latest,
                                       end_date=end_date, 
                                       get_date_function=get_date_function,
                                       existing=existing,
                                       log_dates=log_dates,
                                       has_hash_limit=has_hash_limit,
                                       table_max_size_mb=table_max_size_mb,
                                       limit_query_size=limit_query_size,
                                   )
        return

    from_datasets = """(
                  TABLE_QUERY({dataset},
                       "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}"
                     )
                  )
         """.format(dataset=log_dataset, start=min_date, end=max_date)

    the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit="")

    try:
        bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
    except Exception as err:
        if ( ('Response too large to return.' in str(err)) and has_hash_limit ):
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0

            for k in range(4):
                hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k
                the_sql = SQL.format(course_id=course_id, DATASETS=from_datasets, last_date=last_date, hash_limit=hash_limit)
                bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)
                overwrite = "append"

        elif ('Resources exceeded during query execution' in str(err)):
            if True:
                # figure out time interval in days, and split that in half
                start_date = datetime.datetime.strptime(min_date, '%Y%m%d')
                end_date = datetime.datetime.strptime(max_date, '%Y%m%d')
                ndays = (end_date - start_date).days
                if (ndays < 1) and has_hash_limit:
                    print "----> ndays=%d; retrying with limit_query_size" % ndays
                    sys.stdout.flush()
                    return run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                                      use_dataset_latest=use_dataset_latest,
                                                      end_date=max_date, 
                                                      get_date_function=get_date_function,
                                                      has_hash_limit=has_hash_limit,
                                                      # existing=existing,
                                                      log_dates=log_dates,
                                                      limit_query_size=True,
                                                  )
                elif (ndays < 1):
                    print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays
                    raise
                
                nd1 = int(ndays/2)
                nd2 = ndays - nd1
                #if nd2 > nd1:
                #    nd1 = nd2
                #    nd2 = ndays - nd1
                print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (ndays, nd1, nd2)
                sys.stdout.flush()

                end_date = (start_date + datetime.timedelta(days=nd1)).strftime('%Y%m%d')
                print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                           use_dataset_latest=use_dataset_latest,
                                           end_date=end_date, 
                                           get_date_function=get_date_function,
                                           has_hash_limit=has_hash_limit,
                                           # existing=existing,
                                           log_dates=log_dates)

                end_date = max_date
                print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(SQL, table, course_id, force_recompute=False, 
                                           use_dataset_latest=use_dataset_latest,
                                           end_date=end_date, 
                                           get_date_function=get_date_function,
                                           has_hash_limit=has_hash_limit,
                                           # existing=existing,
                                           log_dates=log_dates)
                print "--> Done with %d + %d days!" % (nd1, nd2)
                return


            if False:
                def get_ym(x):
                    return int(x[0:4]), int(x[4:6]), int(x[6:])
                (min_year, min_month, min_day) = get_ym(min_date)
                (max_year, max_month, max_day) = get_ym(max_date)
                nmonths = max_month - min_month + 12 * (max_year - min_year)
                print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time"
                sys.stdout.flush()
                (end_year, end_month) = (min_year, min_month)
                for dm in range(nmonths):
                    end_month += 1
                    if end_month > 12:
                        end_month = 1
                        end_year += 1
                    end_date = "%04d-%02d-%02d" % (end_year, end_month, min_day)
                    print "--> with end_date=%s" % end_date
                    sys.stdout.flush()
                    run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute, 
                                               use_dataset_latest=use_dataset_latest,
                                               end_date=end_date, 
                                               get_date_function=get_date_function,
                                               has_hash_limit=has_hash_limit,
                                               # existing=existing,
                                               log_dates=log_dates)
                    force_recompute = False		# after first, don't force recompute
                return
        else:
            print the_sql
            raise

    if overwrite=='append':
        txt = '[%s] added tracking log data from %s to %s' % (datetime.datetime.now(), min_date, max_date)
        bqutil.add_description_to_table(dataset, table, txt, append=True)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def analyze_course_content(course_id, 
                           listings_file=None,
                           basedir="X-Year-2-data-sql", 
                           datedir="2013-09-21", 
                           use_dataset_latest=False,
                           do_upload=False,
                           courses=None,
                           verbose=True,
                           ):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split('/',1)[0]	# extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False,
                                      format='csv', skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses])
        sql = "select * from {course_tables}".format(course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(dataset, table, sql=sql, 
                                          newer_than=datetime.datetime(2015, 1, 16, 3, 0),
                                          )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)


        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max([ x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset, latest_person_course)
        
        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'})

        # rates for registrants before and during course
        
        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, 
                                          newer_than=datetime.datetime(2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci['certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {'msbc_viewed': "viewed_median_stats_by_course",
                       'msbc_explored': 'explored_median_stats_by_course',
                       'msbc_certified': 'certified_median_stats_by_course',
                       'msbc_verified': 'verified_median_stats_by_course',
                       }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        def make_key(key):
            key = key.strip()
            key = key.replace(' ', '_').replace("'", "_").replace('/', '_').replace('(','').replace(')','').replace('-', '_').replace(',', '')
            return key

        listings_keys = map(make_key, ["Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", 
                                       "Course Number", "Short Title", "Andrew's Short Titles", "Title", 
                                       "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id",
                                       "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade",
                                       '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)"
                                       ])
        listings_keys.reverse()
        
        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = ['discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 
                          'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 
                          'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 
                          'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 
                          'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 
                          'option', 'lti', 'annotationresponse', 
                          'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 
                          'options', 'comment_prompt', 'conditional', 
                          'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 
                          'customtag', 'transcript', 
                          'split_test', 'word_cloud', 
                          'openended', 'openendedparam', 'answer_display', 'code', 
                          'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 
                          'textannotation', 'imageannotation', 'videosequence', 
                          'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion']

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith('listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [ json.loads(x) for x in open(listings_file) ]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read())
        schema_dict = { x['name'] : x for x in the_schema }

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False)

        return

    
    print "-"*60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    for k in csv.DictReader(open(lfn)):
        if k['course_id']==course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)
    
    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (xbfn, course_id)
        return

    print "[analyze_content] For %s using %s" % (course_id, xbfn)
    
    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()
    
    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = ['html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong',
              'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata',
              'grading_policy', 'br', 'center',  'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 
              'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title']

    def walk_tree(elem):
        if  type(elem.tag)==str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    print "    -> excluding %s (%s), ncount=%s" % (k.get('display_name', '<no_display_name>').encode('utf8'), 
                                                                   midfrag, 
                                                                   mudata.get(midfrag, {}).get('ncount'))
                continue
            walk_tree(k)

    walk_tree(xml)
    print counts

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = ['openassessment', 'optionresponse', 'multiplechoiceresponse', 
                 # 'discussion', 
                 'choiceresponse', 'word_cloud', 
                 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti']
    quant_axis = ['formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse',
                  'imageresponse']

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count
    
    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k
    
    ccd[course_id] = {'course_id': course_id,
                      'nweeks': nweeks,
                      'nqual_per_week': nqual_per_week,
                      'nquant_per_week': nquant_per_week,
                      'total_assessments_per_week' : total_per_week,
                      }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = ['course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks']
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update({ ('listings_%s' % key) : value for key, value in data.items() })	# from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    for key in sorted(counts):	# store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] =  str(value) 	# from content counts

    cmfields.update({ ('nexcluded_sub_20_%s' % key) : str(value) for key, value in nexcluded.items() })	# from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id']==course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'], ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()
    
    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})
        
    fp.close()

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, table)

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
Example #12
0
    def compute_enrollment_by_day(self, course_id, start="2012-08-20", end="2115-01-01"):
        '''
        Compute enrollment by day, based on enrollday_* tables
        '''
        dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest())	# where to store result
        input_dataset = bqutil.course_id2dataset(course_id, 'pcday')				# source data

        ongoing = self.get_collection_metadata('ONGOING', False)
        if ongoing:
            end = datetime.datetime.now().strftime('%Y-%m-%d')
        else:
            end = self.get_collection_metadata('END_DATE', end)

        start = self.get_collection_metadata('START_DATE', start)

        sql_enrollday = """
          SELECT 
              '{course_id}' as course_id,
              date,
              nenroll,
              sum(nenroll) over(order by date) as nenroll_cum,
          FROM (
                  SELECT 
                      date(time) as date,
                      sum(diff_enrollment) as nenroll,
                  FROM (
                   # TABLE_DATE_RANGE([{dataset}.enrollday_],
                   #                       TIMESTAMP('{start}'),
                   #                       TIMESTAMP('{end}'))) 
                   TABLE_QUERY({dataset}, 
                       "integer(regexp_extract(table_id, r'enrollday_([0-9]+)')) BETWEEN {start} and {end}"
                     )
                  )
                  group by date
                  order by date
        )
        group by date, nenroll
        order by date
        """.format(dataset=input_dataset, course_id=course_id, 
                   start=start.replace('-',''), 
                   end=end.replace('-',''))

        sql_enrollday2 = """
          SELECT 
              '{course_id}' as course_id,
              date,
              nenroll,
              sum(nenroll) over(order by date) as nenroll_cum,
          FROM (
                  SELECT 
                      date(time) as date,
                      sum(diff_enrollment_honor) as nenroll_honor,
                      sum(diff_enrollment_audit) as nenroll_audit,
                      sum(diff_enrollment_verified) as nenroll_verified,
        	      # and, for backwards compatibility with old enrollday_* :
                      sum(diff_enrollment_honor) + sum(diff_enrollment_audit) + sum(diff_enrollment_verified) as nenroll,
                  FROM (
                   # TABLE_DATE_RANGE([{dataset}.enrollday2_],
                   #                       TIMESTAMP('{start}'),
                   #                       TIMESTAMP('{end}'))) 
                   TABLE_QUERY({dataset}, 
                       "integer(regexp_extract(table_id, r'enrollday2_([0-9]+)')) BETWEEN {start} and {end}"
                     )
                  )
                  group by date
                  order by date
        )
        group by date, nenroll
        order by date
        """.format(dataset=input_dataset, course_id=course_id, 
                   start=start.replace('-',''), 
                   end=end.replace('-',''))

        sql_enrollday_all = """
          SELECT 
              '{course_id}' as course_id,
              date,
              nenroll,
              sum(nenroll) over(order by date) as nenroll_cum,
          FROM (
                  SELECT 
                      date(time) as date,
                      sum(diff_enrollment_honor) as nenroll_honor,
                      sum(diff_enrollment_audit) as nenroll_audit,
                      sum(diff_enrollment_verified) as nenroll_verified,
                      sum(diff_enrollment_honor) + sum(diff_enrollment_audit) + sum(diff_enrollment_verified) as nenroll,
                  FROM {dataset}.enrollday_all
                  WHERE date(time) <= "{end}" AND  date(time) >= "{start}"
                  group by date
                  order by date
        )
        group by date, nenroll
        order by date
        """.format(dataset=dataset, course_id=course_id, start=start, end=end)

        # special handling: use new enrollday_all tables if available, instead of enrollday* in *_pcday dataset
        tables = bqutil.get_list_of_table_ids(dataset)
        if 'enrollday_all' in tables:
            sql = sql_enrollday_all
            logging.info('[compute_enrollment_by_day] using enrollday_all for %s' % course_id)
            depends_on = [ "%s.enrollday_all" % dataset ]
        else:
            # old special handling: use new enrollday2_* tables if available, instead of enrollday_* 
            tables = bqutil.get_list_of_table_ids(input_dataset)
            prefixes = [x.split('_')[0] for x in tables]
            if 'enrollday2' in prefixes:
                sql = sql_enrollday2
                logging.info('[compute_enrollment_by_day] using enrollday2 for %s' % course_id)
                tpre = 'enrollday2'
            else:
                sql = sql_enrollday
                tpre = 'enrollday'
    
            latest_table = None
            for k in tables:
                if k.startswith('%s_' % tpre):
                    if latest_table is None or k > latest_table:
                        latest_table = k
            depends_on = ['%s.%s' % (input_dataset, latest_table)]

        logging.info('enrollment_day depends_on=%s' % depends_on)
        table = self.add_collection_name_prefix('stats_enrollment_by_day')
        key = None
        return self.cached_get_bq_table(dataset, table, sql=sql, key=key,
                                        depends_on=depends_on,
                                        logger=logging.error, ignore_cache=False)
Example #13
0
def run_query_on_tracking_logs(SQL,
                               table,
                               course_id,
                               force_recompute=False,
                               use_dataset_latest=False,
                               start_date=None,
                               end_date=None,
                               get_date_function=None,
                               existing=None,
                               log_dates=None,
                               days_delta=1,
                               skip_last_day=False,
                               has_hash_limit=False,
                               newer_than=None,
                               table_max_size_mb=800,
                               limit_query_size=False):
    '''
    make a certain table (with SQL given) for specified course_id.

    The master table holds all the data for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    If the query fails because of "Resources exceeded during query execution"
    then try setting the end_date, to do part at a time.

    NOTE: the SQL must produce a result which is ordered by date, in increasing order.

    days_delta = integer number of days to increase each time; specify 0 for one day overlap,
                 but make sure the SQL query only selects for time > TIMESTAMP("{last_date}")

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.

    start_date = optional argument, giving min start date for logs to process, in YYYY-MM-DD format.

    newer_than = if specified, as datetime, then any existing destination table must
    be newer than this datetime, else force_recompute is made True
    '''

    dataset = bqutil.course_id2dataset(
        course_id, use_dataset_latest=use_dataset_latest)  # destination
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")

    if existing is None:
        existing = bqutil.get_list_of_table_ids(dataset)
        print "[run_query_on_tracking_logs] got %s existing tables in dataset %s" % (
            len(existing or []), dataset)

    if log_dates is None:
        log_tables = [
            x for x in bqutil.get_list_of_table_ids(log_dataset)
            if x.startswith('tracklog_20')
        ]
        log_dates = [x[9:] for x in log_tables]
        log_dates.sort()

    if len(log_dates) == 0:
        print "--> no tracking logs in %s aborting!" % (log_dataset)
        return

    if skip_last_day:
        old_max_date = max(log_dates)
        log_dates.remove(
            max(log_dates))  # remove the last day of data from consideration
        max_date = max(log_dates)
        print "         --> skip_last_day is True: dropping %s, new max_date=%s" % (
            old_max_date, max_date)
        sys.stdout.flush()

    min_date = min(log_dates)
    max_date = max(log_dates)

    if start_date:
        start_date = start_date.replace('-', '')
        if min_date < start_date:
            print "        --> logs start at %s, but that is before start_date, so using min_date=start_date=%s" % (
                min_date, start_date)
            min_date = start_date

    if end_date is not None:
        print "[run_query_on_tracking_logs] %s: min_date=%s, max_date=%s, using end_date=%s for max_date cutoff" % (
            table, min_date, max_date, end_date)
        sys.stdout.flush()
        the_end_date = end_date.replace('-',
                                        '')  # end_date should be YYYY-MM-DD
        if the_end_date < max_date:
            max_date = the_end_date

    if (table in existing) and newer_than:
        # get date on existing table
        table_datetime = bqutil.get_bq_table_last_modified_datetime(
            dataset, table)
        if (table_datetime < newer_than):
            print "[run_query_on_tracking_logs] existing %s.%s table last modified on %s, older than newer_than=%s, forcing recompute!" % (
                dataset, table, table_datetime, newer_than)
            force_recompute = True

    if force_recompute:
        overwrite = True
    else:
        overwrite = False

    last_date = "2010-10-01 01:02:03"  # default last date

    if (not overwrite) and table in existing:
        # find out what the end date is of the current table
        pc_last = bqutil.get_table_data(dataset,
                                        table,
                                        startIndex=-10,
                                        maxResults=100)
        if (pc_last is None):
            print "--> no data in table %s.%s, starting from scratch!" % (
                dataset, table)
            overwrite = True
        else:
            last_dates = []
            for x in pc_last['data']:
                try:
                    add_date = get_date_function(x)
                    last_dates.append(add_date)
                except Exception as err:
                    print "Error with get_date_function occurred. %s" % str(
                        err)
                    continue
            last_date = max(last_dates)
            table_max_date = last_date.strftime('%Y%m%d')
            if max_date <= table_max_date:
                print '--> %s already exists, max_date=%s, but tracking log data min=%s, max=%s, nothing new!' % (
                    table, table_max_date, min_date, max_date)
                sys.stdout.flush()
                return
            min_date = (last_date +
                        datetime.timedelta(days=days_delta)).strftime('%Y%m%d')
            print '--> %s already exists, max_date=%s, adding tracking log data from %s to max=%s' % (
                table, table_max_date, min_date, max_date)
            sys.stdout.flush()
            overwrite = 'append'

    if overwrite == 'append':
        print "Appending to %s table for course %s (start=%s, end=%s, last_date=%s) [%s]" % (
            table, course_id, min_date, max_date, last_date,
            datetime.datetime.now())
    else:
        print "Making new %s table for course %s (start=%s, end=%s) [%s]" % (
            table, course_id, min_date, max_date, datetime.datetime.now())
    sys.stdout.flush()

    if limit_query_size:
        # do only one day's tracking logs, and force use of hash if table is too large
        print '--> limiting query size, so doing only one day at a time, and checking tracking log table size as we go (max=%s MB)' % table_max_size_mb
        the_max_date = max_date  # save max_date information

        while min_date not in log_dates:
            print "  tracklog_%s does not exist!" % min_date
            for ld in log_dates:
                if ld < min_date:
                    continue
                if (ld > min_date) and (ld <= max_date):
                    min_date = ld
                    break
        if min_date not in log_dates:
            print "--> ERROR! Cannot find tracking log file for %s, aborting!" % min_date
            raise Exception("[process_tracking_logs] missing tracking log")

        max_date = min_date
        print '    min_date = max_date = %s' % min_date
        tablename = 'tracklog_%s' % min_date.replace('-', '')
        tablesize_mb = bqutil.get_bq_table_size_bytes(
            log_dataset, tablename) / (1024.0 * 1024)
        nhashes = int(math.ceil(tablesize_mb / table_max_size_mb))
        from_datasets = "[%s.%s]" % (log_dataset, tablename)
        print "--> table %s.%s size %s MB > max=%s MB, using %d hashes" % (
            log_dataset, tablename, tablesize_mb, table_max_size_mb, nhashes)
        sys.stdout.flush()

        if nhashes:
            if not has_hash_limit:
                print "--> ERROR! table %s.%s size %s MB > max=%s MB, but no hash_limit in SQL available" % (
                    log_dataset, tablename, tablesize_mb, table_max_size_mb)
                print "SQL: ", SQL
                raise Exception("[process_tracking_logs] table too large")

            for k in range(nhashes):
                hash_limit = "AND ABS(HASH(username)) %% %d = %d" % (nhashes,
                                                                     k)
                the_sql = SQL.format(course_id=course_id,
                                     DATASETS=from_datasets,
                                     last_date=last_date,
                                     hash_limit=hash_limit)
                print "   Hash %d" % k
                sys.stdout.flush()
                try:
                    bqutil.create_bq_table(dataset,
                                           table,
                                           the_sql,
                                           wait=True,
                                           overwrite=overwrite,
                                           allowLargeResults=True)
                except Exception as err:
                    print the_sql
                    raise
                overwrite = "append"
        else:
            the_sql = SQL.format(course_id=course_id,
                                 DATASETS=from_datasets,
                                 last_date=last_date,
                                 hash_limit="")
            try:
                bqutil.create_bq_table(dataset,
                                       table,
                                       the_sql,
                                       wait=True,
                                       overwrite=overwrite,
                                       allowLargeResults=True)
            except Exception as err:
                print the_sql
                raise

        txt = '[%s] added tracking log data from %s' % (
            datetime.datetime.now(), tablename)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

        print "----> Done with day %s" % max_date

        if the_max_date > max_date:
            # more days still to be done
            print "--> Moving on to another day (max_date=%s)" % the_max_date
            run_query_on_tracking_logs(
                SQL,
                table,
                course_id,
                force_recompute=False,
                use_dataset_latest=use_dataset_latest,
                end_date=end_date,
                get_date_function=get_date_function,
                existing=existing,
                log_dates=log_dates,
                has_hash_limit=has_hash_limit,
                table_max_size_mb=table_max_size_mb,
                limit_query_size=limit_query_size,
            )
        return

    from_datasets = """(
                  TABLE_QUERY({dataset},
                       "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start} and {end}"
                     )
                  )
         """.format(dataset=log_dataset, start=min_date, end=max_date)

    the_sql = SQL.format(course_id=course_id,
                         DATASETS=from_datasets,
                         last_date=last_date,
                         hash_limit="")

    try:
        bqutil.create_bq_table(dataset,
                               table,
                               the_sql,
                               wait=True,
                               overwrite=overwrite,
                               allowLargeResults=True)
    except Exception as err:
        if (('Response too large to return.' in str(err)) and has_hash_limit):
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0

            for k in range(4):
                hash_limit = "AND ABS(HASH(username)) %% 4 = %d" % k
                the_sql = SQL.format(course_id=course_id,
                                     DATASETS=from_datasets,
                                     last_date=last_date,
                                     hash_limit=hash_limit)
                bqutil.create_bq_table(dataset,
                                       table,
                                       the_sql,
                                       wait=True,
                                       overwrite=overwrite,
                                       allowLargeResults=True)
                overwrite = "append"

        elif ('Resources exceeded during query execution' in str(err)):
            if True:
                # figure out time interval in days, and split that in half
                start_date = datetime.datetime.strptime(min_date, '%Y%m%d')
                end_date = datetime.datetime.strptime(max_date, '%Y%m%d')
                ndays = (end_date - start_date).days
                if (ndays < 1) and has_hash_limit:
                    print "----> ndays=%d; retrying with limit_query_size" % ndays
                    sys.stdout.flush()
                    return run_query_on_tracking_logs(
                        SQL,
                        table,
                        course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest,
                        end_date=max_date,
                        get_date_function=get_date_function,
                        has_hash_limit=has_hash_limit,
                        # existing=existing,
                        log_dates=log_dates,
                        limit_query_size=True,
                    )
                elif (ndays < 1):
                    print "====> ERROR with resources exceeded during query execution; ndays=%d, cannot split -- ABORTING!" % ndays
                    raise

                nd1 = int(ndays / 2)
                nd2 = ndays - nd1
                #if nd2 > nd1:
                #    nd1 = nd2
                #    nd2 = ndays - nd1
                print "====> ERROR with resources exceeded during query execution; re-trying based on splitting %d days into %d + %d days" % (
                    ndays, nd1, nd2)
                sys.stdout.flush()

                end_date = (start_date +
                            datetime.timedelta(days=nd1)).strftime('%Y%m%d')
                print "--> part 1 with %d days (end_date=%s)" % (nd1, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(
                    SQL,
                    table,
                    course_id,
                    force_recompute=force_recompute,
                    use_dataset_latest=use_dataset_latest,
                    end_date=end_date,
                    get_date_function=get_date_function,
                    has_hash_limit=has_hash_limit,
                    # existing=existing,
                    log_dates=log_dates)

                end_date = max_date
                print "--> part 2 with %d days (end_date=%s)" % (nd2, end_date)
                sys.stdout.flush()
                run_query_on_tracking_logs(
                    SQL,
                    table,
                    course_id,
                    force_recompute=False,
                    use_dataset_latest=use_dataset_latest,
                    end_date=end_date,
                    get_date_function=get_date_function,
                    has_hash_limit=has_hash_limit,
                    # existing=existing,
                    log_dates=log_dates)
                print "--> Done with %d + %d days!" % (nd1, nd2)
                return

            if False:

                def get_ym(x):
                    return int(x[0:4]), int(x[4:6]), int(x[6:])

                (min_year, min_month, min_day) = get_ym(min_date)
                (max_year, max_month, max_day) = get_ym(max_date)
                nmonths = max_month - min_month + 12 * (max_year - min_year)
                print "====> ERROR with resources exceeded during query execution; re-trying based on one month's data at a time"
                sys.stdout.flush()
                (end_year, end_month) = (min_year, min_month)
                for dm in range(nmonths):
                    end_month += 1
                    if end_month > 12:
                        end_month = 1
                        end_year += 1
                    end_date = "%04d-%02d-%02d" % (end_year, end_month,
                                                   min_day)
                    print "--> with end_date=%s" % end_date
                    sys.stdout.flush()
                    run_query_on_tracking_logs(
                        SQL,
                        table,
                        course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest,
                        end_date=end_date,
                        get_date_function=get_date_function,
                        has_hash_limit=has_hash_limit,
                        # existing=existing,
                        log_dates=log_dates)
                    force_recompute = False  # after first, don't force recompute
                return
        else:
            print the_sql
            raise

    if overwrite == 'append':
        txt = '[%s] added tracking log data from %s to %s' % (
            datetime.datetime.now(), min_date, max_date)
        bqutil.add_description_to_table(dataset, table, txt, append=True)

    print "Done with course %s (end %s)" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
Example #14
0
def already_exists(course_id, use_dataset_latest=False):
    table = "course_axis"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tables = bqutil.get_list_of_table_ids(dataset)
    return table in tables
Example #15
0
def already_exists(course_id, use_dataset_latest, table="user_partitions"):
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tables = bqutil.get_list_of_table_ids(dataset)
    return table in tables
    def __init__(self, course_id_set, output_project_id=None, nskip=0, 
                 output_dataset_id=None, 
                 output_bucket=None,
                 use_dataset_latest=False,
                 only_step=None,
                 end_date=None,
                 ):
        '''
        Compute course report tables, based on combination of all person_course and other individual course tables.

        only_step: specify a single course report step to be executed; runs all reports, if None
        '''
        
        if only_step and ',' in only_step:
            only_step = only_step.split(',')
        self.only_step = only_step

        self.end_date = end_date;

        if not course_id_set:
            print "ERROR! Must specify list of course_id's for report.  Aborting."
            return

        org = course_id_set[0].split('/',1)[0]	# extract org from first course_id
        self.org = org

        self.output_project_id = output_project_id

        crname = ('course_report_%s' % org)
        if use_dataset_latest:
            crname = 'course_report_latest'
        self.dataset = output_dataset_id or crname

        self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket)
        self.course_id_set = course_id_set

        course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set]

        # check to see which datasets have person_course tables
        datasets_with_pc = []
        self.all_pc_tables = OrderedDict()
        self.all_pcday_ip_counts_tables = OrderedDict()
        self.all_uic_tables = OrderedDict()
        self.all_tott_tables = OrderedDict()
        for cd in course_datasets:
            try:
                table = bqutil.get_bq_table_info(cd, 'person_course')
            except Exception as err:
                print "[make-course_report_tables] Err: %s" % str(err)
                table = None
            if table is not None:
                self.all_pc_tables[cd] = table
                datasets_with_pc.append(cd)

            try:
                table = bqutil.get_bq_table_info(cd, 'pcday_ip_counts')
            except Exception as err:
                table = None
            if table is not None:
                self.all_pcday_ip_counts_tables[cd] = table

            try:
                table = bqutil.get_bq_table_info(cd, 'user_info_combo')
            except Exception as err:
                table = None
            if table is not None:
                self.all_uic_tables[cd] = table

            try:
                table = bqutil.get_bq_table_info(cd, 'time_on_task_totals')
            except Exception as err:
                print "[make-course_report_tables] Err: %s" % str(err)
                table = None
            if table is not None:
                self.all_tott_tables[cd] = table

        pc_tables = ',\n'.join(['[%s.person_course]' % x for x in datasets_with_pc])
        pcday_ip_counts_tables = ',\n'.join(['[%s.pcday_ip_counts]' % x for x in self.all_pcday_ip_counts_tables])
        uic_tables = ',\n'.join(['[%s.user_info_combo]' % x for x in self.all_uic_tables])
        tott_tables = ',\n'.join(['[%s.time_on_task_totals]' % x for x in self.all_tott_tables])

        print "%d time_on_task tables: %s" % (len(self.all_tott_tables), tott_tables)
        sys.stdout.flush()

        # find latest combined person_course table
        cpc_tables = [ x for x in bqutil.get_list_of_table_ids(self.dataset) if x.startswith("person_course_") ]
        if cpc_tables:
            the_cpc_table = "[%s.%s]" % (self.dataset, max(cpc_tables))
        else:
            the_cpc_table = None
        print "[make_course_report_tables] ==> Using %s as the latest combined person_course table" % the_cpc_table

        self.parameters = {'dataset': self.dataset,
                           'pc_tables': pc_tables,
                           'uic_tables': uic_tables,
                           'tott_tables': tott_tables,
                           'pcday_ip_counts_tables': pcday_ip_counts_tables,
                           'combined_person_course': the_cpc_table,
                           }
        print "[make_course_report_tables] ==> Using these datasets (with person_course tables): %s" % datasets_with_pc

        self.course_datasets = course_datasets
    
        print "="*100
        print "Generating course report tables -> dataset=%s, project=%s" % (self.dataset, self.output_project_id)
        sys.stdout.flush()

        bqutil.create_dataset_if_nonexistent(self.dataset, project_id=output_project_id)

        self.nskip = nskip
        if 1:
            self.combine_show_answer_stats_by_course()
            self.make_totals_by_course()
            self.make_medians_by_course()
            self.make_table_of_email_addresses()
            self.make_global_modal_ip_table()
            self.make_enrollment_by_day()
            self.make_time_on_task_stats_by_course()
            self.make_total_populations_by_course()
            self.make_table_of_n_courses_registered()
            self.make_geographic_distributions()
            # self.count_tracking_log_events()
            self.make_overall_totals()
    
        print "="*100
        print "Done with course report tables"
        sys.stdout.flush()
    def make_table(self, pcds_table=None, org=None, nskip=0):
        '''
        Get the specified person_course table.  Find IP addresses for which the country code is missing.
        Get country codes for each of those IP addreses using local copy of the maxmind geoip database.
        Store result in bigquery geoip table.
    
        Does not overwrite existing bigquery geoip table - adds new entries.
        '''

        if pcds_table is None:
            if org is None:
                print "Error!  Must specify either --table or --org"
                return
            dataset = 'course_report_' + org

            pctables = []
            for table in bqutil.get_list_of_table_ids(dataset):
                m = re.match('person_course_%s_([\d_]+)$' % org, table)
                if m:
                    pctables.append(table)
            pctables.sort()
            if not pctables:
                print "Error!  No person_course_%s_* tables found in dataset %s!" % (
                    org, dataset)
                return
            pctable = pctables[-1]
        else:
            (dataset, pctable) = pcd_table.split('.', 1)

        print "[make_geoip_table] Using person course from %s.%s" % (dataset,
                                                                     pctable)

        if nskip <= 0:
            pimc_table = "%s_ip_no_cc" % pctable
            sql = """SELECT ip, count(*) as n
                     FROM [{dataset}.{table}] 
                     where cc_by_ip is Null
                     group by ip
                  """.format(dataset=dataset, table=pctable)

            noips = bqutil.get_bq_table(dataset, pimc_table, sql)

            print "%d IP addresses missing geoip information in %s" % (len(
                noips['data']), pimc_table)
            # print noips['data'][:10]
            sys.stdout.flush()

            self.load_geoip()

            for entry in noips['data']:
                ip = entry['ip']
                if ip is None:
                    continue
                if ip in self.geoipdat:
                    # print "--> Already have geoip for %s, skipping" % ip
                    continue

                self.lookup_ip(ip)

                if (self.nchanged % 100 == 0):
                    sys.stdout.write('.')
                    sys.stdout.flush()
                    break

            print "Added %d new geoip entries" % self.nchanged
            sys.stdout.flush()
        else:
            nskip -= 1
        self.write_geoip_table()

        print "--> Done"
        sys.stdout.flush()
Example #18
0
def analyze_course_content(
    course_id,
    listings_file=None,
    basedir="X-Year-2-data-sql",
    datedir="2013-09-21",
    use_dataset_latest=False,
    do_upload=False,
    courses=None,
    verbose=True,
    pin_date=None,
):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split(
                '/', 1)[0]  # extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (
                tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset,
                                      tableid,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False,
                                      format='csv',
                                      skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([
            ('[%s.course_metainfo]' % bqutil.course_id2dataset(x))
            for x in courses
        ])
        sql = "select * from {course_tables}".format(
            course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(
                dataset,
                table,
                sql=sql,
                newer_than=datetime.datetime(2015, 1, 16, 3, 0),
            )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)

        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max(
            [x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset,
                                                          latest_person_course)

        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset,
                                         'nregistered_by_wrap',
                                         sql=sql,
                                         key={'name': 'course_id'})

        # rates for registrants before and during course

        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset,
                                          'stats_cert_rates_by_registration',
                                          sql=sql,
                                          newer_than=datetime.datetime(
                                              2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(
                entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci[
                    'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {
            'msbc_viewed': "viewed_median_stats_by_course",
            'msbc_explored': 'explored_median_stats_by_course',
            'msbc_certified': 'certified_median_stats_by_course',
            'msbc_verified': 'verified_median_stats_by_course',
        }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        listings_keys = map(make_key, [
            "Institution", "Semester", "New or Rerun",
            "Andrew Recodes New/Rerun", "Course Number", "Short Title",
            "Andrew's Short Titles", "Title", "Instructors",
            "Registration Open", "Course Launch", "Course Wrap", "course_id",
            "Empirical Course Wrap", "Andrew's Order", "certifies",
            "MinPassGrade", '4-way Category by name',
            "4-way (CS, STEM, HSocSciGov, HumHistRel)"
        ])
        listings_keys.reverse()

        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = [
            'discussion', 'problem', 'optionresponse', 'checkboxgroup',
            'optioninput', 'choiceresponse', 'video', 'choicegroup',
            'vertical', 'choice', 'sequential', 'multiplechoiceresponse',
            'numericalresponse', 'chapter', 'solution', 'img',
            'formulaequationinput', 'responseparam', 'selfassessment', 'track',
            'task', 'rubric', 'stringresponse', 'combinedopenended',
            'description', 'textline', 'prompt', 'category', 'option', 'lti',
            'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt',
            'comment', 'annotationinput', 'image', 'options', 'comment_prompt',
            'conditional', 'answer', 'poll_question', 'section', 'wrapper',
            'map', 'area', 'customtag', 'transcript', 'split_test',
            'word_cloud', 'openended', 'openendedparam', 'answer_display',
            'code', 'drag_and_drop_input', 'customresponse', 'draggable',
            'mentoring', 'textannotation', 'imageannotation', 'videosequence',
            'feedbackprompt', 'assessments', 'openassessment', 'assessment',
            'explanation', 'criterion'
        ]

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith(
                    'listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (
                    course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [json.loads(x) for x in open(listings_file)]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(
            open('%s/schemas/schema_combined_course_summary_stats.json' %
                 mypath).read())
        schema_dict = {x['name']: x for x in the_schema}

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset,
                                      css_table,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False)

        return

    print "-" * 60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    if listings_file.endswith('.json'):
        data_feed = map(json.loads, open(lfn))
    else:
        data_feed = csv.DictReader(open(lfn))
    for k in data_feed:
        if not 'course_id' in k:
            print "Strange course listings row, no course_id in %s" % k
            raise Exception("Missing course_id")
        if k['course_id'] == course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (
            course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    if pin_date:
        datedir = pin_date
    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest and not pin_date)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)

    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (
            xbfn, course_id)

        if use_dataset_latest:
            # try looking in earlier directories for xbundle file
            import glob
            spath = course_dir / ("../*/xbundle_%s.xml" % cfn)
            files = list(glob.glob(spath))
            if files:
                xbfn = path(files[-1])
            if not xbfn.exists():
                print "   --> also cannot find any %s ; aborting!" % spath
            else:
                print "   --> Found and using instead: %s " % xbfn
        if not xbfn.exists():
            raise Exception("[analyze_content] missing xbundle file %s" % xbfn)

    # if there is an xbundle*.fixed file, use that instead of the normal one
    if os.path.exists(str(xbfn) + ".fixed"):
        xbfn = path(str(xbfn) + ".fixed")

    print "[analyze_content] For %s using %s" % (course_id, xbfn)

    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir,
                                    use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()

    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = [
        'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1',
        'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike',
        'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's',
        'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki',
        'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4',
        'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p',
        'P', 'TABLE', 'TD', 'small', 'text', 'title'
    ]

    problem_stats = defaultdict(int)

    def does_problem_have_random_script(problem):
        '''
        return 1 if problem has a script with "random." in it
        else return 0
        '''
        for elem in problem.findall('.//script'):
            if elem.text and ('random.' in elem.text):
                return 1
        return 0

    # walk through xbundle
    def walk_tree(elem, policy=None):
        '''
        Walk XML tree recursively.
        elem = current element
        policy = dict of attributes for children to inherit, with fields like due, graded, showanswer
        '''
        policy = policy or {}
        if type(elem.tag) == str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        if elem.tag in [
                "sequential", "problem", "problemset", "course", "chapter"
        ]:  # very old courses may use inheritance from course & chapter
            keys = ["due", "graded", "format", "showanswer", "start"]
            for k in keys:  # copy inheritable attributes, if they are specified
                val = elem.get(k)
                if val:
                    policy[k] = val
        if elem.tag == "problem":  # accumulate statistics about problems: how many have show_answer = [past_due, closed] ?  have random. in script?
            problem_stats['n_capa_problems'] += 1
            if policy.get('showanswer'):
                problem_stats["n_showanswer_%s" %
                              policy.get('showanswer')] += 1
            else:
                problem_stats[
                    'n_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)
                # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118
                # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed.
            problem_stats[
                'n_random_script'] += does_problem_have_random_script(elem)

            if policy.get('graded') == 'true' or policy.get(
                    'graded') == 'True':
                problem_stats['n_capa_problems_graded'] += 1
                problem_stats[
                    'n_graded_random_script'] += does_problem_have_random_script(
                        elem)
                if policy.get('showanswer'):
                    problem_stats["n_graded_showanswer_%s" %
                                  policy.get('showanswer')] += 1
                else:
                    problem_stats[
                        'n_graded_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)

        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    try:
                        print "    -> excluding %s (%s), ncount=%s" % (
                            k.get('display_name',
                                  '<no_display_name>').encode('utf8'), midfrag,
                            mudata.get(midfrag, {}).get('ncount'))
                    except Exception as err:
                        print "    -> excluding ", k
                continue
            walk_tree(k, policy.copy())

    walk_tree(xml)
    print "--> Count of individual element tags throughout XML: ", counts

    print "--> problem_stats:", json.dumps(problem_stats, indent=4)

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = [
        'openassessment',
        'optionresponse',
        'multiplechoiceresponse',
        # 'discussion',
        'choiceresponse',
        'word_cloud',
        'combinedopenended',
        'choiceresponse',
        'stringresponse',
        'textannotation',
        'openended',
        'lti'
    ]
    quant_axis = [
        'formularesponse', 'numericalresponse', 'customresponse',
        'symbolicresponse', 'coderesponse', 'imageresponse'
    ]

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count

    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (
        nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k

    ccd[course_id] = {
        'course_id': course_id,
        'nweeks': nweeks,
        'nqual_per_week': nqual_per_week,
        'nquant_per_week': nquant_per_week,
        'total_assessments_per_week': total_per_week,
    }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = [
        'course_id', 'nquant_per_week', 'total_assessments_per_week',
        'nqual_per_week', 'nweeks'
    ]
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update(
        {make_key('listings_%s' % key): value
         for key, value in data.items()})  # from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    cmfields['filename_xbundle'] = xbfn
    cmfields['filename_listings'] = lfn

    for key in sorted(
            counts
    ):  # store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] = str(value)  # from content counts

    for key in sorted(problem_stats):  # store problem stats
        value = problem_stats[key]
        cmfields['problem_stat_%s' % key] = str(value)

    cmfields.update({('nexcluded_sub_20_%s' % key): str(value)
                     for key, value in nexcluded.items()
                     })  # from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id'] == course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'],
                                                           ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()

    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})

    fp.close()

    # build and output course_listings_and_metainfo

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    mypath = os.path.dirname(os.path.realpath(__file__))
    clm_table = "course_listing_and_metainfo"
    clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table)
    clm_schema = json.loads(open(clm_schema_file).read())

    clm = {}
    for finfo in clm_schema:
        field = finfo['name']
        clm[field] = cmfields.get(field)
    clm_fnb = clm_table + ".json"
    clm_fn = course_dir / clm_fnb
    open(clm_fn, 'w').write(json.dumps(clm))

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / clm_fnb
    print "--> Course listing + metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, clm_table)
    sys.stdout.flush()
    gsutil.upload_file_to_gs(clm_fn, gsfnp)
    bqutil.load_data_to_table(dataset,
                              clm_table,
                              gsfnp,
                              clm_schema,
                              wait=True,
                              verbose=False)

    # output course_metainfo

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, table)
    sys.stdout.flush()

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfnp,
                              the_schema,
                              wait=True,
                              verbose=False,
                              format='csv',
                              skiprows=1)
 def make_table(self, pcds_table=None, org=None, nskip=0):
     '''
     Get the specified person_course table.  Find IP addresses for which the country code is missing.
     Get country codes for each of those IP addreses using local copy of the maxmind geoip database.
     Store result in bigquery geoip table.
 
     Does not overwrite existing bigquery geoip table - adds new entries.
     '''
     
     if pcds_table is None:
         if org is None:
             print "Error!  Must specify either --table or --org"
             return
         dataset = 'course_report_' + org
         
         pctables = []
         for table in bqutil.get_list_of_table_ids(dataset):
             m = re.match('person_course_%s_([\d_]+)$' % org, table)
             if m:
                 pctables.append(table)
         pctables.sort()
         if not pctables:
             print "Error!  No person_course_%s_* tables found in dataset %s!" % (org, dataset)
             return
         pctable = pctables[-1]
     else:
         (dataset, pctable) = pcd_table.split('.',1)
 
     print "[make_geoip_table] Using person course from %s.%s" % (dataset, pctable)
 
     if nskip <= 0:
         pimc_table = "%s_ip_no_cc" % pctable
         sql = """SELECT ip, count(*) as n
                  FROM [{dataset}.{table}] 
                  where cc_by_ip is Null
                  group by ip
               """.format(dataset=dataset, table=pctable)
     
         noips = bqutil.get_bq_table(dataset, pimc_table, sql)
         
         print "%d IP addresses missing geoip information in %s" % (len(noips['data']), pimc_table)
         # print noips['data'][:10]
         sys.stdout.flush()
     
         self.load_geoip()
     
         for entry in noips['data']:
             ip = entry['ip']
             if ip is None:
                 continue
             if ip in self.geoipdat:
                 # print "--> Already have geoip for %s, skipping" % ip
                 continue
 
             self.lookup_ip(ip)
 
             if (self.nchanged%100==0):
                 sys.stdout.write('.')
                 sys.stdout.flush()
                 break
     
         print "Added %d new geoip entries" % self.nchanged
         sys.stdout.flush()
     else:
         nskip -= 1
     self.write_geoip_table()
 
     print "--> Done" 
     sys.stdout.flush()
def already_exists(course_id, use_dataset_latest):
    table = "grading_policy"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tables = bqutil.get_list_of_table_ids(dataset)
    return table in tables
def load_all_daily_logs_for_course(course_id, gsbucket="gs://x-data", verbose=True, wait=False,
                                   check_dates=True):
    '''
    Load daily tracking logs for course from google storage into BigQuery.
    
    If wait=True then waits for loading jobs to be completed.  It's desirable to wait
    if subsequent jobs which need these tables (like person_day) are to be run
    immediately afterwards.
    '''

    print "Loading daily tracking logs for course %s into BigQuery (start: %s)" % (course_id, datetime.datetime.now())
    sys.stdout.flush()
    gsroot = gsutil.path_from_course_id(course_id)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA = json.loads(open('%s/schemas/schema_tracking_log.json' % mypath).read())['tracking_log']

    gsdir = '%s/%s/DAILY/' % (gsbucket, gsroot)

    fnset = gsutil.get_gs_file_list(gsdir)
  
    dataset = bqutil.course_id2dataset(gsroot, dtype="logs")
  
    # create this dataset if necessary
    bqutil.create_dataset_if_nonexistent(dataset)

    tables = bqutil.get_list_of_table_ids(dataset)
    tables = [x for x in tables if x.startswith('track')]
  
    if verbose:
        print "-"*77
        print "current tables loaded:", json.dumps(tables, indent=4)
        print "files to load: ", json.dumps(fnset.keys(), indent=4)
        print "-"*77
        sys.stdout.flush()
  
    for fn, fninfo in fnset.iteritems():

        if int(fninfo['size'])<=45:
            print "Zero size file %s, skipping" % fn
            continue

        m = re.search('(\d\d\d\d-\d\d-\d\d)', fn)
        if not m:
            continue
        date = m.group(1)
        tablename = "tracklog_%s" % date.replace('-','')	# YYYYMMDD for compatibility with table wildcards

        # file_date = gsutil.get_local_file_mtime_in_utc(fn, make_tz_unaware=True)
        file_date = fninfo['date'].replace(tzinfo=None)
  
        if tablename in tables:
            skip = True
            if check_dates:
                table_date = bqutil.get_bq_table_last_modified_datetime(dataset, tablename)
                if not (table_date > file_date):
                    print "Already have table %s, but %s file_date=%s, table_date=%s; re-loading from gs" % (tablename, fn, file_date, table_date)
                    skip = False
                    
            if skip:
                if verbose:
                    print "Already have table %s, skipping file %s" % (tablename, fn)
                    sys.stdout.flush()
                continue

        #if date < '2014-07-27':
        #  continue
  
        print "Loading %s into table %s " % (fn, tablename)
        if verbose:
            print "start [%s]" % datetime.datetime.now()
        sys.stdout.flush()
        gsfn = fninfo['name']
        ret = bqutil.load_data_to_table(dataset, tablename, gsfn, SCHEMA, wait=wait, maxbad=1000)
  
    if verbose:
        print "-" * 77
        print "done with %s [%s]" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
Example #22
0
def rephrase_forum_json_for_course(
    course_id,
    gsbucket="gs://x-data",
    basedir="X-Year-2-data-sql",
    datedir=None,
    do_gs_copy=False,
    use_dataset_latest=False,
):

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket,
                                          use_dataset_latest)

    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            sys.stdout.flush()
            return

    print "Processing %s -> %s (%s)" % (fn, ofn, datetime.datetime.now())
    sys.stdout.flush()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    for line in fp:
        cnt += 1
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)
    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt == 0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()
Example #23
0
def rephrase_forum_json_for_course(
    course_id,
    gsbucket="gs://x-data",
    basedir="X-Year-2-data-sql",
    datedir=None,
    do_gs_copy=False,
    use_dataset_latest=False,
):

    print "Loading SQL for course %s into BigQuery (start: %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    lfp = find_course_sql_dir(course_id,
                              basedir,
                              datedir,
                              use_dataset_latest=use_dataset_latest)

    print "Using this directory for local files: ", lfp
    sys.stdout.flush()

    fn = 'forum.mongo'
    gsdir = gsutil.gs_path_from_course_id(course_id, gsbucket,
                                          use_dataset_latest)

    def openfile(fn, mode='r'):
        if (not os.path.exists(lfp / fn)) and (not fn.endswith('.gz')):
            fn += ".gz"
        if fn.endswith('.gz'):
            return gzip.GzipFile(lfp / fn, mode)
        return open(lfp / fn, mode)

    fp = openfile(fn)

    ofn = lfp / "forum-rephrased.json.gz"
    ofncsv = "forum.csv.gz"  # To match table name in BQ
    ofncsv_lfp = lfp / ofncsv

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    bqutil.create_dataset_if_nonexistent(dataset)

    if os.path.exists(ofn) and os.path.exists(ofncsv_lfp):

        tables = bqutil.get_list_of_table_ids(dataset)
        if not 'forum' in tables:
            print "Already done?  But no forums table loaded into datasaet %s.  Redoing." % dataset
        else:
            print "Already done %s -> %s (skipping)" % (fn, ofn)
            print "Already done %s -> %s (skipping)" % (fn, ofncsv_lfp)
            sys.stdout.flush()
            return

    print "Processing %s -> writing to %s and %s (%s)" % (
        fn, ofn, ofncsv, datetime.datetime.now())
    sys.stdout.flush()

    # Setup CSV header
    ocsv = csv.DictWriter(openfile(ofncsv, 'w'),
                          fieldnames=SCHEMA_DICT.keys(),
                          quoting=csv.QUOTE_NONNUMERIC)
    ocsv.writeheader()

    cnt = 0
    ofp = gzip.GzipFile('tmp.json.gz', 'w')
    data = OrderedDict()
    for line in fp:
        cnt += 1
        # Write JSON row
        newline = do_rephrase_line(line, linecnt=cnt)
        ofp.write(newline)

        try:
            #Write CSV row
            data = json.loads(newline)
            ocsv.writerow(data)
        except Exception as err:
            print "Error writing CSV output row %s=%s" % (cnt, data)
            raise

    ofp.close()

    print "...done (%s)" % datetime.datetime.now()

    if cnt == 0:
        print "...but cnt=0 entries found, skipping forum loading"
        sys.stdout.flush()
        return

    print "...copying to gsc"
    sys.stdout.flush()

    # do upload twice, because GSE file metadata doesn't always make it to BigQuery right away?
    gsfn = gsdir + '/' + "forum-rephrased.json.gz"
    cmd = 'gsutil cp tmp.json.gz %s' % (gsfn)
    os.system(cmd)
    os.system(cmd)

    table = 'forum'
    bqutil.load_data_to_table(dataset, table, gsfn, SCHEMA, wait=True)
    msg = "Original data from %s" % (lfp / fn)
    bqutil.add_description_to_table(dataset, table, msg, append=True)

    os.system('mv tmp.json.gz "%s"' % (ofn))

    print "...done (%s)" % datetime.datetime.now()
    sys.stdout.flush()