def createVideoAxis(course_id,
                    force_recompute=False,
                    use_dataset_latest=False):
    '''
    Video axis depends on the current course axis, and looks for the category field defines as video.
    In addition, the edx video id is extracted (with the full path stripped, in order to generalize tracking log searches for video ids where it
    was found that some courses contained the full path beginning with i4x, while other courses only had the edx video id), youtube id
    and the chapter name / index for that respective video
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    table = TABLE_VIDEO_AXIS

    # Get Video results
    the_sql = """
                SELECT chapters.index as index_chapter,
                       videos.index as index_video,
                       videos.category as category,
                       videos.course_id as course_id,
                       videos.name as name,
                       videos.vid_id as video_id,
                       videos.yt_id as youtube_id,
                       chapters.name as chapter_name
                      FROM ( SELECT index, category, course_id, name, chapter_mid, 
                             #REGEXP_REPLACE(module_id, '[.]', '_') as vid_id, # vid id containing full path
                             REGEXP_EXTRACT(REGEXP_REPLACE(module_id, '[.]', '_'), r'(?:.*\/)(.*)') as vid_id, # Only containing video id
                             REGEXP_EXTRACT(data.ytid, r'\:(.*)') as yt_id,
                      FROM [{dataset}.course_axis]
                      WHERE category = "video") as videos
                      LEFT JOIN 
                      ( SELECT name, module_id, index
                        FROM [{dataset}.course_axis]
                      ) as chapters
                      ON videos.chapter_mid = chapters.module_id
                      ORDER BY videos.index asc
              """.format(dataset=dataset)

    print "[analyze_videos] Creating %s.%s table for %s" % (
        dataset, TABLE_VIDEO_AXIS, course_id)
    sys.stdout.flush()

    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS)
        assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % (
            TABLE_VIDEO_AXIS, TABLE_COURSE_AXIS)

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % (
            dataset, TABLE_COURSE_AXIS, TABLE_VIDEO_AXIS)
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        the_sql,
        force_query=force_recompute,
        depends_on=["%s.course_axis" % (dataset)],
    )
    return bqdat
def make_enrollment_verified_events_per_user(course_id,
                                             force_recompute=False,
                                             use_dataset_latest=False,
                                             end_date=None):

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    table = TABLE_PERSON_ENROLLMENT_VERIFIED

    SQL = """

		SELECT "{course_id}" as course_id, 
                       user_id,
		       min(TIMESTAMP(time)) as verified_enroll_time,
		       case when max(TIMESTAMP_TO_SEC(time)) == min(TIMESTAMP_TO_SEC(time)) then null else max(TIMESTAMP(time)) end as verified_unenroll_time,
		FROM [{dataset}.{enrollment_events}]
		WHERE ((mode = 'verified' and deactivated) or # Unenrolled
		       (mode='verified' and not activated and mode_changed) # Enrolled
		      )
		GROUP BY course_id, user_id
		ORDER BY verified_enroll_time asc

         """.format(dataset=dataset,
                    course_id=course_id,
                    enrollment_events=TABLE_ENROLLMENT_EVENTS)

    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        SQL,
        force_query=force_recompute,
        depends_on=["%s.%s" % (dataset, TABLE_ENROLLMENT_EVENTS)],
    )
def createPersonCourseVideo( course_id, force_recompute=False, use_dataset_latest=False ):
    '''
    Create the person_course_video_watched table, based on video_stats.
    Each row gives the number of unique videos watched by a given user, for the given course.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_PERSON_COURSE_VIDEO_WATCHED

    the_sql = """
                  SELECT user_id, 
                      "{course_id}" as course_id,
                      count(*) n_unique_videos_watched,
                      count(*) / n_total_videos as fract_total_videos_watched,
                      viewed, certified, verified
                  FROM
                  (
                      SELECT PC.user_id as user_id, UV.username as username,
                          video_id, 
                          n_views,
                          NV.n_total_videos as n_total_videos,
                          certified,
                          viewed,
                          (mode=="verified") as verified,
                      FROM
                      (
                          SELECT username, video_id, count(*) as n_views
                          FROM [{dataset}.video_stats_day] 
                          GROUP BY username, video_id
                      ) UV
                      JOIN [{dataset}.person_course] PC
                      on UV.username = PC.username
                      CROSS JOIN 
                      (
                          SELECT count(*) as n_total_videos
                          FROM [{dataset}.video_axis]
                      ) NV
                      WHERE ((PC.roles = 'Student') OR (PC.roles is NULL))	# accommodate case when roles.csv is missing
                      # WHERE PC.roles = 'Student'
                  )
                  GROUP BY user_id, certified, viewed, verified, n_total_videos
                  order by user_id
              """

    the_sql = the_sql.format(course_id=course_id, dataset=dataset)
    bqdat = bqutil.get_bq_table(dataset, table, the_sql, force_query=force_recompute,
                                depends_on=["%s.%s" % (dataset, TABLE_VIDEO_STATS)],
                                newer_than=datetime.datetime( 2017, 2, 6, 18, 30 ),
                                startIndex=-2)
    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, table)
    print "--> Done with %s for %s, %d entries found" % (table, course_id, nfound)
    sys.stdout.flush()

    return bqdat
Example #4
0
def create_course_problem_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    Generate course_problem table, with one row per (problem_id), giving average points, standard deviation on points,
    number of unique users attempted, max points possible.

    Uses person_item and course_item.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "course_problem"

    the_sql = """
# compute course_problem table for {course_id}
SELECT problem_nid, problem_id, problem_short_id, 
  avg(problem_grade) as avg_problem_raw_score,
  stddev(problem_grade) as sdv_problem_raw_score,
  # max(problem_grade) as max_problem_raw_score,
  max(possible_raw_score) as max_possible_raw_score,
  avg(problem_grade / possible_raw_score * 100) as avg_problem_pct_score,
  count(unique(user_id)) as n_unique_users_attempted,
  problem_name,
  is_split,
  split_name,
FROM
(
    SELECT problem_nid, problem_id, problem_short_id, sum(item_grade) as problem_grade, user_id,
        sum(CI.item_points_possible) as possible_raw_score, problem_name, is_split, split_name,
    FROM [{dataset}.person_item] PI
    JOIN [{dataset}.course_item] CI
    on PI.item_nid = CI.item_nid
    group by problem_nid, problem_short_id, problem_id, user_id, problem_name, is_split, split_name
)
group by problem_nid, problem_id, problem_short_id, problem_name, is_split, split_name
# order by problem_short_id
order by avg_problem_pct_score desc
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [ "%s.course_item" % dataset,
                   "%s.person_item" % dataset
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[make_course_problem_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()
def createVideoAxis(course_id, force_recompute=False, use_dataset_latest=False):
    """
    Video axis depends on the current course axis, and looks for the category field defines as video.
    In addition, the edx video id is extracted (with the full path stripped, in order to generalize tracking log searches for video ids where it
    was found that some courses contained the full path beginning with i4x, while other courses only had the edx video id), youtube id
    and the chapter name / index for that respective video
    """
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_VIDEO_AXIS

    # Get Video results
    the_sql = """
                SELECT chapters.index as index_chapter,
                       videos.index as index_video,
                       videos.category as category,
                       videos.course_id as course_id,
                       videos.name as name,
                       videos.vid_id as video_id,
                       videos.yt_id as youtube_id,
                       chapters.name as chapter_name
                      FROM ( SELECT index, category, course_id, name, chapter_mid, 
                             #REGEXP_REPLACE(module_id, '[.]', '_') as vid_id, # vid id containing full path
                             REGEXP_EXTRACT(REGEXP_REPLACE(module_id, '[.]', '_'), r'(?:.*\/)(.*)') as vid_id, # Only containing video id
                             REGEXP_EXTRACT(data.ytid, r'\:(.*)') as yt_id,
                      FROM [{dataset}.course_axis]
                      WHERE category = "video") as videos
                      LEFT JOIN 
                      ( SELECT name, module_id, index
                        FROM [{dataset}.course_axis]
                      ) as chapters
                      ON videos.chapter_mid = chapters.module_id
                      ORDER BY videos.index asc
              """.format(
        dataset=dataset
    )

    print "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_AXIS, course_id)
    sys.stdout.flush()

    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS)
        assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % (
            TABLE_VIDEO_AXIS,
            TABLE_COURSE_AXIS,
        )

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % (dataset, TABLE_COURSE_AXIS, TABLE_VIDEO_AXIS)
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(
        dataset, table, the_sql, force_query=force_recompute, depends_on=["%s.course_axis" % (dataset)]
    )
    return bqdat
Example #6
0
def create_person_problem_table(course_id,
                                force_recompute=False,
                                use_dataset_latest=False):
    '''
    Generate person_problem table, with one row per (user_id, problem_id), giving problem raw_score earned, attempts,
    and datestamp.

    Computed by aggregating over person_item, and joining with course_item
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    tablename = "person_problem"

    the_sql = """
# compute person-problem table for {course_id}

SELECT user_id,
       course_id,
    CI.problem_nid as problem_nid,
    sum(item_grade) as problem_raw_score,
    sum(item_grade) / sum(CI.item_points_possible) * 100 as problem_pct_score,
    max(PI.grade) as grade,
    max(n_attempts) as n_attempts,
    max(date) as date,
    
FROM [{dataset}.person_item] PI
JOIN [{dataset}.course_item] CI
    
on PI.item_nid = CI.item_nid
group by user_id, course_id, problem_nid
order by user_id, course_id, problem_nid
    """.format(dataset=dataset, course_id=course_id)

    depends_on = ["%s.course_item" % dataset, "%s.person_item" % dataset]

    try:
        bqdat = bqutil.get_bq_table(dataset,
                                    tablename,
                                    the_sql,
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[make_person_problem_table] ERR! failed in creating %s.%s using this sql:" % (
            dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id,
                                                         nfound)
    sys.stdout.flush()
def get_stats_module_usage(
    course_id,
    basedir="X-Year-2-data-sql",
    datedir="2013-09-21",
    use_dataset_latest=False,
):
    '''
    Get data from the stats_module_usage table, if it doesn't already exist as a local file.
    Compute it if necessary.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    sql = """   SELECT 
                    module_type, module_id, count(*) as ncount 
                FROM [{dataset}.studentmodule] 
                group by module_id, module_type
                order by module_id
          """.format(dataset=dataset)

    table = 'stats_module_usage'
    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest)
    csvfn = course_dir / (table + ".csv")

    data = {}
    if csvfn.exists():
        # read file into data structure
        for k in list(csv.DictReader(open(csvfn))):
            midfrag = tuple(k['module_id'].split('/')[-2:])
            data[midfrag] = k
    else:
        # download if it is already computed, or recompute if needed
        bqdat = bqutil.get_bq_table(dataset, table, sql=sql)
        if bqdat is None:
            bqdat = {'data': []}

        fields = ["module_type", "module_id", "ncount"]
        fp = open(csvfn, 'w')
        cdw = csv.DictWriter(fp, fieldnames=fields)
        cdw.writeheader()
        for k in bqdat['data']:
            midfrag = tuple(k['module_id'].split('/')[-2:])
            data[midfrag] = k
            try:
                k['module_id'] = k['module_id'].encode('utf8')
                cdw.writerow(k)
            except Exception as err:
                print "Error writing row %s, err=%s" % (k, str(err))
        fp.close()

    print "[analyze_content] got %d lines of studentmodule usage data" % len(
        data)
    return data
Example #8
0
def create_person_item_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    Generate person_item table, with one row per (user_id, item_id), giving grade points earned, attempts,
    and datestamp.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "person_item"

    the_sql = """
# compute person-item table

SELECT user_id, 
    # PA.item_id as item_id,
    CI.item_short_id as item_short_id,
    CI.item_nid as item_nid,
    item_grade,
    n_attempts,
    date
FROM
(
    SELECT user_id,
        item.answer_id as item_id,
        if(item.correct_bool, 1, 0) as item_grade,
        attempts as n_attempts,
        max(created) as date,
    FROM [{dataset}.problem_analysis]
    group by user_id, item_id, item_grade, n_attempts  # force (user_id, item_id) to be unique (it should always be, even w/o this)
) PA
JOIN [{dataset}.course_item] CI
on PA.item_id = CI.item_id
order by user_id, CI.content_index, CI.item_number
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [ "%s.course_item" % dataset,
                   "%s.problem_analysis" % dataset
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[make_person_item_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()
def create_person_problem_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    Generate person_problem table, with one row per (user_id, problem_id), giving problem raw_score earned, attempts,
    and datestamp.

    Computed by aggregating over person_item, and joining with course_item
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "person_problem"

    the_sql = """
# compute person-problem table for {course_id}

SELECT user_id,
       course_id,
    CI.problem_nid as problem_nid,
    sum(item_grade) as problem_raw_score,
    sum(item_grade) / sum(CI.item_points_possible) * 100 as problem_pct_score,
    max(PI.grade) as grade,
    max(n_attempts) as n_attempts,
    max(date) as date,
    
FROM [{dataset}.person_item] PI
JOIN [{dataset}.course_item] CI
    
on PI.item_nid = CI.item_nid
group by user_id, course_id, problem_nid
order by user_id, course_id, problem_nid
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [ "%s.course_item" % dataset,
                   "%s.person_item" % dataset
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[make_person_problem_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()
def get_stats_module_usage(course_id,
                           basedir="X-Year-2-data-sql", 
                           datedir="2013-09-21", 
                           use_dataset_latest=False,
                           ):
    '''
    Get data from the stats_module_usage table, if it doesn't already exist as a local file.
    Compute it if necessary.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    sql = """   SELECT 
                    module_type, module_id, count(*) as ncount 
                FROM [{dataset}.studentmodule] 
                group by module_id, module_type
                order by module_id
          """.format(dataset=dataset)

    table = 'stats_module_usage'
    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    csvfn = course_dir / (table + ".csv")

    data = {}
    if csvfn.exists():
        # read file into data structure
        for k in list(csv.DictReader(open(csvfn))):
            midfrag = tuple(k['module_id'].split('/')[-2:])
            data[midfrag] = k
    else:
        # download if it is already computed, or recompute if needed
        bqdat = bqutil.get_bq_table(dataset, table, sql=sql)
        if bqdat is None:
            bqdat = {'data': []}

        fields = [ "module_type", "module_id", "ncount" ]
        fp = open(csvfn, 'w')
        cdw = csv.DictWriter(fp, fieldnames=fields)
        cdw.writeheader()
        for k in bqdat['data']:
            midfrag = tuple(k['module_id'].split('/')[-2:])
            data[midfrag] = k
            try:
                k['module_id'] = k['module_id'].encode('utf8')
                cdw.writerow(k)
            except Exception as err:
                print "Error writing row %s, err=%s" % (k, str(err))
        fp.close()

    print "[analyze_content] got %d lines of studentmodule usage data" % len(data)
    return data
def process_time_on_task_totals(course_id, force_recompute=False, use_dataset_latest=False):

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
                    username, 

                    sum(total_time_5) as total_time_5,
                    sum(total_time_30) as total_time_30,

                    sum(total_video_time_5) as total_video_time_5,
                    sum(total_video_time_30) as total_video_time_30,
                    sum(serial_video_time_30) as serial_video_time_30,

                    sum(total_problem_time_5) as total_problem_time_5,
                    sum(total_problem_time_30) as total_problem_time_30,
                    sum(serial_problem_time_30) as serial_problem_time_30,

                    sum(total_forum_time_5) as total_forum_time_5,
                    sum(total_forum_time_30) as total_forum_time_30,
                    sum(serial_forum_time_30) as serial_forum_time_30,

                    sum(total_text_time_5) as total_text_time_5,
                    sum(total_text_time_30) as total_text_time_30,
                    sum(serial_text_time_30) as serial_text_time_30,

            FROM [{dataset}.time_on_task]
            GROUP BY course_id, username
            order by username
         """

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    the_sql = SQL.format(dataset=dataset, course_id=course_id)

    tablename = 'time_on_task_totals'

    print "Computing %s for %s" % (tablename, dataset)
    sys.stdout.flush()

    bqdat = bqutil.get_bq_table(dataset, tablename, the_sql,
                                force_query=force_recompute,
                                depends_on=[ '%s.time_on_task' % dataset ],
                                )

    return bqdat
def process_time_on_task_totals(course_id, force_recompute=False, use_dataset_latest=False):

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
                    username, 

                    sum(total_time_5) as total_time_5,
                    sum(total_time_30) as total_time_30,

                    sum(total_video_time_5) as total_video_time_5,
                    sum(total_video_time_30) as total_video_time_30,
                    sum(serial_video_time_30) as serial_video_time_30,

                    sum(total_problem_time_5) as total_problem_time_5,
                    sum(total_problem_time_30) as total_problem_time_30,
                    sum(serial_problem_time_30) as serial_problem_time_30,

                    sum(total_forum_time_5) as total_forum_time_5,
                    sum(total_forum_time_30) as total_forum_time_30,
                    sum(serial_forum_time_30) as serial_forum_time_30,

                    sum(total_text_time_5) as total_text_time_5,
                    sum(total_text_time_30) as total_text_time_30,
                    sum(serial_text_time_30) as serial_text_time_30,

            FROM [{dataset}.time_on_task]
            GROUP BY course_id, username
            order by username
         """

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    the_sql = SQL.format(dataset=dataset, course_id=course_id)

    tablename = 'time_on_task_totals'

    print "Computing %s for %s" % (tablename, dataset)
    sys.stdout.flush()

    bqdat = bqutil.get_bq_table(dataset, tablename, the_sql,
                                force_query=force_recompute,
                                depends_on=[ '%s.time_on_task' % dataset ],
                                )

    return bqdat
def make_enrollment_verified_events_per_user(course_id, force_recompute=False, use_dataset_latest=False, end_date=None):


    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_PERSON_ENROLLMENT_VERIFIED
    
    SQL = """

		SELECT "{course_id}" as course_id, 
                       user_id,
		       min(TIMESTAMP(time)) as verified_enroll_time,
		       case when max(TIMESTAMP_TO_SEC(time)) == min(TIMESTAMP_TO_SEC(time)) then null else max(TIMESTAMP(time)) end as verified_unenroll_time,
		FROM [{dataset}.{enrollment_events}]
		WHERE ((mode = 'verified' and deactivated) or # Unenrolled
		       (mode='verified' and not activated and mode_changed) # Enrolled
		      )
		GROUP BY course_id, user_id
		ORDER BY verified_enroll_time asc

         """.format( dataset=dataset, course_id=course_id, enrollment_events=TABLE_ENROLLMENT_EVENTS )

    bqdat = bqutil.get_bq_table(dataset, table, SQL, force_query=force_recompute,
                                depends_on=["%s.%s" % (dataset, TABLE_ENROLLMENT_EVENTS )],
                               )
def analyze_course_content(course_id, 
                           listings_file=None,
                           basedir="X-Year-2-data-sql", 
                           datedir="2013-09-21", 
                           use_dataset_latest=False,
                           do_upload=False,
                           courses=None,
                           verbose=True,
                           ):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split('/',1)[0]	# extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset, tableid, gsfnp, the_schema, wait=True, verbose=False,
                                      format='csv', skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([('[%s.course_metainfo]' % bqutil.course_id2dataset(x)) for x in courses])
        sql = "select * from {course_tables}".format(course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(dataset, table, sql=sql, 
                                          newer_than=datetime.datetime(2015, 1, 16, 3, 0),
                                          )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)


        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max([ x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset, latest_person_course)
        
        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset, 'nregistered_by_wrap', sql=sql, key={'name': 'course_id'})

        # rates for registrants before and during course
        
        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset, 'stats_cert_rates_by_registration', sql=sql, 
                                          newer_than=datetime.datetime(2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci['certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {'msbc_viewed': "viewed_median_stats_by_course",
                       'msbc_explored': 'explored_median_stats_by_course',
                       'msbc_certified': 'certified_median_stats_by_course',
                       'msbc_verified': 'verified_median_stats_by_course',
                       }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field=='course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        def make_key(key):
            key = key.strip()
            key = key.replace(' ', '_').replace("'", "_").replace('/', '_').replace('(','').replace(')','').replace('-', '_').replace(',', '')
            return key

        listings_keys = map(make_key, ["Institution", "Semester", "New or Rerun", "Andrew Recodes New/Rerun", 
                                       "Course Number", "Short Title", "Andrew's Short Titles", "Title", 
                                       "Instructors", "Registration Open", "Course Launch", "Course Wrap", "course_id",
                                       "Empirical Course Wrap", "Andrew's Order", "certifies", "MinPassGrade",
                                       '4-way Category by name', "4-way (CS, STEM, HSocSciGov, HumHistRel)"
                                       ])
        listings_keys.reverse()
        
        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = ['discussion', 'problem', 'optionresponse', 'checkboxgroup', 'optioninput', 
                          'choiceresponse', 'video', 'choicegroup', 'vertical', 'choice', 'sequential', 
                          'multiplechoiceresponse', 'numericalresponse', 'chapter', 'solution', 'img', 
                          'formulaequationinput', 'responseparam', 'selfassessment', 'track', 'task', 'rubric', 
                          'stringresponse', 'combinedopenended', 'description', 'textline', 'prompt', 'category', 
                          'option', 'lti', 'annotationresponse', 
                          'annotatable', 'colgroup', 'tag_prompt', 'comment', 'annotationinput', 'image', 
                          'options', 'comment_prompt', 'conditional', 
                          'answer', 'poll_question', 'section', 'wrapper', 'map', 'area', 
                          'customtag', 'transcript', 
                          'split_test', 'word_cloud', 
                          'openended', 'openendedparam', 'answer_display', 'code', 
                          'drag_and_drop_input', 'customresponse', 'draggable', 'mentoring', 
                          'textannotation', 'imageannotation', 'videosequence', 
                          'feedbackprompt', 'assessments', 'openassessment', 'assessment', 'explanation', 'criterion']

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith('listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [ json.loads(x) for x in open(listings_file) ]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(open('%s/schemas/schema_combined_course_summary_stats.json' % mypath).read())
        schema_dict = { x['name'] : x for x in the_schema }

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False)

        return

    
    print "-"*60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    for k in csv.DictReader(open(lfn)):
        if k['course_id']==course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)
    
    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (xbfn, course_id)
        return

    print "[analyze_content] For %s using %s" % (course_id, xbfn)
    
    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir, use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()
    
    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = ['html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1', 'em', 'b', 'h2', 'h3', 'body', 'span', 'strong',
              'a', 'sub', 'strike', 'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's', 'pre', 'policy', 'metadata',
              'grading_policy', 'br', 'center',  'wiki', 'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4', 
              'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p', 'P', 'TABLE', 'TD', 'small', 'text', 'title']

    def walk_tree(elem):
        if  type(elem.tag)==str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    print "    -> excluding %s (%s), ncount=%s" % (k.get('display_name', '<no_display_name>').encode('utf8'), 
                                                                   midfrag, 
                                                                   mudata.get(midfrag, {}).get('ncount'))
                continue
            walk_tree(k)

    walk_tree(xml)
    print counts

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = ['openassessment', 'optionresponse', 'multiplechoiceresponse', 
                 # 'discussion', 
                 'choiceresponse', 'word_cloud', 
                 'combinedopenended', 'choiceresponse', 'stringresponse', 'textannotation', 'openended', 'lti']
    quant_axis = ['formularesponse', 'numericalresponse', 'customresponse', 'symbolicresponse', 'coderesponse',
                  'imageresponse']

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count
    
    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k
    
    ccd[course_id] = {'course_id': course_id,
                      'nweeks': nweeks,
                      'nqual_per_week': nqual_per_week,
                      'nquant_per_week': nquant_per_week,
                      'total_assessments_per_week' : total_per_week,
                      }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = ['course_id', 'nquant_per_week', 'total_assessments_per_week', 'nqual_per_week', 'nweeks']
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update({ ('listings_%s' % key) : value for key, value in data.items() })	# from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    for key in sorted(counts):	# store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] =  str(value) 	# from content counts

    cmfields.update({ ('nexcluded_sub_20_%s' % key) : str(value) for key, value in nexcluded.items() })	# from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id']==course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'], ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()
    
    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})
        
    fp.close()

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (gsfnp, dataset, table)

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset, table, gsfnp, the_schema, wait=True, verbose=False, format='csv', skiprows=1)
Example #15
0
    def cached_get_bq_table(
        self,
        dataset,
        table,
        sql=None,
        key=None,
        drop=None,
        logger=None,
        ignore_cache=False,
        depends_on=None,
        force_query=False,
        force_newer_than=None,
        startIndex=0,
        maxResults=1000000,
        allowLargeResults=False,
        raise_exception=False,
        project_id=None,
    ):
        '''
        Get a dataset from BigQuery; use memcache.

        If "depends_on" is provided (as a list of strings), and if the desired table
        already exists, then check to make sure it is newer than any of the tables
        listed in "depends_on".

        if force_newer_than is set (should be a datetime) then in the depends_on
        testing, use that date as an override, such that the SQL is re-run if
        the existing table is older than this date.

        project_id: if specified, overrides the default BigQuery project ID (for the actual query)
        '''
        if logger is None:
            logger = logging.info
        memset = '%s.%s' % (dataset, table)
        if startIndex:
            memset += '-%d-%d' % (startIndex, maxResults)
        data = mem.get(memset)

        optargs = {}
        if project_id:
            optargs['project_id'] = project_id

        table_date = None
        if depends_on is not None:
            # get the latest mod time of tables in depends_on:
            modtimes = [
                bqutil.get_bq_table_last_modified_datetime(
                    *(x.split('.', 1)), **optargs) for x in depends_on
            ]
            latest = max([x for x in modtimes if x is not None] or [None])

            if not latest:
                raise Exception(
                    "[datasource.cached_get_bq_table] Cannot get last mod time for %s (got %s), needed by %s.%s"
                    % (depends_on, modtimes, dataset, table))

            if force_newer_than and force_newer_than > latest:
                latest = force_newer_than

            if data and data.get('lastModifiedTime', None):
                # data has a mod time, let's see if that has expired
                if data.get('lastModifiedTime', None) < latest:
                    ignore_cache = True

            # get the mod time of the computed table, if it exists
            try:
                table_date = bqutil.get_bq_table_last_modified_datetime(
                    dataset, table, **optargs)
            except Exception as err:
                if 'Not Found' in str(err):
                    table_date = None
                    ignore_cache = True
                    logging.info(
                        "[datasource.cached_get_bq_table] Table %s.%s doesn't exist, forcing recomputation"
                        % (dataset, table))
                else:
                    raise

            if table_date and table_date < latest:
                ignore_cache = True
                if sql:
                    force_query = True
                    logging.info(
                        "[datasource.cached_get_bq_table] Forcing query recomputation of %s.%s, table_date=%s, latest=%s"
                        % (dataset, table, table_date, latest))
                else:
                    logging.info(
                        "[datasource.cached_get_bq_table] Forcing cache reload of %s.%s, table_date=%s, latest=%s"
                        % (dataset, table, table_date, latest))

            # logging.info("[datasource.cached_get_bq_table] %s.%s table_date=%s, latest=%s, force_query=%s" % (dataset, table, table_date, latest, force_query))

        if (not data) or ignore_cache or (
                not data['data']
        ):  # data['data']=None if table was empty, and in that case try again
            try:
                data = bqutil.get_bq_table(dataset,
                                           table,
                                           sql,
                                           key=key,
                                           logger=logger,
                                           force_query=force_query,
                                           startIndex=startIndex,
                                           maxResults=maxResults,
                                           allowLargeResults=allowLargeResults,
                                           **optargs)
                if force_query or not table_date:
                    table_date = bqutil.get_bq_table_last_modified_datetime(
                        dataset, table, **optargs)
                data['last_modified_date'] = table_date
            except Exception as err:
                logging.error(err)
                if raise_exception:
                    raise
                data = {
                    'fields': {},
                    'field_names': [],
                    'data': [],
                    'data_by_key': {}
                }
                return data  # don't cache empty result
            data['depends_on'] = depends_on
            if (drop is not None) and drop:
                for key in drop:
                    data.pop(
                        key
                    )  # because data can be too huge for memcache ("Values may not be more than 1000000 bytes in length")
            try:
                mem.set(memset, data, time=3600 * 12)
            except Exception as err:
                logging.error('error doing mem.set for %s.%s from bigquery' %
                              (dataset, table))
        self.bqdata[table] = data
        return data
Example #16
0
def analyze_course_content(
    course_id,
    listings_file=None,
    basedir="X-Year-2-data-sql",
    datedir="2013-09-21",
    use_dataset_latest=False,
    do_upload=False,
    courses=None,
    verbose=True,
    pin_date=None,
):
    '''
    Compute course_content table, which quantifies:

    - number of chapter, sequential, vertical modules
    - number of video modules
    - number of problem, *openended, mentoring modules
    - number of dicussion, annotatable, word_cloud modules

    Do this using the course "xbundle" file, produced when the course axis is computed.

    Include only modules which had nontrivial use, to rule out the staff and un-shown content. 
    Do the exclusion based on count of module appearing in the studentmodule table, based on 
    stats_module_usage for each course.

    Also, from the course listings file, compute the number of weeks the course was open.

    If do_upload (triggered by --force-recompute) then upload all accumulated data to the course report dataset 
    as the "stats_course_content" table.  Also generate a "course_summary_stats" table, stored in the
    course_report_ORG or course_report_latest dataset.  The course_summary_stats table combines
    data from many reports,, including stats_course_content, the medians report, the listings file,
    broad_stats_by_course, and time_on_task_stats_by_course.
    
    '''

    if do_upload:
        if use_dataset_latest:
            org = "latest"
        else:
            org = courses[0].split(
                '/', 1)[0]  # extract org from first course_id in courses

        crname = 'course_report_%s' % org

        gspath = gsutil.gs_path_from_course_id(crname)
        gsfnp = gspath / CCDATA
        gsutil.upload_file_to_gs(CCDATA, gsfnp)
        tableid = "stats_course_content"
        dataset = crname

        mypath = os.path.dirname(os.path.realpath(__file__))
        SCHEMA_FILE = '%s/schemas/schema_content_stats.json' % mypath

        try:
            the_schema = json.loads(open(SCHEMA_FILE).read())[tableid]
        except Exception as err:
            print "Oops!  Failed to load schema file for %s.  Error: %s" % (
                tableid, str(err))
            raise

        if 0:
            bqutil.load_data_to_table(dataset,
                                      tableid,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False,
                                      format='csv',
                                      skiprows=1)

        table = 'course_metainfo'
        course_tables = ',\n'.join([
            ('[%s.course_metainfo]' % bqutil.course_id2dataset(x))
            for x in courses
        ])
        sql = "select * from {course_tables}".format(
            course_tables=course_tables)
        print "--> Creating %s.%s using %s" % (dataset, table, sql)

        if 1:
            metainfo_dataset = bqutil.get_bq_table(
                dataset,
                table,
                sql=sql,
                newer_than=datetime.datetime(2015, 1, 16, 3, 0),
            )
            # bqutil.create_bq_table(dataset, table, sql, overwrite=True)

        #-----------------------------------------------------------------------------
        # make course_summary_stats table
        #
        # This is a combination of the broad_stats_by_course table (if that exists), and course_metainfo.
        # Also use (and create if necessary) the nregistered_by_wrap table.

        # get the broad_stats_by_course data
        bsbc = bqutil.get_table_data(dataset, 'broad_stats_by_course')

        table_list = bqutil.get_list_of_table_ids(dataset)

        latest_person_course = max(
            [x for x in table_list if x.startswith('person_course_')])
        print "Latest person_course table in %s is %s" % (dataset,
                                                          latest_person_course)

        sql = """
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        nr_by_wrap = bqutil.get_bq_table(dataset,
                                         'nregistered_by_wrap',
                                         sql=sql,
                                         key={'name': 'course_id'})

        # rates for registrants before and during course

        sql = """
                SELECT 
                    *,
                    ncertified / nregistered * 100 as pct_certified_of_reg,
                    ncertified_and_registered_before_launch / nregistered_before_launch * 100 as pct_certified_reg_before_launch,
                    ncertified_and_registered_during_course / nregistered_during_course * 100 as pct_certified_reg_during_course,
                    ncertified / nregistered_by_wrap * 100 as pct_certified_of_reg_by_wrap,
                    ncertified / nviewed * 100 as pct_certified_of_viewed,
                    ncertified / nviewed_by_wrap * 100 as pct_certified_of_viewed_by_wrap,
                    ncertified_by_ewrap / nviewed_by_ewrap * 100 as pct_certified_of_viewed_by_ewrap,
                FROM
                (
                # ------------------------
                # get aggregate data
                SELECT pc.course_id as course_id, 
                    cminfo.wrap_date as wrap_date,
                    count(*) as nregistered,
                    sum(case when pc.certified then 1 else 0 end) ncertified,
                    sum(case when (TIMESTAMP(pc.cert_created_date) < cminfo.ewrap_date) and (pc.certified and pc.viewed) then 1 else 0 end) ncertified_by_ewrap,
                    sum(case when pc.viewed then 1 else 0 end) nviewed,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) nregistered_by_wrap,
                    sum(case when pc.start_time < cminfo.wrap_date then 1 else 0 end) / nregistered * 100 nregistered_by_wrap_pct,
                    sum(case when (pc.start_time < cminfo.wrap_date) and pc.viewed then 1 else 0 end) nviewed_by_wrap,
                    sum(case when (pc.start_time < cminfo.ewrap_date) and pc.viewed then 1 else 0 end) nviewed_by_ewrap,
                    sum(case when pc.start_time < cminfo.launch_date then 1 else 0 end) nregistered_before_launch,
                    sum(case when pc.start_time < cminfo.launch_date 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_before_launch,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) then 1 else 0 end) nregistered_during_course,
                    sum(case when (pc.start_time >= cminfo.launch_date) 
                              and (pc.start_time < cminfo.wrap_date) 
                              and pc.certified
                              then 1 else 0 end) ncertified_and_registered_during_course,
                FROM
                    [{dataset}.{person_course}] as pc
                left join (
                
                # --------------------
                #  get course launch and wrap dates from course_metainfo

       SELECT AA.course_id as course_id, 
              AA.wrap_date as wrap_date,
              AA.launch_date as launch_date,
              BB.ewrap_date as ewrap_date,
       FROM (
               #  inner get course launch and wrap dates from course_metainfo
                SELECT A.course_id as course_id,
                  A.wrap_date as wrap_date,
                  B.launch_date as launch_date,
                from
                (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as wrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Wrap'
                 )
                ) as A
                left outer join 
                (
                 SELECT course_id,
                      TIMESTAMP(concat(launch_year, "-", launch_month, '-', launch_day)) as launch_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as launch_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as launch_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as launch_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Course Launch'
                 )
                ) as B
                on A.course_id = B.course_id 
                # end inner course_metainfo subquery
            ) as AA
            left outer join
            (
                 SELECT course_id,
                      TIMESTAMP(concat(wrap_year, "-", wrap_month, '-', wrap_day, ' 23:59:59')) as ewrap_date,
                 FROM (
                  SELECT course_id, 
                    regexp_extract(value, r'(\d+)/\d+/\d+') as wrap_month,
                    regexp_extract(value, r'\d+/(\d+)/\d+') as wrap_day,
                    regexp_extract(value, r'\d+/\d+/(\d+)') as wrap_year,
                  FROM [{dataset}.course_metainfo]
                  where key='listings_Empirical Course Wrap'
                 )
            ) as BB
            on AA.course_id = BB.course_id

                # end course_metainfo subquery
                # --------------------
                
                ) as cminfo
                on pc.course_id = cminfo.course_id
                
                group by course_id, wrap_date
                order by course_id
                # ---- end get aggregate data
                )
                order by course_id
        """.format(dataset=dataset, person_course=latest_person_course)

        print "--> Assembling course_summary_stats from %s" % 'stats_cert_rates_by_registration'
        sys.stdout.flush()
        cert_by_reg = bqutil.get_bq_table(dataset,
                                          'stats_cert_rates_by_registration',
                                          sql=sql,
                                          newer_than=datetime.datetime(
                                              2015, 1, 16, 3, 0),
                                          key={'name': 'course_id'})

        # start assembling course_summary_stats

        c_sum_stats = defaultdict(OrderedDict)
        for entry in bsbc['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            cmci.update(entry)
            cnbw = nr_by_wrap['data_by_key'][course_id]
            nbw = int(cnbw['nregistered_by_wrap'])
            cmci['nbw_wrap_date'] = cnbw['wrap_date']
            cmci['nregistered_by_wrap'] = nbw
            cmci['nregistered_by_wrap_pct'] = cnbw['nregistered_by_wrap_pct']
            cmci['frac_female'] = float(entry['n_female_viewed']) / (float(
                entry['n_male_viewed']) + float(entry['n_female_viewed']))
            ncert = float(cmci['certified_sum'])
            if ncert:
                cmci[
                    'certified_of_nregistered_by_wrap_pct'] = nbw / ncert * 100.0
            else:
                cmci['certified_of_nregistered_by_wrap_pct'] = None
            cbr = cert_by_reg['data_by_key'][course_id]
            for field, value in cbr.items():
                cmci['cbr_%s' % field] = value

        # add medians for viewed, explored, and certified

        msbc_tables = {
            'msbc_viewed': "viewed_median_stats_by_course",
            'msbc_explored': 'explored_median_stats_by_course',
            'msbc_certified': 'certified_median_stats_by_course',
            'msbc_verified': 'verified_median_stats_by_course',
        }
        for prefix, mtab in msbc_tables.items():
            print "--> Merging median stats data from %s" % mtab
            sys.stdout.flush()
            bqdat = bqutil.get_table_data(dataset, mtab)
            for entry in bqdat['data']:
                course_id = entry['course_id']
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    cmci['%s_%s' % (prefix, field)] = value

        # add time on task data

        tot_table = "time_on_task_stats_by_course"
        prefix = "ToT"
        print "--> Merging time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add serial time on task data

        tot_table = "time_on_task_serial_stats_by_course"
        prefix = "SToT"
        print "--> Merging serial time on task data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # add show_answer stats

        tot_table = "show_answer_stats_by_course"
        prefix = "SAS"
        print "--> Merging show_answer stats data from %s" % tot_table
        sys.stdout.flush()
        try:
            bqdat = bqutil.get_table_data(dataset, tot_table)
        except Exception as err:
            bqdat = {'data': {}}
        for entry in bqdat['data']:
            course_id = entry['course_id']
            cmci = c_sum_stats[course_id]
            for field, value in entry.items():
                if field == 'course_id':
                    continue
                cmci['%s_%s' % (prefix, field)] = value

        # setup list of keys, for CSV output

        css_keys = c_sum_stats.values()[0].keys()

        # retrieve course_metainfo table, pivot, add that to summary_stats

        print "--> Merging course_metainfo from %s" % table
        sys.stdout.flush()
        bqdat = bqutil.get_table_data(dataset, table)

        listings_keys = map(make_key, [
            "Institution", "Semester", "New or Rerun",
            "Andrew Recodes New/Rerun", "Course Number", "Short Title",
            "Andrew's Short Titles", "Title", "Instructors",
            "Registration Open", "Course Launch", "Course Wrap", "course_id",
            "Empirical Course Wrap", "Andrew's Order", "certifies",
            "MinPassGrade", '4-way Category by name',
            "4-way (CS, STEM, HSocSciGov, HumHistRel)"
        ])
        listings_keys.reverse()

        for lk in listings_keys:
            css_keys.insert(1, "listings_%s" % lk)

        COUNTS_TO_KEEP = [
            'discussion', 'problem', 'optionresponse', 'checkboxgroup',
            'optioninput', 'choiceresponse', 'video', 'choicegroup',
            'vertical', 'choice', 'sequential', 'multiplechoiceresponse',
            'numericalresponse', 'chapter', 'solution', 'img',
            'formulaequationinput', 'responseparam', 'selfassessment', 'track',
            'task', 'rubric', 'stringresponse', 'combinedopenended',
            'description', 'textline', 'prompt', 'category', 'option', 'lti',
            'annotationresponse', 'annotatable', 'colgroup', 'tag_prompt',
            'comment', 'annotationinput', 'image', 'options', 'comment_prompt',
            'conditional', 'answer', 'poll_question', 'section', 'wrapper',
            'map', 'area', 'customtag', 'transcript', 'split_test',
            'word_cloud', 'openended', 'openendedparam', 'answer_display',
            'code', 'drag_and_drop_input', 'customresponse', 'draggable',
            'mentoring', 'textannotation', 'imageannotation', 'videosequence',
            'feedbackprompt', 'assessments', 'openassessment', 'assessment',
            'explanation', 'criterion'
        ]

        for entry in bqdat['data']:
            thekey = make_key(entry['key'])
            # if thekey.startswith('count_') and thekey[6:] not in COUNTS_TO_KEEP:
            #     continue
            if thekey.startswith(
                    'listings_') and thekey[9:] not in listings_keys:
                # print "dropping key=%s for course_id=%s" % (thekey, entry['course_id'])
                continue
            c_sum_stats[entry['course_id']][thekey] = entry['value']
            #if 'certifies' in thekey:
            #    print "course_id=%s, key=%s, value=%s" % (entry['course_id'], thekey, entry['value'])
            if thekey not in css_keys:
                css_keys.append(thekey)

        # compute forum_posts_per_week
        for course_id, entry in c_sum_stats.items():
            nfps = entry.get('nforum_posts_sum', 0)
            if nfps:
                fppw = int(nfps) / float(entry['nweeks'])
                entry['nforum_posts_per_week'] = fppw
                print "    course: %s, assessments_per_week=%s, forum_posts_per_week=%s" % (
                    course_id, entry['total_assessments_per_week'], fppw)
            else:
                entry['nforum_posts_per_week'] = None
        css_keys.append('nforum_posts_per_week')

        # read in listings file and merge that in also
        if listings_file:
            if listings_file.endswith('.csv'):
                listings = csv.DictReader(open(listings_file))
            else:
                listings = [json.loads(x) for x in open(listings_file)]
            for entry in listings:
                course_id = entry['course_id']
                if course_id not in c_sum_stats:
                    continue
                cmci = c_sum_stats[course_id]
                for field, value in entry.items():
                    lkey = "listings_%s" % make_key(field)
                    if not (lkey in cmci) or (not cmci[lkey]):
                        cmci[lkey] = value

        print "Storing these fields: %s" % css_keys

        # get schema
        mypath = os.path.dirname(os.path.realpath(__file__))
        the_schema = json.loads(
            open('%s/schemas/schema_combined_course_summary_stats.json' %
                 mypath).read())
        schema_dict = {x['name']: x for x in the_schema}

        # write out CSV
        css_table = "course_summary_stats"
        ofn = "%s__%s.csv" % (dataset, css_table)
        ofn2 = "%s__%s.json" % (dataset, css_table)
        print "Writing data to %s and %s" % (ofn, ofn2)

        ofp = open(ofn, 'w')
        ofp2 = open(ofn2, 'w')
        dw = csv.DictWriter(ofp, fieldnames=css_keys)
        dw.writeheader()
        for cid, entry in c_sum_stats.items():
            for ek in entry:
                if ek not in schema_dict:
                    entry.pop(ek)
                # entry[ek] = str(entry[ek])	# coerce to be string
            ofp2.write(json.dumps(entry) + "\n")
            for key in css_keys:
                if key not in entry:
                    entry[key] = None
            dw.writerow(entry)
        ofp.close()
        ofp2.close()

        # upload to bigquery
        # the_schema = [ { 'type': 'STRING', 'name': x } for x in css_keys ]
        if 1:
            gsfnp = gspath / dataset / (css_table + ".json")
            gsutil.upload_file_to_gs(ofn2, gsfnp)
            # bqutil.load_data_to_table(dataset, css_table, gsfnp, the_schema, wait=True, verbose=False,
            #                           format='csv', skiprows=1)
            bqutil.load_data_to_table(dataset,
                                      css_table,
                                      gsfnp,
                                      the_schema,
                                      wait=True,
                                      verbose=False)

        return

    print "-" * 60 + " %s" % course_id

    # get nweeks from listings
    lfn = path(listings_file)
    if not lfn.exists():
        print "[analyze_content] course listings file %s doesn't exist!" % lfn
        return

    data = None
    if listings_file.endswith('.json'):
        data_feed = map(json.loads, open(lfn))
    else:
        data_feed = csv.DictReader(open(lfn))
    for k in data_feed:
        if not 'course_id' in k:
            print "Strange course listings row, no course_id in %s" % k
            raise Exception("Missing course_id")
        if k['course_id'] == course_id:
            data = k
            break

    if not data:
        print "[analyze_content] no entry for %s found in course listings file %s!" % (
            course_id, lfn)
        return

    def date_parse(field):
        (m, d, y) = map(int, data[field].split('/'))
        return datetime.datetime(y, m, d)

    launch = date_parse('Course Launch')
    wrap = date_parse('Course Wrap')
    ndays = (wrap - launch).days
    nweeks = ndays / 7.0

    print "Course length = %6.2f weeks (%d days)" % (nweeks, ndays)

    if pin_date:
        datedir = pin_date
    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest and not pin_date)
    cfn = gsutil.path_from_course_id(course_id)

    xbfn = course_dir / ("xbundle_%s.xml" % cfn)

    if not xbfn.exists():
        print "[analyze_content] cannot find xbundle file %s for %s!" % (
            xbfn, course_id)

        if use_dataset_latest:
            # try looking in earlier directories for xbundle file
            import glob
            spath = course_dir / ("../*/xbundle_%s.xml" % cfn)
            files = list(glob.glob(spath))
            if files:
                xbfn = path(files[-1])
            if not xbfn.exists():
                print "   --> also cannot find any %s ; aborting!" % spath
            else:
                print "   --> Found and using instead: %s " % xbfn
        if not xbfn.exists():
            raise Exception("[analyze_content] missing xbundle file %s" % xbfn)

    # if there is an xbundle*.fixed file, use that instead of the normal one
    if os.path.exists(str(xbfn) + ".fixed"):
        xbfn = path(str(xbfn) + ".fixed")

    print "[analyze_content] For %s using %s" % (course_id, xbfn)

    # get module usage data
    mudata = get_stats_module_usage(course_id, basedir, datedir,
                                    use_dataset_latest)

    xml = etree.parse(open(xbfn)).getroot()

    counts = defaultdict(int)
    nexcluded = defaultdict(int)

    IGNORE = [
        'html', 'p', 'div', 'iframe', 'ol', 'li', 'ul', 'blockquote', 'h1',
        'em', 'b', 'h2', 'h3', 'body', 'span', 'strong', 'a', 'sub', 'strike',
        'table', 'td', 'tr', 's', 'tbody', 'sup', 'sub', 'strike', 'i', 's',
        'pre', 'policy', 'metadata', 'grading_policy', 'br', 'center', 'wiki',
        'course', 'font', 'tt', 'it', 'dl', 'startouttext', 'endouttext', 'h4',
        'head', 'source', 'dt', 'hr', 'u', 'style', 'dd', 'script', 'th', 'p',
        'P', 'TABLE', 'TD', 'small', 'text', 'title'
    ]

    problem_stats = defaultdict(int)

    def does_problem_have_random_script(problem):
        '''
        return 1 if problem has a script with "random." in it
        else return 0
        '''
        for elem in problem.findall('.//script'):
            if elem.text and ('random.' in elem.text):
                return 1
        return 0

    # walk through xbundle
    def walk_tree(elem, policy=None):
        '''
        Walk XML tree recursively.
        elem = current element
        policy = dict of attributes for children to inherit, with fields like due, graded, showanswer
        '''
        policy = policy or {}
        if type(elem.tag) == str and (elem.tag.lower() not in IGNORE):
            counts[elem.tag.lower()] += 1
        if elem.tag in [
                "sequential", "problem", "problemset", "course", "chapter"
        ]:  # very old courses may use inheritance from course & chapter
            keys = ["due", "graded", "format", "showanswer", "start"]
            for k in keys:  # copy inheritable attributes, if they are specified
                val = elem.get(k)
                if val:
                    policy[k] = val
        if elem.tag == "problem":  # accumulate statistics about problems: how many have show_answer = [past_due, closed] ?  have random. in script?
            problem_stats['n_capa_problems'] += 1
            if policy.get('showanswer'):
                problem_stats["n_showanswer_%s" %
                              policy.get('showanswer')] += 1
            else:
                problem_stats[
                    'n_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)
                # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/capa_base.py#L118
                # finished = Show the answer after the student has answered the problem correctly, the student has no attempts left, or the problem due date has passed.
            problem_stats[
                'n_random_script'] += does_problem_have_random_script(elem)

            if policy.get('graded') == 'true' or policy.get(
                    'graded') == 'True':
                problem_stats['n_capa_problems_graded'] += 1
                problem_stats[
                    'n_graded_random_script'] += does_problem_have_random_script(
                        elem)
                if policy.get('showanswer'):
                    problem_stats["n_graded_showanswer_%s" %
                                  policy.get('showanswer')] += 1
                else:
                    problem_stats[
                        'n_graded_shownanswer_finished'] += 1  # DEFAULT showanswer = finished  (make sure this remains true)

        for k in elem:
            midfrag = (k.tag, k.get('url_name_orig', None))
            if (midfrag in mudata) and int(mudata[midfrag]['ncount']) < 20:
                nexcluded[k.tag] += 1
                if verbose:
                    try:
                        print "    -> excluding %s (%s), ncount=%s" % (
                            k.get('display_name',
                                  '<no_display_name>').encode('utf8'), midfrag,
                            mudata.get(midfrag, {}).get('ncount'))
                    except Exception as err:
                        print "    -> excluding ", k
                continue
            walk_tree(k, policy.copy())

    walk_tree(xml)
    print "--> Count of individual element tags throughout XML: ", counts

    print "--> problem_stats:", json.dumps(problem_stats, indent=4)

    # combine some into "qual_axis" and others into "quant_axis"
    qual_axis = [
        'openassessment',
        'optionresponse',
        'multiplechoiceresponse',
        # 'discussion',
        'choiceresponse',
        'word_cloud',
        'combinedopenended',
        'choiceresponse',
        'stringresponse',
        'textannotation',
        'openended',
        'lti'
    ]
    quant_axis = [
        'formularesponse', 'numericalresponse', 'customresponse',
        'symbolicresponse', 'coderesponse', 'imageresponse'
    ]

    nqual = 0
    nquant = 0
    for tag, count in counts.items():
        if tag in qual_axis:
            nqual += count
        if tag in quant_axis:
            nquant += count

    print "nqual=%d, nquant=%d" % (nqual, nquant)

    nqual_per_week = nqual / nweeks
    nquant_per_week = nquant / nweeks
    total_per_week = nqual_per_week + nquant_per_week

    print "per week: nqual=%6.2f, nquant=%6.2f total=%6.2f" % (
        nqual_per_week, nquant_per_week, total_per_week)

    # save this overall data in CCDATA
    lock_file(CCDATA)
    ccdfn = path(CCDATA)
    ccd = {}
    if ccdfn.exists():
        for k in csv.DictReader(open(ccdfn)):
            ccd[k['course_id']] = k

    ccd[course_id] = {
        'course_id': course_id,
        'nweeks': nweeks,
        'nqual_per_week': nqual_per_week,
        'nquant_per_week': nquant_per_week,
        'total_assessments_per_week': total_per_week,
    }

    # fields = ccd[ccd.keys()[0]].keys()
    fields = [
        'course_id', 'nquant_per_week', 'total_assessments_per_week',
        'nqual_per_week', 'nweeks'
    ]
    cfp = open(ccdfn, 'w')
    dw = csv.DictWriter(cfp, fieldnames=fields)
    dw.writeheader()
    for cid, entry in ccd.items():
        dw.writerow(entry)
    cfp.close()
    lock_file(CCDATA, release=True)

    # store data in course_metainfo table, which has one (course_id, key, value) on each line
    # keys include nweeks, nqual, nquant, count_* for module types *

    cmfields = OrderedDict()
    cmfields['course_id'] = course_id
    cmfields['course_length_days'] = str(ndays)
    cmfields.update(
        {make_key('listings_%s' % key): value
         for key, value in data.items()})  # from course listings
    cmfields.update(ccd[course_id].copy())

    # cmfields.update({ ('count_%s' % key) : str(value) for key, value in counts.items() })	# from content counts

    cmfields['filename_xbundle'] = xbfn
    cmfields['filename_listings'] = lfn

    for key in sorted(
            counts
    ):  # store counts in sorted order, so that the later generated CSV file can have a predictable structure
        value = counts[key]
        cmfields['count_%s' % key] = str(value)  # from content counts

    for key in sorted(problem_stats):  # store problem stats
        value = problem_stats[key]
        cmfields['problem_stat_%s' % key] = str(value)

    cmfields.update({('nexcluded_sub_20_%s' % key): str(value)
                     for key, value in nexcluded.items()
                     })  # from content counts

    course_dir = find_course_sql_dir(course_id, basedir, datedir,
                                     use_dataset_latest)
    csvfn = course_dir / CMINFO

    # manual overriding of the automatically computed fields can be done by storing course_id,key,value data
    # in the CMINFO_OVERRIDES file

    csvfn_overrides = course_dir / CMINFO_OVERRIDES
    if csvfn_overrides.exists():
        print "--> Loading manual override information from %s" % csvfn_overrides
        for ovent in csv.DictReader(open(csvfn_overrides)):
            if not ovent['course_id'] == course_id:
                print "===> ERROR! override file has entry with wrong course_id: %s" % ovent
                continue
            print "    overriding key=%s with value=%s" % (ovent['key'],
                                                           ovent['value'])
            cmfields[ovent['key']] = ovent['value']

    print "--> Course metainfo writing to %s" % csvfn

    fp = open(csvfn, 'w')

    cdw = csv.DictWriter(fp, fieldnames=['course_id', 'key', 'value'])
    cdw.writeheader()

    for k, v in cmfields.items():
        cdw.writerow({'course_id': course_id, 'key': k, 'value': v})

    fp.close()

    # build and output course_listings_and_metainfo

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    mypath = os.path.dirname(os.path.realpath(__file__))
    clm_table = "course_listing_and_metainfo"
    clm_schema_file = '%s/schemas/schema_%s.json' % (mypath, clm_table)
    clm_schema = json.loads(open(clm_schema_file).read())

    clm = {}
    for finfo in clm_schema:
        field = finfo['name']
        clm[field] = cmfields.get(field)
    clm_fnb = clm_table + ".json"
    clm_fn = course_dir / clm_fnb
    open(clm_fn, 'w').write(json.dumps(clm))

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / clm_fnb
    print "--> Course listing + metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, clm_table)
    sys.stdout.flush()
    gsutil.upload_file_to_gs(clm_fn, gsfnp)
    bqutil.load_data_to_table(dataset,
                              clm_table,
                              gsfnp,
                              clm_schema,
                              wait=True,
                              verbose=False)

    # output course_metainfo

    table = 'course_metainfo'
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    gsfnp = gsutil.gs_path_from_course_id(
        course_id, use_dataset_latest=use_dataset_latest) / CMINFO
    print "--> Course metainfo uploading to %s then to %s.%s" % (
        gsfnp, dataset, table)
    sys.stdout.flush()

    gsutil.upload_file_to_gs(csvfn, gsfnp)

    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/schemas/schema_course_metainfo.json' % mypath
    the_schema = json.loads(open(SCHEMA_FILE).read())[table]

    bqutil.load_data_to_table(dataset,
                              table,
                              gsfnp,
                              the_schema,
                              wait=True,
                              verbose=False,
                              format='csv',
                              skiprows=1)
def process_time_on_asset_totals(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    Compute total time on asset values, across various subpopulations of users, for different
    assets (labeled by their module id's).  This requires time_on_asset_daily.

    The time_on_asset_totals table has these columns:

    - course_id
    - module_id: ID for the asset (including video, problem, text, vertical, sequential - see course axis)
    - n_unique_users: number of unique users who accessed the asset
    - n_unique_certified: number of unique users who accessed the asset and also earned a certificate
    - mean_tmid5: mean time spent on asset [sec], for the given module_id, with a 5-minute timeout
    - cert_mean_tmid5: mean time spent on asset by certified users [sec], for the given module_id, with a 5-minute timeout
    - mean_tmid30: mean time spent on asset [sec], for the given module_id, with a 30-minute timeout
    - cert_mean_tmid30: mean time spent on asset by certified users [sec], for the given module_id, with a 30-minute timeout
    - median_tmid5: median time spent on asset [sec], for the given module_id, with a 5-minute timeout
    - cert_median_tmid5: median time spent on asset by certified users [sec], for the given module_id, with a 5-minute timeout
    - median_tmid30: median time spent on asset [sec], for the given module_id, with a 30-minute timeout
    - cert_median_tmid30: median time spent on asset by certified users [sec], for the given module_id, with a 30-minute timeout
    - total_tmid5: total time spent on given module_id, in seconds, with a 5-minute timeout
    - cert_total_tmid5: total time spent on given module_id, in seconds, with a 5-minute timeout, by certified users
    - total_tmid30: total time spent on given module_id, in seconds, with a 30-minute timeout
    - cert_total_tmid30: total time spent on given module_id, in seconds, with a 30-minute timeout, by certified users
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    SQL = """
      SELECT 
          "{course_id}" as course_id,
          module_id, 
          EXACT_COUNT_DISTINCT(username) as n_unique_users,
          EXACT_COUNT_DISTINCT (certified_username) as n_unique_certified,
          AVG(time_umid5) as mean_tmid5,
          AVG(cert_time_umid5) as cert_mean_tmid5,
          AVG(time_umid30) as mean_tmid30,
          AVG(cert_time_umid30) as cert_mean_tmid30,
          NTH(26, QUANTILES(time_umid5, 50)) as median_tmid5,
          NTH(26, QUANTILES(cert_time_umid5, 50)) as cert_median_tmid5,
          NTH(26, QUANTILES(time_umid30, 50)) as median_tmid30,
          NTH(26, QUANTILES(cert_time_umid30, 50)) as cert_median_tmid30,
          sum(time_umid5) as total_tmid5,   # total time on module (by module_id) in seconds
          sum(cert_time_umid5) as cert_total_tmid5,
          sum(time_umid30) as total_tmid30, # mid5 has 5 minute timeout, mid30 has 30 min timeout
          sum(cert_time_umid30) as cert_total_tmid30,
      FROM (
          SELECT
            TL.module_id as module_id,
            TL.username as username,
            (case when certified then TL.username else null end) as certified_username,
            sum(time_umid5) as time_umid5,
            sum(time_umid30) as time_umid30,
            sum(case when certified then time_umid5 else null end) as cert_time_umid5,
            sum(case when certified then time_umid30 else null end) as cert_time_umid30,

          FROM [{dataset}.time_on_asset_daily] TL
          JOIN [{dataset}.person_course] PC	# join to know who certified or attempted a problem
          ON TL.username = PC.username
          WHERE TL.time_umid5 is not null
                AND PC.nproblem_check > 0	# limit to users who attempted at least one problem
          GROUP BY module_id, username, certified_username
          ORDER BY module_id, username
        )
        GROUP BY module_id
        order by module_id
         """

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    the_sql = SQL.format(dataset=dataset, course_id=course_id)

    tablename = 'time_on_asset_totals'

    print "Computing %s for %s" % (tablename, dataset)
    sys.stdout.flush()

    bqdat = bqutil.get_bq_table(dataset, tablename, the_sql,
                                force_query=force_recompute,
                                depends_on=[ '%s.time_on_asset_daily' % dataset ],
                                )

    return bqdat
Example #18
0
def create_problem_first_attempt_correct_table(course_id,
                                               force_recompute=False,
                                               use_dataset_latest=False):
    '''
    It is very useful to know, for each graded problem, the percentage of users who got the problem
    correct on their first attempt.  This information is computed and stored in the problem_first_attempt_correct
    table, for exploreres, users who completed, and users who certified.  Problems are indexed by problem_nid,
    which is a unique index used by course_problem and course_item.
    '''

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    tablename = "problem_first_attempt_correct"

    the_sql = """
# compute problem_first_attempt_correct table for {course_id}
SELECT
    problem_nid,
    n_first_attempt_correct_by_certified,
    n_certified_users_attempted,
    n_first_attempt_correct_by_certified / n_certified_users_attempted * 100 as pct_correct_first_attempt_by_certified,
    n_first_attempt_correct_by_completed,
    n_completed_users_attempted,
    n_first_attempt_correct_by_completed / n_completed_users_attempted * 100 as pct_correct_first_attempt_by_completed,
    n_first_attempt_correct_by_explored,
    n_explored_users_attempted,
    n_first_attempt_correct_by_explored / n_explored_users_attempted * 100 as pct_correct_first_attempt_by_explored,
FROM (
    SELECT 

      PP.problem_nid as problem_nid,
      sum(case when PC.certified and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_certified,
      sum(case when PC.completed and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_completed,
      sum(case when PC.explored and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_explored,
      count(case when PC.certified then PP.user_id else null end) as n_certified_users_attempted,
      count(case when PC.completed then PP.user_id else null end) as n_completed_users_attempted,
      count(case when PC.explored then PP.user_id else null end) as n_explored_users_attempted,

    FROM [{dataset}.person_problem] PP
    JOIN [{dataset}.person_course] PC
    on PP.user_id = PC.user_id
    WHERE PC.certified or PC.completed or PC.explored
    group by problem_nid
    order by problem_nid
)
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [
        "%s.person_problem" % dataset,
        "%s.person_course" % dataset,
    ]

    try:
        bqdat = bqutil.get_bq_table(dataset,
                                    tablename,
                                    the_sql,
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[create_problem_first_attempt_correct_table] ERR! failed in creating %s.%s using this sql:" % (
            dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id,
                                                         nfound)
    sys.stdout.flush()
def create_problem_first_attempt_correct_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    It is very useful to know, for each graded problem, the percentage of users who got the problem
    correct on their first attempt.  This information is computed and stored in the problem_first_attempt_correct
    table, for exploreres, users who completed, and users who certified.  Problems are indexed by problem_nid,
    which is a unique index used by course_problem and course_item.
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "problem_first_attempt_correct"

    the_sql = """
# compute problem_first_attempt_correct table for {course_id}
SELECT
    problem_nid,
    n_first_attempt_correct_by_certified,
    n_certified_users_attempted,
    n_first_attempt_correct_by_certified / n_certified_users_attempted * 100 as pct_correct_first_attempt_by_certified,
    n_first_attempt_correct_by_completed,
    n_completed_users_attempted,
    n_first_attempt_correct_by_completed / n_completed_users_attempted * 100 as pct_correct_first_attempt_by_completed,
    n_first_attempt_correct_by_explored,
    n_explored_users_attempted,
    n_first_attempt_correct_by_explored / n_explored_users_attempted * 100 as pct_correct_first_attempt_by_explored,
FROM (
    SELECT 

      PP.problem_nid as problem_nid,
      sum(case when PC.certified and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_certified,
      sum(case when PC.completed and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_completed,
      sum(case when PC.explored and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_explored,
      count(case when PC.certified then PP.user_id else null end) as n_certified_users_attempted,
      count(case when PC.completed then PP.user_id else null end) as n_completed_users_attempted,
      count(case when PC.explored then PP.user_id else null end) as n_explored_users_attempted,

    FROM [{dataset}.person_problem] PP
    JOIN [{dataset}.person_course] PC
    on PP.user_id = PC.user_id
    WHERE PC.certified or PC.completed or PC.explored
    group by problem_nid
    order by problem_nid
)
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [ "%s.person_problem" % dataset,
                   "%s.person_course" % dataset,
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[create_problem_first_attempt_correct_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()
Example #20
0
def create_course_item_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    the course_item dataset has these columns:

    Field Name                              Type    Example         Description
    item_id                                 string  i4x-MITx-8_MReV-problem-CheckPoint_1_Newton_s_First_Law_2_1     
                                                                    Unique ID for an assessment item (constructed using the problem module_id, and linked to problem_analysis table keys)
    problem_id                              string  CheckPoint_1_Newton_s_First_Law 
                                                                    Unique ID for an assessment problem (constructed using problem url_name)
    problem_nid                             integer 27              unique problem numerical id (equal to the sequential count of problems up to this one)
    assignment_short_id                     string  HW_4            Unique short ID for assignment, using assignment short name + "_" + assignment_seq_num (should be same as what shows up in user's edX platform progress page)
    item_weight                             float   6.59E-05        Fraction of overall grade (between 0 and 1) contributed by this item
    n_user_responses                        integer 4868            Number of users who provided a response to this assessment item
    problem_name                            string  CheckPoint 1: Newton's First Law        
                                                                    Name of problem within which this item exists
    chapter_name                            string  Chapter 1       Name of chapter within which the problem exists
    section_name                            string  Section 1       Name of section (aka sequential) within which the problem exists
    assignment_id                           string  Checkpoint_ch3  Unique ID for the assignment within which the problem exists
    n_problems_in_assignment                integer 23              Number of problems within the assignment
    assignment_type                         string  Checkpoint      The assignment type within which the assignment exists
    assignment_type_weight                  float   0.1             Fraction of the overall grade contributed by the assignment type
    n_assignments_of_type                   integer 11              Number of assignments of this type
    assignment_seq_num                      integer 3               Sequential number of the assignment_type within the course
    chapter_number                          integer 3               Number of the chapter within which the problem exists
    section_number                          integer 3               Number of the section (aka sequential) within which the problem exists
    content_index                           integer 141             Index number of the problem within the content course axis
    problem_weight                          integer 1               Weight of the problem within the assignment
    item_points_possible                    float   1               Always 1 (used for debugging - number of points assigned to an item)
    problem_points_possible                 integer 6               Always equal to the number of items in the assignment (used for debugging)
    emperical_item_points_possible          integer 1               Emperical value of point value of item, based on user data in problem_analysis table (for debugging)
    emperical_problem_points_possible       integer 6               Emperical value of maximum number of points possible for problem based on problem_analysis (for debugging)
    item_number                             integer 1               Number of the item, within the problem (in order of presentation, starting from 1)
    n_items                                 integer 6               Number of items within the problem
    start_date                              date    2013-06-01 00:01:00 UTC 
                                                                    Date when problem was issued
    due_date                                date    2013-06-23 23:59:00 UTC 
                                                                    Date when problem was due
    problem_path                            string  /Unit_1/Newtons_First_Law/2/1   
                                                                    Path of problem within course content, specifying chapter and sequential
    problem_short_id                        string  HW_7__3         short (and unique) problem ID, made using assignment short ID + "__" + problem number
    item_short_id                           string  HW_7__3_1       short (and unique) item ID, made using problem short ID + "_" + item number
    item_nid                                integer 41              unique item numerical id (equal to the row number of this entry in the course_itm table)
    cumulative_item_weight                  float   6.59E-05        Cumulative fraction of item weights (for debugging: should increase to 1.0 by the end of table)
    is_split                                boolean False           Boolean flag indicating if this item was within an A/B split_test or not
    split_name                              string  CircMotionAB    Name of the split_test within which this item is placed, if is_split is True

    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "course_item"

    the_sql = """
SELECT 
    # '{course_id}' as course_id,
    *,
    CONCAT(assignment_short_id, "__", STRING(problem_number)) as problem_short_id,
    CONCAT(assignment_short_id, "__", STRING(problem_number), "_", STRING(item_number)) as item_short_id,
    row_number() over (order by content_index, item_number) as item_nid,
    sum(item_weight) over (order by content_index, item_number) cumulative_item_weight
FROM
(
    # items with additional data about fraction_of_overall_grade from grading_policy
    SELECT item_id, 
        problem_id,
        max(if(item_number=1, x_item_nid, null)) over (partition by problem_id) as problem_nid,
        CONCAT(GP.short_label, "_", STRING(assignment_seq_num)) as assignment_short_id,
        (problem_weight * GP.fraction_of_overall_grade / n_items / sum_problem_weight_in_assignment / n_assignments_of_type) as item_weight,
        n_user_responses,
        chapter_name,
        section_name,
        vertical_name,
        problem_name,
        CI.assignment_id as assignment_id,
        n_problems_in_assignment,
        CI.assignment_type as assignment_type,
        GP.fraction_of_overall_grade as assignment_type_weight,
        n_assignments_of_type,
        assignment_seq_num,
        chapter_number,
        content_index,
        section_number,
        problem_number,
        problem_weight,
        item_points_possible,
        problem_points_possible,
        emperical_item_points_possible,
        emperical_problem_points_possible,
        item_number,
        n_items,
        start_date,
        due_date,
        is_split,
        split_name,
        problem_path,
    FROM
    (
        # items with number of problems per assignment
        SELECT item_id, item_number,
            n_items,
            problem_id,
            row_number() over (partition by item_number order by content_index) as x_item_nid,
            n_user_responses,
            chapter_name,
            section_name,
            vertical_name,
            problem_name,
            assignment_id,
            sum(if(assignment_id is not null and item_number=1, 1, 0)) over (partition by assignment_id) n_problems_in_assignment,
            sum(if(assignment_id is not null and item_number=1, problem_weight, 0)) 
                over (partition by assignment_id) sum_problem_weight_in_assignment,
            assignment_type,
            n_assignments_of_type,
            assignment_seq_num,
            chapter_number,
            section_number,
            problem_number,
            problem_path,
            content_index,
            start_date,
            due_date,
            is_split,
            split_name,
            problem_weight,
            item_points_possible,
            problem_points_possible,
            emperical_item_points_possible,
            emperical_problem_points_possible,
        FROM
        (
            # items from problem_analysis with metadata from course_axis
            SELECT item_id, item_number,
                n_items,
                problem_id,
                n_user_responses,
                CA.name as problem_name,
                chapter_name,
                section_name,
                vertical_name,
                assignment_id,
                assignment_type,
                n_assignments_of_type,
                CA.assignment_seq_num as assignment_seq_num,
                CA.chapter_number as chapter_number,
                CA.section_number as section_number,
                CA.problem_number as problem_number,
                CA.path as problem_path,
                CA.index as content_index,
                CA.start as start_date,
                CA.due as due_date,
                CA.is_split as is_split,
                CA.split_name as split_name,
                if(CA.weight is null, 1.0, CA.weight) as problem_weight,
                item_points_possible,
                problem_points_possible,
                emperical_item_points_possible,
                emperical_problem_points_possible,
            FROM
            (
                # get items with item metadata from problem_analysis table
                SELECT item_id, item_number,
                    n_items,
                    problem_id,
                    n_user_responses,
                    1.0 as item_points_possible,
                    1.0 * n_items as problem_points_possible,
                    problem_points_possible / n_items as emperical_item_points_possible,
                    problem_points_possible as emperical_problem_points_possible,
                FROM
                (
                    SELECT item_id, item_number,
                        max(item_number) over (partition by problem_id) n_items,
                        problem_id,
                        problem_points_possible,
                        n_user_responses,
                    FROM
                    (
                        SELECT item_id,
                            row_number() over (partition by problem_id order by item_id) item_number,
                            problem_id,
                            problem_points_possible,
                            n_user_responses,
                        FROM
                        (
                            SELECT item.answer_id as item_id,
                                problem_url_name as problem_id,
                                max_grade as problem_points_possible,
                                count(*) as n_user_responses,
                            FROM [{dataset}.problem_analysis]
                            group by item_id, problem_id, problem_points_possible
                            having n_user_responses > 5   # minimum cutoff for an item to be included
                        )
                    )
                )
                order by item_id, item_number
            ) as PA
            JOIN 
            (
                # -------------------------------------------------- graded problems from course axis
                # master table of graded problems from course_axis, with assignment metadata
                SELECT module_id,
                    url_name,
                    index,
                    weight,
                    assignment_type,
                    MAX(IF(problem_number=1, x_assignment_seq_num, null)) over (partition by assignment_id) as assignment_seq_num,
                    problem_number,
                    assignment_id,
                    n_assignments_of_type,
                    chapter_name,
                    section_name,
                    vertical_name,
                    name,
                    path,
                    start,
                    due,
                    is_split,
                    split_name,
                    chapter_number,
                    section_number,
                FROM
                (
                    # course_axis with chapter number and number of assignments of type
                    SELECT *,  # add column with number of assignments of type
                        SUM(IF(problem_number=1, 1, 0)) over (partition by assignment_type) n_assignments_of_type,
                        row_number() over (partition by assignment_type, problem_number order by index) as x_assignment_seq_num,
                    FROM
                    (
                        # ---------------------------------------- course axis with vertical name
                        SELECT module_id,
                            url_name,
                            index,
                            weight,
                            assignment_type,
                            chapter_number,
                            section_number,
                            assignment_id,  
                            chapter_name,
                            section_name,
                            vertical_name,
                            name,
                            path,
                            start,
                            due,
                            is_split,
                            split_name,
                            # add column with problem number within assignment_id
                            row_number() over (partition by assignment_id order by index) problem_number,
                        FROM
                        (
                            # course axis of problems which have non-null grading_format, including chapter number
                            # and section (aka sequential) number (within the chapter)
                            SELECT CAI.module_id as module_id,
                                CAI.url_name as url_name,
                                index,
                                weight,
                                assignment_type,
                                chapter_number,
                                section_number,
                                #  assignment_id = assignment_type + ch_chapter_number + sec_section_number
                                CONCAT(assignment_type, "_ch", STRING(chapter_number), "_sec", STRING(section_number)) as assignment_id,  
                                chapter_name,
                                section_name,
                                name,
                                path,
                                start,
                                due,
                                is_split,
                                split_name,
                                parent,
                            FROM 
                            (
                                # course axis entries of things which have non-null grading format, with section_mid from path
                                SELECT module_id,
                                    url_name,
                                    index,
                                    If(data.weight is null, 1.0, data.weight) as weight,
                                    gformat as assignment_type,
                                    chapter_mid as chapter_mid,
                                    REGEXP_EXTRACT(path, '^/[^/]+/([^/]+)') as section_mid,
                                    name,
                                    path,
                                    start,
                                    due,
                                    is_split,
                                    split_url_name as split_name,
                                    parent,
                                FROM [{dataset}.course_axis] CAI
                                where gformat is not null 
                                and category = "problem"
                                order by index
                            ) CAI
                            LEFT JOIN  # join course_axis with itself to get chapter_number and section_number
                            (   
                                # get chapters and sections (aka sequentials) with module_id, chapter_number, and section_number
                                # each assignment is identified by assignment_type + chapter_number + section_number
                                # note in some previous calculations, the section_number was left out by mistake
                                # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/course_module.py#L1305
                                SELECT module_id, url_name, name as section_name,
                                    max(if(category="chapter", x_chapter_number, null)) over (partition by chapter_mid order by index) as chapter_number,
                                    section_number,
                                    chapter_name,
                                FROM
                                (
                                    SELECT module_id, url_name,
                                        row_number() over (partition by category order by index) as x_chapter_number,
                                        row_number() over (partition by chapter_mid, category order by index) as section_number,
                                        FIRST_VALUE(name) over (partition by chapter_mid order by index) as chapter_name,
                                        index,
                                        category,
                                        name,
                                        if(category="chapter", module_id, chapter_mid) as chapter_mid,
                                    FROM  [{dataset}.course_axis] 
                                    where category = "chapter" or category = "sequential" or category = "videosequence"
                                    order by index
                                )
                                order by index
                            ) CHN
                            # ON CAI.chapter_mid = CHN.chapter_mid  # old, for assignments by chapter
                            ON CAI.section_mid = CHN.url_name     # correct way, for assignments by section (aka sequential)
                            # where gformat is not null
                        ) CAPN
                        LEFT JOIN # join with course_axis to get names of verticals in which problems reside
                        (
                            # get verticals
                            SELECT url_name as vertical_url_name, 
                                name as vertical_name,
                            FROM  [{dataset}.course_axis] 
                            where category = "vertical"
                        ) CAV
                        ON CAPN.parent = CAV.vertical_url_name
                        # ---------------------------------------- END course axis with vertical_name
                    )
                )
                order by index
                # -------------------------------------------------- END graded problems from course axis
            ) CA
            ON PA.problem_id = CA.url_name
        )
    ) CI
    LEFT JOIN [{dataset}.grading_policy] GP
    ON CI.assignment_type = GP.assignment_type
    order by content_index, item_number
)
order by content_index, item_number
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [ "%s.course_axis" % dataset,
                   "%s.grading_policy" % dataset,
                   "%s.problem_analysis" % dataset
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    newer_than=datetime.datetime(2015, 10, 31, 17, 00),
                                    depends_on=depends_on,
                                    force_query=force_recompute)
    except Exception as err:
        print "[make_course_item_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = len(bqdat['data'])
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()
def createVideoStats_obsolete( course_id, force_recompute=False, use_dataset_latest=False, startDate=DATE_DEFAULT_START, endDate=DATE_DEFAULT_END ):
    '''
    Create video statistics for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video
    position > 95% of the total video length duration.
    This was the original method used, but is not the most efficient since it queries entire log set. Instead, generate video stats per day, then incrementally
    append to that data table as the daily log data comes in.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS
    
    the_sql = """
                 SELECT index_chapter,
                        index_video,
                        name,
                        video_id, 
                        chapter_name,
                        sum(case when position > 0 then 1 else 0 end) as videos_viewed, 
                        sum(case when position > video_length*0.95 then 1 else 0 end) as videos_watched,
                 FROM (SELECT username,
                              #module_id as video_id,
                              #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path
                              (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'[-]' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only
                              max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else  float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position,
                       FROM (TABLE_QUERY({logs},
                             "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start_date} and {end_date}"))
                       WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and
                              event is not null
                       group by username, video_id
                       order by username, video_id) as video_log,
                       LEFT JOIN EACH
                       (SELECT video_length,
                                video_id as vid_id,
                                name,
                                index_video,
                                index_chapter,
                                chapter_name
                        FROM [{dataset}.{videoaxis}]
                        ) as {videoaxis}
                        ON video_log.video_id = {videoaxis}.vid_id
                        WHERE video_id is not null
                        group by video_id, name, index_chapter, index_video, chapter_name
                        order by index_video asc;
                """.format(dataset=dataset,start_date=startDate,end_date=endDate,logs=logs, videoaxis=TABLE_VIDEO_AXIS)

    print "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS, course_id)
    sys.stdout.flush()
        
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS )
        assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_STATS, TABLE_VIDEO_AXIS ) 

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_STATS )
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(dataset, table, the_sql, force_query=force_recompute,
                                depends_on=["%s.%s" % (dataset, TABLE_VIDEO_AXIS)],
                                )
    return bqdat
    def make_table(self, pcds_table=None, org=None, nskip=0):
        '''
        Get the specified person_course table.  Find IP addresses for which the country code is missing.
        Get country codes for each of those IP addreses using local copy of the maxmind geoip database.
        Store result in bigquery geoip table.
    
        Does not overwrite existing bigquery geoip table - adds new entries.
        '''

        if pcds_table is None:
            if org is None:
                print "Error!  Must specify either --table or --org"
                return
            dataset = 'course_report_' + org

            pctables = []
            for table in bqutil.get_list_of_table_ids(dataset):
                m = re.match('person_course_%s_([\d_]+)$' % org, table)
                if m:
                    pctables.append(table)
            pctables.sort()
            if not pctables:
                print "Error!  No person_course_%s_* tables found in dataset %s!" % (
                    org, dataset)
                return
            pctable = pctables[-1]
        else:
            (dataset, pctable) = pcd_table.split('.', 1)

        print "[make_geoip_table] Using person course from %s.%s" % (dataset,
                                                                     pctable)

        if nskip <= 0:
            pimc_table = "%s_ip_no_cc" % pctable
            sql = """SELECT ip, count(*) as n
                     FROM [{dataset}.{table}] 
                     where cc_by_ip is Null
                     group by ip
                  """.format(dataset=dataset, table=pctable)

            noips = bqutil.get_bq_table(dataset, pimc_table, sql)

            print "%d IP addresses missing geoip information in %s" % (len(
                noips['data']), pimc_table)
            # print noips['data'][:10]
            sys.stdout.flush()

            self.load_geoip()

            for entry in noips['data']:
                ip = entry['ip']
                if ip is None:
                    continue
                if ip in self.geoipdat:
                    # print "--> Already have geoip for %s, skipping" % ip
                    continue

                self.lookup_ip(ip)

                if (self.nchanged % 100 == 0):
                    sys.stdout.write('.')
                    sys.stdout.flush()
                    break

            print "Added %d new geoip entries" % self.nchanged
            sys.stdout.flush()
        else:
            nskip -= 1
        self.write_geoip_table()

        print "--> Done"
        sys.stdout.flush()
 def make_table(self, pcds_table=None, org=None, nskip=0):
     '''
     Get the specified person_course table.  Find IP addresses for which the country code is missing.
     Get country codes for each of those IP addreses using local copy of the maxmind geoip database.
     Store result in bigquery geoip table.
 
     Does not overwrite existing bigquery geoip table - adds new entries.
     '''
     
     if pcds_table is None:
         if org is None:
             print "Error!  Must specify either --table or --org"
             return
         dataset = 'course_report_' + org
         
         pctables = []
         for table in bqutil.get_list_of_table_ids(dataset):
             m = re.match('person_course_%s_([\d_]+)$' % org, table)
             if m:
                 pctables.append(table)
         pctables.sort()
         if not pctables:
             print "Error!  No person_course_%s_* tables found in dataset %s!" % (org, dataset)
             return
         pctable = pctables[-1]
     else:
         (dataset, pctable) = pcd_table.split('.',1)
 
     print "[make_geoip_table] Using person course from %s.%s" % (dataset, pctable)
 
     if nskip <= 0:
         pimc_table = "%s_ip_no_cc" % pctable
         sql = """SELECT ip, count(*) as n
                  FROM [{dataset}.{table}] 
                  where cc_by_ip is Null
                  group by ip
               """.format(dataset=dataset, table=pctable)
     
         noips = bqutil.get_bq_table(dataset, pimc_table, sql)
         
         print "%d IP addresses missing geoip information in %s" % (len(noips['data']), pimc_table)
         # print noips['data'][:10]
         sys.stdout.flush()
     
         self.load_geoip()
     
         for entry in noips['data']:
             ip = entry['ip']
             if ip is None:
                 continue
             if ip in self.geoipdat:
                 # print "--> Already have geoip for %s, skipping" % ip
                 continue
 
             self.lookup_ip(ip)
 
             if (self.nchanged%100==0):
                 sys.stdout.write('.')
                 sys.stdout.flush()
                 break
     
         print "Added %d new geoip entries" % self.nchanged
         sys.stdout.flush()
     else:
         nskip -= 1
     self.write_geoip_table()
 
     print "--> Done" 
     sys.stdout.flush()
def createVideoStats(course_id,
                     force_recompute=False,
                     use_dataset_latest=False):
    '''
    Final step for video stats is to run through daily video stats table and aggregate for entire course for videos watch and videos viewed
    Join results with video axis to get detailed metadata per video for dashboard data
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS

    the_sql = """
              SELECT index_chapter,
                     index_video,
                     name,
                     video_id,
                     chapter_name,
                     sum(case when position > 0 then 1 else 0 end) as videos_viewed, 
                     sum(case when position > video_length*0.95 then 1 else 0 end) as videos_watched,
               FROM (
                     SELECT username, index_chapter,
                            index_video,
                            name,
                            video_id, 
                            chapter_name,
                            max(position) as position,
                            video_length,
                     FROM (SELECT * FROM [{dataset}.{videostatsperday}]) as video_log,
                           LEFT JOIN EACH
                          (SELECT video_length,
                                  video_id as vid_id,
                                  name,
                                  index_video,
                                  index_chapter,
                                  chapter_name
                           FROM [{dataset}.{videoaxis}]
                           ) as video_axis
                           ON video_log.video_id = video_axis.vid_id
                           WHERE video_id is not null and username is not null
                           group by username, video_id, name, index_chapter, index_video, chapter_name, video_length
                           order by video_id asc)
                GROUP BY video_id, index_chapter, index_video, name, chapter_name
                ORDER BY index_video asc;
                """.format(dataset=dataset,
                           videoaxis=TABLE_VIDEO_AXIS,
                           videostatsperday=TABLE_VIDEO_STATS_PER_DAY)

    print "[analyze_videos] Creating %s.%s table for %s" % (
        dataset, TABLE_VIDEO_STATS, course_id)
    sys.stdout.flush()

    try:
        tinfo_va = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS)
        trows_va = int(tinfo_va['numRows'])
        tinfo_va_day = bqutil.get_bq_table_info(dataset,
                                                TABLE_VIDEO_STATS_PER_DAY)
        trows_va_day = int(tinfo_va['numRows'])
        assert tinfo_va is not None and trows_va != 0, "[analyze videos] %s table depends on %s, which does not exist" % (
            TABLE_VIDEO_STATS, TABLE_VIDEO_AXIS)
        assert tinfo_va_day is not None and trows_va_day != 0, "[analyze videos] %s table depends on %s, which does not exist" % (
            TABLE_VIDEO_STATS, TABLE_VIDEO_STATS_PER_DAY)

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s and/or %s (including 0 rows in table)?  Skipping creation of %s" % (
            dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_STATS_PER_DAY,
            TABLE_VIDEO_STATS)
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        the_sql,
        force_query=force_recompute,
        depends_on=["%s.%s" % (dataset, TABLE_VIDEO_AXIS)],
    )
    return bqdat
def process_time_on_task_totals(course_id,
                                force_recompute=False,
                                use_dataset_latest=False,
                                config_parameter_overrides=None):

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    config_parameters = {
        'timeout_short': 5,
        'timeout_long': 30,
        'time_on_task_table_name': 'time_on_task',
        'time_on_task_totals_table_name': 'time_on_task_totals',
        'course_id': course_id,
        'dataset': dataset,
    }

    config_parameters.update(config_parameter_overrides or {})

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
                    username, 

                    sum(total_time_{timeout_short}) as total_time_{timeout_short},
                    sum(total_time_{timeout_long}) as total_time_{timeout_long},

                    sum(total_video_time_{timeout_short}) as total_video_time_{timeout_short},
                    sum(total_video_time_{timeout_long}) as total_video_time_{timeout_long},
                    sum(serial_video_time_{timeout_long}) as serial_video_time_{timeout_long},

                    sum(total_problem_time_{timeout_short}) as total_problem_time_{timeout_short},
                    sum(total_problem_time_{timeout_long}) as total_problem_time_{timeout_long},
                    sum(serial_problem_time_{timeout_long}) as serial_problem_time_{timeout_long},

                    sum(total_forum_time_{timeout_short}) as total_forum_time_{timeout_short},
                    sum(total_forum_time_{timeout_long}) as total_forum_time_{timeout_long},
                    sum(serial_forum_time_{timeout_long}) as serial_forum_time_{timeout_long},

                    sum(total_text_time_{timeout_short}) as total_text_time_{timeout_short},
                    sum(total_text_time_{timeout_long}) as total_text_time_{timeout_long},
                    sum(serial_text_time_{timeout_long}) as serial_text_time_{timeout_long},

            FROM [{dataset}.{time_on_task_table_name}]
            GROUP BY course_id, username
            order by username
         """

    the_sql = SQL.format(**config_parameters)

    tablename = config_parameters.get(
        'time_on_task_totals_table_name') or 'time_on_task_totals'

    print "Computing %s for %s" % (tablename, dataset)
    sys.stdout.flush()

    bqdat = bqutil.get_bq_table(
        dataset,
        tablename,
        the_sql,
        force_query=force_recompute,
        depends_on=['%s.time_on_task' % dataset],
    )

    return bqdat
def createPersonCourseVideo(course_id,
                            force_recompute=False,
                            use_dataset_latest=False):
    '''
    Create the person_course_video_watched table, based on video_stats.
    Each row gives the number of unique videos watched by a given user, for the given course.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    table = TABLE_PERSON_COURSE_VIDEO_WATCHED

    the_sql = """
                  SELECT user_id, 
                      "{course_id}" as course_id,
                      count(*) n_unique_videos_watched,
                      count(*) / n_total_videos as fract_total_videos_watched,
                      viewed, certified, verified
                  FROM
                  (
                      SELECT PC.user_id as user_id, UV.username as username,
                          video_id, 
                          n_views,
                          NV.n_total_videos as n_total_videos,
                          certified,
                          viewed,
                          (mode=="verified") as verified,
                      FROM
                      (
                          SELECT username, video_id, count(*) as n_views
                          FROM [{dataset}.video_stats_day] 
                          GROUP BY username, video_id
                      ) UV
                      JOIN [{dataset}.person_course] PC
                      on UV.username = PC.username
                      CROSS JOIN 
                      (
                          SELECT count(*) as n_total_videos
                          FROM [{dataset}.video_axis]
                      ) NV
                      WHERE ((PC.roles = 'Student') OR (PC.roles is NULL))	# accommodate case when roles.csv is missing
                      # WHERE PC.roles = 'Student'
                  )
                  GROUP BY user_id, certified, viewed, verified, n_total_videos
                  order by user_id
              """

    the_sql = the_sql.format(course_id=course_id, dataset=dataset)
    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        the_sql,
        force_query=force_recompute,
        depends_on=["%s.%s" % (dataset, TABLE_VIDEO_STATS)],
        newer_than=datetime.datetime(2017, 2, 6, 18, 30),
        startIndex=-2)
    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, table)
    print "--> Done with %s for %s, %d entries found" % (table, course_id,
                                                         nfound)
    sys.stdout.flush()

    return bqdat
Example #27
0
def AnalyzeIDV(course_id, force_recompute=False, use_dataset_latest=False):

    tablename = "idv_analysis"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    print "="*77
    print "Creating %s.%s table for %s" % (dataset, tablename, course_id)
    print "-"*77

    org = course_id.split('/',1)[0]
    dataset_cr = ('course_report_%s' % org)
    if use_dataset_latest:
        dataset_cr = 'course_report_latest'

    pcv = "person_course_viewed"
    try:
        tinfo = bqutil.get_bq_table_info(dataset_cr, "person_course_viewed")
        assert tinfo is not None
    except Exception as err:
        print " --> missing %s.%s ;  using dummy instead" % (dataset_cr, pcv)
        sys.stdout.flush()
        dataset_cr = dataset
        pcv = "person_course"

    the_sql = """
# IDV and non-IDV enrollee engagement at point of last IDV enrollment, before course end, including # IDV and certificates in other courses
SELECT
    "{course_id}" as course_id,    
    OC.user_id as user_id,
    OC.username as username,
    (OC.verified_enroll_time is not NULL) as is_idv,
    sum(case when PCI.verified_enroll_time is not NULL then 1 else 0 end) as n_other_idv,
    sum(case when PCI.verified_enroll_time is not NULL and PCI.start_time > OC.start_time then 1 else 0 end) as n_previous_idv,
    sum(case when PCI.certified and PCI.start_time > OC.start_time then 1 else 0 end) as n_previous_certified,
    sum(case when PCI.viewed and PCI.start_time > OC.start_time then 1 else 0 end) as n_previous_participated,
    sum(case when (PCI.verified_enroll_time is not NULL and PCI.start_time > OC.start_time and PCI.certified) then 1 else 0 end) as n_previous_idv_certified,
    first(gender) as gender,
    first(YoB) as YoB,
    first(LoE) as LoE,
    OC.n_problem_records as n_problem_records,
    OC.n_correct as n_correct,
    OC.n_incorrect as n_incorrect,
    OC.total_problem_points as total_problem_points,
    OC.verified_enroll_time as verified_enroll_time,
    OC.verified_unenroll_time as verified_unenroll_time,
    OC.verified_enroll_date as verified_enroll_date,
    OC.verified_unenroll_date as verified_unenroll_date,
    OC.nforum_pinned as nforum_pinned,
    OC.is_forum_moderator as is_forum_moderator,
    OC.final_course_grade as final_course_grade,
    OC.earned_certificate as earned_certificate,
    OC.n_show_answer as n_show_answer,
    OC.nprogcheck as nprogcheck,
    OC.nvideo as nvideo,
    OC.nforum_reads as nforum_reads,
    OC.nforum_posts as nforum_posts,
    OC.hours_on_system as hours_on_system,
    OC.countryLabel as countryLabel,
    OC.start_time as start_time,
FROM
(
    # engagement stats for NON verified ID versus verified ID, as of the date of the last IDV signup
    SELECT *
    FROM
    (
        # stats for NON verified ID, as of the date of the last IDV signup
        SELECT
        PAC.user_id as user_id,
            PAC.username as username,
            PAC.n_problem_records as n_problem_records,
            PAC.n_correct as n_correct,
            PAC.n_incorrect as n_incorrect,
            PAC.total_problem_points as total_problem_points,
            PAC.verified_enroll_time as verified_enroll_time,
            PAC.verified_unenroll_time as verified_unenroll_time,
            DATE(PAC.verified_enroll_time) as verified_enroll_date,
            DATE(PAC.verified_unenroll_time) as verified_unenroll_date,
            PAC.nforum_pinned as nforum_pinned,
            PAC.is_forum_moderator as is_forum_moderator,
            PAC.final_course_grade as final_course_grade,
            PAC.earned_certificate as earned_certificate,
            PAC.countryLabel as countryLabel,
            PAC.start_time as start_time,
            sum(PCD.nshow_answer) as n_show_answer,
            sum(PCD.nprogcheck) as nprogcheck,
            sum(PCD.nvideo) as nvideo,
            sum(PCD.nforum_reads) as nforum_reads,
            sum(PCD.nforum_posts) as nforum_posts,
            sum(PCD.sum_dt / 60 / 60) as hours_on_system,
        FROM
        (
            # get problem grade and activity counts up to date of verified ID enrollment
            SELECT PA.user_id as user_id,
                PC.username as username,
                count(*) as n_problem_records,
                sum(case when PA.item.correct_bool then 1 else 0 end) as n_correct,
                sum(case when PA.item.correct_bool==False then 1 else 0 end) as n_incorrect,
                sum(PA.grade) as total_problem_points,
                PC.verified_enroll_time as verified_enroll_time,
                PC.verified_unenroll_time as verified_unenroll_time,
                PC.nforum_pinned as nforum_pinned,
                PC.forumRoles_isModerator as is_forum_moderator,
                PC.grade as final_course_grade,
                PC.certified as earned_certificate,
                PC.countryLabel as countryLabel,
                PC.start_time as start_time,
                max_verified_enroll_time,
            FROM [{dataset}.problem_analysis] PA
            JOIN
            (
                SELECT user_id, username, verified_enroll_time, verified_unenroll_time, nforum_pinned,
                    forumRoles_isModerator, grade, certified, max_verified_enroll_time, countryLabel, start_time
                      FROM [{dataset}.person_course] PC
                CROSS JOIN
                (
                    SELECT max(verified_enroll_time) as max_verified_enroll_time
                            FROM [{dataset}.person_course]
                ) VET
                where viewed
            ) PC
            ON PA.user_id = PC.user_id
            where PA.created <= PC.max_verified_enroll_time
                and PC.verified_enroll_time is null
            group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate,
                verified_unenroll_time, max_verified_enroll_time, countryLabel, start_time
            order by user_id
        ) PAC
        JOIN [{dataset}.person_course_day] PCD
            ON PAC.username = PCD.username
        WHERE PCD.date < DATE(max_verified_enroll_time)
        group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate,
                 verified_unenroll_time, n_problem_records, n_correct, n_incorrect, total_problem_points, nforum_pinned, is_forum_moderator,
                 verified_enroll_date, verified_unenroll_date, countryLabel, start_time
        order by user_id
    ),
    (
        # stats for those who DID enroll verified ID, as of the date of their IDV enrollment
        # include nprogcheck, nshow_answer, nproblem_check, nvideo, hours_on_system
        SELECT
        PAC.user_id as user_id,
            PAC.username as username,
            PAC.n_problem_records as n_problem_records,
            PAC.n_correct as n_correct,
            PAC.n_incorrect as n_incorrect,
            PAC.total_problem_points as total_problem_points,
            PAC.verified_enroll_time as verified_enroll_time,
            PAC.verified_unenroll_time as verified_unenroll_time,
            DATE(PAC.verified_enroll_time) as verified_enroll_date,
            DATE(PAC.verified_unenroll_time) as verified_unenroll_date,
            PAC.nforum_pinned as nforum_pinned,
            PAC.is_forum_moderator as is_forum_moderator,
            PAC.final_course_grade as final_course_grade,
            PAC.earned_certificate as earned_certificate,
            PAC.countryLabel as countryLabel,
            PAC.start_time as start_time,
            sum(PCD.nshow_answer) as n_show_answer,
            sum(PCD.nprogcheck) as nprogcheck,
            sum(PCD.nvideo) as nvideo,
            sum(PCD.nforum_reads) as nforum_reads,
            sum(PCD.nforum_posts) as nforum_posts,
            sum(PCD.sum_dt / 60 / 60) as hours_on_system,
        FROM
        (
            # get problem grade and activity counts up to date of verified ID enrollment
            SELECT PA.user_id as user_id,
                PC.username as username,
                count(*) as n_problem_records,
                sum(case when PA.item.correct_bool then 1 else 0 end) as n_correct,
                sum(case when PA.item.correct_bool==False then 1 else 0 end) as n_incorrect,
                sum(PA.grade) as total_problem_points,
                PC.verified_enroll_time as verified_enroll_time,
                PC.verified_unenroll_time as verified_unenroll_time,
                PC.nforum_pinned as nforum_pinned,
                PC.forumRoles_isModerator as is_forum_moderator,
                PC.grade as final_course_grade,
                PC.certified as earned_certificate,
                PC.countryLabel as countryLabel,
                PC.start_time as start_time,
            FROM [{dataset}.problem_analysis] PA
            JOIN [{dataset}.person_course] PC
               ON PA.user_id = PC.user_id
            where PA.created <= PC.verified_enroll_time
            group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade,
                     earned_certificate, verified_unenroll_time, countryLabel, start_time
            order by user_id
        ) PAC
        JOIN [{dataset}.person_course_day] PCD
            ON PAC.username = PCD.username
        WHERE PCD.date < DATE(PAC.verified_enroll_time)
        group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate,
                 verified_unenroll_time, n_problem_records, n_correct, n_incorrect, total_problem_points, nforum_pinned, is_forum_moderator,
                 verified_enroll_date, verified_unenroll_date, countryLabel, start_time
        order by user_id
    )
    order by verified_enroll_date, user_id
) OC
LEFT JOIN [{dataset_cr}.{pcv}] PCI
on OC.user_id = PCI.user_id
#where (PCI.verified_enroll_time is null) or (PCI.verified_enroll_time <= OC.verified_enroll_time)
group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate,
         verified_unenroll_time, n_problem_records, n_correct, n_incorrect, total_problem_points, nforum_pinned, is_forum_moderator,
         verified_enroll_date, verified_unenroll_date,
         n_show_answer, nprogcheck, nvideo, nforum_reads, nforum_posts, hours_on_system, countryLabel, start_time, is_idv
order by verified_enroll_date, user_id
"""

    the_sql = the_sql.format(dataset=dataset, dataset_cr=dataset_cr, pcv=pcv, course_id=course_id)

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, force_query=force_recompute,
                                    depends_on=["%s.problem_course" % dataset, "%s.person_course_day" % dataset, "%s.problem_analysis" % dataset],
                                    allowLargeResults=True,
                                    startIndex=-2)
    except Exception as err:
        print "ERROR! Failed on SQL="
        print the_sql
        raise
    
    print "  --> created %s.%s" % (dataset, tablename)
    sys.stdout.flush()
def CreateForumPosts(course_id,
                     force_recompute=True,
                     use_dataset_latest=False,
                     skip_last_day=False,
                     end_date=None):
    '''
    Create Forum posts table, based on forum data. Categorizes forum posts as initial_post, response_post or comment.
    Also extracts first 100 characters of the post content as a preview.
    '''

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    table = TABLE_FORUM_POSTS

    the_sql = """
                        SELECT * FROM
                        (
                             SELECT ADDED_TITLE.FA.username as username,
				    "{course_id}" as course_id,
                                    ADDED_TITLE.FA.slug_id as slug_id,
                                    ADDED_TITLE.FA.slug_type as slug_type,
                                    ADDED_TITLE.FA.thread_id as thread_id,
                                    ADDED_TITLE.FA.parent_id as parent_id,
                                    ADDED_TITLE.IP.username as original_poster,
                                    ADD_RESPOND_TO.username as responded_to,
                                    ADDED_TITLE.IP.title as title,
                                    ADDED_TITLE.FA.first_time as first_time,
                                    ADDED_TITLE.FA.body_preview as body_preview,
                             FROM 
                             (
                                  SELECT * FROM 
                                  (
                                       SELECT author_username as username,
                                              mongoid as slug_id,
                                              (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                              comment_thread_id as thread_id,
                                              parent_id,
                                              title,
                                              created_at as first_time,
                                              SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                       FROM [{dataset}.{forum}]
                                       WHERE (_type = "Comment" and parent_id is not null)
                                  ) as FA # 3rd level comment
                                  LEFT JOIN EACH
                                  ( 
                                       SELECT * FROM
                                       ( 
                                            SELECT author_username as username,
                                                   mongoid as slug_id,
                                                   (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                     (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                                   comment_thread_id as thread_id,
                                                   parent_id,
                                                   title,
                                                   created_at as first_time,
                                                   SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                            FROM [{dataset}.{forum}]
                                            WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                                       )
                                  ) as IP
                                  ON FA.thread_id = IP.slug_id
                             ) as ADDED_TITLE
                             LEFT JOIN EACH
                             (
                                  SELECT author_username as username,
                                         mongoid as slug_id,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                  FROM [{dataset}.{forum}]
                             ) as ADD_RESPOND_TO
                             ON ADDED_TITLE.FA.parent_id = ADD_RESPOND_TO.slug_id
                             WHERE ADDED_TITLE.FA.slug_type = "comment"
                        ) as RC,
                        (
                             SELECT FA.username as username,
				    "{course_id}" as course_id,
                                    FA.slug_id as slug_id,
                                    FA.slug_type as slug_type,
                                    FA.thread_id as thread_id,
                                    FA.parent_id as parent_id,
                                    IP.username as original_poster,
                                    IP.title as title,
                                    FA.first_time as first_time,
                                    FA.body_preview as body_preview
                             FROM 
                             (
                                  SELECT author_username as username,
                                         mongoid as slug_id,
                                         (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                           (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                         title,
                                         created_at as first_time,
                                         SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                  FROM [{dataset}.{forum}]
                                  WHERE (_type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null)
                             ) as FA # 2nd level comment
                             LEFT JOIN EACH
                             (
                                  SELECT * FROM 
                                  (    
                                       SELECT author_username as username,
                                              mongoid as slug_id,
                                              (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                              comment_thread_id as thread_id,
                                              parent_id,
                                              title,
                                              created_at as first_time,
                                              SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                       FROM [{dataset}.{forum}]
                                       WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                                  )
                             ) as IP
                             ON FA.thread_id = IP.slug_id
                        ) as RC2,
                        (
                             SELECT * FROM
                             (
                                  SELECT author_username as username,
				         "{course_id}" as course_id,
                                         mongoid as slug_id,
                                         (case when _type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null then "initial_post" end) as slug_type,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                         title,
                                         created_at as first_time,
                                         SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                  FROM [{dataset}.{forum}]
                                  WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                             )
                        ) as NA
              """.format(dataset=dataset,
                         course_id=course_id,
                         forum=TABLE_FORUM,
                         post_preview_char_count=POST_PREVIEW_CHAR_COUNT)

    print "[make_forum_analysis] Creating %s.%s table for %s" % (
        dataset, TABLE_FORUM_POSTS, course_id)
    sys.stdout.flush()

    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_FORUM)
        assert tinfo is not None, "[make_forum_analysis] %s table depends on %s, which does not exist" % (
            TABLE_FORUM_POSTS, TABLE_FORUM)

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % (
            dataset, TABLE_FORUM, TABLE_FORUM_POSTS)
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        the_sql,
        force_query=force_recompute,
        depends_on=["%s.%s" % (dataset, TABLE_FORUM)],
    )

    return bqdat
def CreateForumPosts( course_id, force_recompute=True, use_dataset_latest=False, skip_last_day=False, end_date=None):
    '''
    Create Forum posts table, based on forum data. Categorizes forum posts as initial_post, response_post or comment.
    Also extracts first 100 characters of the post content as a preview.
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_FORUM_POSTS

    the_sql = """
                        SELECT * FROM
                        (
                             SELECT ADDED_TITLE.FA.username as username,
				    "{course_id}" as course_id,
                                    ADDED_TITLE.FA.slug_id as slug_id,
                                    ADDED_TITLE.FA.slug_type as slug_type,
                                    ADDED_TITLE.FA.thread_id as thread_id,
                                    ADDED_TITLE.FA.parent_id as parent_id,
                                    ADDED_TITLE.IP.username as original_poster,
                                    ADD_RESPOND_TO.username as responded_to,
                                    ADDED_TITLE.IP.title as title,
                                    ADDED_TITLE.FA.first_time as first_time,
                                    ADDED_TITLE.FA.body_preview as body_preview,
                             FROM 
                             (
                                  SELECT * FROM 
                                  (
                                       SELECT author_username as username,
                                              mongoid as slug_id,
                                              (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                              comment_thread_id as thread_id,
                                              parent_id,
                                              title,
                                              created_at as first_time,
                                              SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                       FROM [{dataset}.{forum}]
                                       WHERE (_type = "Comment" and parent_id is not null)
                                  ) as FA # 3rd level comment
                                  LEFT JOIN EACH
                                  ( 
                                       SELECT * FROM
                                       ( 
                                            SELECT author_username as username,
                                                   mongoid as slug_id,
                                                   (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                     (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                                   comment_thread_id as thread_id,
                                                   parent_id,
                                                   title,
                                                   created_at as first_time,
                                                   SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                            FROM [{dataset}.{forum}]
                                            WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                                       )
                                  ) as IP
                                  ON FA.thread_id = IP.slug_id
                             ) as ADDED_TITLE
                             LEFT JOIN EACH
                             (
                                  SELECT author_username as username,
                                         mongoid as slug_id,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                  FROM [{dataset}.{forum}]
                             ) as ADD_RESPOND_TO
                             ON ADDED_TITLE.FA.parent_id = ADD_RESPOND_TO.slug_id
                             WHERE ADDED_TITLE.FA.slug_type = "comment"
                        ) as RC,
                        (
                             SELECT FA.username as username,
				    "{course_id}" as course_id,
                                    FA.slug_id as slug_id,
                                    FA.slug_type as slug_type,
                                    FA.thread_id as thread_id,
                                    FA.parent_id as parent_id,
                                    IP.username as original_poster,
                                    IP.title as title,
                                    FA.first_time as first_time,
                                    FA.body_preview as body_preview
                             FROM 
                             (
                                  SELECT author_username as username,
                                         mongoid as slug_id,
                                         (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                           (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                         title,
                                         created_at as first_time,
                                         SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                  FROM [{dataset}.{forum}]
                                  WHERE (_type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null)
                             ) as FA # 2nd level comment
                             LEFT JOIN EACH
                             (
                                  SELECT * FROM 
                                  (    
                                       SELECT author_username as username,
                                              mongoid as slug_id,
                                              (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                              comment_thread_id as thread_id,
                                              parent_id,
                                              title,
                                              created_at as first_time,
                                              SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                       FROM [{dataset}.{forum}]
                                       WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                                  )
                             ) as IP
                             ON FA.thread_id = IP.slug_id
                        ) as RC2,
                        (
                             SELECT * FROM
                             (
                                  SELECT author_username as username,
				         "{course_id}" as course_id,
                                         mongoid as slug_id,
                                         (case when _type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null then "initial_post" end) as slug_type,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                         title,
                                         created_at as first_time,
                                         SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                  FROM [{dataset}.{forum}]
                                  WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                             )
                        ) as NA
              """.format( dataset=dataset, course_id=course_id, forum=TABLE_FORUM, post_preview_char_count=POST_PREVIEW_CHAR_COUNT )

    print "[make_forum_analysis] Creating %s.%s table for %s" % ( dataset, TABLE_FORUM_POSTS, course_id )
    sys.stdout.flush()

    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_FORUM )
        assert tinfo is not None, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_POSTS, TABLE_FORUM )

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % ( dataset, TABLE_FORUM, TABLE_FORUM_POSTS )
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(dataset, table, the_sql, force_query=force_recompute,
                                depends_on=["%s.%s" % (dataset, TABLE_FORUM)],
                                )

    return bqdat
Example #30
0
    def cached_get_bq_table(self, dataset, table, sql=None, key=None, drop=None,
                            logger=None, ignore_cache=False, 
                            depends_on=None,
                            force_query=False,
                            force_newer_than=None,
                            startIndex=0, maxResults=1000000,
                            allowLargeResults=False,
                            raise_exception=False,
                            project_id=None,
    ):
        '''
        Get a dataset from BigQuery; use memcache.

        If "depends_on" is provided (as a list of strings), and if the desired table
        already exists, then check to make sure it is newer than any of the tables
        listed in "depends_on".

        if force_newer_than is set (should be a datetime) then in the depends_on
        testing, use that date as an override, such that the SQL is re-run if
        the existing table is older than this date.

        project_id: if specified, overrides the default BigQuery project ID (for the actual query)
        '''
        if logger is None:
            logger = logging.info
        memset = '%s.%s' % (dataset,table)
        if startIndex:
            memset += '-%d-%d' % (startIndex, maxResults)
        data = mem.get(memset)

        optargs = {}
        if project_id:
            optargs['project_id'] = project_id

        if depends_on is not None:
            # get the latest mod time of tables in depends_on:
            modtimes = [ bqutil.get_bq_table_last_modified_datetime(*(x.split('.',1)), **optargs) for x in depends_on]
            latest = max([x for x in modtimes if x is not None] or [None])
            
            if not latest:
                raise Exception("[datasource.cached_get_bq_table] Cannot get last mod time for %s (got %s), needed by %s.%s" % (depends_on, modtimes, dataset, table))

            if force_newer_than and force_newer_than > latest:
                latest = force_newer_than

            if data and data.get('lastModifiedTime', None):
                # data has a mod time, let's see if that has expired
                if data.get('lastModifiedTime', None) < latest:
                    ignore_cache = True

            # get the mod time of the computed table, if it exists
            try:
                table_date = bqutil.get_bq_table_last_modified_datetime(dataset, table, **optargs)
            except Exception as err:
                if 'Not Found' in str(err):
                    table_date = None
                    ignore_cache = True
                    logging.info("[datasource.cached_get_bq_table] Table %s.%s doesn't exist, forcing recomputation" % (dataset, table))
                else:
                    raise

            if table_date and table_date < latest:
                ignore_cache = True
                if sql:
                    force_query = True
                    logging.info("[datasource.cached_get_bq_table] Forcing query recomputation of %s.%s, table_date=%s, latest=%s" % (dataset, table,
                                                                                                                                      table_date, latest))
                else:
                    logging.info("[datasource.cached_get_bq_table] Forcing cache reload of %s.%s, table_date=%s, latest=%s" % (dataset, table,
                                                                                                                               table_date, latest))

            # logging.info("[datasource.cached_get_bq_table] %s.%s table_date=%s, latest=%s, force_query=%s" % (dataset, table, table_date, latest, force_query))

        if (not data) or ignore_cache or (not data['data']):	# data['data']=None if table was empty, and in that case try again
            try:
                data = bqutil.get_bq_table(dataset, table, sql, key=key, logger=logger,
                                           force_query=force_query,
                                           startIndex=startIndex, 
                                           maxResults=maxResults, 
                                           allowLargeResults=allowLargeResults,
                                           **optargs)
            except Exception as err:
                logging.error(err)
                if raise_exception:
                    raise
                data = {'fields': {}, 'field_names': [], 'data': [], 'data_by_key': {}}
                return data		# don't cache empty result
            data['depends_on'] = depends_on
            if (drop is not None) and drop:
                for key in drop:
                    data.pop(key)	# because data can be too huge for memcache ("Values may not be more than 1000000 bytes in length")
            try:
                mem.set(memset, data, time=3600*12)
            except Exception as err:
                logging.error('error doing mem.set for %s.%s from bigquery' % (dataset, table))
        self.bqdata[table] = data
        return data
Example #31
0
def make_irt_report(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    the_sql_alpha = """
    IR.itemtestcorr as item_test,
    IR.itemrestcorr as item_rest,
    IR.alpha as alpha,
    """

    the_sql_no_alpha = """
    null as item_test,
    null as item_rest,
    null as alpha,
    """

    the_sql_alpha_join = """
    JOIN [{dataset}.item_reliabilities] IR
    on IR.item = CP.problem_yid
    """.format(dataset=dataset)

    the_sql = """
# item_response_theory_report for {course_id}
#
# problem_nid,problem_short_id,chapter,assignment_type,problem_label,problem_id,IRT item number,avg_problem_raw_score,avg_problem_pct_score,
# n_unique_users_attempted,item_test,item_rest,alpha ,Discrimination,Difficulty

SELECT 
    "{course_id}" as course_id,
    IG.problem_nid as problem_nid,
    CP.problem_short_id as problem_short_id,
    CI.chapter_name as chapter,
    assignment_type,
    CONCAT("[", STRING(IG.problem_nid), "] ", CI.chapter_name, " / ", CI.section_name, " / ", CP.problem_name) as problem_label,
    CP.problem_id as problem_id,
    CONCAT(STRING(CP.problem_nid), "/", STRING(cutnum)) as IRT_item_number,
    CP.avg_problem_raw_score avg_problem_raw_score,
    CP.avg_problem_pct_score avg_problem_pct_score,
    CP.n_unique_users_attempted n_unique_users_attempted,
    {sql_alpha}
    irt_diff as Difficulty,
    irt_disc as Discrimination,
    diff_se as Difficulty_SE,
    disc_se as Discrimination_SE,
    "{irt_method}" as irt_method,

FROM [{dataset}.{item_irt_grm}] IG
JOIN [{dataset}.course_item] CI
on IG.problem_nid = CI.problem_nid
JOIN 
(
    SELECT *, CONCAT("y", STRING(problem_nid)) as problem_yid,
    FROM [{dataset}.course_problem]
) CP
on IG.problem_nid = CP.problem_nid
{sql_alpha_join}
where CI.item_number = 1
    """

    tablename = "item_response_theory_report"
    RELIABILITIES_TABLE = "item_reliabilities"
    IRT_TABLES = OrderedDict([ ("item_irt_grm", "STATA GRM"),
                               ("item_irt_grm_R", "R mirt GRM"),
                           ])
    
    irt_table_to_use = None
    irt_table_date = None

    # use newest of the existing IRT tables
    for irt_tablename in IRT_TABLES:
        try:
            tinfo = bqutil.get_bq_table_info(dataset, irt_tablename )
            assert tinfo is not None, "%s.%s does not exist" % ( dataset, irt_tablename )
            lmt = tinfo.get('lastModifiedTime')
            use_table = lmt and ( (not irt_table_date) or (irt_table_date and lmt > irt_table_date) )
            if use_table:
                irt_table_date = lmt
                irt_table_to_use = irt_tablename
            else:
                print "[make_irt_report] Not using IRT table %s (date %s) - older than %s (date %s)" % ( irt_tablename,
                                                                                                         lmt,
                                                                                                         irt_table_to_use,
                                                                                                         irt_table_date )
        except Exception as err:
            pass
    
    if not irt_table_to_use:
        raise Exception("[make_irt_report] Cannot generate IRT report; requires one of %s" % (','.join(IRT_TABLES.keys())))

    # SQL changes depending on whether item_reliabilities exists or not
    have_reliabilities = False
    try:
        tinfo = bqutil.get_bq_table_info(dataset, RELIABILITIES_TABLE)
        assert tinfo is not None, "%s.%s does not exist" % ( dataset, RELIABILITIES_TABLE )
        if tinfo is not None:
            have_reliabilities = True
    except Exception as err:
        pass

    if have_reliabilities:
        sql_alpha = {'sql_alpha': the_sql_alpha, "sql_alpha_join": the_sql_alpha_join }
    else:
        sql_alpha = {'sql_alpha': the_sql_no_alpha, "sql_alpha_join": "" }

    the_sql = the_sql.format(dataset=dataset, course_id=course_id, item_irt_grm=irt_table_to_use, 
                             irt_method=IRT_TABLES[irt_table_to_use],
                             **sql_alpha)

    depends_on = [ "%s.course_item" % dataset,
                   "%s.course_problem" % dataset,
                   "%s.%s" % (dataset, irt_table_to_use),
               ]

    if have_reliabilities:
        depends_on.append("%s.item_reliabilities" % dataset)

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    newer_than=datetime.datetime(2016, 9, 27, 14, 48),
                                    startIndex=-2)
    except Exception as err:
        print "[make_irt_report] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d problem items found" % (tablename, course_id, nfound)
    sys.stdout.flush()
Example #32
0
def make_irt_report(course_id,
                    force_recompute=False,
                    use_dataset_latest=False):
    '''
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    the_sql_alpha = """
    IR.itemtestcorr as item_test,
    IR.itemrestcorr as item_rest,
    IR.alpha as alpha,
    """

    the_sql_no_alpha = """
    null as item_test,
    null as item_rest,
    null as alpha,
    """

    the_sql_alpha_join = """
    JOIN [{dataset}.item_reliabilities] IR
    on IR.item = CP.problem_yid
    """.format(dataset=dataset)

    the_sql = """
# item_response_theory_report for {course_id}
#
# problem_nid,problem_short_id,chapter,assignment_type,problem_label,problem_id,IRT item number,avg_problem_raw_score,avg_problem_pct_score,
# n_unique_users_attempted,item_test,item_rest,alpha ,Discrimination,Difficulty

SELECT 
    "{course_id}" as course_id,
    IG.problem_nid as problem_nid,
    CP.problem_short_id as problem_short_id,
    CI.chapter_name as chapter,
    assignment_type,
    CONCAT("[", STRING(IG.problem_nid), "] ", CI.chapter_name, " / ", CI.section_name, " / ", CP.problem_name) as problem_label,
    CP.problem_id as problem_id,
    CONCAT(STRING(CP.problem_nid), "/", STRING(cutnum)) as IRT_item_number,
    CP.avg_problem_raw_score avg_problem_raw_score,
    CP.avg_problem_pct_score avg_problem_pct_score,
    CP.n_unique_users_attempted n_unique_users_attempted,
    {sql_alpha}
    irt_diff as Difficulty,
    irt_disc as Discrimination,
    diff_se as Difficulty_SE,
    disc_se as Discrimination_SE,
    "{irt_method}" as irt_method,

FROM [{dataset}.{item_irt_grm}] IG
JOIN [{dataset}.course_item] CI
on IG.problem_nid = CI.problem_nid
JOIN 
(
    SELECT *, CONCAT("y", STRING(problem_nid)) as problem_yid,
    FROM [{dataset}.course_problem]
) CP
on IG.problem_nid = CP.problem_nid
{sql_alpha_join}
where CI.item_number = 1
    """

    tablename = "item_response_theory_report"
    RELIABILITIES_TABLE = "item_reliabilities"
    IRT_TABLES = OrderedDict([
        ("item_irt_grm", "STATA GRM"),
        ("item_irt_grm_R", "R mirt GRM"),
    ])

    irt_table_to_use = None
    irt_table_date = None

    # use newest of the existing IRT tables
    for irt_tablename in IRT_TABLES:
        try:
            tinfo = bqutil.get_bq_table_info(dataset, irt_tablename)
            assert tinfo is not None, "%s.%s does not exist" % (dataset,
                                                                irt_tablename)
            lmt = tinfo.get('lastModifiedTime')
            use_table = lmt and ((not irt_table_date) or
                                 (irt_table_date and lmt > irt_table_date))
            if use_table:
                irt_table_date = lmt
                irt_table_to_use = irt_tablename
            else:
                print "[make_irt_report] Not using IRT table %s (date %s) - older than %s (date %s)" % (
                    irt_tablename, lmt, irt_table_to_use, irt_table_date)
        except Exception as err:
            pass

    if not irt_table_to_use:
        raise Exception(
            "[make_irt_report] Cannot generate IRT report; requires one of %s"
            % (','.join(IRT_TABLES.keys())))

    # SQL changes depending on whether item_reliabilities exists or not
    have_reliabilities = False
    try:
        tinfo = bqutil.get_bq_table_info(dataset, RELIABILITIES_TABLE)
        assert tinfo is not None, "%s.%s does not exist" % (
            dataset, RELIABILITIES_TABLE)
        if tinfo is not None:
            have_reliabilities = True
    except Exception as err:
        pass

    if have_reliabilities:
        sql_alpha = {
            'sql_alpha': the_sql_alpha,
            "sql_alpha_join": the_sql_alpha_join
        }
    else:
        sql_alpha = {'sql_alpha': the_sql_no_alpha, "sql_alpha_join": ""}

    the_sql = the_sql.format(dataset=dataset,
                             course_id=course_id,
                             item_irt_grm=irt_table_to_use,
                             irt_method=IRT_TABLES[irt_table_to_use],
                             **sql_alpha)

    depends_on = [
        "%s.course_item" % dataset,
        "%s.course_problem" % dataset,
        "%s.%s" % (dataset, irt_table_to_use),
    ]

    if have_reliabilities:
        depends_on.append("%s.item_reliabilities" % dataset)

    try:
        bqdat = bqutil.get_bq_table(dataset,
                                    tablename,
                                    the_sql,
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    newer_than=datetime.datetime(
                                        2016, 9, 27, 14, 48),
                                    startIndex=-2)
    except Exception as err:
        print "[make_irt_report] ERR! failed in creating %s.%s using this sql:" % (
            dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d problem items found" % (
        tablename, course_id, nfound)
    sys.stdout.flush()
def process_time_on_asset_totals(course_id,
                                 force_recompute=False,
                                 use_dataset_latest=False):
    '''
    Compute total time on asset values, across various subpopulations of users, for different
    assets (labeled by their module id's).  This requires time_on_asset_daily.

    The time_on_asset_totals table has these columns:

    - course_id
    - module_id: ID for the asset (including video, problem, text, vertical, sequential - see course axis)
    - n_unique_users: number of unique users who accessed the asset
    - n_unique_certified: number of unique users who accessed the asset and also earned a certificate
    - mean_tmid5: mean time spent on asset [sec], for the given module_id, with a 5-minute timeout
    - cert_mean_tmid5: mean time spent on asset by certified users [sec], for the given module_id, with a 5-minute timeout
    - mean_tmid30: mean time spent on asset [sec], for the given module_id, with a 30-minute timeout
    - cert_mean_tmid30: mean time spent on asset by certified users [sec], for the given module_id, with a 30-minute timeout
    - median_tmid5: median time spent on asset [sec], for the given module_id, with a 5-minute timeout
    - cert_median_tmid5: median time spent on asset by certified users [sec], for the given module_id, with a 5-minute timeout
    - median_tmid30: median time spent on asset [sec], for the given module_id, with a 30-minute timeout
    - cert_median_tmid30: median time spent on asset by certified users [sec], for the given module_id, with a 30-minute timeout
    - total_tmid5: total time spent on given module_id, in seconds, with a 5-minute timeout
    - cert_total_tmid5: total time spent on given module_id, in seconds, with a 5-minute timeout, by certified users
    - total_tmid30: total time spent on given module_id, in seconds, with a 30-minute timeout
    - cert_total_tmid30: total time spent on given module_id, in seconds, with a 30-minute timeout, by certified users
    '''

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    SQL = """
      SELECT 
          "{course_id}" as course_id,
          module_id, 
          EXACT_COUNT_DISTINCT(username) as n_unique_users,
          EXACT_COUNT_DISTINCT (certified_username) as n_unique_certified,
          AVG(time_umid5) as mean_tmid5,
          AVG(cert_time_umid5) as cert_mean_tmid5,
          AVG(time_umid30) as mean_tmid30,
          AVG(cert_time_umid30) as cert_mean_tmid30,
          NTH(26, QUANTILES(time_umid5, 50)) as median_tmid5,
          NTH(26, QUANTILES(cert_time_umid5, 50)) as cert_median_tmid5,
          NTH(26, QUANTILES(time_umid30, 50)) as median_tmid30,
          NTH(26, QUANTILES(cert_time_umid30, 50)) as cert_median_tmid30,
          sum(time_umid5) as total_tmid5,   # total time on module (by module_id) in seconds
          sum(cert_time_umid5) as cert_total_tmid5,
          sum(time_umid30) as total_tmid30, # mid5 has 5 minute timeout, mid30 has 30 min timeout
          sum(cert_time_umid30) as cert_total_tmid30,
      FROM (
          SELECT
            TL.module_id as module_id,
            TL.username as username,
            (case when certified then TL.username else null end) as certified_username,
            sum(time_umid5) as time_umid5,
            sum(time_umid30) as time_umid30,
            sum(case when certified then time_umid5 else null end) as cert_time_umid5,
            sum(case when certified then time_umid30 else null end) as cert_time_umid30,

          FROM [{dataset}.time_on_asset_daily] TL
          JOIN [{dataset}.person_course] PC	# join to know who certified or attempted a problem
          ON TL.username = PC.username
          WHERE TL.time_umid5 is not null
                AND PC.nproblem_check > 0	# limit to users who attempted at least one problem
          GROUP BY module_id, username, certified_username
          ORDER BY module_id, username
        )
        GROUP BY module_id
        order by module_id
         """

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    the_sql = SQL.format(dataset=dataset, course_id=course_id)

    tablename = 'time_on_asset_totals'

    print "Computing %s for %s" % (tablename, dataset)
    sys.stdout.flush()

    bqdat = bqutil.get_bq_table(
        dataset,
        tablename,
        the_sql,
        force_query=force_recompute,
        depends_on=['%s.time_on_asset_daily' % dataset],
    )

    return bqdat
def createVideoStats_obsolete(course_id,
                              force_recompute=False,
                              use_dataset_latest=False,
                              startDate=DATE_DEFAULT_START,
                              endDate=DATE_DEFAULT_END):
    '''
    Create video statistics for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video
    position > 95% of the total video length duration.
    This was the original method used, but is not the most efficient since it queries entire log set. Instead, generate video stats per day, then incrementally
    append to that data table as the daily log data comes in.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS

    the_sql = """
                 SELECT index_chapter,
                        index_video,
                        name,
                        video_id, 
                        chapter_name,
                        sum(case when position > 0 then 1 else 0 end) as videos_viewed, 
                        sum(case when position > video_length*0.95 then 1 else 0 end) as videos_watched,
                 FROM (SELECT username,
                              #module_id as video_id,
                              #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path
                              (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'[-]' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only
                              max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else  float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position,
                       FROM (TABLE_QUERY({logs},
                             "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start_date} and {end_date}"))
                       WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and
                              event is not null
                       group by username, video_id
                       order by username, video_id) as video_log,
                       LEFT JOIN EACH
                       (SELECT video_length,
                                video_id as vid_id,
                                name,
                                index_video,
                                index_chapter,
                                chapter_name
                        FROM [{dataset}.{videoaxis}]
                        ) as {videoaxis}
                        ON video_log.video_id = {videoaxis}.vid_id
                        WHERE video_id is not null
                        group by video_id, name, index_chapter, index_video, chapter_name
                        order by index_video asc;
                """.format(dataset=dataset,
                           start_date=startDate,
                           end_date=endDate,
                           logs=logs,
                           videoaxis=TABLE_VIDEO_AXIS)

    print "[analyze_videos] Creating %s.%s table for %s" % (
        dataset, TABLE_VIDEO_STATS, course_id)
    sys.stdout.flush()

    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS)
        assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % (
            TABLE_VIDEO_STATS, TABLE_VIDEO_AXIS)

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % (
            dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_STATS)
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        the_sql,
        force_query=force_recompute,
        depends_on=["%s.%s" % (dataset, TABLE_VIDEO_AXIS)],
    )
    return bqdat
def createVideoStats( course_id, force_recompute=False, use_dataset_latest=False ):
    '''
    Final step for video stats is to run through daily video stats table and aggregate for entire course for videos watch and videos viewed
    Join results with video axis to get detailed metadata per video for dashboard data
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS

    the_sql = """
              SELECT index_chapter,
                     index_video,
                     name,
                     video_id,
                     chapter_name,
                     sum(case when position > 0 then 1 else 0 end) as videos_viewed, 
                     sum(case when position > video_length*0.95 then 1 else 0 end) as videos_watched,
               FROM (
                     SELECT username, index_chapter,
                            index_video,
                            name,
                            video_id, 
                            chapter_name,
                            max(position) as position,
                            video_length,
                     FROM (SELECT * FROM [{dataset}.{videostatsperday}]) as video_log,
                           LEFT JOIN EACH
                          (SELECT video_length,
                                  video_id as vid_id,
                                  name,
                                  index_video,
                                  index_chapter,
                                  chapter_name
                           FROM [{dataset}.{videoaxis}]
                           ) as video_axis
                           ON video_log.video_id = video_axis.vid_id
                           WHERE video_id is not null and username is not null
                           group by username, video_id, name, index_chapter, index_video, chapter_name, video_length
                           order by video_id asc)
                GROUP BY video_id, index_chapter, index_video, name, chapter_name
                ORDER BY index_video asc;
                """.format(dataset=dataset, videoaxis=TABLE_VIDEO_AXIS, videostatsperday=TABLE_VIDEO_STATS_PER_DAY)

    print "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS, course_id)
    sys.stdout.flush()
        
    try:
        tinfo_va = bqutil.get_bq_table_info( dataset, TABLE_VIDEO_AXIS )
        trows_va = int(tinfo_va['numRows'])
        tinfo_va_day = bqutil.get_bq_table_info( dataset, TABLE_VIDEO_STATS_PER_DAY )
        trows_va_day = int(tinfo_va['numRows'])
        assert tinfo_va is not None and trows_va != 0, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_STATS, TABLE_VIDEO_AXIS ) 
        assert tinfo_va_day is not None and trows_va_day != 0, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_STATS, TABLE_VIDEO_STATS_PER_DAY ) 

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s and/or %s (including 0 rows in table)?  Skipping creation of %s" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_STATS_PER_DAY, TABLE_VIDEO_STATS )
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(dataset, table, the_sql, force_query=force_recompute,
                                depends_on=["%s.%s" % (dataset, TABLE_VIDEO_AXIS)],
                                )
    return bqdat