Esempio n. 1
0
def make_enrollment_events(course_id, force_recompute=False, use_dataset_latest=False, end_date=None):
    '''
    Create an enrollment_events table, based on scanning the tracking logs.
    Record all enrollment events, ordered by time.
    '''

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
	            time,
                    (case when (event_struct.user_id is not null) then event_struct.user_id
                         when (context.user_id is not null) then context.user_id end) as user_id,
                    (case when (event_struct.mode is not null) then event_struct.mode
                         else JSON_EXTRACT_SCALAR(event, "$.mode") end) as mode,
                    (case when (event_type = "edx.course.enrollment.activated") then True else False end) as activated,
                    (case when (event_type = "edx.course.enrollment.deactivated") then True else False end) as deactivated,
                    (case when (event_type = "edx.course.enrollment.mode_changed") then True else False end) as mode_changed,
                    (case when (event_type = "edx.course.enrollment.upgrade.succeeded") then True else False end) as upgraded,
                    event_type,
            FROM {DATASETS} 
            where (event_type contains "edx.course.enrollment")
                  and time > TIMESTAMP("{last_date}")
            order by time;
            """

    table = 'enrollment_events'

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf,
                                                     days_delta=0)
def make_enrollment_events(course_id, force_recompute=False, use_dataset_latest=False, end_date=None):
    '''
    Create an enrollment_events table, based on scanning the tracking logs.
    Record all enrollment events, ordered by time.
    '''

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
	            time,
                    (case when (event_struct.user_id is not null) then event_struct.user_id
                         when (context.user_id is not null) then context.user_id end) as user_id,
                    (case when (event_struct.mode is not null) then event_struct.mode
                         else JSON_EXTRACT_SCALAR(event, "$.mode") end) as mode,
                    (case when (event_type = "edx.course.enrollment.activated") then True else False end) as activated,
                    (case when (event_type = "edx.course.enrollment.deactivated") then True else False end) as deactivated,
                    (case when (event_type = "edx.course.enrollment.mode_changed") then True else False end) as mode_changed,
                    (case when (event_type = "edx.course.enrollment.upgrade.succeeded") then True else False end) as upgraded,
                    event_type,
            FROM {DATASETS} 
            where (event_type contains "edx.course.enrollment")
                  and time > TIMESTAMP("{last_date}")
            order by time;
            """

    table = 'enrollment_events'

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf,
                                                     days_delta=0)
def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None):
    '''
    Create one enrollday_all table, containing time, user_id, and enrollment stats.
    Just as is done for problem_check tables, the master table holds all the events 
    extracted from the tracking logs for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the enrollday_all table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.
    '''

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
	            time, 
                    event_struct.user_id as user_id, 
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "honor")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "honor")
                          then -1 
                          else 0 end) as diff_enrollment_honor,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "verified")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "verified")
                          then -1 
                          else 0 end) as diff_enrollment_verified,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "audit")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "audit")
                          then -1 
                          else 0 end) as diff_enrollment_audit,
            FROM {DATASETS} 
            where (event_type = "edx.course.enrollment.activated") or
                  (event_type = "edx.course.enrollment.deactivated")
                  and time > TIMESTAMP("{last_date}")
            order by time;
            """

    table = 'enrollday_all'

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf,
                                                     days_delta=0)
def createVideoStats_day( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None):
    '''
    Create video statistics per ay for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video
    position > 95% of the total video length duration.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS_PER_DAY
    
    the_sql = """
              SELECT date(time)as date, username,
                              #module_id as video_id,
                              #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path
                              (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'([-])' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only
                              max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else  float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position,
                       FROM {DATASETS}
                       WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and
                              event is not null
                       group by username, video_id, date
                       order by date
              """
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_STATS_PER_DAY )
        assert tinfo is not None, "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS_PER_DAY, course_id)

        print "[analyze_videos] Appending latest data to %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS_PER_DAY, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % ( dataset, TABLE_VIDEO_STATS_PER_DAY )
        sys.stdout.flush()
        pass

    print "=== Processing Video Stats Per Day for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     get_date_function=gdf,
                                                     skip_last_day=skip_last_day)

    print "Done with Video Stats Per Day for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def compute_person_course_day_ip_table(course_id,
                                       force_recompute=False,
                                       use_dataset_latest=False,
                                       end_date=None):
    '''
    make pcday_ip_counts table for specified course_id.

    The master table holds all the (username, course_id, date, ip_addr, nip_count)
    data for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    This is used in computing the modal IP address of users.

    If the pcday_ip_counts table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    If the query fails because of "Resources exceeded during query execution"
    then try setting the end_date, to do part at a time.
    '''

    SQL = """
                    SELECT username, ip, 
                       date(time) as date, 
                       count(*) as ipcount,
                       '{course_id}' as course_id,
                    FROM {DATASETS}
                    where username != ""
                    group by username, ip, date
                    order by date 
          """

    table = 'pcday_ip_counts'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(
        SQL,
        table,
        course_id,
        force_recompute=force_recompute,
        use_dataset_latest=use_dataset_latest,
        end_date=end_date,
        get_date_function=gdf)
def compute_person_course_day_ip_table(course_id, force_recompute=False, use_dataset_latest=False, end_date=None):
    '''
    make pcday_ip_counts table for specified course_id.

    The master table holds all the (username, course_id, date, ip_addr, nip_count)
    data for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    This is used in computing the modal IP address of users.

    If the pcday_ip_counts table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    If the query fails because of "Resources exceeded during query execution"
    then try setting the end_date, to do part at a time.
    '''

    SQL = """
                    SELECT username, ip, 
                       date(time) as date, 
                       count(*) as ipcount,
                       '{course_id}' as course_id,
                    FROM {DATASETS}
                    where username != ""
                    group by username, ip, date
                    order by date 
          """

    table = 'pcday_ip_counts'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf)
def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, 
                   check_dates=True, skip_last_day=False):
    '''
    Make {course_id}.person_course_day table for specified course_id.

    This is a single course-specific table, which contains all day's data.
    It is incrementally updated when new tracking logs data comes in,
    by appending rows to the end.  The rows are kept in time order.

    check_dates is disregarded.

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    videoTableExists = False
    try:

        tinfo_video = bqutil.get_bq_table_info(dataset, 'video_stats_day')
        assert tinfo_video is not None, "Video stats table missing... Not including video stats"
	videoTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass
        #return

    forumTableExists = False
    try:

        tinfo_forum = bqutil.get_bq_table_info(dataset, 'forum_events')
        assert tinfo_forum is not None, "Forum events table missing... Not including forum stats"
	forumTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass
        #return

    problemTableExists = False
    try:

        tinfo_personproblem = bqutil.get_bq_table_info(dataset, 'person_problem')
        tinfo_courseproblem = bqutil.get_bq_table_info(dataset, 'course_problem')
        tinfo_courseaxis = bqutil.get_bq_table_info(dataset, 'course_axis')
        tinfo_personcourse = bqutil.get_bq_table_info(dataset, 'person_course')
	# Check course axis and person course, course problem
        assert tinfo_personproblem is not None, "Person problem table missing... Not including problem stats"
        assert tinfo_courseproblem is not None, "Course problem table missing... Not including problem stats"
        assert tinfo_courseaxis is not None, "Course axis table missing... Not including problem stats"
        assert tinfo_personcourse is not None, "Person Course table missing... Not including problem stats"
	problemTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass

    PCDAY_SQL_BASE_SELECT = """
			  SELECT username,
				 '{course_id}' AS course_id,
				 DATE(time) AS date,
				 SUM(bevent) AS nevents,
				 SUM(bprogress) AS nprogcheck,
				 SUM(bshow_answer) AS nshow_answer,
				 SUM(bvideo) AS nvideo,
				 SUM(bproblem_check) AS nproblem_check,
				 SUM(bforum) AS nforum,
				 SUM(bshow_transcript) AS ntranscript,
				 SUM(bseq_goto) AS nseq_goto,
				 SUM(bseek_video) AS nseek_video,
				 SUM(bpause_video) AS npause_video,
		    """

    PCDAY_SQL_VIDEO_EXISTS = """
			  	 COUNT(DISTINCT video_id) AS nvideos_viewed, # New Video - Unique videos viewed
				 SUM(case when position is not null then FLOAT(position) else FLOAT(0.0) end) AS nvideos_watched_sec, # New Video - # sec watched using max video position
		    """

    PCDAY_SQL_VIDEO_DNE = """
				 0 AS nvideos_viewed, # New Video - Unique videos viewed
				 FLOAT(0.0) AS nvideos_watched_sec, # New Video - # sec watched using max video position
		    """
    PCDAY_SQL_VIDEO_SELECT = PCDAY_SQL_VIDEO_EXISTS if videoTableExists else PCDAY_SQL_VIDEO_DNE

    PCDAY_SQL_FORUM_EXISTS = """
				 SUM(case when read is not null then read else 0 end) AS nforum_reads, # New discussion - Forum reads
				 SUM(case when write is not null then write else 0 end) AS nforum_posts, # New discussion - Forum posts
				 COUNT(DISTINCT thread_id ) AS nforum_threads, # New discussion - Unique forum threads interacted with
		    """

    PCDAY_SQL_FORUM_DNE = """
				 0 AS nforum_reads, # New discussion - Forum reads
				 0 AS nforum_posts, # New discussion - Forum posts
				 0 AS nforum_threads, # New discussion - Unique forum threads interacted with
		    """
    PCDAY_SQL_FORUM_SELECT = PCDAY_SQL_FORUM_EXISTS if forumTableExists else PCDAY_SQL_FORUM_DNE

    PCDAY_SQL_PROBLEM_EXISTS = """
				 COUNT(DISTINCT problem_nid ) AS nproblems_answered, # New Problem - Unique problems attempted
				 SUM(case when n_attempts is not null then n_attempts else 0 end) AS nproblems_attempted, # New Problem - Total attempts
				 SUM(case when ncount_problem_multiplechoice is not null then ncount_problem_multiplechoice else 0 end) as nproblems_multiplechoice,
				 SUM(case when ncount_problem_choice is not null then ncount_problem_choice else 0 end) as nproblems_choice,
				 SUM(case when ncount_problem_numerical is not null then ncount_problem_numerical else 0 end) as nproblems_numerical,
				 SUM(case when ncount_problem_option is not null then ncount_problem_option else 0 end) as nproblems_option,
				 SUM(case when ncount_problem_custom is not null then ncount_problem_custom else 0 end) as nproblems_custom,
				 SUM(case when ncount_problem_string is not null then ncount_problem_string else 0 end) as nproblems_string,
				 SUM(case when ncount_problem_mixed is not null then ncount_problem_mixed else 0 end) as nproblems_mixed,
				 SUM(case when ncount_problem_formula is not null then ncount_problem_formula else 0 end) as nproblems_forumula,
				 SUM(case when ncount_problem_other is not null then ncount_problem_other else 0 end) as nproblems_other,
		    """

    PCDAY_SQL_PROBLEM_DNE = """
				 0 AS nproblems_answered, # New Problem - Unique problems attempted
				 0 AS nproblems_attempted, # New Problem - Total attempts
				 0 AS nproblems_multiplechoice,
				 0 AS nproblems_choice,
				 0 AS nproblems_numerical,
				 0 AS nproblems_option,
				 0 AS nproblems_custom,
				 0 AS nproblems_string,
				 0 AS nproblems_mixed,
				 0 AS nproblems_forumula,
				 0 AS nproblems_other,
		    """
    PCDAY_SQL_PROBLEM_SELECT = PCDAY_SQL_PROBLEM_EXISTS if problemTableExists else PCDAY_SQL_PROBLEM_DNE

    PCDAY_SQL_MID = """
				 MAX(time) AS last_event,
				 AVG( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS avg_dt,
				 STDDEV( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sdv_dt,
				 MAX( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS max_dt,
				 COUNT( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS n_dt,
				 SUM( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sum_dt
			FROM (
			  SELECT
			    *
			  FROM (
			    SELECT
			      username,
			      CASE WHEN event_type = "play_video" THEN 1 ELSE 0 END AS bvideo,
			      CASE WHEN event_type = "problem_check" THEN 1 ELSE 0 END AS bproblem_check,
			      CASE WHEN username != "" THEN 1 ELSE 0 END AS bevent,
			      CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum,
			      CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress,
			      CASE WHEN event_type IN ("show_answer",
				"showanswer") THEN 1 ELSE 0 END AS bshow_answer,
			      CASE WHEN event_type = 'show_transcript' THEN 1 ELSE 0 END AS bshow_transcript,
			      CASE WHEN event_type = 'seq_goto' THEN 1 ELSE 0 END AS bseq_goto,
			      CASE WHEN event_type = 'seek_video' THEN 1 ELSE 0 END AS bseek_video,
			      CASE WHEN event_type = 'pause_video' THEN 1 ELSE 0 END AS bpause_video,
			      # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll,
			      # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll
			      time,
			      LAG(time, 1) OVER (PARTITION BY username ORDER BY time) last_time
			    FROM {DATASETS}
			    WHERE
			      NOT event_type CONTAINS "/xblock/"
			      AND username != "" )
		    """


    PCDAY_SQL_VIDEO = """ ,
			  ( # Video events
				  SELECT TIMESTAMP(date) as time,
				         '{course_id}' as course_id,
				         username,
				         video_id,
				         position,
				  FROM [{dataset}.video_stats_day]
				  WHERE TIMESTAMP(date)>= TIMESTAMP("{min_date_start}") and TIMESTAMP(date) <= TIMESTAMP("{max_date_end}")

			  )
                      """
    PCDAY_SQL_ADD = PCDAY_SQL_VIDEO if videoTableExists else ''

    PCDAY_SQL_FORUM = """ ,
			  ( # Forum Events
				   SELECT time,
					  username,
				          '{course_id}' as course_id,
				          thread_id,
				          (CASE WHEN (forum_action == "reply" or forum_action == "comment_reply"
						      or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment")
						THEN 1 ELSE 0 END) AS write,
					  (CASE WHEN (forum_action == "read" or forum_action == "read_inline") THEN 1 ELSE 0 END) AS read,
				   FROM [{dataset}.forum_events]
				   WHERE (forum_action == "reply" or forum_action == "comment_reply"
					  or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment"
					  or forum_action == "read" or forum_action == "read_inline")
				          and ( time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}") )
			  )
                      """
    PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_FORUM if forumTableExists else PCDAY_SQL_ADD

    PCDAY_SQL_PROBLEM = """,
			  ( # Problems
				   SELECT pc.username AS username,
				          pp.problem_nid AS problem_nid,
				          pp.n_attempts AS n_attempts,
				          pp.time AS time,
				          '{course_id}' as course_id,
					  pp.ncount_problem_multiplechoice as ncount_problem_multiplechoice,
					  pp.ncount_problem_choice as ncount_problem_choice,
					  pp.ncount_problem_numerical as ncount_problem_numerical,
					  pp.ncount_problem_option as ncount_problem_option,
					  pp.ncount_problem_custom as ncount_problem_custom,
					  pp.ncount_problem_string as ncount_problem_string,
					  pp.ncount_problem_mixed as ncount_problem_mixed,
					  pp.ncount_problem_formula as ncount_problem_formula,
					  pp.ncount_problem_other as ncount_problem_other,
				   FROM (

					   (
					      SELECT PP.user_id as user_id,
						     PP.problem_nid AS problem_nid,
						     PP.n_attempts as n_attempts,
						     PP.date as time,
						     (Case when CP_CA.data_itype == "multiplechoiceresponse" then 1 else 0 end) as ncount_problem_multiplechoice, # Choice
					             (Case when CP_CA.data_itype == "choiceresponse" then 1 else 0 end) as ncount_problem_choice,       # Choice
						     (Case when CP_CA.data_itype == "numericalresponse" then 1 else 0 end) as ncount_problem_numerical, #input
						     (Case when CP_CA.data_itype == "optionresponse" then 1 else 0 end) as ncount_problem_option,       # Choice
					             (Case when CP_CA.data_itype == "customresponse" then 1 else 0 end) as ncount_problem_custom,       # Custom
					             (Case when CP_CA.data_itype == "stringresponse" then 1 else 0 end) as ncount_problem_string,       # Input
					             (Case when CP_CA.data_itype == "mixed" then 1 else 0 end) as ncount_problem_mixed,                 # Mixed
					             (Case when CP_CA.data_itype == "forumula" then 1 else 0 end) as ncount_problem_formula,            # Input
					             (Case when CP_CA.data_itype != "multiplechoiceresponse" and
							        CP_CA.data_itype != "choiceresponse" and
							        CP_CA.data_itype != "numericalresponse" and
							        CP_CA.data_itype != "optionresponse" and
							        CP_CA.data_itype != "customresponse" and
							        CP_CA.data_itype != "stringresponse" and
							        CP_CA.data_itype != "mixed" and
							        CP_CA.data_itype != "forumula"
							   then 1 else 0 end) as ncount_problem_other, # Input
						     #MAX(n_attempts) AS n_attempts,
						     #MAX(date) AS time,
					      FROM [{dataset}.person_problem] PP
					      LEFT JOIN
					      (
							SELECT CP.problem_nid as problem_nid,
							       INTEGER(CP.problem_id) as problem_id,
							       CA.data.itype as data_itype,
						        FROM [{dataset}.course_problem] CP
						        LEFT JOIN [{dataset}.course_axis] CA
						        ON CP.problem_id == CA.url_name
					      ) as CP_CA
					      ON PP.problem_nid == CP_CA.problem_nid
					      GROUP BY time, user_id, problem_nid, n_attempts,
						       ncount_problem_multiplechoice,
						       ncount_problem_choice,
						       ncount_problem_choice,
						       ncount_problem_numerical,
						       ncount_problem_option,
						       ncount_problem_custom,
						       ncount_problem_string,
						       ncount_problem_mixed,
						       ncount_problem_formula,
						       ncount_problem_other
					      )

					      #FROM [{dataset}.person_item] PI
					      #JOIN [{dataset}.course_item] CI
					      #ON PI.item_nid = CI.item_nid
					      #GROUP BY user_id,
						       #problem_nid
					      #ORDER BY
						       #user_id,
						       #problem_nid
					) AS pp
				        LEFT JOIN (
							      SELECT username,
								     user_id
							      FROM [{dataset}.person_course] 
					) AS pc
					ON pc.user_id = pp.user_id
				        WHERE time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}")
			  )
 
                        """
    PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_PROBLEM if problemTableExists else PCDAY_SQL_ADD

    PCDAY_SQL_END = """
			  )
			  WHERE time > TIMESTAMP("{last_date}")
			  GROUP BY course_id,
				   username,
				   date
			  ORDER BY date
		    """


    PCDAY_SQL_NEW = PCDAY_SQL_BASE_SELECT + PCDAY_SQL_VIDEO_SELECT + PCDAY_SQL_FORUM_SELECT + PCDAY_SQL_PROBLEM_SELECT + PCDAY_SQL_MID + PCDAY_SQL_ADD + PCDAY_SQL_END

    PCDAY_SQL = PCDAY_SQL_NEW.format( dataset=dataset, course_id="{course_id}", DATASETS="{DATASETS}", last_date="{last_date}", min_date_start="{min_date_start}", max_date_end="{max_date_end}")

    table = 'person_course_day'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    print "=== Processing person_course_day for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    # Major person_course_day schema revision 19-Jan-2016 adds new fields; if table exists, ensure it 
    # has new schema, else force recompute.
    try:
        tinfo = bqutil.get_bq_table_info(dataset, table)
    except Exception as err:
        tinfo = None
    if tinfo:
        fields = tinfo['schema']['fields']
        field_names = [x['name'] for x in fields]
        if not 'nvideos_viewed' in field_names:
            cdt = tinfo['creationTime']
            print "    --> person_course_day created %s; missing nvideos_viewed field in schema; forcing recompute - this may take a long time!" % cdt
            sys.stdout.flush()
            force_recompute = True

    process_tracking_logs.run_query_on_tracking_logs(PCDAY_SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf,
                                                     newer_than=datetime.datetime( 2016, 1, 19, 22, 30 ),
                                                     skip_last_day=skip_last_day)
    
    print "Done with person_course_day for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def createVideoStats_day(course_id,
                         force_recompute=False,
                         use_dataset_latest=False,
                         skip_last_day=False,
                         end_date=None):
    '''
    Create video statistics per ay for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video
    position > 95% of the total video length duration.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS_PER_DAY

    the_sql = """
              SELECT date(time)as date, username,
                              #module_id as video_id,
                              #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path
                              (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'([-])' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only
                              max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else  float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position,
                       FROM {DATASETS}
                       WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and
                              event is not null
                       group by username, video_id, date
                       order by date
              """
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_STATS_PER_DAY)
        assert tinfo is not None, "[analyze_videos] Creating %s.%s table for %s" % (
            dataset, TABLE_VIDEO_STATS_PER_DAY, course_id)

        print "[analyze_videos] Appending latest data to %s.%s table for %s" % (
            dataset, TABLE_VIDEO_STATS_PER_DAY, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % (
            dataset, TABLE_VIDEO_STATS_PER_DAY)
        sys.stdout.flush()
        pass

    print "=== Processing Video Stats Per Day for %s (start %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(
        the_sql,
        table,
        course_id,
        force_recompute=force_recompute,
        use_dataset_latest=use_dataset_latest,
        get_date_function=gdf,
        skip_last_day=skip_last_day)

    print "Done with Video Stats Per Day for %s (end %s)" % (
        course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
Esempio n. 9
0
def get_ora_events(course_id, force_recompute=False, use_dataset_latest=False, end_date=None):
    '''
    make ora_events table for specified course_id.

    The master table holds all the openassessment events extracted from
    the tracking logs for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the ora_events table doesn't exist, then run it once on all
    the existing tracking logs.  
    '''
    
    SQL = """
        SELECT 
           (case when module_id_sometimes is not null then REGEXP_EXTRACT(module_id_sometimes, ".*/([^/]+)")
                when submission_uuid is not null then REGEXP_EXTRACT(last_module_id, ".*/([^/]+)")
                end) as problem_id,
           *,
           (case when module_id_sometimes is not null then module_id_sometimes
                when submission_uuid is not null then last_module_id
                end) as module_id,
        FROM        
          (SELECT 
             *,
             # must aggregate to get module_id, which is not available for some events, like peer_assess
             # this means module_id may be missing, e.g. if the creation event happens in a different day's tracking log
             NTH_VALUE(module_id_sometimes,1) 
                 over (partition by submission_uuid order by module_id_sometimes desc) last_module_id,
           from
               (SELECT 
                   time, 
                   username,
                   '{course_id}' as course_id,
               (case when REGEXP_MATCH(event_type, "openassessmentblock..*") then 
                          REGEXP_EXTRACT(event_type, "openassessmentblock.(.*)")
                     when REGEXP_MATCH(event_type, "/courses/.*") then
                          REGEXP_EXTRACT(event_type, "/courses/.*/(handler/.*)")
                     end) as action,
               (case when event_type = "openassessmentblock.peer_assess" then JSON_EXTRACT_SCALAR(event, "$.submission_uuid")
                     when event_type = "openassessmentblock.self_assess" then JSON_EXTRACT_SCALAR(event, "$.submission_uuid")
                     when event_type = "openassessmentblock.create_submission" then JSON_EXTRACT_SCALAR(event, "$.submission_uuid")
                     when event_type = "openassessmentblock.get_peer_submission" then JSON_EXTRACT_SCALAR(event, "$.submission_returned_uuid")
                    end) as submission_uuid,
               (case when event_type="openassessmentblock.peer_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].option.points")
                     when event_type="openassessmentblock.self_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].option.points")
                    end) as part1_points,
               (case when event_type="openassessmentblock.peer_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].feedback")
                     when event_type="openassessmentblock.self_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].feedback")
                    end) as part1_feedback,
               (case when event_type="openassessmentblock.create_submission" then JSON_EXTRACT_SCALAR(event, "$.attempt_number")
                    end) as attempt_num,
               (case 
                     when event_type="openassessmentblock.get_peer_submission" then JSON_EXTRACT_SCALAR(event, "$.item_id")
                     when event_type="openassessmentblock.create_submission" then 
                                  REGEXP_EXTRACT( JSON_EXTRACT_SCALAR(event, "$.answer.file_key"), ".*(i4x://.*)")
                     when REGEXP_MATCH(event_type, "/courses/.*/xblock/i4x:.*") then
                                  REGEXP_REPLACE( REGEXP_EXTRACT( event_type, "/courses/.*/xblock/(i4x:;_[^/]+)/handler.*" ),
                                                  ";_", "/")
                     end) as module_id_sometimes,
               event_type,
               event,
               event_source,
               (case when event_type="openassessmentblock.create_submission" then JSON_EXTRACT_SCALAR(event, "$.answer.text")
                    end) as answer_text,
               from {DATASETS}
               where event_type contains "openassessment"
                  and event != '{{"POST": {{}}, "GET": {{}}}}'	# disregard if user did nothing but look at the page
                  and time > TIMESTAMP("{last_date}")
               order by time
             )
          order by time
         )
        order by time
            """

    table = 'ora_events'

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf,
                                                     days_delta=0)
        
Esempio n. 10
0
def process_course_time_on_asset(course_id, force_recompute=False, use_dataset_latest=False, end_date=None,
                                just_do_totals=False, limit_query_size=False, table_max_size_mb=800,
                                skip_totals=False, start_date=None, config_parameter_overrides=None):
    '''
    Create the time_on_asset_daily table, containing module_id, username, date, 
    and time on asset stats.  This table has separate rows for each date, 
    and is ordered in time.  To update it, a new day's logs are
    processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    Compute totals and store in time_on_asset_totals, by summing over all dates, 
    grouped by module_id.

    How are time_on_asset numbers computed?

    See discussion in make_time_on_task.py

    The time_one_asset_daily table has these columns:

    - date: gives day for the data
    - username
    - module_id
    - time_umid5: total time on module (by module_id) in seconds, with a 5-minute timeout
    - time_umid30: total time on module (by module_id) in seconds, with a 30-minute timeout

    '''


    if just_do_totals:
        return process_time_on_task_totals(course_id, force_recompute=force_recompute, 
                                           use_dataset_latest=use_dataset_latest)

    SQL = """
            SELECT
                    "{course_id}" as course_id,
                    date(time) as date,
                    username,
                    module_id,
                    # time_umid5 = total time on module (by module_id) in seconds
                    # time_mid5 has 5 minute timeout, time_mid30 has 30 min timeout
                    SUM( case when dt_umid < 5*60 then dt_umid end ) as time_umid5,
                    SUM( case when dt_umid < 30*60 then dt_umid end ) as time_umid30,
            FROM (
              SELECT time,
                username,
                module_id,
                (time - last_time)/1.0E6 as dt,         # dt is in seconds
                (time - last_time_umid)/1.0E6 as dt_umid,   # dt for (user, module_id) in seconds
                last_time_umid,
              FROM
                (
                SELECT time,
                    username,
                    last_username,
                    module_id,
                    USEC_TO_TIMESTAMP(last_time) as last_time,
                    USEC_TO_TIMESTAMP(last_time_umid) as last_time_umid,                    
                FROM (
                  SELECT time,
                    username,
                    module_id,
                    lag(time, 1) over (partition by username order by time) last_time,
                    lag(username, 1) over (partition by username order by time) last_username,
                    lag(time, 1) over (partition by username, module_id order by time) last_time_umid,
                  FROM
                    (SELECT time, 
                      username,
                      (case when REGEXP_MATCH(module_id, r'.*\"\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\}}$')
                            when REGEXP_MATCH(module_id, r'.*\"\]\}}\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\]\}}\}}$')
                           else module_id end) as module_id,	# fix some errors in module_id names
                    FROM {DATASETS}
                  )
         
                    WHERE       
                                     module_id is not null
                                     AND username is not null
                                     AND username != ""
                                     and time > TIMESTAMP("{last_date}")
                                     
                    )
                )
              )
              WHERE module_id is not null
                    AND NOT module_id CONTAINS '"'
              GROUP BY date, module_id, username
              ORDER BY date, module_id, username
          """

    table = 'time_on_asset_daily'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     start_date=start_date,
                                                     get_date_function=gdf,
                                                     days_delta=0,
                                                     has_hash_limit=True,
                                                     newer_than=datetime.datetime(2015,3,15),	# schema change
                                                     table_max_size_mb=table_max_size_mb,
                                                     limit_query_size=limit_query_size)

    if not skip_totals:
        return process_time_on_asset_totals(course_id, force_recompute=force_recompute, 
                                           use_dataset_latest=use_dataset_latest)

    return
def CreateForumEvents(course_id,
                      force_recompute=False,
                      use_dataset_latest=False,
                      skip_last_day=False,
                      end_date=None):
    '''
    Create forum events table, based on tracking logs.  Extracts all forum-related events, including forum post reads,
    into the date-time ordered table.  Repeated calls to this procedure will append new events to the table.  If no
    new events are found, the existing table is left unchanged.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_FORUM_EVENTS

    # event_type for forums may be like:
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/threads/5460c918a2a525003a0007fa
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/inline
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/users/4051854/followed
    #  /courses/UnivX/123.4x/2T2015/discussion/comments/54593f21a2a525003a000351/reply
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545e4f5da2a5251aac000672/reply
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/upvote
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/unvote
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/5447c22e892b213c7b0001f3/update
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/54493025892b2120a1000335/pin
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/54492e9c35c79cb03e00030c/delete
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/General/inline
    #  /courses/UnivX/123.4x/2T2015/instructor/api/list_forum_members
    #  /courses/UnivX/123.4x/2T2015/instructor/api/update_forum_role_membership
    #     \"GET\": {\"action\": [\"allow\"], \"rolename\": [\"Administrator\"], \"unique_student_identifier\": [\"NEW_ADMIN_USER\"]}}"}
    #
    # module_id will be like:
    # "module_id": "UnivX/123.4x/forum/54492f0c892b21597e00030a"

    the_sql = """
              SELECT time, 
                     username,
                     '{course_id}' as course_id,
                     (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/reply') then "reply"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/upvote') then "upvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unvote') then "unvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/update') then "update"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/delete') then "delete"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/close') then "close"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/follow') then "follow_thread"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unfollow') then "unfollow_thread"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/pin') then "pin"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unpin') then "unpin"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/downvote') then "downvote"  # does this happen?
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/reply') then "comment_reply"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/upvote') then "comment_upvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/update') then "comment_update"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/unvote') then "comment_unvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/delete') then "comment_delete"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+/followed') then "follow_user"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+$') then "target_user"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then "read"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/inline') then "read_inline"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/search') then "search"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum$') then "enter_forum"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/$') then "enter_forum"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/instructor/api/(.*)') then REGEXP_EXTRACT(event_type, r'/courses/.*/instructor/api/(.*)')
                           when event_type = "edx.forum.thread.created" then "created_thread"
                           when event_type = "edx.forum.response.created" then "created_response"
                           when event_type = "edx.forum.comment.created" then "created_comment"
                           when event_type = "edx.forum.searched" then "searched"
                           else event_type end) as forum_action,
                           (case when module_id is not null then REGEXP_EXTRACT(module_id, r'[^/]+/[^/]+/forum/([^/]+)') # For old-school courses with transparent course ids
                                      else (case when module_id is null # otherwise, for new opaque course ids, use regex to find thread_id from event_type, since module_id is null
                                               then (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/forum/[^/]+/threads/([^/]+)') # read
                                                      else (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/threads/([^/]+)') # upvote, pinned, upvoted, unvoted, deleted, followed
                                                             else REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/comments/([^/]+)/') end) # comment
                                                                end) end) end) as thread_id,
                     REGEXP_EXTRACT(event_type, r'/courses/.*/forum/([^/]+)/') as subject,
                     REGEXP_EXTRACT(event_type, r'/courses/.*/forum/users/([^/]+)') as target_user_id,
                     event_struct.query as search_query,   # unavailable before June 1, 2015
                     event_struct.GET as event_GET,        # unavailable before June 1, 2015
              FROM {DATASETS}
              WHERE  (REGEXP_MATCH(event_type ,r'^edx\.forum\..*')
                      or event_type contains "/discussion/forum"
                      or event_type contains "/discussion/threads"
                      or event_type contains "/discussion/comments"
                      or event_type contains "list-forum-"
                      or event_type contains "list_forum_"
                      or event_type contains "add-forum-"
                      or event_type contains "add_forum_"
                      or event_type contains "remove-forum-"
                      or event_type contains "remove_forum_"
                      or event_type contains "update_forum_"
                     ) 
                    AND username is not null
                    AND event is not null
                    and time > TIMESTAMP("{last_date}")
                    {hash_limit}
              order by time
              """

    try:
        tinfo = bqutil.get_bq_table_info(dataset, table)
        assert tinfo is not None, "[make_forum_analysis] Creating %s.%s table for %s" % (
            dataset, table, course_id)

        print "[make_forum_analysis] Appending latest data to %s.%s table for %s" % (
            dataset, table, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % (dataset, table)
        sys.stdout.flush()
        pass

    print "=== Processing Forum Events for %s (start %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(
        the_sql,
        table,
        course_id,
        force_recompute=force_recompute,
        use_dataset_latest=use_dataset_latest,
        get_date_function=gdf,
        has_hash_limit=True,
        end_date=end_date,
        skip_last_day=skip_last_day)

    print "Done with Forum Events for %s (end %s)" % (course_id,
                                                      datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()
def get_ora_events(course_id, force_recompute=False, use_dataset_latest=False, end_date=None):
    '''
    make ora_events table for specified course_id.

    The master table holds all the openassessment events extracted from
    the tracking logs for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the ora_events table doesn't exist, then run it once on all
    the existing tracking logs.  
    '''
    
    SQL = """
        SELECT 
           (case when module_id_sometimes is not null then REGEXP_EXTRACT(module_id_sometimes, ".*/([^/]+)")
                when submission_uuid is not null then REGEXP_EXTRACT(last_module_id, ".*/([^/]+)")
                end) as problem_id,
           *,
           (case when module_id_sometimes is not null then module_id_sometimes
                when submission_uuid is not null then last_module_id
                end) as module_id,
        FROM        
          (SELECT 
             *,
             # must aggregate to get module_id, which is not available for some events, like peer_assess
             # this means module_id may be missing, e.g. if the creation event happens in a different day's tracking log
             NTH_VALUE(module_id_sometimes,1) 
                 over (partition by submission_uuid order by module_id_sometimes desc) last_module_id,
           from
               (SELECT 
                   time, 
                   username,
                   '{course_id}' as course_id,
               (case when REGEXP_MATCH(event_type, "openassessmentblock..*") then 
                          REGEXP_EXTRACT(event_type, "openassessmentblock.(.*)")
                     when REGEXP_MATCH(event_type, "/courses/.*") then
                          REGEXP_EXTRACT(event_type, "/courses/.*/(handler/.*)")
                     end) as action,
               (case when event_type = "openassessmentblock.peer_assess" then JSON_EXTRACT_SCALAR(event, "$.submission_uuid")
                     when event_type = "openassessmentblock.self_assess" then JSON_EXTRACT_SCALAR(event, "$.submission_uuid")
                     when event_type = "openassessmentblock.create_submission" then JSON_EXTRACT_SCALAR(event, "$.submission_uuid")
                     when event_type = "openassessmentblock.get_peer_submission" then JSON_EXTRACT_SCALAR(event, "$.submission_returned_uuid")
                    end) as submission_uuid,
               (case when event_type="openassessmentblock.peer_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].option.points")
                     when event_type="openassessmentblock.self_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].option.points")
                    end) as part1_points,
               (case when event_type="openassessmentblock.peer_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].feedback")
                     when event_type="openassessmentblock.self_assess" then JSON_EXTRACT_SCALAR(event, "$.parts[0].feedback")
                    end) as part1_feedback,
               (case when event_type="openassessmentblock.create_submission" then JSON_EXTRACT_SCALAR(event, "$.attempt_number")
                    end) as attempt_num,
               (case 
                     when event_type="openassessmentblock.get_peer_submission" then JSON_EXTRACT_SCALAR(event, "$.item_id")
                     when event_type="openassessmentblock.create_submission" then 
                                  REGEXP_EXTRACT( JSON_EXTRACT_SCALAR(event, "$.answer.file_key"), ".*(i4x://.*)")
                     when REGEXP_MATCH(event_type, "/courses/.*/xblock/i4x:.*") then
                                  REGEXP_REPLACE( REGEXP_EXTRACT( event_type, "/courses/.*/xblock/(i4x:;_[^/]+)/handler.*" ),
                                                  ";_", "/")
                     end) as module_id_sometimes,
               event_type,
               event,
               event_source,
               (case when event_type="openassessmentblock.create_submission" then JSON_EXTRACT_SCALAR(event, "$.answer.text")
                    end) as answer_text,
               from {DATASETS}
               where event_type contains "openassessment"
                  and event != '{{"POST": {{}}, "GET": {{}}}}'	# disregard if user did nothing but look at the page
                  and time > TIMESTAMP("{last_date}")
               order by time
             )
          order by time
         )
        order by time
            """

    table = 'ora_events'

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf,
                                                     days_delta=0)
def ExtractProblemEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None):
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_PROBLEM_EVENTS
    the_sql = """
SELECT  
    context.user_id as user_id, 
    time,
    event_source,
    REGEXP_EXTRACT(
      (CASE when module_id is not null then module_id 
          when event_type contains "/xblock/i4x:;_" then REPLACE(REGEXP_EXTRACT(event_type, r"i4x:;_;_(.*)/handler/xmodule"),";_", "/")
          else REPLACE(event_struct.problem, "i4x://", "")
          end),
      "[^/]+/problem/([^/]+)") as problem_url,
    (CASE when event_type contains "/xblock/i4x:;_" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)")
          when event_type contains "type@problem+block" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)")
          else event_type
          end) as event_type,
   event_struct.attempts as attempts,
   event_struct.success as success,
   event_struct.grade as grade,          
FROM {DATASETS}
WHERE       
   ( REGEXP_MATCH(event_type, r'problem_\w+') 
     OR event_type = "showanswer"
   )
   AND context.user_id is not null
   and time > TIMESTAMP("{last_date}")
   {hash_limit}
order by user_id, time
    """

    try:
        bqutil.create_dataset_if_nonexistent(dataset)
        tinfo = bqutil.get_bq_table_info(dataset, table )
        assert tinfo is not None, "[make_problem_events] Creating %s.%s table for %s" % (dataset, table, course_id)

        print "[make_problem_events] Appending latest data to %s.%s table for %s" % (dataset, table, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % ( dataset, table )
        sys.stdout.flush()
        pass

    print "=== Processing Forum Events for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     get_date_function=gdf,
                                                     has_hash_limit=True,
                                                     end_date=end_date,
                                                     skip_last_day=skip_last_day
                                                    )

    print "Done with Problem Events for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def ExtractProblemEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None):
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_PROBLEM_EVENTS
    the_sql = """
SELECT  
    context.user_id as user_id, 
    time,
    event_source,
    REGEXP_EXTRACT(
      (CASE when module_id is not null then module_id 
          when event_type contains "/xblock/i4x:;_" then REPLACE(REGEXP_EXTRACT(event_type, r"i4x:;_;_(.*)/handler/xmodule"),";_", "/")
          else REPLACE(event_struct.problem, "i4x://", "")
          end),
      "[^/]+/problem/([^/]+)") as problem_url,
    (CASE when event_type contains "/xblock/i4x:;_" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)")
          when event_type contains "type@problem+block" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)")
          else event_type
          end) as event_type,
   event_struct.attempts as attempts,
   event_struct.success as success,
   event_struct.grade as grade,          
FROM {DATASETS}
WHERE       
   ( REGEXP_MATCH(event_type, r'problem_\w+') 
     OR event_type = "showanswer"
   )
   AND context.user_id is not null
   and time > TIMESTAMP("{last_date}")
   {hash_limit}
order by user_id, time
    """

    try:
        tinfo = bqutil.get_bq_table_info(dataset, table )
        assert tinfo is not None, "[make_problem_events] Creating %s.%s table for %s" % (dataset, table, course_id)

        print "[make_problem_events] Appending latest data to %s.%s table for %s" % (dataset, table, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % ( dataset, table )
        sys.stdout.flush()
        pass

    print "=== Processing Forum Events for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     get_date_function=gdf,
                                                     has_hash_limit=True,
                                                     end_date=end_date,
                                                     skip_last_day=skip_last_day
                                                    )

    print "Done with Problem Events for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
Esempio n. 15
0
def process_course_time_on_task(course_id, force_recompute=False, use_dataset_latest=False, end_date=None,
                                just_do_totals=False, limit_query_size=False, table_max_size_mb=800,
                                skip_totals=False, start_date=None):
    '''
    Create the time_on_task table, containing time, user_id, and time
    on task stats.  This table isn't split into separate days.  It is
    ordered in time, however.  To update it, a new day's logs are
    processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    Compute totals and store in time_on_task_totals, by summing over all dates, 
    grouped by user.
    '''

    if just_do_totals:
        return process_time_on_task_totals(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
                    date(time) as date,
                    username, 

                    # total time spent on system
                    SUM( case when dt < 5*60 then dt end ) as total_time_5,
                    SUM( case when dt < 30*60 then dt end ) as total_time_30,

                    # total time spent watching videos
                    SUM( case when (dt_video is not null) and (dt_video < 5*60) then dt_video end ) as total_video_time_5,
                    SUM( case when (dt_video is not null) and (dt_video < 30*60) then dt_video end ) as total_video_time_30,
                    SUM( case when (serial_dt_video is not null) and (serial_dt_video < 30*60) then serial_dt_video end ) as serial_video_time_30,

                    # total time spent doing problems
                    SUM( case when (dt_problem is not null) and (dt_problem < 5*60) then dt_problem end ) as total_problem_time_5,
                    SUM( case when (dt_problem is not null) and (dt_problem < 30*60) then dt_problem end ) as total_problem_time_30,
                    SUM( case when (serial_dt_problem is not null) and (serial_dt_problem < 30*60) then serial_dt_problem end ) as serial_problem_time_30,

                    # total time spent on forum
                    SUM( case when (dt_forum is not null) and (dt_forum < 5*60) then dt_forum end ) as total_forum_time_5,
                    SUM( case when (dt_forum is not null) and (dt_forum < 30*60) then dt_forum end ) as total_forum_time_30,
                    SUM( case when (serial_dt_forum is not null) and (serial_dt_forum < 30*60) then serial_dt_forum end ) as serial_forum_time_30,

                    # total time spent with textbook or wiki
                    SUM( case when (dt_text is not null) and (dt_text < 5*60) then dt_text end ) as total_text_time_5,
                    SUM( case when (dt_text is not null) and (dt_text < 30*60) then dt_text end ) as total_text_time_30,
                    SUM( case when (serial_dt_text is not null) and (serial_dt_text < 30*60) then serial_dt_text end ) as serial_text_time_30,
            FROM
              (
              SELECT time,
                username,
                (time - last_time)/1.0E6 as dt,         # dt is in seconds
                case when is_video then (time - last_time_video)/1.0E6 end as dt_video,
                case when is_problem then (time - last_time_problem)/1.0E6 end as dt_problem,
                case when is_forum then (time - last_time_forum)/1.0E6 end as dt_forum,
                case when is_text then (time - last_time_text)/1.0E6 end as dt_text,

                case when is_video then (time - last_time_xevent)/1.0E6 end as serial_dt_video,
                case when is_problem then (time - last_time_xevent)/1.0E6 end as serial_dt_problem,
                case when is_forum then (time - last_time_xevent)/1.0E6 end as serial_dt_forum,
                case when is_text then (time - last_time_xevent)/1.0E6 end as serial_dt_text,
              FROM
                (
                SELECT time, 
                    username,
                    last_username,
                    USEC_TO_TIMESTAMP(last_time) as last_time,
                    (case when is_video then USEC_TO_TIMESTAMP(last_time_video) end) as last_time_video,
                    # last_username_video,
                    # last_event_video,
                    is_problem,
                    is_video,
                    (case when is_problem then USEC_TO_TIMESTAMP(last_time_problem) end) as last_time_problem,
                    # last_username_problem,
                    # last_event_problem,
                    is_forum,
                    is_text,
                    (case when is_forum then USEC_TO_TIMESTAMP(last_time_forum) end) as last_time_forum,
                    (case when is_text then USEC_TO_TIMESTAMP(last_time_text) end) as last_time_text,
                    is_xevent,
                    (case when is_xevent then USEC_TO_TIMESTAMP(last_time_xevent) end) as last_time_xevent,
                FROM
                  (SELECT time,
                    username,
                    lag(time, 1) over (partition by username order by time) last_time,
                    lag(username, 1) over (partition by username order by time) last_username,
                    is_video,
                    is_problem,
                    is_forum,
                    is_text,
                    (is_video or is_problem or is_forum or is_text) as is_xevent,   # x = video, problem, forum, or text: any event
                    case when is_problem then username else '' end as uname_problem,
                    case when is_video then username else '' end as uname_video,
                    case when is_forum then username else '' end as uname_forum,
                    case when is_text then username else '' end as uname_text,
                    case when (is_video or is_problem or is_forum or is_text) then username else '' end as uname_xevent,
                    lag(time, 1) over (partition by uname_video order by time) last_time_video,
                    # lag(event_type, 1) over (partition by uname_video order by time) last_event_video,
                    # lag(uname_video, 1) over (partition by uname_video order by time) last_username_video,
                    lag(time, 1) over (partition by uname_problem order by time) last_time_problem,
                    # lag(event_type, 1) over (partition by uname_problem order by time) last_event_problem,
                    # lag(uname_problem, 1) over (partition by uname_problem order by time) last_username_problem,
                    lag(time, 1) over (partition by uname_forum order by time) last_time_forum,
                    lag(time, 1) over (partition by uname_text order by time) last_time_text,
                    lag(time, 1) over (partition by uname_xevent order by time) last_time_xevent,
                  FROM
                    (SELECT time, 
                      username,
                      event_type,
                      case when (REGEXP_MATCH(event_type ,r'\w+_video')
                               or REGEXP_MATCH(event_type, r'\w+_transcript')
                              ) then True else False end as is_video,
                      case when REGEXP_MATCH(event_type, r'problem_\w+') then True else False end as is_problem,
                      case when (REGEXP_MATCH(event_type ,r'^edx\.forum\..*')
                               or event_type contains "/discussion/forum"
                              ) then True else False end as is_forum,
                      case when (REGEXP_MATCH(event_type ,r'^textbook\..*')
                               or event_type contains "/wiki/"
                              ) then True else False end as is_text,
                    FROM {DATASETS}
                    WHERE       
                                     NOT event_type contains "/xblock/"
                                     AND username is not null
                                     AND username != ""
                                     and time > TIMESTAMP("{last_date}")
                                     {hash_limit}
                    )
                  )
                WHERE last_time is not null
                ORDER BY username, time
                )
              )
            group by course_id, username, date
            order by date, username;
          """

    table = 'time_on_task'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     start_date=start_date,
                                                     get_date_function=gdf,
                                                     days_delta=0,
                                                     has_hash_limit=True,
                                                     newer_than=datetime.datetime(2015,3,15),	# schema change
                                                     table_max_size_mb=table_max_size_mb,
                                                     limit_query_size=limit_query_size)

    if not skip_totals:
        return process_time_on_task_totals(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)

    return
def process_course(course_id,
                   force_recompute=False,
                   use_dataset_latest=False,
                   end_date=None):
    '''
    Create one enrollday_all table, containing time, user_id, and enrollment stats.
    Just as is done for problem_check tables, the master table holds all the events 
    extracted from the tracking logs for a course.  It isn't split into separate
    days.  It is ordered in time, however.  To update it, a new day's logs
    are processed, then the results appended to this table.

    If the enrollday_all table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.
    '''

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
                time, 
                    event_struct.user_id as user_id, 
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "honor")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "honor")
                          then -1 
                          else 0 end) as diff_enrollment_honor,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "verified")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "verified")
                          then -1 
                          when (event_type = "edx.course.enrollment.mode_changed" 
                                and event_struct.mode = "verified")
                          then 1 
                          when (event_type = "edx.course.enrollment.mode_changed" 
                                and event_struct.mode = "honor")
                          then -1 
                          else 0 end) as diff_enrollment_verified,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "audit")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "audit")
                          then -1 
                          else 0 end) as diff_enrollment_audit,
            FROM {DATASETS} 
            where (event_type = "edx.course.enrollment.activated") or
                  (event_type = "edx.course.enrollment.deactivated") or
                  (event_type = "edx.course.enrollment.mode_changed")
                  and time > TIMESTAMP("{last_date}")
            order by time;
            """

    table = 'enrollday_all'

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(
        SQL,
        table,
        course_id,
        force_recompute=force_recompute,
        use_dataset_latest=use_dataset_latest,
        end_date=end_date,
        get_date_function=gdf,
        days_delta=0)
Esempio n. 17
0
def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, 
                   check_dates=True, skip_last_day=False):
    '''
    Make {course_id}.person_course_day table for specified course_id.

    This is a single course-specific table, which contains all day's data.
    It is incrementally updated when new tracking logs data comes in,
    by appending rows to the end.  The rows are kept in time order.

    check_dates is disregarded.

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.
    '''

    PCDAY_SQL = """
    select username, 
           "{course_id}" as course_id,
           date(time) as date,
           sum(bevent) as nevents,
           sum(bprogress) as nprogcheck,
           sum(bshow_answer) as nshow_answer,
           sum(bvideo) as nvideo, 
           sum(bproblem_check) as nproblem_check,
           sum(bforum) as nforum,
           sum(bshow_transcript) as ntranscript,
           sum(bseq_goto) as nseq_goto,
           sum(bseek_video) as nseek_video,
           sum(bpause_video) as npause_video,
           MAX(time) as last_event,
           AVG(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
               ) as avg_dt,
           STDDEV(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as sdv_dt,
           MAX(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as max_dt,
           COUNT(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as n_dt,
           SUM(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as sum_dt
    from
    (SELECT username, 
      case when event_type = "play_video" then 1 else 0 end as bvideo,
      case when event_type = "problem_check" then 1 else 0 end as bproblem_check,
      case when username != "" then 1 else 0 end as bevent,
      case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum,
      case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress,
      case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer,
      case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript,
      case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto,
      case when event_type = 'seek_video' then 1 else 0 end as bseek_video,
      case when event_type = 'pause_video' then 1 else 0 end as bpause_video,
      # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll,
      # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll
      time,
      lag(time, 1) over (partition by username order by time) last_time
      FROM {DATASETS}
      WHERE
        NOT event_type contains "/xblock/"
        AND username != ""
    )
    group by course_id, username, date
    order by date
    """

    table = 'person_course_day'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    print "=== Processing person_course_day for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    process_tracking_logs.run_query_on_tracking_logs(PCDAY_SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf,
                                                     skip_last_day=skip_last_day)
    
    print "Done with person_course_day for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def process_course_time_on_task(course_id,
                                force_recompute=False,
                                use_dataset_latest=False,
                                end_date=None,
                                just_do_totals=False,
                                limit_query_size=False,
                                table_max_size_mb=800,
                                skip_totals=False,
                                start_date=None,
                                config_parameter_overrides=None):
    '''Create the time_on_task table, containing time, user_id, and time
    on task stats.  This table isn't split into separate days.  It is
    ordered in time, however.  To update it, a new day's logs are
    processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    Compute totals and store in time_on_task_totals, by summing over all dates, 
    grouped by user.

    config_parameter_overrides may be provided as a dict, which
    overrides the default timeout parameters (5 min and 30 min) and
    output table name; used for bounday analysis testing.

    How are time_on_task numbers computed?

    Let TO be the timeout (eg 5 min or 30 min).  For all time-ordered
    pairs of events, at times t1 and t2, with dt = t2-t1, the total
    time is the sum of all dt such that dt <= TO.

    Solo (unpaired) events do not cause addition of any time to the total.

    For video (or text or problems), events are filtered and only
    include events of those types.

    This methodology undercounts, for example, if the learner ends
    watching a video by closing the browser, or by switching elsewhere
    without stopping the video player first.  (note there does exist a
    stop_video event, which occurs when the video stop button is
    clicked).

    This methodology overcounts, for example, if the learner watches a
    video for awhile, switches to a problem, then back to the video.
    If the initial and final video intervals happen within TO, then
    the intermediate interval is counted towards the video total time,
    despite the learner actually doing something else (the problem)
    during that time.

    '''

    if just_do_totals:
        return process_time_on_task_totals(
            course_id,
            force_recompute=force_recompute,
            use_dataset_latest=use_dataset_latest,
            config_parameter_overrides=config_parameter_overrides)

    config_parameters = {
        'timeout_short': 5,
        'timeout_long': 30,
        'time_on_task_table_name': 'time_on_task',
        'course_id': course_id,
    }

    config_parameters.update(config_parameter_overrides or {})

    SQL_TOP = """
            SELECT 
  		    "{course_id}" as course_id,
                    date(time) as date,
                    username, 

                    # total time spent on system
                    SUM( case when dt < {timeout_short}*60 then dt end ) as total_time_{timeout_short},
                    SUM( case when dt < {timeout_long}*60 then dt end ) as total_time_{timeout_long},

                    # total time spent watching videos
                    SUM( case when (dt_video is not null) and (dt_video < {timeout_short}*60) then dt_video end ) as total_video_time_{timeout_short},
                    SUM( case when (dt_video is not null) and (dt_video < {timeout_long}*60) then dt_video end ) as total_video_time_{timeout_long},
                    SUM( case when (serial_dt_video is not null) and (serial_dt_video < {timeout_long}*60) then serial_dt_video end ) as serial_video_time_{timeout_long},

                    # total time spent doing problems
                    SUM( case when (dt_problem is not null) and (dt_problem < {timeout_short}*60) then dt_problem end ) as total_problem_time_{timeout_short},
                    SUM( case when (dt_problem is not null) and (dt_problem < {timeout_long}*60) then dt_problem end ) as total_problem_time_{timeout_long},
                    SUM( case when (serial_dt_problem is not null) and (serial_dt_problem < {timeout_long}*60) then serial_dt_problem end ) as serial_problem_time_{timeout_long},

                    # total time spent on forum
                    SUM( case when (dt_forum is not null) and (dt_forum < {timeout_short}*60) then dt_forum end ) as total_forum_time_{timeout_short},
                    SUM( case when (dt_forum is not null) and (dt_forum < {timeout_long}*60) then dt_forum end ) as total_forum_time_{timeout_long},
                    SUM( case when (serial_dt_forum is not null) and (serial_dt_forum < {timeout_long}*60) then serial_dt_forum end ) as serial_forum_time_{timeout_long},

                    # total time spent with textbook or wiki
                    SUM( case when (dt_text is not null) and (dt_text < {timeout_short}*60) then dt_text end ) as total_text_time_{timeout_short},
                    SUM( case when (dt_text is not null) and (dt_text < {timeout_long}*60) then dt_text end ) as total_text_time_{timeout_long},
                    SUM( case when (serial_dt_text is not null) and (serial_dt_text < {timeout_long}*60) then serial_dt_text end ) as serial_text_time_{timeout_long},

    """

    SQL_BOT = """
            FROM
              (
              SELECT time,
                username,
                (time - last_time)/1.0E6 as dt,         # dt is in seconds
                case when is_video then (time - last_time_video)/1.0E6 end as dt_video,
                case when is_problem then (time - last_time_problem)/1.0E6 end as dt_problem,
                case when is_forum then (time - last_time_forum)/1.0E6 end as dt_forum,
                case when is_text then (time - last_time_text)/1.0E6 end as dt_text,

                case when is_video then (time - last_time_xevent)/1.0E6 end as serial_dt_video,
                case when is_problem then (time - last_time_xevent)/1.0E6 end as serial_dt_problem,
                case when is_forum then (time - last_time_xevent)/1.0E6 end as serial_dt_forum,
                case when is_text then (time - last_time_xevent)/1.0E6 end as serial_dt_text,
              FROM
                (
                SELECT time, 
                    username,
                    last_username,
                    USEC_TO_TIMESTAMP(last_time) as last_time,
                    (case when is_video then USEC_TO_TIMESTAMP(last_time_video) end) as last_time_video,
                    # last_username_video,
                    # last_event_video,
                    is_problem,
                    is_video,
                    (case when is_problem then USEC_TO_TIMESTAMP(last_time_problem) end) as last_time_problem,
                    # last_username_problem,
                    # last_event_problem,
                    is_forum,
                    is_text,
                    (case when is_forum then USEC_TO_TIMESTAMP(last_time_forum) end) as last_time_forum,
                    (case when is_text then USEC_TO_TIMESTAMP(last_time_text) end) as last_time_text,
                    is_xevent,
                    (case when is_xevent then USEC_TO_TIMESTAMP(last_time_xevent) end) as last_time_xevent,
                FROM
                  (SELECT time,
                    username,
                    lag(time, 1) over (partition by username order by time) last_time,
                    lag(username, 1) over (partition by username order by time) last_username,
                    is_video,
                    is_problem,
                    is_forum,
                    is_text,
                    (is_video or is_problem or is_forum or is_text) as is_xevent,   # x = video, problem, forum, or text: any event
                    case when is_problem then username else '' end as uname_problem,
                    case when is_video then username else '' end as uname_video,
                    case when is_forum then username else '' end as uname_forum,
                    case when is_text then username else '' end as uname_text,
                    case when (is_video or is_problem or is_forum or is_text) then username else '' end as uname_xevent,
                    lag(time, 1) over (partition by uname_video order by time) last_time_video,
                    # lag(event_type, 1) over (partition by uname_video order by time) last_event_video,
                    # lag(uname_video, 1) over (partition by uname_video order by time) last_username_video,
                    lag(time, 1) over (partition by uname_problem order by time) last_time_problem,
                    # lag(event_type, 1) over (partition by uname_problem order by time) last_event_problem,
                    # lag(uname_problem, 1) over (partition by uname_problem order by time) last_username_problem,
                    lag(time, 1) over (partition by uname_forum order by time) last_time_forum,
                    lag(time, 1) over (partition by uname_text order by time) last_time_text,
                    lag(time, 1) over (partition by uname_xevent order by time) last_time_xevent,
                  FROM
                    (SELECT time, 
                      username,
                      event_type,
                      case when (REGEXP_MATCH(event_type ,r'\w+_video')
                               or REGEXP_MATCH(event_type, r'\w+_transcript')
                              ) then True else False end as is_video,
                      case when REGEXP_MATCH(event_type, r'problem_\w+') then True else False end as is_problem,
                      case when (REGEXP_MATCH(event_type ,r'^edx\.forum\..*')
                               or event_type contains "/discussion/forum"
                               or event_type contains "/discussion/threads"
                               or event_type contains "/discussion/comments"
                              ) then True else False end as is_forum,
                      case when (REGEXP_MATCH(event_type ,r'^textbook\..*')
                               or event_type contains "/wiki/"
                              ) then True else False end as is_text,
                    FROM {DATASETS}
                    WHERE       
                                     NOT event_type contains "/xblock/"
                                     AND username is not null
                                     AND username != ""
                                     and time > TIMESTAMP("{last_date}")
                                     {hash_limit}
                    )
                  )
                WHERE last_time is not null
                ORDER BY username, time
                )
              )
            group by course_id, username, date
            order by date, username;
          """

    SQL = SQL_TOP.format(**config_parameters) + SQL_BOT

    table = config_parameters.get('time_on_task_table_name') or 'time_on_task'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(
        SQL,
        table,
        course_id,
        force_recompute=force_recompute,
        use_dataset_latest=use_dataset_latest,
        end_date=end_date,
        start_date=start_date,
        get_date_function=gdf,
        days_delta=0,
        has_hash_limit=True,
        newer_than=datetime.datetime(2015, 3, 15),  # schema change
        table_max_size_mb=table_max_size_mb,
        limit_query_size=limit_query_size)

    if not skip_totals:
        return process_time_on_task_totals(
            course_id,
            force_recompute=force_recompute,
            use_dataset_latest=use_dataset_latest,
            config_parameter_overrides=config_parameter_overrides)

    return
def process_course_time_on_asset(course_id,
                                 force_recompute=False,
                                 use_dataset_latest=False,
                                 end_date=None,
                                 just_do_totals=False,
                                 limit_query_size=False,
                                 table_max_size_mb=800,
                                 skip_totals=False,
                                 start_date=None,
                                 config_parameter_overrides=None):
    '''
    Create the time_on_asset_daily table, containing module_id, username, date, 
    and time on asset stats.  This table has separate rows for each date, 
    and is ordered in time.  To update it, a new day's logs are
    processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    Compute totals and store in time_on_asset_totals, by summing over all dates, 
    grouped by module_id.

    How are time_on_asset numbers computed?

    See discussion in make_time_on_task.py

    The time_one_asset_daily table has these columns:

    - date: gives day for the data
    - username
    - module_id
    - time_umid5: total time on module (by module_id) in seconds, with a 5-minute timeout
    - time_umid30: total time on module (by module_id) in seconds, with a 30-minute timeout

    '''

    if just_do_totals:
        return process_time_on_task_totals(
            course_id,
            force_recompute=force_recompute,
            use_dataset_latest=use_dataset_latest)

    SQL = """
            SELECT
                    "{course_id}" as course_id,
                    date(time) as date,
                    username,
                    module_id,
                    # time_umid5 = total time on module (by module_id) in seconds
                    # time_mid5 has 5 minute timeout, time_mid30 has 30 min timeout
                    SUM( case when dt_umid < 5*60 then dt_umid end ) as time_umid5,
                    SUM( case when dt_umid < 30*60 then dt_umid end ) as time_umid30,
            FROM (
              SELECT time,
                username,
                module_id,
                (time - last_time)/1.0E6 as dt,         # dt is in seconds
                (time - last_time_umid)/1.0E6 as dt_umid,   # dt for (user, module_id) in seconds
                last_time_umid,
              FROM
                (
                SELECT time,
                    username,
                    last_username,
                    module_id,
                    USEC_TO_TIMESTAMP(last_time) as last_time,
                    USEC_TO_TIMESTAMP(last_time_umid) as last_time_umid,                    
                FROM (
                  SELECT time,
                    username,
                    module_id,
                    lag(time, 1) over (partition by username order by time) last_time,
                    lag(username, 1) over (partition by username order by time) last_username,
                    lag(time, 1) over (partition by username, module_id order by time) last_time_umid,
                  FROM
                    (SELECT time, 
                      username,
                      (case when REGEXP_MATCH(module_id, r'.*\"\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\}}$')
                            when REGEXP_MATCH(module_id, r'.*\"\]\}}\}}$') then REGEXP_EXTRACT(module_id, r'(.*)\"\]\}}\}}$')
                           else module_id end) as module_id,	# fix some errors in module_id names
                    FROM {DATASETS}
                  )
         
                    WHERE       
                                     module_id is not null
                                     AND username is not null
                                     AND username != ""
                                     and time > TIMESTAMP("{last_date}")
                                     
                    )
                )
              )
              WHERE module_id is not null
                    AND NOT module_id CONTAINS '"'
              GROUP BY date, module_id, username
              ORDER BY date, module_id, username
          """

    table = 'time_on_asset_daily'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(
        SQL,
        table,
        course_id,
        force_recompute=force_recompute,
        use_dataset_latest=use_dataset_latest,
        end_date=end_date,
        start_date=start_date,
        get_date_function=gdf,
        days_delta=0,
        has_hash_limit=True,
        newer_than=datetime.datetime(2015, 3, 15),  # schema change
        table_max_size_mb=table_max_size_mb,
        limit_query_size=limit_query_size)

    if not skip_totals:
        return process_time_on_asset_totals(
            course_id,
            force_recompute=force_recompute,
            use_dataset_latest=use_dataset_latest)

    return
Esempio n. 20
0
def compute_person_course_day_trlang_table(course_id, force_recompute=False, use_dataset_latest=False, end_date=None):
    '''
    make pcday_trlang_counts for specified course_id
    
    This couse table holds all the (username, course_id, date, transcript_language, count)
    data for a course.

    This is used in computing the modal video transcript language of users

    '''
    table = 'pcday_trlang_counts'

    SQL = '''
		SELECT
		  username,
		  course_id,
		  DATE(time) AS date,
		  LAST(time) AS last_time,
		  resource,
		  CASE
		    WHEN resource_event_type = 'transcript_download' AND prev_event_data IS NOT NULL THEN prev_event_data
		    WHEN resource_event_type = 'transcript_download'
		  AND prev_event_data IS NULL THEN 'en'
		    WHEN resource_event_type = 'transcript_language' THEN resource_event_data
		    ELSE resource_event_data
		  END AS resource_event_data,
		  resource_event_type,
		  SUM(lang_count) AS langcount
		FROM (
		  SELECT
		    course_id,
		    username,
		    resource,
		    resource_event_data,
		    resource_event_type,
		    LAG(time, 1) OVER (PARTITION BY username ORDER BY time) AS prev_time,
		    LAG(resource_event_type, 1) OVER (PARTITION BY username ORDER BY time) AS prev_event_type,
		    LAG(resource_event_data, 1) OVER (PARTITION BY username ORDER BY time) AS prev_event_data,
		    time,
		    COUNT(*) AS lang_count,
		    event_type
		  FROM (
		    SELECT
		      time,
		      course_id,
		      username,
		      'video' AS resource,
		      CASE
			WHEN module_id IS NOT NULL THEN REGEXP_EXTRACT(module_id, r'.*video/(.*)')                          # Newer data .. May 2016?
			ELSE REGEXP_EXTRACT(event_type, r'.*;_video;_(.*)\/handler\/transcript\/translation\/.*') END   # Older data
		      AS resource_id,
		      CASE
			WHEN (event_type = 'edx.ui.lms.link_clicked'                                                     # Trasncript Download
			AND REGEXP_MATCH(JSON_EXTRACT(event, '$.target_url'), r'(.*handler/transcript/download)') ) THEN 'transcript_download'
			WHEN (REGEXP_MATCH(event_type, "/transcript/translation/.*") ) THEN 'transcript_language'
			ELSE NULL
		      END AS resource_event_type,
		      REGEXP_EXTRACT(event_type, r'.*\/handler\/transcript\/translation\/(.*)') AS resource_event_data,
		      event_type,
		    FROM {DATASETS}
		    WHERE
		      time > TIMESTAMP("2010-10-01 01:02:03")
		      AND username != ""
		      AND ( (NOT event_type CONTAINS "/xblock/"                                                          # Trasncript Download
			  AND event_type = 'edx.ui.lms.link_clicked'                                                  # Trasncript Download
			  AND REGEXP_MATCH(JSON_EXTRACT(event, '$.target_url'), r'(.*handler/transcript/download)') ) # Transcript Download
			OR (REGEXP_MATCH(event_type, "/transcript/translation/.*") )                                   # Transcript Language Toggle
			)
		    ORDER BY
		      time )
		  GROUP BY
		    username,
		    course_id,
		    resource,
		    resource_event_data,
		    resource_event_type,
		    event_type,
		    time )
		GROUP BY
		  username,
		  course_id,
		  date,
		  resource,
		  resource_event_data,
		  resource_event_type,
		ORDER BY
		  date ASC,
		  username ASC
		  '''

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf)
Esempio n. 21
0
def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, 
                   check_dates=True, skip_last_day=False):
    '''
    Make {course_id}.person_course_day table for specified course_id.

    This is a single course-specific table, which contains all day's data.
    It is incrementally updated when new tracking logs data comes in,
    by appending rows to the end.  The rows are kept in time order.

    check_dates is disregarded.

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    videoTableExists = False
    try:

        tinfo_video = bqutil.get_bq_table_info(dataset, 'video_stats_day')
        assert tinfo_video is not None, "Video stats table missing... Not including video stats"
	videoTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass
        #return

    forumTableExists = False
    try:

        tinfo_forum = bqutil.get_bq_table_info(dataset, 'forum_events')
        assert tinfo_forum is not None, "Forum events table missing... Not including forum stats"
	forumTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass
        #return

    problemTableExists = False
    try:

        tinfo_personproblem = bqutil.get_bq_table_info(dataset, 'person_problem')
        tinfo_courseproblem = bqutil.get_bq_table_info(dataset, 'course_problem')
        tinfo_courseaxis = bqutil.get_bq_table_info(dataset, 'course_axis')
        tinfo_personcourse = bqutil.get_bq_table_info(dataset, 'person_course')
	# Check course axis and person course, course problem
        assert tinfo_personproblem is not None, "Person problem table missing... Not including problem stats"
        assert tinfo_courseproblem is not None, "Course problem table missing... Not including problem stats"
        assert tinfo_courseaxis is not None, "Course axis table missing... Not including problem stats"
        assert tinfo_personcourse is not None, "Person Course table missing... Not including problem stats"
	problemTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass

    PCDAY_SQL_BASE_SELECT = """
			  SELECT username,
				 '{course_id}' AS course_id,
				 DATE(time) AS date,
				 SUM(bevent) AS nevents,
				 SUM(bprogress) AS nprogcheck,
				 SUM(bshow_answer) AS nshow_answer,
				 SUM(bvideo) AS nvideo,
				 SUM(bproblem_check) AS nproblem_check,
				 SUM(bforum) AS nforum,
				 SUM(bshow_transcript) AS ntranscript,
				 SUM(bseq_goto) AS nseq_goto,
				 SUM(bseek_video) AS nseek_video,
				 SUM(bpause_video) AS npause_video,
		    """

    PCDAY_SQL_VIDEO_EXISTS = """
			  	 COUNT(DISTINCT video_id) AS nvideos_viewed, # New Video - Unique videos viewed
				 SUM(case when position is not null then FLOAT(position) else FLOAT(0.0) end) AS nvideos_watched_sec, # New Video - # sec watched using max video position
		    """

    PCDAY_SQL_VIDEO_DNE = """
				 0 AS nvideos_viewed, # New Video - Unique videos viewed
				 FLOAT(0.0) AS nvideos_watched_sec, # New Video - # sec watched using max video position
		    """
    PCDAY_SQL_VIDEO_SELECT = PCDAY_SQL_VIDEO_EXISTS if videoTableExists else PCDAY_SQL_VIDEO_DNE

    PCDAY_SQL_FORUM_EXISTS = """
				 SUM(case when read is not null then read else 0 end) AS nforum_reads, # New discussion - Forum reads
				 SUM(case when write is not null then write else 0 end) AS nforum_posts, # New discussion - Forum posts
				 COUNT(DISTINCT thread_id ) AS nforum_threads, # New discussion - Unique forum threads interacted with
		    """

    PCDAY_SQL_FORUM_DNE = """
				 0 AS nforum_reads, # New discussion - Forum reads
				 0 AS nforum_posts, # New discussion - Forum posts
				 0 AS nforum_threads, # New discussion - Unique forum threads interacted with
		    """
    PCDAY_SQL_FORUM_SELECT = PCDAY_SQL_FORUM_EXISTS if forumTableExists else PCDAY_SQL_FORUM_DNE

    PCDAY_SQL_PROBLEM_EXISTS = """
				 COUNT(DISTINCT problem_nid ) AS nproblems_answered, # New Problem - Unique problems attempted
				 SUM(case when n_attempts is not null then n_attempts else 0 end) AS nproblems_attempted, # New Problem - Total attempts
				 SUM(case when ncount_problem_multiplechoice is not null then ncount_problem_multiplechoice else 0 end) as nproblems_multiplechoice,
				 SUM(case when ncount_problem_choice is not null then ncount_problem_choice else 0 end) as nproblems_choice,
				 SUM(case when ncount_problem_numerical is not null then ncount_problem_numerical else 0 end) as nproblems_numerical,
				 SUM(case when ncount_problem_option is not null then ncount_problem_option else 0 end) as nproblems_option,
				 SUM(case when ncount_problem_custom is not null then ncount_problem_custom else 0 end) as nproblems_custom,
				 SUM(case when ncount_problem_string is not null then ncount_problem_string else 0 end) as nproblems_string,
				 SUM(case when ncount_problem_mixed is not null then ncount_problem_mixed else 0 end) as nproblems_mixed,
				 SUM(case when ncount_problem_formula is not null then ncount_problem_formula else 0 end) as nproblems_forumula,
				 SUM(case when ncount_problem_other is not null then ncount_problem_other else 0 end) as nproblems_other,
		    """

    PCDAY_SQL_PROBLEM_DNE = """
				 0 AS nproblems_answered, # New Problem - Unique problems attempted
				 0 AS nproblems_attempted, # New Problem - Total attempts
				 0 AS nproblems_multiplechoice,
				 0 AS nproblems_choice,
				 0 AS nproblems_numerical,
				 0 AS nproblems_option,
				 0 AS nproblems_custom,
				 0 AS nproblems_string,
				 0 AS nproblems_mixed,
				 0 AS nproblems_forumula,
				 0 AS nproblems_other,
		    """
    PCDAY_SQL_PROBLEM_SELECT = PCDAY_SQL_PROBLEM_EXISTS if problemTableExists else PCDAY_SQL_PROBLEM_DNE

    PCDAY_SQL_MID = """
				 MIN(time) AS first_event,
				 MAX(time) AS last_event,
				 AVG( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS avg_dt,
				 STDDEV( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sdv_dt,
				 MAX( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS max_dt,
				 COUNT( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS n_dt,
				 SUM( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sum_dt
			FROM (
			  SELECT
			    *
			  FROM (
			    SELECT
			      username,
			      CASE WHEN event_type = "play_video" THEN 1 ELSE 0 END AS bvideo,
			      CASE WHEN event_type = "problem_check" THEN 1 ELSE 0 END AS bproblem_check,
			      CASE WHEN username != "" THEN 1 ELSE 0 END AS bevent,
			      CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum,
			      CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress,
			      CASE WHEN event_type IN ("show_answer",
				"showanswer") THEN 1 ELSE 0 END AS bshow_answer,
			      CASE WHEN event_type = 'show_transcript' THEN 1 ELSE 0 END AS bshow_transcript,
			      CASE WHEN event_type = 'seq_goto' THEN 1 ELSE 0 END AS bseq_goto,
			      CASE WHEN event_type = 'seek_video' THEN 1 ELSE 0 END AS bseek_video,
			      CASE WHEN event_type = 'pause_video' THEN 1 ELSE 0 END AS bpause_video,
			      # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll,
			      # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll
			      time,
			      LAG(time, 1) OVER (PARTITION BY username ORDER BY time) last_time
			    FROM {DATASETS}
			    WHERE
			      NOT event_type CONTAINS "/xblock/"
			      AND username != "" )
		    """


    PCDAY_SQL_VIDEO = """ ,
			  ( # Video events
				  SELECT TIMESTAMP(date) as time,
				         '{course_id}' as course_id,
				         username,
				         video_id,
				         position,
				  FROM [{dataset}.video_stats_day]
				  WHERE TIMESTAMP(date)>= TIMESTAMP("{min_date_start}") and TIMESTAMP(date) <= TIMESTAMP("{max_date_end}")

			  )
                      """
    PCDAY_SQL_ADD = PCDAY_SQL_VIDEO if videoTableExists else ''

    PCDAY_SQL_FORUM = """ ,
			  ( # Forum Events
				   SELECT time,
					  username,
				          '{course_id}' as course_id,
				          thread_id,
				          (CASE WHEN (forum_action == "reply" or forum_action == "comment_reply"
						      or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment")
						THEN 1 ELSE 0 END) AS write,
					  (CASE WHEN (forum_action == "read" or forum_action == "read_inline") THEN 1 ELSE 0 END) AS read,
				   FROM [{dataset}.forum_events]
				   WHERE (forum_action == "reply" or forum_action == "comment_reply"
					  or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment"
					  or forum_action == "read" or forum_action == "read_inline")
				          and ( time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}") )
			  )
                      """
    PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_FORUM if forumTableExists else PCDAY_SQL_ADD

    PCDAY_SQL_PROBLEM = """,
			  ( # Problems
				   SELECT pc.username AS username,
				          pp.problem_nid AS problem_nid,
				          pp.n_attempts AS n_attempts,
				          pp.time AS time,
				          '{course_id}' as course_id,
					  pp.ncount_problem_multiplechoice as ncount_problem_multiplechoice,
					  pp.ncount_problem_choice as ncount_problem_choice,
					  pp.ncount_problem_numerical as ncount_problem_numerical,
					  pp.ncount_problem_option as ncount_problem_option,
					  pp.ncount_problem_custom as ncount_problem_custom,
					  pp.ncount_problem_string as ncount_problem_string,
					  pp.ncount_problem_mixed as ncount_problem_mixed,
					  pp.ncount_problem_formula as ncount_problem_formula,
					  pp.ncount_problem_other as ncount_problem_other,
				   FROM (

					   (
					      SELECT PP.user_id as user_id,
						     PP.problem_nid AS problem_nid,
						     PP.n_attempts as n_attempts,
						     PP.date as time,
						     (Case when CP_CA.data_itype == "multiplechoiceresponse" then 1 else 0 end) as ncount_problem_multiplechoice, # Choice
					             (Case when CP_CA.data_itype == "choiceresponse" then 1 else 0 end) as ncount_problem_choice,       # Choice
						     (Case when CP_CA.data_itype == "numericalresponse" then 1 else 0 end) as ncount_problem_numerical, #input
						     (Case when CP_CA.data_itype == "optionresponse" then 1 else 0 end) as ncount_problem_option,       # Choice
					             (Case when CP_CA.data_itype == "customresponse" then 1 else 0 end) as ncount_problem_custom,       # Custom
					             (Case when CP_CA.data_itype == "stringresponse" then 1 else 0 end) as ncount_problem_string,       # Input
					             (Case when CP_CA.data_itype == "mixed" then 1 else 0 end) as ncount_problem_mixed,                 # Mixed
					             (Case when CP_CA.data_itype == "forumula" then 1 else 0 end) as ncount_problem_formula,            # Input
					             (Case when CP_CA.data_itype != "multiplechoiceresponse" and
							        CP_CA.data_itype != "choiceresponse" and
							        CP_CA.data_itype != "numericalresponse" and
							        CP_CA.data_itype != "optionresponse" and
							        CP_CA.data_itype != "customresponse" and
							        CP_CA.data_itype != "stringresponse" and
							        CP_CA.data_itype != "mixed" and
							        CP_CA.data_itype != "forumula"
							   then 1 else 0 end) as ncount_problem_other, # Input
						     #MAX(n_attempts) AS n_attempts,
						     #MAX(date) AS time,
					      FROM [{dataset}.person_problem] PP
					      LEFT JOIN
					      (
							SELECT CP.problem_nid as problem_nid,
							       INTEGER(CP.problem_id) as problem_id,
							       CA.data.itype as data_itype,
						        FROM [{dataset}.course_problem] CP
						        LEFT JOIN [{dataset}.course_axis] CA
						        ON CP.problem_id == CA.url_name
					      ) as CP_CA
					      ON PP.problem_nid == CP_CA.problem_nid
					      GROUP BY time, user_id, problem_nid, n_attempts,
						       ncount_problem_multiplechoice,
						       ncount_problem_choice,
						       ncount_problem_choice,
						       ncount_problem_numerical,
						       ncount_problem_option,
						       ncount_problem_custom,
						       ncount_problem_string,
						       ncount_problem_mixed,
						       ncount_problem_formula,
						       ncount_problem_other
					      )

					      #FROM [{dataset}.person_item] PI
					      #JOIN [{dataset}.course_item] CI
					      #ON PI.item_nid = CI.item_nid
					      #GROUP BY user_id,
						       #problem_nid
					      #ORDER BY
						       #user_id,
						       #problem_nid
					) AS pp
				        LEFT JOIN (
							      SELECT username,
								     user_id
							      FROM [{dataset}.person_course] 
					) AS pc
					ON pc.user_id = pp.user_id
				        WHERE time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}")
			  )
 
                        """
    PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_PROBLEM if problemTableExists else PCDAY_SQL_ADD

    PCDAY_SQL_END = """
			  )
			  WHERE time > TIMESTAMP("{last_date}")
			  GROUP BY course_id,
				   username,
				   date
			  ORDER BY date
		    """


    PCDAY_SQL_NEW = PCDAY_SQL_BASE_SELECT + PCDAY_SQL_VIDEO_SELECT + PCDAY_SQL_FORUM_SELECT + PCDAY_SQL_PROBLEM_SELECT + PCDAY_SQL_MID + PCDAY_SQL_ADD + PCDAY_SQL_END

    PCDAY_SQL = PCDAY_SQL_NEW.format( dataset=dataset, course_id="{course_id}", DATASETS="{DATASETS}", last_date="{last_date}", min_date_start="{min_date_start}", max_date_end="{max_date_end}")

    table = 'person_course_day'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    print "=== Processing person_course_day for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    # Major person_course_day schema revision 19-Jan-2016 adds new fields; if table exists, ensure it 
    # has new schema, else force recompute.
    try:
        tinfo = bqutil.get_bq_table_info(dataset, table)
    except Exception as err:
        tinfo = None
    if tinfo:
        fields = tinfo['schema']['fields']
        field_names = [x['name'] for x in fields]
        if not 'nvideos_viewed' in field_names:
            cdt = tinfo['creationTime']
            print "    --> person_course_day created %s; missing nvideos_viewed field in schema; forcing recompute - this may take a long time!" % cdt
            sys.stdout.flush()
            force_recompute = True

    process_tracking_logs.run_query_on_tracking_logs(PCDAY_SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf,
                                                     newer_than=datetime.datetime( 2017, 2, 8, 16, 30 ),
                                                     skip_last_day=skip_last_day)
    
    print "Done with person_course_day for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
def CreateForumEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None):
    '''
    Create forum events table, based on tracking logs.  Extracts all forum-related events, including forum post reads,
    into the date-time ordered table.  Repeated calls to this procedure will append new events to the table.  If no
    new events are found, the existing table is left unchanged.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_FORUM_EVENTS
    
    # event_type for forums may be like:
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/threads/5460c918a2a525003a0007fa
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/inline
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/users/4051854/followed
    #  /courses/UnivX/123.4x/2T2015/discussion/comments/54593f21a2a525003a000351/reply
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545e4f5da2a5251aac000672/reply
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/upvote
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/unvote
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/5447c22e892b213c7b0001f3/update
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/54493025892b2120a1000335/pin
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/54492e9c35c79cb03e00030c/delete
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/General/inline
    #  /courses/UnivX/123.4x/2T2015/instructor/api/list_forum_members
    #  /courses/UnivX/123.4x/2T2015/instructor/api/update_forum_role_membership
    #     \"GET\": {\"action\": [\"allow\"], \"rolename\": [\"Administrator\"], \"unique_student_identifier\": [\"NEW_ADMIN_USER\"]}}"}
    #
    # module_id will be like:
    # "module_id": "UnivX/123.4x/forum/54492f0c892b21597e00030a"

    the_sql = """
              SELECT time, 
                     username,
                     '{course_id}' as course_id,
                     (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/reply') then "reply"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/upvote') then "upvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unvote') then "unvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/update') then "update"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/delete') then "delete"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/close') then "close"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/follow') then "follow_thread"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unfollow') then "unfollow_thread"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/pin') then "pin"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unpin') then "unpin"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/downvote') then "downvote"  # does this happen?
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/reply') then "comment_reply"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/upvote') then "comment_upvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/update') then "comment_update"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/unvote') then "comment_unvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/delete') then "comment_delete"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+/followed') then "follow_user"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+$') then "target_user"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then "read"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/inline') then "read_inline"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/search') then "search"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum$') then "enter_forum"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/$') then "enter_forum"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/instructor/api/(.*)') then REGEXP_EXTRACT(event_type, r'/courses/.*/instructor/api/(.*)')
                           when event_type = "edx.forum.thread.created" then "created_thread"
                           when event_type = "edx.forum.response.created" then "created_response"
                           when event_type = "edx.forum.comment.created" then "created_comment"
                           when event_type = "edx.forum.searched" then "searched"
                           else event_type end) as forum_action,
                           (case when module_id is not null then REGEXP_EXTRACT(module_id, r'[^/]+/[^/]+/forum/([^/]+)') # For old-school courses with transparent course ids
                                      else (case when module_id is null # otherwise, for new opaque course ids, use regex to find thread_id from event_type, since module_id is null
                                               then (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/forum/[^/]+/threads/([^/]+)') # read
                                                      else (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/threads/([^/]+)') # upvote, pinned, upvoted, unvoted, deleted, followed
                                                             else REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/comments/([^/]+)/') end) # comment
                                                                end) end) end) as thread_id,
                     REGEXP_EXTRACT(event_type, r'/courses/.*/forum/([^/]+)/') as subject,
                     REGEXP_EXTRACT(event_type, r'/courses/.*/forum/users/([^/]+)') as target_user_id,
                     event_struct.query as search_query,   # unavailable before June 1, 2015
                     event_struct.GET as event_GET,        # unavailable before June 1, 2015
              FROM {DATASETS}
              WHERE  (REGEXP_MATCH(event_type ,r'^edx\.forum\..*')
                      or event_type contains "/discussion/forum"
                      or event_type contains "/discussion/threads"
                      or event_type contains "/discussion/comments"
                      or event_type contains "list-forum-"
                      or event_type contains "list_forum_"
                      or event_type contains "add-forum-"
                      or event_type contains "add_forum_"
                      or event_type contains "remove-forum-"
                      or event_type contains "remove_forum_"
                      or event_type contains "update_forum_"
                     ) 
                    AND username is not null
                    AND event is not null
                    and time > TIMESTAMP("{last_date}")
                    {hash_limit}
              order by time
              """

    try:
        tinfo = bqutil.get_bq_table_info(dataset, table )
        assert tinfo is not None, "[make_forum_analysis] Creating %s.%s table for %s" % (dataset, table, course_id)

        print "[make_forum_analysis] Appending latest data to %s.%s table for %s" % (dataset, table, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % ( dataset, table )
        sys.stdout.flush()
        pass

    print "=== Processing Forum Events for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     get_date_function=gdf,
                                                     has_hash_limit=True,
                                                     end_date=end_date,
                                                     skip_last_day=skip_last_day
                                                    )

    print "Done with Forum Events for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()
Esempio n. 23
0
def process_course_time_on_task(course_id, force_recompute=False, use_dataset_latest=False, end_date=None,
                                just_do_totals=False, limit_query_size=False, table_max_size_mb=800,
                                skip_totals=False, start_date=None):
    '''
    Create the time_on_task table, containing time, user_id, and time
    on task stats.  This table isn't split into separate days.  It is
    ordered in time, however.  To update it, a new day's logs are
    processed, then the results appended to this table.

    If the table doesn't exist, then run it once on all
    the existing tracking logs.  

    If it already exists, then run a query on it to see what dates have
    already been done.  Then do all tracking logs except those which
    have already been done.  Append the results to the existing table.

    Compute totals and store in time_on_task_totals, by summing over all dates, 
    grouped by user.
    '''

    if just_do_totals:
        return process_time_on_task_totals(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
                    date(time) as date,
                    username, 

                    # total time spent on system
                    SUM( case when dt < 5*60 then dt end ) as total_time_5,
                    SUM( case when dt < 30*60 then dt end ) as total_time_30,

                    # total time spent watching videos
                    SUM( case when (dt_video is not null) and (dt_video < 5*60) then dt_video end ) as total_video_time_5,
                    SUM( case when (dt_video is not null) and (dt_video < 30*60) then dt_video end ) as total_video_time_30,
                    SUM( case when (serial_dt_video is not null) and (serial_dt_video < 30*60) then serial_dt_video end ) as serial_video_time_30,

                    # total time spent doing problems
                    SUM( case when (dt_problem is not null) and (dt_problem < 5*60) then dt_problem end ) as total_problem_time_5,
                    SUM( case when (dt_problem is not null) and (dt_problem < 30*60) then dt_problem end ) as total_problem_time_30,
                    SUM( case when (serial_dt_problem is not null) and (serial_dt_problem < 30*60) then serial_dt_problem end ) as serial_problem_time_30,

                    # total time spent on forum
                    SUM( case when (dt_forum is not null) and (dt_forum < 5*60) then dt_forum end ) as total_forum_time_5,
                    SUM( case when (dt_forum is not null) and (dt_forum < 30*60) then dt_forum end ) as total_forum_time_30,
                    SUM( case when (serial_dt_forum is not null) and (serial_dt_forum < 30*60) then serial_dt_forum end ) as serial_forum_time_30,

                    # total time spent with textbook or wiki
                    SUM( case when (dt_text is not null) and (dt_text < 5*60) then dt_text end ) as total_text_time_5,
                    SUM( case when (dt_text is not null) and (dt_text < 30*60) then dt_text end ) as total_text_time_30,
                    SUM( case when (serial_dt_text is not null) and (serial_dt_text < 30*60) then serial_dt_text end ) as serial_text_time_30,
            FROM
              (
              SELECT time,
                username,
                (time - last_time)/1.0E6 as dt,         # dt is in seconds
                case when is_video then (time - last_time_video)/1.0E6 end as dt_video,
                case when is_problem then (time - last_time_problem)/1.0E6 end as dt_problem,
                case when is_forum then (time - last_time_forum)/1.0E6 end as dt_forum,
                case when is_text then (time - last_time_text)/1.0E6 end as dt_text,

                case when is_video then (time - last_time_xevent)/1.0E6 end as serial_dt_video,
                case when is_problem then (time - last_time_xevent)/1.0E6 end as serial_dt_problem,
                case when is_forum then (time - last_time_xevent)/1.0E6 end as serial_dt_forum,
                case when is_text then (time - last_time_xevent)/1.0E6 end as serial_dt_text,
              FROM
                (
                SELECT time, 
                    username,
                    last_username,
                    USEC_TO_TIMESTAMP(last_time) as last_time,
                    (case when is_video then USEC_TO_TIMESTAMP(last_time_video) end) as last_time_video,
                    # last_username_video,
                    # last_event_video,
                    is_problem,
                    is_video,
                    (case when is_problem then USEC_TO_TIMESTAMP(last_time_problem) end) as last_time_problem,
                    # last_username_problem,
                    # last_event_problem,
                    is_forum,
                    is_text,
                    (case when is_forum then USEC_TO_TIMESTAMP(last_time_forum) end) as last_time_forum,
                    (case when is_text then USEC_TO_TIMESTAMP(last_time_text) end) as last_time_text,
                    is_xevent,
                    (case when is_xevent then USEC_TO_TIMESTAMP(last_time_xevent) end) as last_time_xevent,
                FROM
                  (SELECT time,
                    username,
                    lag(time, 1) over (partition by username order by time) last_time,
                    lag(username, 1) over (partition by username order by time) last_username,
                    is_video,
                    is_problem,
                    is_forum,
                    is_text,
                    (is_video or is_problem or is_forum or is_text) as is_xevent,   # x = video, problem, forum, or text: any event
                    case when is_problem then username else '' end as uname_problem,
                    case when is_video then username else '' end as uname_video,
                    case when is_forum then username else '' end as uname_forum,
                    case when is_text then username else '' end as uname_text,
                    case when (is_video or is_problem or is_forum or is_text) then username else '' end as uname_xevent,
                    lag(time, 1) over (partition by uname_video order by time) last_time_video,
                    # lag(event_type, 1) over (partition by uname_video order by time) last_event_video,
                    # lag(uname_video, 1) over (partition by uname_video order by time) last_username_video,
                    lag(time, 1) over (partition by uname_problem order by time) last_time_problem,
                    # lag(event_type, 1) over (partition by uname_problem order by time) last_event_problem,
                    # lag(uname_problem, 1) over (partition by uname_problem order by time) last_username_problem,
                    lag(time, 1) over (partition by uname_forum order by time) last_time_forum,
                    lag(time, 1) over (partition by uname_text order by time) last_time_text,
                    lag(time, 1) over (partition by uname_xevent order by time) last_time_xevent,
                  FROM
                    (SELECT time, 
                      username,
                      event_type,
                      case when (REGEXP_MATCH(event_type ,r'\w+_video')
                               or REGEXP_MATCH(event_type, r'\w+_transcript')
                              ) then True else False end as is_video,
                      case when REGEXP_MATCH(event_type, r'problem_\w+') then True else False end as is_problem,
                      case when (REGEXP_MATCH(event_type ,r'^edx\.forum\..*')
                               or event_type contains "/discussion/forum"
                               or event_type contains "/discussion/threads"
                               or event_type contains "/discussion/comments"
                              ) then True else False end as is_forum,
                      case when (REGEXP_MATCH(event_type ,r'^textbook\..*')
                               or event_type contains "/wiki/"
                              ) then True else False end as is_text,
                    FROM {DATASETS}
                    WHERE       
                                     NOT event_type contains "/xblock/"
                                     AND username is not null
                                     AND username != ""
                                     and time > TIMESTAMP("{last_date}")
                                     {hash_limit}
                    )
                  )
                WHERE last_time is not null
                ORDER BY username, time
                )
              )
            group by course_id, username, date
            order by date, username;
          """

    table = 'time_on_task'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     start_date=start_date,
                                                     get_date_function=gdf,
                                                     days_delta=0,
                                                     has_hash_limit=True,
                                                     newer_than=datetime.datetime(2015,3,15),	# schema change
                                                     table_max_size_mb=table_max_size_mb,
                                                     limit_query_size=limit_query_size)

    if not skip_totals:
        return process_time_on_task_totals(course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)

    return