Python get_bq_table_size_rows Examples, bqutil.get_bq_table_size_rows Python Examples

Example #1

0

Show file

def create_course_problem_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    Generate course_problem table, with one row per (problem_id), giving average points, standard deviation on points,
    number of unique users attempted, max points possible.

    Uses person_item and course_item.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "course_problem"

    the_sql = """
# compute course_problem table for {course_id}
SELECT problem_nid, problem_id, problem_short_id, 
  avg(problem_grade) as avg_problem_raw_score,
  stddev(problem_grade) as sdv_problem_raw_score,
  # max(problem_grade) as max_problem_raw_score,
  max(possible_raw_score) as max_possible_raw_score,
  avg(problem_grade / possible_raw_score * 100) as avg_problem_pct_score,
  count(unique(user_id)) as n_unique_users_attempted,
  problem_name,
  is_split,
  split_name,
FROM
(
    SELECT problem_nid, problem_id, problem_short_id, sum(item_grade) as problem_grade, user_id,
        sum(CI.item_points_possible) as possible_raw_score, problem_name, is_split, split_name,
    FROM [{dataset}.person_item] PI
    JOIN [{dataset}.course_item] CI
    on PI.item_nid = CI.item_nid
    group by problem_nid, problem_short_id, problem_id, user_id, problem_name, is_split, split_name
)
group by problem_nid, problem_id, problem_short_id, problem_name, is_split, split_name
# order by problem_short_id
order by avg_problem_pct_score desc
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [ "%s.course_item" % dataset,
                   "%s.person_item" % dataset
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[make_course_problem_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()

Example #2

0

Show file

File: make_video_analysis.py Project: AbdouSeck/edx2bigquery

def createPersonCourseVideo( course_id, force_recompute=False, use_dataset_latest=False ):
    '''
    Create the person_course_video_watched table, based on video_stats.
    Each row gives the number of unique videos watched by a given user, for the given course.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_PERSON_COURSE_VIDEO_WATCHED

    the_sql = """
                  SELECT user_id, 
                      "{course_id}" as course_id,
                      count(*) n_unique_videos_watched,
                      count(*) / n_total_videos as fract_total_videos_watched,
                      viewed, certified, verified
                  FROM
                  (
                      SELECT PC.user_id as user_id, UV.username as username,
                          video_id, 
                          n_views,
                          NV.n_total_videos as n_total_videos,
                          certified,
                          viewed,
                          (mode=="verified") as verified,
                      FROM
                      (
                          SELECT username, video_id, count(*) as n_views
                          FROM [{dataset}.video_stats_day] 
                          GROUP BY username, video_id
                      ) UV
                      JOIN [{dataset}.person_course] PC
                      on UV.username = PC.username
                      CROSS JOIN 
                      (
                          SELECT count(*) as n_total_videos
                          FROM [{dataset}.video_axis]
                      ) NV
                      WHERE ((PC.roles = 'Student') OR (PC.roles is NULL))	# accommodate case when roles.csv is missing
                      # WHERE PC.roles = 'Student'
                  )
                  GROUP BY user_id, certified, viewed, verified, n_total_videos
                  order by user_id
              """

    the_sql = the_sql.format(course_id=course_id, dataset=dataset)
    bqdat = bqutil.get_bq_table(dataset, table, the_sql, force_query=force_recompute,
                                depends_on=["%s.%s" % (dataset, TABLE_VIDEO_STATS)],
                                newer_than=datetime.datetime( 2017, 2, 6, 18, 30 ),
                                startIndex=-2)
    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, table)
    print "--> Done with %s for %s, %d entries found" % (table, course_id, nfound)
    sys.stdout.flush()

    return bqdat

Example #3

0

Show file

def create_person_problem_table(course_id,
                                force_recompute=False,
                                use_dataset_latest=False):
    '''
    Generate person_problem table, with one row per (user_id, problem_id), giving problem raw_score earned, attempts,
    and datestamp.

    Computed by aggregating over person_item, and joining with course_item
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    tablename = "person_problem"

    the_sql = """
# compute person-problem table for {course_id}

SELECT user_id,
       course_id,
    CI.problem_nid as problem_nid,
    sum(item_grade) as problem_raw_score,
    sum(item_grade) / sum(CI.item_points_possible) * 100 as problem_pct_score,
    max(PI.grade) as grade,
    max(n_attempts) as n_attempts,
    max(date) as date,
    
FROM [{dataset}.person_item] PI
JOIN [{dataset}.course_item] CI
    
on PI.item_nid = CI.item_nid
group by user_id, course_id, problem_nid
order by user_id, course_id, problem_nid
    """.format(dataset=dataset, course_id=course_id)

    depends_on = ["%s.course_item" % dataset, "%s.person_item" % dataset]

    try:
        bqdat = bqutil.get_bq_table(dataset,
                                    tablename,
                                    the_sql,
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[make_person_problem_table] ERR! failed in creating %s.%s using this sql:" % (
            dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id,
                                                         nfound)
    sys.stdout.flush()

Example #4

0

Show file

def create_person_item_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    Generate person_item table, with one row per (user_id, item_id), giving grade points earned, attempts,
    and datestamp.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "person_item"

    the_sql = """
# compute person-item table

SELECT user_id, 
    # PA.item_id as item_id,
    CI.item_short_id as item_short_id,
    CI.item_nid as item_nid,
    item_grade,
    n_attempts,
    date
FROM
(
    SELECT user_id,
        item.answer_id as item_id,
        if(item.correct_bool, 1, 0) as item_grade,
        attempts as n_attempts,
        max(created) as date,
    FROM [{dataset}.problem_analysis]
    group by user_id, item_id, item_grade, n_attempts  # force (user_id, item_id) to be unique (it should always be, even w/o this)
) PA
JOIN [{dataset}.course_item] CI
on PA.item_id = CI.item_id
order by user_id, CI.content_index, CI.item_number
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [ "%s.course_item" % dataset,
                   "%s.problem_analysis" % dataset
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[make_person_item_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()

Example #5

0

Show file

File: make_item_tables.py Project: AbdouSeck/edx2bigquery

def create_person_problem_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    Generate person_problem table, with one row per (user_id, problem_id), giving problem raw_score earned, attempts,
    and datestamp.

    Computed by aggregating over person_item, and joining with course_item
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "person_problem"

    the_sql = """
# compute person-problem table for {course_id}

SELECT user_id,
       course_id,
    CI.problem_nid as problem_nid,
    sum(item_grade) as problem_raw_score,
    sum(item_grade) / sum(CI.item_points_possible) * 100 as problem_pct_score,
    max(PI.grade) as grade,
    max(n_attempts) as n_attempts,
    max(date) as date,
    
FROM [{dataset}.person_item] PI
JOIN [{dataset}.course_item] CI
    
on PI.item_nid = CI.item_nid
group by user_id, course_id, problem_nid
order by user_id, course_id, problem_nid
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [ "%s.course_item" % dataset,
                   "%s.person_item" % dataset
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[make_person_problem_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()

Example #6

0

Show file

File: make_course_report_tables.py Project: maxliu/edx2bigquery

    def count_tracking_log_events(self):
        '''
        Loop over all tracking logs up to cutoff date, and sum up number of entries, by
        doing table info lookups, with no SQL queries.
        '''
        if self.skip_or_do_step("count_events") < 0:
            return	# skip step

        tlend = self.end_date.replace('-', '')	# end_date normally specified as YYYY-MM-DD

        log_event_counts = {}

        # iterate over each course, one at a time
        for course_id in self.course_id_set:
            log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
            # get list of all tracking log files for this course
            log_tables = [x for x in bqutil.get_list_of_table_ids(log_dataset) if x.startswith('tracklog_20')]

            log_tables_todo = [x for x in log_tables if x[9:] <= tlend]
            log_tables_todo.sort()
            print "[count_tracking_log_events] for course %s using %d tracking log tables, from %s to %s" % (course_id, 
                                                                                                             len(log_tables_todo),
                                                                                                             log_tables_todo[0], 
                                                                                                             log_tables_todo[-1])
            sys.stdout.flush()

            # go through all log files and get size on each
            row_sizes = [ bqutil.get_bq_table_size_rows(log_dataset, x) for x in log_tables_todo ]
            
            log_event_counts[course_id] = sum(row_sizes)
            print "                         For %s found %d total tracking log events" % (course_id, log_event_counts[course_id])
            sys.stdout.flush()

        self.log_event_counts = log_event_counts
        
        self.total_events = sum(log_event_counts.values())
        print "--> Total number of events for %s = %d" % (self.org, self.total_events)

Example #7

0

Show file

def obsolete_process_course(course_id, force_recompute=False, check_dates=True):
    '''
    make person_course_day tables for specified course_id.  This version
    produces one table for each day.  It is inefficient when there are 
    many days with very small daily tracking log tables.
    '''

    PCDAY_SQL = """
    select username, 
           "{course_id}" as course_id,
           sum(bevent) as nevents,
           sum(bprogress) as nprogcheck,
           sum(bshow_answer) as nshow_answer,
           sum(bvideo) as nvideo, 
           sum(bproblem_check) as nproblem_check,
           sum(bforum) as nforum,
           sum(bshow_transcript) as ntranscript,
           sum(bseq_goto) as nseq_goto,
           sum(bseek_video) as nseek_video,
           sum(bpause_video) as npause_video,
           MAX(time) as last_event,
           AVG(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
               ) as avg_dt,
           STDDEV(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as sdv_dt,
           MAX(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as max_dt,
           COUNT(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as n_dt,
           SUM(
               case when (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 then null
               else (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 end
           ) as sum_dt
    from
    (SELECT username, 
      case when event_type = "play_video" then 1 else 0 end as bvideo,
      case when event_type = "problem_check" then 1 else 0 end as bproblem_check,
      case when username != "" then 1 else 0 end as bevent,
      case when regexp_match(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum,
      case when regexp_match(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress,
      case when event_type in ("show_answer", "showanswer") then 1 else 0 end as bshow_answer,
      case when event_type = 'show_transcript' then 1 else 0 end as bshow_transcript,
      case when event_type = 'seq_goto' then 1 else 0 end as bseq_goto,
      case when event_type = 'seek_video' then 1 else 0 end as bseek_video,
      case when event_type = 'pause_video' then 1 else 0 end as bpause_video,
      # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll,
      # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll
      time,
      lag(time, 1) over (partition by username order by time) last_time
      FROM [{dataset}.{table_id}]
      WHERE
        NOT event_type contains "/xblock/"
        AND username != ""
    )
    group by course_id, username
    order by sdv_dt desc
    """

    course_dir = course_id.replace('/','__')
    dataset = bqutil.course_id2dataset(course_id)
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
    pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday")

    print "Processing course %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    log_tables = bqutil.get_tables(log_dataset)

    try:
        bqutil.create_dataset_if_nonexistent(pcd_dataset)
    except Exception as err:
        print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err))
        
    pcday_tables_info = bqutil.get_tables(pcd_dataset)
    pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])]

    print "pcday_tables = ", pcday_tables

    log_table_list = log_tables['tables']
    log_table_list.sort()

    for table in log_table_list:
        tr = table['tableReference']
        table_id = tr['tableId']
        if not table_id.startswith('tracklog'):
            continue
    
        date = table_id[9:]
    
        table_out = 'pcday_%s' % date
    
        if (table_out in pcday_tables) and not force_recompute:
            skip = True
            if check_dates:
                table_out_date = bqutil.get_bq_table_last_modified_datetime(pcd_dataset, table_out)
                log_table_date = bqutil.get_bq_table_last_modified_datetime(log_dataset, table_id)
                if log_table_date > table_out_date:
                    skip = False
                    print "%s...already exists, but table_out date=%s and log_table date=%s, so re-computing" % (table_out,
                                                                                                                 table_out_date,
                                                                                                                 log_table_date)
            if skip:
                print "%s...already done, skipping" % table_out
                sys.stdout.flush()
                continue

        if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0:
            print "...zero size table %s, skipping" % table_id
            sys.stdout.flush()
            continue

        print ("Creating %s " % table_out),
        
        the_sql = PCDAY_SQL.format(course_id=course_id, 
                                   dataset=log_dataset,
                                   table_id=table_id)
    
        sys.stdout.flush()
    
        bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()

Example #8

0

Show file

File: make_enrollment_day.py Project: kesiena115/edx2bigquery

def old_process_course(course_id, force_recompute=False):
    '''
    DEPRACATED - instead of creating one table per day, because there is so little
    total data, create one enrollday_all table (see other function below).

    make enrollday2_* tables for specified course_id
    '''

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
	            time, 
                    event_struct.user_id as user_id, 
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "honor")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "honor")
                          then -1 
                          else 0 end) as diff_enrollment_honor,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "verified")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "verified")
                          then -1 
                          else 0 end) as diff_enrollment_verified,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "audit")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "audit")
                          then -1 
                          else 0 end) as diff_enrollment_audit,
            FROM [{dataset}.{table_id}] 
            where (event_type = "edx.course.enrollment.activated") or
                  (event_type = "edx.course.enrollment.deactivated")
            order by time;
            """

    course_dir = course_id.replace('/','__')
    dataset = bqutil.course_id2dataset(course_id)
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
    pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday")

    print "Processing course %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    log_tables = bqutil.get_tables(log_dataset)

    try:
        bqutil.create_dataset_if_nonexistent(pcd_dataset)
    except Exception as err:
        print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err))
        
    pcday_tables_info = bqutil.get_tables(pcd_dataset)
    pcday_tables = [x['tableReference']['tableId'] for x in pcday_tables_info.get('tables', [])]

    # print "pcday_tables = ", pcday_tables

    log_table_list = log_tables['tables']
    log_table_list.sort()

    for table in log_table_list:
        tr = table['tableReference']
        table_id = tr['tableId']
        if not table_id.startswith('tracklog'):
            continue
    
        date = table_id[9:]
    
        table_out = 'enrollday2_%s' % date
    
        if (table_out in pcday_tables) and not force_recompute:
            print "%s...already done, skipping" % table_id
            sys.stdout.flush()
            continue

        if bqutil.get_bq_table_size_rows(log_dataset, table_id)==0:
            print "...zero size table %s, skipping" % table_id
            sys.stdout.flush()
            continue

        print ("Creating %s " % table_out), 
        
        the_sql = SQL.format(course_id=course_id, 
                             dataset=log_dataset,
                             table_id=table_id)
    
        sys.stdout.flush()
    
        bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False)
    
    print "Done with course %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()

Example #9

0

Show file

File: make_irt_report.py Project: AbdouSeck/edx2bigquery

def make_irt_report(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    the_sql_alpha = """
    IR.itemtestcorr as item_test,
    IR.itemrestcorr as item_rest,
    IR.alpha as alpha,
    """

    the_sql_no_alpha = """
    null as item_test,
    null as item_rest,
    null as alpha,
    """

    the_sql_alpha_join = """
    JOIN [{dataset}.item_reliabilities] IR
    on IR.item = CP.problem_yid
    """.format(dataset=dataset)

    the_sql = """
# item_response_theory_report for {course_id}
#
# problem_nid,problem_short_id,chapter,assignment_type,problem_label,problem_id,IRT item number,avg_problem_raw_score,avg_problem_pct_score,
# n_unique_users_attempted,item_test,item_rest,alpha ,Discrimination,Difficulty

SELECT 
    "{course_id}" as course_id,
    IG.problem_nid as problem_nid,
    CP.problem_short_id as problem_short_id,
    CI.chapter_name as chapter,
    assignment_type,
    CONCAT("[", STRING(IG.problem_nid), "] ", CI.chapter_name, " / ", CI.section_name, " / ", CP.problem_name) as problem_label,
    CP.problem_id as problem_id,
    CONCAT(STRING(CP.problem_nid), "/", STRING(cutnum)) as IRT_item_number,
    CP.avg_problem_raw_score avg_problem_raw_score,
    CP.avg_problem_pct_score avg_problem_pct_score,
    CP.n_unique_users_attempted n_unique_users_attempted,
    {sql_alpha}
    irt_diff as Difficulty,
    irt_disc as Discrimination,
    diff_se as Difficulty_SE,
    disc_se as Discrimination_SE,
    "{irt_method}" as irt_method,

FROM [{dataset}.{item_irt_grm}] IG
JOIN [{dataset}.course_item] CI
on IG.problem_nid = CI.problem_nid
JOIN 
(
    SELECT *, CONCAT("y", STRING(problem_nid)) as problem_yid,
    FROM [{dataset}.course_problem]
) CP
on IG.problem_nid = CP.problem_nid
{sql_alpha_join}
where CI.item_number = 1
    """

    tablename = "item_response_theory_report"
    RELIABILITIES_TABLE = "item_reliabilities"
    IRT_TABLES = OrderedDict([ ("item_irt_grm", "STATA GRM"),
                               ("item_irt_grm_R", "R mirt GRM"),
                           ])
    
    irt_table_to_use = None
    irt_table_date = None

    # use newest of the existing IRT tables
    for irt_tablename in IRT_TABLES:
        try:
            tinfo = bqutil.get_bq_table_info(dataset, irt_tablename )
            assert tinfo is not None, "%s.%s does not exist" % ( dataset, irt_tablename )
            lmt = tinfo.get('lastModifiedTime')
            use_table = lmt and ( (not irt_table_date) or (irt_table_date and lmt > irt_table_date) )
            if use_table:
                irt_table_date = lmt
                irt_table_to_use = irt_tablename
            else:
                print "[make_irt_report] Not using IRT table %s (date %s) - older than %s (date %s)" % ( irt_tablename,
                                                                                                         lmt,
                                                                                                         irt_table_to_use,
                                                                                                         irt_table_date )
        except Exception as err:
            pass
    
    if not irt_table_to_use:
        raise Exception("[make_irt_report] Cannot generate IRT report; requires one of %s" % (','.join(IRT_TABLES.keys())))

    # SQL changes depending on whether item_reliabilities exists or not
    have_reliabilities = False
    try:
        tinfo = bqutil.get_bq_table_info(dataset, RELIABILITIES_TABLE)
        assert tinfo is not None, "%s.%s does not exist" % ( dataset, RELIABILITIES_TABLE )
        if tinfo is not None:
            have_reliabilities = True
    except Exception as err:
        pass

    if have_reliabilities:
        sql_alpha = {'sql_alpha': the_sql_alpha, "sql_alpha_join": the_sql_alpha_join }
    else:
        sql_alpha = {'sql_alpha': the_sql_no_alpha, "sql_alpha_join": "" }

    the_sql = the_sql.format(dataset=dataset, course_id=course_id, item_irt_grm=irt_table_to_use, 
                             irt_method=IRT_TABLES[irt_table_to_use],
                             **sql_alpha)

    depends_on = [ "%s.course_item" % dataset,
                   "%s.course_problem" % dataset,
                   "%s.%s" % (dataset, irt_table_to_use),
               ]

    if have_reliabilities:
        depends_on.append("%s.item_reliabilities" % dataset)

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    newer_than=datetime.datetime(2016, 9, 27, 14, 48),
                                    startIndex=-2)
    except Exception as err:
        print "[make_irt_report] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d problem items found" % (tablename, course_id, nfound)
    sys.stdout.flush()

Example #10

0

Show file

File: make_enrollment_day.py Project: sibycharley/edx2bigquery

def old_process_course(course_id, force_recompute=False):
    '''
    DEPRACATED - instead of creating one table per day, because there is so little
    total data, create one enrollday_all table (see other function below).

    make enrollday2_* tables for specified course_id
    '''

    SQL = """
            SELECT 
  		    "{course_id}" as course_id,
	            time, 
                    event_struct.user_id as user_id, 
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "honor")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "honor")
                          then -1 
                          else 0 end) as diff_enrollment_honor,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "verified")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "verified")
                          then -1 
                          else 0 end) as diff_enrollment_verified,
                    (case when (event_type = "edx.course.enrollment.activated" 
                                and event_struct.mode = "audit")
                          then 1 
                          when (event_type = "edx.course.enrollment.deactivated" 
                                and event_struct.mode = "audit")
                          then -1 
                          else 0 end) as diff_enrollment_audit,
            FROM [{dataset}.{table_id}] 
            where (event_type = "edx.course.enrollment.activated") or
                  (event_type = "edx.course.enrollment.deactivated")
            order by time;
            """

    course_dir = course_id.replace('/', '__')
    dataset = bqutil.course_id2dataset(course_id)
    log_dataset = bqutil.course_id2dataset(course_id, dtype="logs")
    pcd_dataset = bqutil.course_id2dataset(course_id, dtype="pcday")

    print "Processing course %s (start %s)" % (course_id,
                                               datetime.datetime.now())
    sys.stdout.flush()

    log_tables = bqutil.get_tables(log_dataset)

    try:
        bqutil.create_dataset_if_nonexistent(pcd_dataset)
    except Exception as err:
        print "Oops, err when creating %s, err=%s" % (pcd_dataset, str(err))

    pcday_tables_info = bqutil.get_tables(pcd_dataset)
    pcday_tables = [
        x['tableReference']['tableId']
        for x in pcday_tables_info.get('tables', [])
    ]

    # print "pcday_tables = ", pcday_tables

    log_table_list = log_tables['tables']
    log_table_list.sort()

    for table in log_table_list:
        tr = table['tableReference']
        table_id = tr['tableId']
        if not table_id.startswith('tracklog'):
            continue

        date = table_id[9:]

        table_out = 'enrollday2_%s' % date

        if (table_out in pcday_tables) and not force_recompute:
            print "%s...already done, skipping" % table_id
            sys.stdout.flush()
            continue

        if bqutil.get_bq_table_size_rows(log_dataset, table_id) == 0:
            print "...zero size table %s, skipping" % table_id
            sys.stdout.flush()
            continue

        print("Creating %s " % table_out),

        the_sql = SQL.format(course_id=course_id,
                             dataset=log_dataset,
                             table_id=table_id)

        sys.stdout.flush()

        bqutil.create_bq_table(pcd_dataset, table_out, the_sql, wait=False)

    print "Done with course %s (end %s)" % (course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()

Example #11

0

Show file

File: make_irt_report.py Project: scmc/edx2bigquery

def make_irt_report(course_id,
                    force_recompute=False,
                    use_dataset_latest=False):
    '''
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    the_sql_alpha = """
    IR.itemtestcorr as item_test,
    IR.itemrestcorr as item_rest,
    IR.alpha as alpha,
    """

    the_sql_no_alpha = """
    null as item_test,
    null as item_rest,
    null as alpha,
    """

    the_sql_alpha_join = """
    JOIN [{dataset}.item_reliabilities] IR
    on IR.item = CP.problem_yid
    """.format(dataset=dataset)

    the_sql = """
# item_response_theory_report for {course_id}
#
# problem_nid,problem_short_id,chapter,assignment_type,problem_label,problem_id,IRT item number,avg_problem_raw_score,avg_problem_pct_score,
# n_unique_users_attempted,item_test,item_rest,alpha ,Discrimination,Difficulty

SELECT 
    "{course_id}" as course_id,
    IG.problem_nid as problem_nid,
    CP.problem_short_id as problem_short_id,
    CI.chapter_name as chapter,
    assignment_type,
    CONCAT("[", STRING(IG.problem_nid), "] ", CI.chapter_name, " / ", CI.section_name, " / ", CP.problem_name) as problem_label,
    CP.problem_id as problem_id,
    CONCAT(STRING(CP.problem_nid), "/", STRING(cutnum)) as IRT_item_number,
    CP.avg_problem_raw_score avg_problem_raw_score,
    CP.avg_problem_pct_score avg_problem_pct_score,
    CP.n_unique_users_attempted n_unique_users_attempted,
    {sql_alpha}
    irt_diff as Difficulty,
    irt_disc as Discrimination,
    diff_se as Difficulty_SE,
    disc_se as Discrimination_SE,
    "{irt_method}" as irt_method,

FROM [{dataset}.{item_irt_grm}] IG
JOIN [{dataset}.course_item] CI
on IG.problem_nid = CI.problem_nid
JOIN 
(
    SELECT *, CONCAT("y", STRING(problem_nid)) as problem_yid,
    FROM [{dataset}.course_problem]
) CP
on IG.problem_nid = CP.problem_nid
{sql_alpha_join}
where CI.item_number = 1
    """

    tablename = "item_response_theory_report"
    RELIABILITIES_TABLE = "item_reliabilities"
    IRT_TABLES = OrderedDict([
        ("item_irt_grm", "STATA GRM"),
        ("item_irt_grm_R", "R mirt GRM"),
    ])

    irt_table_to_use = None
    irt_table_date = None

    # use newest of the existing IRT tables
    for irt_tablename in IRT_TABLES:
        try:
            tinfo = bqutil.get_bq_table_info(dataset, irt_tablename)
            assert tinfo is not None, "%s.%s does not exist" % (dataset,
                                                                irt_tablename)
            lmt = tinfo.get('lastModifiedTime')
            use_table = lmt and ((not irt_table_date) or
                                 (irt_table_date and lmt > irt_table_date))
            if use_table:
                irt_table_date = lmt
                irt_table_to_use = irt_tablename
            else:
                print "[make_irt_report] Not using IRT table %s (date %s) - older than %s (date %s)" % (
                    irt_tablename, lmt, irt_table_to_use, irt_table_date)
        except Exception as err:
            pass

    if not irt_table_to_use:
        raise Exception(
            "[make_irt_report] Cannot generate IRT report; requires one of %s"
            % (','.join(IRT_TABLES.keys())))

    # SQL changes depending on whether item_reliabilities exists or not
    have_reliabilities = False
    try:
        tinfo = bqutil.get_bq_table_info(dataset, RELIABILITIES_TABLE)
        assert tinfo is not None, "%s.%s does not exist" % (
            dataset, RELIABILITIES_TABLE)
        if tinfo is not None:
            have_reliabilities = True
    except Exception as err:
        pass

    if have_reliabilities:
        sql_alpha = {
            'sql_alpha': the_sql_alpha,
            "sql_alpha_join": the_sql_alpha_join
        }
    else:
        sql_alpha = {'sql_alpha': the_sql_no_alpha, "sql_alpha_join": ""}

    the_sql = the_sql.format(dataset=dataset,
                             course_id=course_id,
                             item_irt_grm=irt_table_to_use,
                             irt_method=IRT_TABLES[irt_table_to_use],
                             **sql_alpha)

    depends_on = [
        "%s.course_item" % dataset,
        "%s.course_problem" % dataset,
        "%s.%s" % (dataset, irt_table_to_use),
    ]

    if have_reliabilities:
        depends_on.append("%s.item_reliabilities" % dataset)

    try:
        bqdat = bqutil.get_bq_table(dataset,
                                    tablename,
                                    the_sql,
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    newer_than=datetime.datetime(
                                        2016, 9, 27, 14, 48),
                                    startIndex=-2)
    except Exception as err:
        print "[make_irt_report] ERR! failed in creating %s.%s using this sql:" % (
            dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d problem items found" % (
        tablename, course_id, nfound)
    sys.stdout.flush()

Example #12

0

Show file

File: make_video_analysis.py Project: proversity-org/edx2bigquery

def createPersonCourseVideo(course_id,
                            force_recompute=False,
                            use_dataset_latest=False):
    '''
    Create the person_course_video_watched table, based on video_stats.
    Each row gives the number of unique videos watched by a given user, for the given course.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    table = TABLE_PERSON_COURSE_VIDEO_WATCHED

    the_sql = """
                  SELECT user_id, 
                      "{course_id}" as course_id,
                      count(*) n_unique_videos_watched,
                      count(*) / n_total_videos as fract_total_videos_watched,
                      viewed, certified, verified
                  FROM
                  (
                      SELECT PC.user_id as user_id, UV.username as username,
                          video_id, 
                          n_views,
                          NV.n_total_videos as n_total_videos,
                          certified,
                          viewed,
                          (mode=="verified") as verified,
                      FROM
                      (
                          SELECT username, video_id, count(*) as n_views
                          FROM [{dataset}.video_stats_day] 
                          GROUP BY username, video_id
                      ) UV
                      JOIN [{dataset}.person_course] PC
                      on UV.username = PC.username
                      CROSS JOIN 
                      (
                          SELECT count(*) as n_total_videos
                          FROM [{dataset}.video_axis]
                      ) NV
                      WHERE ((PC.roles = 'Student') OR (PC.roles is NULL))	# accommodate case when roles.csv is missing
                      # WHERE PC.roles = 'Student'
                  )
                  GROUP BY user_id, certified, viewed, verified, n_total_videos
                  order by user_id
              """

    the_sql = the_sql.format(course_id=course_id, dataset=dataset)
    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        the_sql,
        force_query=force_recompute,
        depends_on=["%s.%s" % (dataset, TABLE_VIDEO_STATS)],
        newer_than=datetime.datetime(2017, 2, 6, 18, 30),
        startIndex=-2)
    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, table)
    print "--> Done with %s for %s, %d entries found" % (table, course_id,
                                                         nfound)
    sys.stdout.flush()

    return bqdat

Example #13

0

Show file

def create_problem_first_attempt_correct_table(course_id,
                                               force_recompute=False,
                                               use_dataset_latest=False):
    '''
    It is very useful to know, for each graded problem, the percentage of users who got the problem
    correct on their first attempt.  This information is computed and stored in the problem_first_attempt_correct
    table, for exploreres, users who completed, and users who certified.  Problems are indexed by problem_nid,
    which is a unique index used by course_problem and course_item.
    '''

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    tablename = "problem_first_attempt_correct"

    the_sql = """
# compute problem_first_attempt_correct table for {course_id}
SELECT
    problem_nid,
    n_first_attempt_correct_by_certified,
    n_certified_users_attempted,
    n_first_attempt_correct_by_certified / n_certified_users_attempted * 100 as pct_correct_first_attempt_by_certified,
    n_first_attempt_correct_by_completed,
    n_completed_users_attempted,
    n_first_attempt_correct_by_completed / n_completed_users_attempted * 100 as pct_correct_first_attempt_by_completed,
    n_first_attempt_correct_by_explored,
    n_explored_users_attempted,
    n_first_attempt_correct_by_explored / n_explored_users_attempted * 100 as pct_correct_first_attempt_by_explored,
FROM (
    SELECT 

      PP.problem_nid as problem_nid,
      sum(case when PC.certified and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_certified,
      sum(case when PC.completed and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_completed,
      sum(case when PC.explored and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_explored,
      count(case when PC.certified then PP.user_id else null end) as n_certified_users_attempted,
      count(case when PC.completed then PP.user_id else null end) as n_completed_users_attempted,
      count(case when PC.explored then PP.user_id else null end) as n_explored_users_attempted,

    FROM [{dataset}.person_problem] PP
    JOIN [{dataset}.person_course] PC
    on PP.user_id = PC.user_id
    WHERE PC.certified or PC.completed or PC.explored
    group by problem_nid
    order by problem_nid
)
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [
        "%s.person_problem" % dataset,
        "%s.person_course" % dataset,
    ]

    try:
        bqdat = bqutil.get_bq_table(dataset,
                                    tablename,
                                    the_sql,
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[create_problem_first_attempt_correct_table] ERR! failed in creating %s.%s using this sql:" % (
            dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id,
                                                         nfound)
    sys.stdout.flush()

Example #14

0

Show file

File: make_item_tables.py Project: AbdouSeck/edx2bigquery

def create_problem_first_attempt_correct_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    It is very useful to know, for each graded problem, the percentage of users who got the problem
    correct on their first attempt.  This information is computed and stored in the problem_first_attempt_correct
    table, for exploreres, users who completed, and users who certified.  Problems are indexed by problem_nid,
    which is a unique index used by course_problem and course_item.
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "problem_first_attempt_correct"

    the_sql = """
# compute problem_first_attempt_correct table for {course_id}
SELECT
    problem_nid,
    n_first_attempt_correct_by_certified,
    n_certified_users_attempted,
    n_first_attempt_correct_by_certified / n_certified_users_attempted * 100 as pct_correct_first_attempt_by_certified,
    n_first_attempt_correct_by_completed,
    n_completed_users_attempted,
    n_first_attempt_correct_by_completed / n_completed_users_attempted * 100 as pct_correct_first_attempt_by_completed,
    n_first_attempt_correct_by_explored,
    n_explored_users_attempted,
    n_first_attempt_correct_by_explored / n_explored_users_attempted * 100 as pct_correct_first_attempt_by_explored,
FROM (
    SELECT 

      PP.problem_nid as problem_nid,
      sum(case when PC.certified and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_certified,
      sum(case when PC.completed and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_completed,
      sum(case when PC.explored and PP.n_attempts=1 and PP.problem_pct_score=100 then 1 else 0 end) as n_first_attempt_correct_by_explored,
      count(case when PC.certified then PP.user_id else null end) as n_certified_users_attempted,
      count(case when PC.completed then PP.user_id else null end) as n_completed_users_attempted,
      count(case when PC.explored then PP.user_id else null end) as n_explored_users_attempted,

    FROM [{dataset}.person_problem] PP
    JOIN [{dataset}.person_course] PC
    on PP.user_id = PC.user_id
    WHERE PC.certified or PC.completed or PC.explored
    group by problem_nid
    order by problem_nid
)
    """.format(dataset=dataset, course_id=course_id)

    depends_on = [ "%s.person_problem" % dataset,
                   "%s.person_course" % dataset,
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    startIndex=-2)
    except Exception as err:
        print "[create_problem_first_attempt_correct_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()