Python get_bq_table_infoの例

プログラミング言語: Python

名前空間/パッケージ名: bqutil

メソッド/関数: get_bq_table_info

hotexamples.comのコード掲載数: 37

Python get_bq_table_info - 37件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのbqutil.get_bq_table_infoの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: stats.py プロジェクト: CGNx/xanalytics

 def get_report_broad_stats(self):
     table = 'broad_stats_by_course'
     dataset = self.get_course_report_dataset()
     key = None
     tableinfo = bqutil.get_bq_table_info(dataset, table)
     data = self.cached_get_bq_table(dataset, table, key=key)
     return (data, tableinfo)

コード例 #2

ファイルを表示

ファイル: make_course_report_tables.py プロジェクト: maxliu/edx2bigquery

    def combine_show_answer_stats_by_course(self):
        '''
        combine show_answer_stats_by_course over all courses, into one table,
        stored in the course_report dataset.
        '''
        tablename = "show_answer_stats_by_course"
        if self.skip_or_do_step(tablename) < 0:
            return	# skip step

        # which datasets have stats_by_course?

        datasets_with_sasbc = []
        for cd in self.course_datasets:
            try:
                table = bqutil.get_bq_table_info(cd, tablename)
            except Exception as err:
                print "[make-course_report_tables] Err: %s" % str(err)
                continue
            if table is None:
                continue
            datasets_with_sasbc.append(cd)
        
        if not datasets_with_sasbc:
            print '[make_course_report_tables] combine_show_answer_stats_by_course: no datasets have show_answer_stats_by_course!'
            print '--> Aborting creation of %s' %  show_answer_stats_by_course
            print '--> This may cause problems with report creation.  Run analyze_problems on at least one course to resolve'

        sasbc_tables = ',\n'.join(['[%s.%s]' % (x, tablename) for x in datasets_with_sasbc])

        SQL = """
              SELECT * from {tables}
              """.format(tables=sasbc_tables)
        
        self.do_table(SQL, tablename, check_skip=False)

コード例 #3

ファイルを表示

ファイル: make_video_analysis.py プロジェクト: musixhine/edx2bigquery

def createVideoAxis(course_id,
                    force_recompute=False,
                    use_dataset_latest=False):
    '''
    Video axis depends on the current course axis, and looks for the category field defines as video.
    In addition, the edx video id is extracted (with the full path stripped, in order to generalize tracking log searches for video ids where it
    was found that some courses contained the full path beginning with i4x, while other courses only had the edx video id), youtube id
    and the chapter name / index for that respective video
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    table = TABLE_VIDEO_AXIS

    # Get Video results
    the_sql = """
                SELECT chapters.index as index_chapter,
                       videos.index as index_video,
                       videos.category as category,
                       videos.course_id as course_id,
                       videos.name as name,
                       videos.vid_id as video_id,
                       videos.yt_id as youtube_id,
                       chapters.name as chapter_name
                      FROM ( SELECT index, category, course_id, name, chapter_mid, 
                             #REGEXP_REPLACE(module_id, '[.]', '_') as vid_id, # vid id containing full path
                             REGEXP_EXTRACT(REGEXP_REPLACE(module_id, '[.]', '_'), r'(?:.*\/)(.*)') as vid_id, # Only containing video id
                             REGEXP_EXTRACT(data.ytid, r'\:(.*)') as yt_id,
                      FROM [{dataset}.course_axis]
                      WHERE category = "video") as videos
                      LEFT JOIN 
                      ( SELECT name, module_id, index
                        FROM [{dataset}.course_axis]
                      ) as chapters
                      ON videos.chapter_mid = chapters.module_id
                      ORDER BY videos.index asc
              """.format(dataset=dataset)

    print "[analyze_videos] Creating %s.%s table for %s" % (
        dataset, TABLE_VIDEO_AXIS, course_id)
    sys.stdout.flush()

    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS)
        assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % (
            TABLE_VIDEO_AXIS, TABLE_COURSE_AXIS)

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % (
            dataset, TABLE_COURSE_AXIS, TABLE_VIDEO_AXIS)
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        the_sql,
        force_query=force_recompute,
        depends_on=["%s.course_axis" % (dataset)],
    )
    return bqdat

コード例 #4

ファイルを表示

ファイル: make_video_analysis.py プロジェクト: kesiena115/edx2bigquery

def createVideoAxis(course_id, force_recompute=False, use_dataset_latest=False):
    """
    Video axis depends on the current course axis, and looks for the category field defines as video.
    In addition, the edx video id is extracted (with the full path stripped, in order to generalize tracking log searches for video ids where it
    was found that some courses contained the full path beginning with i4x, while other courses only had the edx video id), youtube id
    and the chapter name / index for that respective video
    """
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_VIDEO_AXIS

    # Get Video results
    the_sql = """
                SELECT chapters.index as index_chapter,
                       videos.index as index_video,
                       videos.category as category,
                       videos.course_id as course_id,
                       videos.name as name,
                       videos.vid_id as video_id,
                       videos.yt_id as youtube_id,
                       chapters.name as chapter_name
                      FROM ( SELECT index, category, course_id, name, chapter_mid, 
                             #REGEXP_REPLACE(module_id, '[.]', '_') as vid_id, # vid id containing full path
                             REGEXP_EXTRACT(REGEXP_REPLACE(module_id, '[.]', '_'), r'(?:.*\/)(.*)') as vid_id, # Only containing video id
                             REGEXP_EXTRACT(data.ytid, r'\:(.*)') as yt_id,
                      FROM [{dataset}.course_axis]
                      WHERE category = "video") as videos
                      LEFT JOIN 
                      ( SELECT name, module_id, index
                        FROM [{dataset}.course_axis]
                      ) as chapters
                      ON videos.chapter_mid = chapters.module_id
                      ORDER BY videos.index asc
              """.format(
        dataset=dataset
    )

    print "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_AXIS, course_id)
    sys.stdout.flush()

    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS)
        assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % (
            TABLE_VIDEO_AXIS,
            TABLE_COURSE_AXIS,
        )

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % (dataset, TABLE_COURSE_AXIS, TABLE_VIDEO_AXIS)
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(
        dataset, table, the_sql, force_query=force_recompute, depends_on=["%s.course_axis" % (dataset)]
    )
    return bqdat

コード例 #5

ファイルを表示

    def get_table(self,
                  dataset=None,
                  table=None,
                  org=None,
                  number=None,
                  semester=None):
        '''
        show arbitrary table from bigquery -- mainly for debugging
        '''
        if dataset is None:
            course_id = '/'.join([org, number, semester])
            dataset = bqutil.course_id2dataset(
                course_id, use_dataset_latest=self.use_dataset_latest())
            if not self.is_user_authorized_for_course(course_id):
                return self.no_auth_sorry()
            if ('person' in table) or ('track' in table) or ('student'
                                                             in table):
                if not self.does_user_have_role('instructor', course_id):
                    return self.no_auth_sorry()

            # be more restrictive: researchers only
            if not (self.does_user_have_role('researcher', course_id)):
                return self.no_auth_sorry()

        else:
            course_id = None
            if not self.user in self.AUTHORIZED_USERS:
                return self.no_auth_sorry()

        tableinfo = bqutil.get_bq_table_info(dataset, table)

        fields = tableinfo['schema']['fields']
        field_names = [x['name'] for x in fields]

        tablecolumns = json.dumps([{
            'data': x,
            'title': x,
            'class': 'dt-center'
        } for x in field_names])
        logging.info(tablecolumns)

        data = self.common_data
        data.update({
            'dataset': dataset,
            'table': table,
            'course_id': course_id,
            'tablecolumns': tablecolumns,
        })

        template = JINJA_ENVIRONMENT.get_template('show_table.html')
        self.response.out.write(template.render(data))

コード例 #6

ファイルを表示

ファイル: make_video_analysis.py プロジェクト: AbdouSeck/edx2bigquery

def createVideoStats_day( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None):
    '''
    Create video statistics per ay for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video
    position > 95% of the total video length duration.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS_PER_DAY
    
    the_sql = """
              SELECT date(time)as date, username,
                              #module_id as video_id,
                              #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path
                              (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'([-])' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only
                              max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else  float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position,
                       FROM {DATASETS}
                       WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and
                              event is not null
                       group by username, video_id, date
                       order by date
              """
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_STATS_PER_DAY )
        assert tinfo is not None, "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS_PER_DAY, course_id)

        print "[analyze_videos] Appending latest data to %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS_PER_DAY, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % ( dataset, TABLE_VIDEO_STATS_PER_DAY )
        sys.stdout.flush()
        pass

    print "=== Processing Video Stats Per Day for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     get_date_function=gdf,
                                                     skip_last_day=skip_last_day)

    print "Done with Video Stats Per Day for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()

コード例 #7

ファイルを表示

ファイル: main.py プロジェクト: pombredanne/xanalytics

    def get_table(self, dataset=None, table=None, org=None, number=None,semester=None):
        '''
        show arbitrary table from bigquery -- mainly for debugging
        '''
        if dataset is None:
            course_id = '/'.join([org, number, semester])
            dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest())
            if not self.is_user_authorized_for_course(course_id):
                return self.no_auth_sorry()
            if ('person' in table) or ('track' in table) or ('student' in table):
                if not self.does_user_have_role('instructor', course_id):
                    return self.no_auth_sorry()

            # be more restrictive: researchers only
            if not (self.does_user_have_role('researcher', course_id)):
                return self.no_auth_sorry()
                    
        else:
            course_id = None
            if not self.user in self.AUTHORIZED_USERS:
                return self.no_auth_sorry()

        tableinfo = bqutil.get_bq_table_info(dataset, table)

        fields = tableinfo['schema']['fields']
        field_names = [x['name'] for x in fields]

        tablecolumns = json.dumps([ { 'data': x, 'title': x, 'class': 'dt-center' } for x in field_names ])
        logging.info(tablecolumns)

        data = self.common_data
        data.update({'dataset': dataset,
                     'table': table,
                     'course_id': course_id,
                     'tablecolumns': tablecolumns,
                 })
        
        template = JINJA_ENVIRONMENT.get_template('show_table.html')
        self.response.out.write(template.render(data))

コード例 #8

ファイルを表示

ファイル: make_problem_events.py プロジェクト: proversity-org/edx2bigquery

def ExtractProblemEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None):
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_PROBLEM_EVENTS
    the_sql = """
SELECT  
    context.user_id as user_id, 
    time,
    event_source,
    REGEXP_EXTRACT(
      (CASE when module_id is not null then module_id 
          when event_type contains "/xblock/i4x:;_" then REPLACE(REGEXP_EXTRACT(event_type, r"i4x:;_;_(.*)/handler/xmodule"),";_", "/")
          else REPLACE(event_struct.problem, "i4x://", "")
          end),
      "[^/]+/problem/([^/]+)") as problem_url,
    (CASE when event_type contains "/xblock/i4x:;_" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)")
          when event_type contains "type@problem+block" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)")
          else event_type
          end) as event_type,
   event_struct.attempts as attempts,
   event_struct.success as success,
   event_struct.grade as grade,          
FROM {DATASETS}
WHERE       
   ( REGEXP_MATCH(event_type, r'problem_\w+') 
     OR event_type = "showanswer"
   )
   AND context.user_id is not null
   and time > TIMESTAMP("{last_date}")
   {hash_limit}
order by user_id, time
    """

    try:
        bqutil.create_dataset_if_nonexistent(dataset)
        tinfo = bqutil.get_bq_table_info(dataset, table )
        assert tinfo is not None, "[make_problem_events] Creating %s.%s table for %s" % (dataset, table, course_id)

        print "[make_problem_events] Appending latest data to %s.%s table for %s" % (dataset, table, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % ( dataset, table )
        sys.stdout.flush()
        pass

    print "=== Processing Forum Events for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     get_date_function=gdf,
                                                     has_hash_limit=True,
                                                     end_date=end_date,
                                                     skip_last_day=skip_last_day
                                                    )

    print "Done with Problem Events for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()

コード例 #9

ファイルを表示

ファイル: make_forum_analysis.py プロジェクト: kesiena115/edx2bigquery

def CreateForumPosts( course_id, force_recompute=True, use_dataset_latest=False, skip_last_day=False, end_date=None):
    '''
    Create Forum posts table, based on forum data. Categorizes forum posts as initial_post, response_post or comment.
    Also extracts first 100 characters of the post content as a preview.
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_FORUM_POSTS

    the_sql = """
                        SELECT * FROM
                        (
                             SELECT ADDED_TITLE.FA.username as username,
				    "{course_id}" as course_id,
                                    ADDED_TITLE.FA.slug_id as slug_id,
                                    ADDED_TITLE.FA.slug_type as slug_type,
                                    ADDED_TITLE.FA.thread_id as thread_id,
                                    ADDED_TITLE.FA.parent_id as parent_id,
                                    ADDED_TITLE.IP.username as original_poster,
                                    ADD_RESPOND_TO.username as responded_to,
                                    ADDED_TITLE.IP.title as title,
                                    ADDED_TITLE.FA.first_time as first_time,
                                    ADDED_TITLE.FA.body_preview as body_preview,
                             FROM 
                             (
                                  SELECT * FROM 
                                  (
                                       SELECT author_username as username,
                                              mongoid as slug_id,
                                              (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                              comment_thread_id as thread_id,
                                              parent_id,
                                              title,
                                              created_at as first_time,
                                              SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                       FROM [{dataset}.{forum}]
                                       WHERE (_type = "Comment" and parent_id is not null)
                                  ) as FA # 3rd level comment
                                  LEFT JOIN EACH
                                  ( 
                                       SELECT * FROM
                                       ( 
                                            SELECT author_username as username,
                                                   mongoid as slug_id,
                                                   (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                     (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                                   comment_thread_id as thread_id,
                                                   parent_id,
                                                   title,
                                                   created_at as first_time,
                                                   SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                            FROM [{dataset}.{forum}]
                                            WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                                       )
                                  ) as IP
                                  ON FA.thread_id = IP.slug_id
                             ) as ADDED_TITLE
                             LEFT JOIN EACH
                             (
                                  SELECT author_username as username,
                                         mongoid as slug_id,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                  FROM [{dataset}.{forum}]
                             ) as ADD_RESPOND_TO
                             ON ADDED_TITLE.FA.parent_id = ADD_RESPOND_TO.slug_id
                             WHERE ADDED_TITLE.FA.slug_type = "comment"
                        ) as RC,
                        (
                             SELECT FA.username as username,
				    "{course_id}" as course_id,
                                    FA.slug_id as slug_id,
                                    FA.slug_type as slug_type,
                                    FA.thread_id as thread_id,
                                    FA.parent_id as parent_id,
                                    IP.username as original_poster,
                                    IP.title as title,
                                    FA.first_time as first_time,
                                    FA.body_preview as body_preview
                             FROM 
                             (
                                  SELECT author_username as username,
                                         mongoid as slug_id,
                                         (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                           (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                         title,
                                         created_at as first_time,
                                         SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                  FROM [{dataset}.{forum}]
                                  WHERE (_type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null)
                             ) as FA # 2nd level comment
                             LEFT JOIN EACH
                             (
                                  SELECT * FROM 
                                  (    
                                       SELECT author_username as username,
                                              mongoid as slug_id,
                                              (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                              comment_thread_id as thread_id,
                                              parent_id,
                                              title,
                                              created_at as first_time,
                                              SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                       FROM [{dataset}.{forum}]
                                       WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                                  )
                             ) as IP
                             ON FA.thread_id = IP.slug_id
                        ) as RC2,
                        (
                             SELECT * FROM
                             (
                                  SELECT author_username as username,
				         "{course_id}" as course_id,
                                         mongoid as slug_id,
                                         (case when _type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null then "initial_post" end) as slug_type,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                         title,
                                         created_at as first_time,
                                         SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                  FROM [{dataset}.{forum}]
                                  WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                             )
                        ) as NA
              """.format( dataset=dataset, course_id=course_id, forum=TABLE_FORUM, post_preview_char_count=POST_PREVIEW_CHAR_COUNT )

    print "[make_forum_analysis] Creating %s.%s table for %s" % ( dataset, TABLE_FORUM_POSTS, course_id )
    sys.stdout.flush()

    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_FORUM )
        assert tinfo is not None, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_POSTS, TABLE_FORUM )

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % ( dataset, TABLE_FORUM, TABLE_FORUM_POSTS )
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(dataset, table, the_sql, force_query=force_recompute,
                                depends_on=["%s.%s" % (dataset, TABLE_FORUM)],
                                )

    return bqdat

コード例 #10

ファイルを表示

ファイル: make_item_tables.py プロジェクト: AbdouSeck/edx2bigquery

def create_course_item_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    the course_item dataset has these columns:

    Field Name                              Type    Example         Description
    item_id                                 string  i4x-MITx-8_MReV-problem-CheckPoint_1_Newton_s_First_Law_2_1     
                                                                    Unique ID for an assessment item (constructed using the problem module_id, and linked to problem_analysis table keys)
    problem_id                              string  CheckPoint_1_Newton_s_First_Law 
                                                                    Unique ID for an assessment problem (constructed using problem url_name)
    problem_nid                             integer 27              unique problem numerical id (equal to the sequential count of problems up to this one)
    assignment_short_id                     string  HW_4            Unique short ID for assignment, using assignment short name + "_" + assignment_seq_num (should be same as what shows up in user's edX platform progress page)
    item_weight                             float   6.59E-05        Fraction of overall grade (between 0 and 1) contributed by this item
    n_user_responses                        integer 4868            Number of users who provided a response to this assessment item
    problem_name                            string  CheckPoint 1: Newton's First Law        
                                                                    Name of problem within which this item exists
    chapter_name                            string  Chapter 1       Name of chapter within which the problem exists
    section_name                            string  Section 1       Name of section (aka sequential) within which the problem exists
    assignment_id                           string  Checkpoint_ch3  Unique ID for the assignment within which the problem exists
    n_problems_in_assignment                integer 23              Number of problems within the assignment
    assignment_type                         string  Checkpoint      The assignment type within which the assignment exists
    assignment_type_weight                  float   0.1             Fraction of the overall grade contributed by the assignment type
    n_assignments_of_type                   integer 11              Number of assignments of this type
    assignment_seq_num                      integer 3               Sequential number of the assignment_type within the course
    chapter_number                          integer 3               Number of the chapter within which the problem exists
    section_number                          integer 3               Number of the section (aka sequential) within which the problem exists
    content_index                           integer 141             Index number of the problem within the content course axis
    problem_weight                          integer 1               Weight of the problem within the assignment
    item_points_possible                    float   1               Always 1 (used for debugging - number of points assigned to an item)
    problem_points_possible                 integer 6               Always equal to the number of items in the assignment (used for debugging)
    emperical_item_points_possible          integer 1               Emperical value of point value of item, based on user data in problem_analysis table (for debugging)
    emperical_problem_points_possible       integer 6               Emperical value of maximum number of points possible for problem based on problem_analysis (for debugging)
    item_number                             integer 1               Number of the item, within the problem (in order of presentation, starting from 1)
    n_items                                 integer 6               Number of items within the problem
    start_date                              date    2013-06-01 00:01:00 UTC 
                                                                    Date when problem was issued
    due_date                                date    2013-06-23 23:59:00 UTC 
                                                                    Date when problem was due
    problem_path                            string  /Unit_1/Newtons_First_Law/2/1   
                                                                    Path of problem within course content, specifying chapter and sequential
    problem_short_id                        string  HW_7__3         short (and unique) problem ID, made using assignment short ID + "__" + problem number
    item_short_id                           string  HW_7__3_1       short (and unique) item ID, made using problem short ID + "_" + item number
    item_nid                                integer 41              unique item numerical id (equal to the row number of this entry in the course_itm table)
    cumulative_item_weight                  float   6.59E-05        Cumulative fraction of item weights (for debugging: should increase to 1.0 by the end of table)
    is_split                                boolean False           Boolean flag indicating if this item was within an A/B split_test or not
    split_name                              string  CircMotionAB    Name of the split_test within which this item is placed, if is_split is True

    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "course_item"

    # determine if grading_policy exists or not
    GP_TABLE = "grading_policy"
    have_grading_policy = False
    try:
        tinfo = bqutil.get_bq_table_info(dataset, GP_TABLE)
        assert tinfo is not None, "%s.%s does not exist" % ( dataset, GP_TABLE )
        if tinfo is not None:
            have_grading_policy = True
    except Exception as err:
        pass

    # change SQL if grading_policy doesn't exist
    if have_grading_policy:
        disable_gformat = ""
        alternate_gp = ""
    else:
        print "Warning - grading_policy doest NOT exist, using a dummy grading policy instead, and allowing gformat=null"
        sys.stdout.flush()
        disable_gformat = "#"
        alternate_gp = '( SELECT "" as assignment_type, 1.0 as fraction_of_overall_grade, "none" as short_label ) GP'

    the_sql = """
SELECT 
    # '{course_id}' as course_id,
    *,
    CONCAT(assignment_short_id, "__", STRING(problem_number)) as problem_short_id,
    CONCAT(assignment_short_id, "__", STRING(problem_number), "_", STRING(item_number)) as item_short_id,
    row_number() over (order by content_index, item_number) as item_nid,
    sum(item_weight) over (order by content_index, item_number) cumulative_item_weight
FROM
(
    # items with additional data about fraction_of_overall_grade from grading_policy
    SELECT item_id, 
        problem_id,
        max(if(item_number=1, x_item_nid, null)) over (partition by problem_id) as problem_nid,
        CONCAT((case when GP.short_label is null then "" else GP.short_label end),
               "_", STRING(assignment_seq_num)) as assignment_short_id,
        (problem_weight * (case when GP.fraction_of_overall_grade is null then 1.0 else GP.fraction_of_overall_grade end)
             / n_items / sum_problem_weight_in_assignment / n_assignments_of_type) as item_weight,
        n_user_responses,
        chapter_name,
        section_name,
        vertical_name,
        problem_name,
        CI.assignment_id as assignment_id,
        n_problems_in_assignment,
        CI.assignment_type as assignment_type,
        (case when GP.fraction_of_overall_grade is null then 1.0 else GP.fraction_of_overall_grade end) as assignment_type_weight,
        n_assignments_of_type,
        assignment_seq_num,
        chapter_number,
        content_index,
        section_number,
        problem_number,
        problem_weight,
        item_points_possible,
        problem_points_possible,
        emperical_item_points_possible,
        emperical_problem_points_possible,
        item_number,
        n_items,
        start_date,
        due_date,
        is_split,
        split_name,
        problem_path,
    FROM
    (
        # items with number of problems per assignment
        SELECT item_id, item_number,
            n_items,
            problem_id,
            row_number() over (partition by item_number order by content_index) as x_item_nid,
            n_user_responses,
            chapter_name,
            section_name,
            vertical_name,
            problem_name,
            assignment_id,
            sum(if(assignment_id is not null and item_number=1, 1, 0)) over (partition by assignment_id) n_problems_in_assignment,
            sum(if(assignment_id is not null and item_number=1, problem_weight, 0)) 
                over (partition by assignment_id) sum_problem_weight_in_assignment,
            assignment_type,
            n_assignments_of_type,
            assignment_seq_num,
            chapter_number,
            section_number,
            problem_number,
            problem_path,
            content_index,
            start_date,
            due_date,
            is_split,
            split_name,
            problem_weight,
            item_points_possible,
            problem_points_possible,
            emperical_item_points_possible,
            emperical_problem_points_possible,
        FROM
        (
            # items from problem_analysis with metadata from course_axis
            SELECT item_id, item_number,
                n_items,
                problem_id,
                n_user_responses,
                CA.name as problem_name,
                chapter_name,
                section_name,
                vertical_name,
                assignment_id,
                assignment_type,
                n_assignments_of_type,
                CA.assignment_seq_num as assignment_seq_num,
                CA.chapter_number as chapter_number,
                CA.section_number as section_number,
                CA.problem_number as problem_number,
                CA.path as problem_path,
                CA.index as content_index,
                CA.start as start_date,
                CA.due as due_date,
                CA.is_split as is_split,
                CA.split_name as split_name,
                if(CA.weight is null, 1.0, CA.weight) as problem_weight,
                item_points_possible,
                problem_points_possible,
                emperical_item_points_possible,
                emperical_problem_points_possible,
            FROM
            (
                # get items with item metadata from problem_analysis table
                SELECT item_id, item_number,
                    n_items,
                    problem_id,
                    n_user_responses,
                    1.0 as item_points_possible,
                    1.0 * n_items as problem_points_possible,
                    problem_points_possible / n_items as emperical_item_points_possible,
                    problem_points_possible as emperical_problem_points_possible,
                FROM
                (
                    SELECT item_id, item_number,
                        max(item_number) over (partition by problem_id) n_items,
                        problem_id,
                        problem_points_possible,
                        n_user_responses,
                    FROM
                    (
                        SELECT item_id,
                            row_number() over (partition by problem_id order by item_id) item_number,
                            problem_id,
                            problem_points_possible,
                            n_user_responses,
                        FROM
                        (
                            SELECT item.answer_id as item_id,
                                problem_url_name as problem_id,
                                max_grade as problem_points_possible,
                                count(*) as n_user_responses,
                            FROM [{dataset}.problem_analysis]
                            group by item_id, problem_id, problem_points_possible
                            having n_user_responses > 5   # minimum cutoff for an item to be included
                        )
                    )
                )
                order by item_id, item_number
            ) as PA
            JOIN 
            (
                # -------------------------------------------------- graded problems from course axis
                # master table of graded problems from course_axis, with assignment metadata
                SELECT module_id,
                    url_name,
                    index,
                    weight,
                    assignment_type,
                    MAX(IF(problem_number=1, x_assignment_seq_num, null)) over (partition by assignment_id) as assignment_seq_num,
                    problem_number,
                    assignment_id,
                    n_assignments_of_type,
                    chapter_name,
                    section_name,
                    vertical_name,
                    name,
                    path,
                    start,
                    due,
                    is_split,
                    split_name,
                    chapter_number,
                    section_number,
                FROM
                (
                    # course_axis with chapter number and number of assignments of type
                    SELECT *,  # add column with number of assignments of type
                        SUM(IF(problem_number=1, 1, 0)) over (partition by assignment_type) n_assignments_of_type,
                        row_number() over (partition by assignment_type, problem_number order by index) as x_assignment_seq_num,
                    FROM
                    (
                        # ---------------------------------------- course axis with vertical name
                        SELECT module_id,
                            url_name,
                            index,
                            weight,
                            assignment_type,
                            chapter_number,
                            section_number,
                            assignment_id,  
                            chapter_name,
                            section_name,
                            vertical_name,
                            name,
                            path,
                            start,
                            due,
                            is_split,
                            split_name,
                            # add column with problem number within assignment_id
                            row_number() over (partition by assignment_id order by index) problem_number,
                        FROM
                        (
                            # course axis of problems which have non-null grading_format, including chapter number
                            # and section (aka sequential) number (within the chapter)
                            SELECT CAI.module_id as module_id,
                                CAI.url_name as url_name,
                                index,
                                weight,
                                assignment_type,
                                chapter_number,
                                section_number,
                                #  assignment_id = assignment_type + ch_chapter_number + sec_section_number
                                CONCAT(assignment_type, "_ch", STRING(chapter_number), "_sec", STRING(section_number)) as assignment_id,  
                                chapter_name,
                                section_name,
                                name,
                                path,
                                start,
                                due,
                                is_split,
                                split_name,
                                parent,
                            FROM 
                            (
                                # course axis entries of things which have non-null grading format, with section_mid from path
                                SELECT module_id,
                                    url_name,
                                    index,
                                    If(data.weight is null, 1.0, data.weight) as weight,
                                    (case when gformat is null then "" else gformat end) as assignment_type,
                                    chapter_mid as chapter_mid,
                                    REGEXP_EXTRACT(path, '^/[^/]+/([^/]+)') as section_mid,
                                    name,
                                    path,
                                    start,
                                    due,
                                    is_split,
                                    split_url_name as split_name,
                                    parent,
                                FROM [{dataset}.course_axis] CAI
                                where 
                                #{disable_gformat} gformat is not null and
                                category = "problem"
                                order by index
                            ) CAI
                            LEFT JOIN  # join course_axis with itself to get chapter_number and section_number
                            (   
                                # get chapters and sections (aka sequentials) with module_id, chapter_number, and section_number
                                # each assignment is identified by assignment_type + chapter_number + section_number
                                # note in some previous calculations, the section_number was left out by mistake
                                # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/course_module.py#L1305
                                SELECT module_id, url_name, name as section_name,
                                    max(if(category="chapter", x_chapter_number, null)) over (partition by chapter_mid order by index) as chapter_number,
                                    section_number,
                                    chapter_name,
                                FROM
                                (
                                    SELECT module_id, url_name,
                                        row_number() over (partition by category order by index) as x_chapter_number,
                                        row_number() over (partition by chapter_mid, category order by index) as section_number,
                                        FIRST_VALUE(name) over (partition by chapter_mid order by index) as chapter_name,
                                        index,
                                        category,
                                        name,
                                        if(category="chapter", module_id, chapter_mid) as chapter_mid,
                                    FROM  [{dataset}.course_axis] 
                                    where category = "chapter" or category = "sequential" or category = "videosequence"
                                    order by index
                                )
                                order by index
                            ) CHN
                            # ON CAI.chapter_mid = CHN.chapter_mid  # old, for assignments by chapter
                            ON CAI.section_mid = CHN.url_name     # correct way, for assignments by section (aka sequential)
                            # where gformat is not null
                        ) CAPN
                        LEFT JOIN # join with course_axis to get names of verticals in which problems reside
                        (
                            # get verticals
                            SELECT url_name as vertical_url_name, 
                                name as vertical_name,
                            FROM  [{dataset}.course_axis] 
                            where category = "vertical"
                        ) CAV
                        ON CAPN.parent = CAV.vertical_url_name
                        # ---------------------------------------- END course axis with vertical_name
                    )
                )
                order by index
                # -------------------------------------------------- END graded problems from course axis
            ) CA
            ON PA.problem_id = CA.url_name
        )
    ) CI
    LEFT JOIN 
    {disable_gformat} [{dataset}.grading_policy] GP
    {alternate_gp}
    ON CI.assignment_type = GP.assignment_type
    order by content_index, item_number
)
order by content_index, item_number
    """.format(dataset=dataset, course_id=course_id, disable_gformat=disable_gformat, alternate_gp=alternate_gp)

    depends_on = [ "%s.course_axis" % dataset,
                   "%s.grading_policy" % dataset,
                   "%s.problem_analysis" % dataset
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    newer_than=datetime.datetime(2015, 10, 31, 17, 00),
                                    depends_on=depends_on,
                                    force_query=force_recompute)
    except Exception as err:
        print "[make_course_item_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = len(bqdat['data'])
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()

コード例 #11

ファイルを表示

ファイル: make_idv_features.py プロジェクト: scmc/edx2bigquery

def AnalyzeIDV(course_id, force_recompute=False, use_dataset_latest=False):

    tablename = "idv_analysis"
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    print "="*77
    print "Creating %s.%s table for %s" % (dataset, tablename, course_id)
    print "-"*77

    org = course_id.split('/',1)[0]
    dataset_cr = ('course_report_%s' % org)
    if use_dataset_latest:
        dataset_cr = 'course_report_latest'

    pcv = "person_course_viewed"
    try:
        tinfo = bqutil.get_bq_table_info(dataset_cr, "person_course_viewed")
        assert tinfo is not None
    except Exception as err:
        print " --> missing %s.%s ;  using dummy instead" % (dataset_cr, pcv)
        sys.stdout.flush()
        dataset_cr = dataset
        pcv = "person_course"

    the_sql = """
# IDV and non-IDV enrollee engagement at point of last IDV enrollment, before course end, including # IDV and certificates in other courses
SELECT
    "{course_id}" as course_id,    
    OC.user_id as user_id,
    OC.username as username,
    (OC.verified_enroll_time is not NULL) as is_idv,
    sum(case when PCI.verified_enroll_time is not NULL then 1 else 0 end) as n_other_idv,
    sum(case when PCI.verified_enroll_time is not NULL and PCI.start_time > OC.start_time then 1 else 0 end) as n_previous_idv,
    sum(case when PCI.certified and PCI.start_time > OC.start_time then 1 else 0 end) as n_previous_certified,
    sum(case when PCI.viewed and PCI.start_time > OC.start_time then 1 else 0 end) as n_previous_participated,
    sum(case when (PCI.verified_enroll_time is not NULL and PCI.start_time > OC.start_time and PCI.certified) then 1 else 0 end) as n_previous_idv_certified,
    first(gender) as gender,
    first(YoB) as YoB,
    first(LoE) as LoE,
    OC.n_problem_records as n_problem_records,
    OC.n_correct as n_correct,
    OC.n_incorrect as n_incorrect,
    OC.total_problem_points as total_problem_points,
    OC.verified_enroll_time as verified_enroll_time,
    OC.verified_unenroll_time as verified_unenroll_time,
    OC.verified_enroll_date as verified_enroll_date,
    OC.verified_unenroll_date as verified_unenroll_date,
    OC.nforum_pinned as nforum_pinned,
    OC.is_forum_moderator as is_forum_moderator,
    OC.final_course_grade as final_course_grade,
    OC.earned_certificate as earned_certificate,
    OC.n_show_answer as n_show_answer,
    OC.nprogcheck as nprogcheck,
    OC.nvideo as nvideo,
    OC.nforum_reads as nforum_reads,
    OC.nforum_posts as nforum_posts,
    OC.hours_on_system as hours_on_system,
    OC.countryLabel as countryLabel,
    OC.start_time as start_time,
FROM
(
    # engagement stats for NON verified ID versus verified ID, as of the date of the last IDV signup
    SELECT *
    FROM
    (
        # stats for NON verified ID, as of the date of the last IDV signup
        SELECT
        PAC.user_id as user_id,
            PAC.username as username,
            PAC.n_problem_records as n_problem_records,
            PAC.n_correct as n_correct,
            PAC.n_incorrect as n_incorrect,
            PAC.total_problem_points as total_problem_points,
            PAC.verified_enroll_time as verified_enroll_time,
            PAC.verified_unenroll_time as verified_unenroll_time,
            DATE(PAC.verified_enroll_time) as verified_enroll_date,
            DATE(PAC.verified_unenroll_time) as verified_unenroll_date,
            PAC.nforum_pinned as nforum_pinned,
            PAC.is_forum_moderator as is_forum_moderator,
            PAC.final_course_grade as final_course_grade,
            PAC.earned_certificate as earned_certificate,
            PAC.countryLabel as countryLabel,
            PAC.start_time as start_time,
            sum(PCD.nshow_answer) as n_show_answer,
            sum(PCD.nprogcheck) as nprogcheck,
            sum(PCD.nvideo) as nvideo,
            sum(PCD.nforum_reads) as nforum_reads,
            sum(PCD.nforum_posts) as nforum_posts,
            sum(PCD.sum_dt / 60 / 60) as hours_on_system,
        FROM
        (
            # get problem grade and activity counts up to date of verified ID enrollment
            SELECT PA.user_id as user_id,
                PC.username as username,
                count(*) as n_problem_records,
                sum(case when PA.item.correct_bool then 1 else 0 end) as n_correct,
                sum(case when PA.item.correct_bool==False then 1 else 0 end) as n_incorrect,
                sum(PA.grade) as total_problem_points,
                PC.verified_enroll_time as verified_enroll_time,
                PC.verified_unenroll_time as verified_unenroll_time,
                PC.nforum_pinned as nforum_pinned,
                PC.forumRoles_isModerator as is_forum_moderator,
                PC.grade as final_course_grade,
                PC.certified as earned_certificate,
                PC.countryLabel as countryLabel,
                PC.start_time as start_time,
                max_verified_enroll_time,
            FROM [{dataset}.problem_analysis] PA
            JOIN
            (
                SELECT user_id, username, verified_enroll_time, verified_unenroll_time, nforum_pinned,
                    forumRoles_isModerator, grade, certified, max_verified_enroll_time, countryLabel, start_time
                      FROM [{dataset}.person_course] PC
                CROSS JOIN
                (
                    SELECT max(verified_enroll_time) as max_verified_enroll_time
                            FROM [{dataset}.person_course]
                ) VET
                where viewed
            ) PC
            ON PA.user_id = PC.user_id
            where PA.created <= PC.max_verified_enroll_time
                and PC.verified_enroll_time is null
            group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate,
                verified_unenroll_time, max_verified_enroll_time, countryLabel, start_time
            order by user_id
        ) PAC
        JOIN [{dataset}.person_course_day] PCD
            ON PAC.username = PCD.username
        WHERE PCD.date < DATE(max_verified_enroll_time)
        group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate,
                 verified_unenroll_time, n_problem_records, n_correct, n_incorrect, total_problem_points, nforum_pinned, is_forum_moderator,
                 verified_enroll_date, verified_unenroll_date, countryLabel, start_time
        order by user_id
    ),
    (
        # stats for those who DID enroll verified ID, as of the date of their IDV enrollment
        # include nprogcheck, nshow_answer, nproblem_check, nvideo, hours_on_system
        SELECT
        PAC.user_id as user_id,
            PAC.username as username,
            PAC.n_problem_records as n_problem_records,
            PAC.n_correct as n_correct,
            PAC.n_incorrect as n_incorrect,
            PAC.total_problem_points as total_problem_points,
            PAC.verified_enroll_time as verified_enroll_time,
            PAC.verified_unenroll_time as verified_unenroll_time,
            DATE(PAC.verified_enroll_time) as verified_enroll_date,
            DATE(PAC.verified_unenroll_time) as verified_unenroll_date,
            PAC.nforum_pinned as nforum_pinned,
            PAC.is_forum_moderator as is_forum_moderator,
            PAC.final_course_grade as final_course_grade,
            PAC.earned_certificate as earned_certificate,
            PAC.countryLabel as countryLabel,
            PAC.start_time as start_time,
            sum(PCD.nshow_answer) as n_show_answer,
            sum(PCD.nprogcheck) as nprogcheck,
            sum(PCD.nvideo) as nvideo,
            sum(PCD.nforum_reads) as nforum_reads,
            sum(PCD.nforum_posts) as nforum_posts,
            sum(PCD.sum_dt / 60 / 60) as hours_on_system,
        FROM
        (
            # get problem grade and activity counts up to date of verified ID enrollment
            SELECT PA.user_id as user_id,
                PC.username as username,
                count(*) as n_problem_records,
                sum(case when PA.item.correct_bool then 1 else 0 end) as n_correct,
                sum(case when PA.item.correct_bool==False then 1 else 0 end) as n_incorrect,
                sum(PA.grade) as total_problem_points,
                PC.verified_enroll_time as verified_enroll_time,
                PC.verified_unenroll_time as verified_unenroll_time,
                PC.nforum_pinned as nforum_pinned,
                PC.forumRoles_isModerator as is_forum_moderator,
                PC.grade as final_course_grade,
                PC.certified as earned_certificate,
                PC.countryLabel as countryLabel,
                PC.start_time as start_time,
            FROM [{dataset}.problem_analysis] PA
            JOIN [{dataset}.person_course] PC
               ON PA.user_id = PC.user_id
            where PA.created <= PC.verified_enroll_time
            group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade,
                     earned_certificate, verified_unenroll_time, countryLabel, start_time
            order by user_id
        ) PAC
        JOIN [{dataset}.person_course_day] PCD
            ON PAC.username = PCD.username
        WHERE PCD.date < DATE(PAC.verified_enroll_time)
        group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate,
                 verified_unenroll_time, n_problem_records, n_correct, n_incorrect, total_problem_points, nforum_pinned, is_forum_moderator,
                 verified_enroll_date, verified_unenroll_date, countryLabel, start_time
        order by user_id
    )
    order by verified_enroll_date, user_id
) OC
LEFT JOIN [{dataset_cr}.{pcv}] PCI
on OC.user_id = PCI.user_id
#where (PCI.verified_enroll_time is null) or (PCI.verified_enroll_time <= OC.verified_enroll_time)
group by user_id, username, verified_enroll_time, nforum_pinned, is_forum_moderator, final_course_grade, earned_certificate,
         verified_unenroll_time, n_problem_records, n_correct, n_incorrect, total_problem_points, nforum_pinned, is_forum_moderator,
         verified_enroll_date, verified_unenroll_date,
         n_show_answer, nprogcheck, nvideo, nforum_reads, nforum_posts, hours_on_system, countryLabel, start_time, is_idv
order by verified_enroll_date, user_id
"""

    the_sql = the_sql.format(dataset=dataset, dataset_cr=dataset_cr, pcv=pcv, course_id=course_id)

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, force_query=force_recompute,
                                    depends_on=["%s.problem_course" % dataset, "%s.person_course_day" % dataset, "%s.problem_analysis" % dataset],
                                    allowLargeResults=True,
                                    startIndex=-2)
    except Exception as err:
        print "ERROR! Failed on SQL="
        print the_sql
        raise
    
    print "  --> created %s.%s" % (dataset, tablename)
    sys.stdout.flush()

コード例 #12

ファイルを表示

ファイル: custom_reports.py プロジェクト: CGNx/xanalytics

    def actual_ajax_get_report_data(self, report_name=None):
        '''
        get data for custom report.
        parameters like course_id, chapter_id, problem_id are passed in as GET or POST parameters

        Defined parameters for SQL:

        {person_course} --> person_course table for the specific course
        {dataset} --> dataset for the specific course
        {course_report} --> course_report_* dataset for the ORG or latest
        {course_report_org} --> course_report_ORG dataset for ORG = ORGANIZATION_NAME
        {orgname} --> organization name
        
        '''
        crm, pdata, auth_ok, msg = self.custom_report_auth_check(report_name)	# crm = CourseReport model
        if not auth_ok:
            return self.no_auth_sorry()
        course_id = pdata['course_id']
        force_query = pdata.get('force_query', False)
        if force_query == 'false':
            force_query = False
        ignore_cache = pdata.get('ignore_cache', False) or force_query

        if course_id:
            dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest())
            pdata['person_course'] = '[%s.person_course]' % dataset
        elif 'dataset' in (crm.meta_info or {}):
            dataset = crm.meta_info['dataset']
        else:
            dataset = self.get_course_report_dataset()
            # using course report dataset; list the tables, to determine which is the latest
            # person_course dataset, and use that for {person_course}
            pdata['person_course'] = '[%s.%s]' % (dataset, self.find_latest_person_course_table(dataset))
        pdata['dataset'] = dataset
        pdata['course_report'] = self.get_course_report_dataset()
        pdata['course_report_org'] = self.get_course_report_dataset(force_use_org=True)
        pdata['orgname'] = self.ORGNAME
        pdata['sane_username'] = self.user.replace(' ', '_').replace('.', '_').replace('@', '_')

        # project_id specified?
        optargs = {}
        if 'project_id' in (crm.meta_info or {}):
            optargs['project_id'] = crm.meta_info['project_id']

        # what table?  get custom course report configuration metadata for report name as specified
        table = crm.table_name
        if not table or table=="None":
            error = "No table name defined!  Cannot process this custom report"
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'   
            self.response.out.write(json.dumps(data))
            return
        if '{' in table:
            table = table.format(**pdata)
            table = table.replace('-', '_').replace(' ', '_')
        if not ('dataset' in (crm.meta_info or {})) and not table.startswith('stats_'):
            table = "stats_" + table

        # special handling for person_course table from particular dataset
        for m in re.findall('{person_course__([^ \}]+)}', crm.sql):
            org = m
            org_dataset = self.get_course_report_dataset(orgname=org)
            pcd = '[%s.%s]' % (org_dataset, self.find_latest_person_course_table(org_dataset))
            pdata['person_course__' + org] = pcd
            logging.info('[cr] org=%s, pc=%s.%s' % (org, org_dataset, pcd))

        # special handling for course_report tables for specific orgs
        for m in re.findall('{course_report__([^ \}]+)}', crm.sql):
            org = m
            org_dataset = self.get_course_report_dataset(orgname=org)
            pdata['course_report__' + org] = org_dataset

        logging.info("Using %s for custom report %s person_course" % (pdata.get('person_course'), report_name))

        error = None

        # dynamic sql: the SQL is allowed to change based on input parameters
        # do this by treating the SQL as a jinja2 tempate
        if crm.meta_info.get('dynamic_sql'):
            if 'sql_flags' in pdata:
                if not type(pdata['sql_flags'])==dict:
                    try:
                        pdata['sql_flags'] = json.loads(pdata['sql_flags'])
                    except Exception as err:
                        msg = "Cannot parse sql_flags as JSON!  sql_flags=%s" % pdata['sql_flags']
                        msg += err
                        logging.error(msg)
                        raise Exception(msg)

            # a little sanity checking - disallow spaces in any sql_flags values
            sf = pdata['sql_flags']
            for k in sf:
                if ' ' in sf[k]:
                    msg = "Illegal sql_flags %s=%s!" % (k, sf[k])
                    msg += "sql_flags = %s" % json.dumps(sf, indent=4)
                    error = "<pre>%s</pre>" % (msg.replace('<','&lt;').replace('<','&gt;'))
                    data = {'error': error}
                    self.response.headers['Content-Type'] = 'application/json'   
                    self.response.out.write(json.dumps(data))
                    return

            try:
                sql_template = Template(crm.sql)
                sql = sql_template.render(pdata)
            except Exception as err:
                msg = 'Custom report data: failed to render dynamic SQL with pdata=%s, err=%s' % (pdata, err)
                logging.error(msg)
                logging.error('sql=%s' % crm.sql)
                error = "<pre>%s</pre>" % (msg.replace('<','&lt;').replace('<','&gt;'))
                data = {'error': error}
                self.response.headers['Content-Type'] = 'application/json'   
                self.response.out.write(json.dumps(data))
                return

            # append username to table name
            table = table + "_%s" % pdata['sane_username']

            force_query = True		# for now, all dynamic_sql is done with force_query
            ignore_cache = True
            the_sql = sql
        else:
            the_sql = crm.sql

        if 1:
            # generate SQL and depends_on
            try:
                sql = the_sql.format(**pdata)
            except Exception as err:
                msg = "Custom report data: failed to prepare SQL, err=%s" % str(err)
                msg += '\npdata = %s' %  pdata
                logging.error(msg)
                if self.is_superuser():
                    error = "<pre>%s</pre>" % (str(msg).replace('<','&lt;').replace('<','&gt;'))
                    data = {'error': error}
                    self.response.headers['Content-Type'] = 'application/json'   
                    self.response.out.write(json.dumps(data))
                    logging.error("Returning with error message")
                    return
                raise

        def strip_brackets(x):
            x = x.strip()
            if x.startswith('[') and x.endswith(']'):
                x = x[1:-1]
                return x
            return x

        if crm.meta_info.get('debug_sql'):
            msg = "debug_sql is true; not running SQL.  This is the SQL which would have been run:\n"
            msg += sql
            msg += "\n\nwith these parameters:\n"
            msg += json.dumps(pdata, indent=4)
            msg += "\n\producing the output table: %s.%s\n" % (dataset, table)
            error = "<pre>%s</pre>" % (msg.replace('<','&lt;').replace('<','&gt;'))
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'   
            self.response.out.write(json.dumps(data))
            return

        try:
            if crm.depends_on and (not crm.depends_on=="None"):
                depends_on = [ strip_brackets(x.format(**pdata)) for x in (json.loads(crm.depends_on or "[]")) ]
            else:
                depends_on = None
        except Exception as err:
            logging.error("for course report %s, cannot process depends_on=%s" % (report_name, crm.depends_on))
            raise Exception("Bad format for the 'depends_on' setting in the custom report specification")
            raise

        # get the data, and do query if needed

        logging.info('custom report get_report_data name=%s, table=%s.%s, depends_on=%s, pdata=%s' % (report_name, dataset, table, depends_on, pdata))

        the_msg = []

        def my_logger(msg):
            logging.info(msg)
            the_msg.append(msg)

        try:
            bqdata = self.cached_get_bq_table(dataset, table, 
                                              sql=sql,
                                              logger=my_logger,
                                              depends_on=depends_on,
                                              startIndex=int(pdata['start'] or 0), 
                                              maxResults=int(pdata['length'] or 100000),
                                              raise_exception=True,
                                              ignore_cache=ignore_cache,
                                              force_query=force_query,
                                              **optargs
            )
            self.fix_bq_dates(bqdata)
        except Exception as err:
            bqdata = {'data': None}
            error = "<pre>%s</pre>" % (str(err).replace('<','&lt;').replace('<','&gt;'))
            logging.error('custom report error %s' % error)
            logging.error(err)
            logging.error(traceback.format_exc())
            # raise
            if self.is_superuser():
                msg = ('\n'.join(the_msg))
                msg = msg.replace('<','&lt;').replace('<','&gt;')
                error += "<pre>%s</pre>" % msg
                error += "SQL: <pre>%s</pre>" % sql
                error += "Parameters: <pre>%s</pre>" % json.dumps(pdata, indent=4)
                error += "optargs: <pre>%s</pre>" % json.dumps(optargs, indent=4)
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'   
            self.response.out.write(json.dumps(data))
            return

        tablecolumns = []
        if pdata['get_table_columns']:
            try:
                tableinfo = bqutil.get_bq_table_info(dataset, table, **optargs)
            except Exception as err:
                error = (error or "\n") + str(err)
                tableinfo = None
                raise

            if tableinfo:
                fields = tableinfo['schema']['fields']
                field_names = [x['name'] for x in fields]
                tablecolumns = [ { 'data': x, 'title': x, 'class': 'dt-center' } for x in field_names ]

        data = self.common_data.copy()
        data.update({'data': bqdata['data'],
                     'draw': pdata['draw'],
                     'fields': bqdata['fields'],
                     'recordsTotal': bqdata.get('numRows', 0),
                     'recordsFiltered': bqdata.get('numRows', 0),
                     'error': error,
                     'tablecolumns': tablecolumns,
                     'output_table': table,
                     'output_dataset': dataset,
                 })
        
        
        # logging.info('[cr] data=%s' % data)

        self.response.headers['Content-Type'] = 'application/json'   
        self.response.out.write(json.dumps(data))

コード例 #13

ファイルを表示

ファイル: make_forum_analysis.py プロジェクト: sibycharley/edx2bigquery

def CreateForumEvents(course_id,
                      force_recompute=False,
                      use_dataset_latest=False,
                      skip_last_day=False,
                      end_date=None):
    '''
    Create forum events table, based on tracking logs.  Extracts all forum-related events, including forum post reads,
    into the date-time ordered table.  Repeated calls to this procedure will append new events to the table.  If no
    new events are found, the existing table is left unchanged.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_FORUM_EVENTS

    # event_type for forums may be like:
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/threads/5460c918a2a525003a0007fa
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/inline
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/users/4051854/followed
    #  /courses/UnivX/123.4x/2T2015/discussion/comments/54593f21a2a525003a000351/reply
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545e4f5da2a5251aac000672/reply
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/upvote
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/unvote
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/5447c22e892b213c7b0001f3/update
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/54493025892b2120a1000335/pin
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/54492e9c35c79cb03e00030c/delete
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/General/inline
    #  /courses/UnivX/123.4x/2T2015/instructor/api/list_forum_members
    #  /courses/UnivX/123.4x/2T2015/instructor/api/update_forum_role_membership
    #     \"GET\": {\"action\": [\"allow\"], \"rolename\": [\"Administrator\"], \"unique_student_identifier\": [\"NEW_ADMIN_USER\"]}}"}
    #
    # module_id will be like:
    # "module_id": "UnivX/123.4x/forum/54492f0c892b21597e00030a"

    the_sql = """
              SELECT time, 
                     username,
                     '{course_id}' as course_id,
                     (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/reply') then "reply"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/upvote') then "upvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unvote') then "unvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/update') then "update"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/delete') then "delete"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/close') then "close"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/follow') then "follow_thread"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unfollow') then "unfollow_thread"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/pin') then "pin"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unpin') then "unpin"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/downvote') then "downvote"  # does this happen?
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/reply') then "comment_reply"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/upvote') then "comment_upvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/update') then "comment_update"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/unvote') then "comment_unvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/delete') then "comment_delete"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+/followed') then "follow_user"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+$') then "target_user"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then "read"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/inline') then "read_inline"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/search') then "search"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum$') then "enter_forum"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/$') then "enter_forum"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/instructor/api/(.*)') then REGEXP_EXTRACT(event_type, r'/courses/.*/instructor/api/(.*)')
                           when event_type = "edx.forum.thread.created" then "created_thread"
                           when event_type = "edx.forum.response.created" then "created_response"
                           when event_type = "edx.forum.comment.created" then "created_comment"
                           when event_type = "edx.forum.searched" then "searched"
                           else event_type end) as forum_action,
                           (case when module_id is not null then REGEXP_EXTRACT(module_id, r'[^/]+/[^/]+/forum/([^/]+)') # For old-school courses with transparent course ids
                                      else (case when module_id is null # otherwise, for new opaque course ids, use regex to find thread_id from event_type, since module_id is null
                                               then (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/forum/[^/]+/threads/([^/]+)') # read
                                                      else (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/threads/([^/]+)') # upvote, pinned, upvoted, unvoted, deleted, followed
                                                             else REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/comments/([^/]+)/') end) # comment
                                                                end) end) end) as thread_id,
                     REGEXP_EXTRACT(event_type, r'/courses/.*/forum/([^/]+)/') as subject,
                     REGEXP_EXTRACT(event_type, r'/courses/.*/forum/users/([^/]+)') as target_user_id,
                     event_struct.query as search_query,   # unavailable before June 1, 2015
                     event_struct.GET as event_GET,        # unavailable before June 1, 2015
              FROM {DATASETS}
              WHERE  (REGEXP_MATCH(event_type ,r'^edx\.forum\..*')
                      or event_type contains "/discussion/forum"
                      or event_type contains "/discussion/threads"
                      or event_type contains "/discussion/comments"
                      or event_type contains "list-forum-"
                      or event_type contains "list_forum_"
                      or event_type contains "add-forum-"
                      or event_type contains "add_forum_"
                      or event_type contains "remove-forum-"
                      or event_type contains "remove_forum_"
                      or event_type contains "update_forum_"
                     ) 
                    AND username is not null
                    AND event is not null
                    and time > TIMESTAMP("{last_date}")
                    {hash_limit}
              order by time
              """

    try:
        tinfo = bqutil.get_bq_table_info(dataset, table)
        assert tinfo is not None, "[make_forum_analysis] Creating %s.%s table for %s" % (
            dataset, table, course_id)

        print "[make_forum_analysis] Appending latest data to %s.%s table for %s" % (
            dataset, table, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % (dataset, table)
        sys.stdout.flush()
        pass

    print "=== Processing Forum Events for %s (start %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(
        the_sql,
        table,
        course_id,
        force_recompute=force_recompute,
        use_dataset_latest=use_dataset_latest,
        get_date_function=gdf,
        has_hash_limit=True,
        end_date=end_date,
        skip_last_day=skip_last_day)

    print "Done with Forum Events for %s (end %s)" % (course_id,
                                                      datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()

コード例 #14

ファイルを表示

ファイル: make_research_data_tables.py プロジェクト: AbdouSeck/edx2bigquery

    def __init__(self, course_id_set, basedir='', datedir='', output_project_id=None, nskip=0, 
                 output_dataset_id=None, 
                 output_bucket=None,
                 use_dataset_latest=False,
                 only_step=None,
                 end_date=None,
                 ):
        '''
	Extract Research Datasets, based on defined list of tables
        '''
        
        if only_step and ',' in only_step:
            only_step = only_step.split(',')
        self.only_step = only_step

        self.end_date = end_date;

        if not course_id_set:
            print "ERROR! Must specify list of course_id's for report.  Aborting."
            return

        org = course_id_set[0].split('/',1)[0]	# extract org from first course_id
        self.org = org

        self.output_project_id = output_project_id

        crname = ('course_report_%s' % org)
        if use_dataset_latest:
            crname = 'course_report_latest'
        self.dataset = output_dataset_id or crname

        self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket)
        self.course_id_set = course_id_set
	course_id = course_id_set

        #course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set]
        #course_datasets_dict = { x:bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set}
	course_dataset = bqutil.course_id2dataset( course_id, use_dataset_latest=use_dataset_latest )

	self.rdp_matrix = collections.OrderedDict()
        #for course_id in course_datasets_dict.keys():

	print "[researchData] Processing data for course %s" % ( course_id )
	sys.stdout.flush()
	for rdp in RESEARCH_DATA_PRODUCTS.keys():
		try:
			table = bqutil.get_bq_table_info( course_dataset, rdp )
			#table = bqutil.get_bq_table_info( course_id, rdp )
			if table is not None:
				#[print "[researchData] %s found for %s dataset" % ( rdp, course_datasets_dict[ course_id ] )
				print "[researchData] %s found" % ( rdp )
				sys.stdout.flush()
				if rdp not in self.rdp_matrix:
					#self.rdp_matrix[ str(rdp) ] = cd
					self.rdp_matrix[ str(rdp) ] = ( course_id, course_dataset )
					#self.rdp_matrix[ str(rdp) ] = ( course_id, course_id )
				else:
					self.rdp_matrix[ str(rdp) ].append( (course_id, course_dataset ) )
					#self.rdp_matrix[ str(rdp) ].append( (course_id,  course_id ) )

		except Exception as err:
			#print str(err)
			print "[researchData] Err: %s not found for %s dataset" % ( rdp, course_id )

	# Extract to archival storage
	for researchDataProduct in self.rdp_matrix:
	
		the_dataset = self.rdp_matrix[ researchDataProduct ][1]
		course_id = self.rdp_matrix[ researchDataProduct ][0] #the_dataset.replace( '__', '/' )
		self.extractResearchData( course_id=course_id, tablename=researchDataProduct, the_dataset=the_dataset, rdp=researchDataProduct, rdp_format='csv', output_bucket=output_bucket, basedir=basedir, datedir=datedir )

        print "="*100
        print "Done extracting Research Data tables -> %s" % RESEARCH_DATA_PRODUCTS.keys()
        print "="*100
        sys.stdout.flush()

コード例 #15

ファイルを表示

    def actual_ajax_get_report_data(self, report_name=None):
        '''
        get data for custom report.
        parameters like course_id, chapter_id, problem_id are passed in as GET or POST parameters

        Defined parameters for SQL:

        {person_course} --> person_course table for the specific course
        {dataset} --> dataset for the specific course
        {course_report} --> course_report_* dataset for the ORG or latest
        {course_report_org} --> course_report_ORG dataset for ORG = ORGANIZATION_NAME
        {orgname} --> organization name
        
        '''
        crm, pdata, auth_ok, msg = self.custom_report_auth_check(
            report_name)  # crm = CourseReport model
        if not auth_ok:
            return self.no_auth_sorry()
        course_id = pdata['course_id']
        force_query = pdata.get('force_query', False)
        if force_query == 'false':
            force_query = False
        ignore_cache = pdata.get('ignore_cache', False) or force_query

        # project_id specified?
        optargs = {}
        if 'project_id' in (crm.meta_info or {}):
            optargs['project_id'] = crm.meta_info['project_id']

        if course_id:
            dataset = bqutil.course_id2dataset(
                course_id, use_dataset_latest=self.use_dataset_latest())
            pdata['person_course'] = '[%s.person_course]' % dataset
        elif 'dataset' in (crm.meta_info or {}):
            dataset = crm.meta_info['dataset']
        else:
            dataset = self.get_course_report_dataset()
            # using course report dataset; list the tables, to determine which is the latest
            # person_course dataset, and use that for {person_course}
            pdata[
                'person_course_latest'] = self.find_latest_person_course_table(
                    dataset, project_id=optargs.get('project_id'))
            pdata['person_course'] = '[%s.%s]' % (
                dataset, pdata['person_course_latest'])
        pdata['dataset'] = dataset
        pdata['course_report'] = self.get_course_report_dataset()
        pdata['course_report_org'] = self.get_course_report_dataset(
            force_use_org=True)
        pdata['orgname'] = self.ORGNAME
        pdata['sane_username'] = self.user.replace(' ', '_').replace(
            '.', '_').replace('@', '_')

        if 'module_id' in pdata and pdata['module_id']:
            url_name = pdata['module_id'].rsplit('/', 1)[-1]
            pdata['module_url_name'] = url_name.replace(':', '__').replace(
                '-', '_')

        # what table?  get custom course report configuration metadata for report name as specified
        table = crm.table_name
        if not table or table == "None":
            error = "No table name defined!  Cannot process this custom report"
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'
            self.response.out.write(json.dumps(data))
            return

        # multiple table names?  use parameters to select one
        if ',' in table:
            tables = table.split(',')
            try:
                table_number = int(pdata.get('table_number', 0) or 0)
                table = tables[table_number]
            except Exception as err:
                raise Exception(
                    "[custom_reports] Cannot select table from tables=%s, table_number=%s, err=%s"
                    % (tables, pdata.get('table_number'), err))

        # allow parameters in table name
        if '{' in table:
            table = table.format(**pdata)
            table = table.replace('-', '_').replace(' ', '_')
        if not ('dataset' in (crm.meta_info or {})) and not table.startswith(
                'stats_') and not (crm.meta_info.get('no_stats_ok')):
            table = "stats_" + table

        # special handling for person_course table from particular dataset
        for m in re.findall('{person_course__([^ \}]+)}', crm.sql):
            org = m
            org_dataset = self.get_course_report_dataset(orgname=org)
            pcd = '[%s.%s]' % (org_dataset,
                               self.find_latest_person_course_table(
                                   org_dataset,
                                   project_id=optargs.get('project_id')))
            pdata['person_course__' + org] = pcd
            logging.info('[cr] org=%s, pc=%s.%s' % (org, org_dataset, pcd))

        # special handling for course_report tables for specific orgs
        for m in re.findall('{course_report__([^ \}]+)}', crm.sql):
            org = m
            org_dataset = self.get_course_report_dataset(orgname=org)
            pdata['course_report__' + org] = org_dataset

        logging.info("Using %s for custom report %s person_course" %
                     (pdata.get('person_course'), report_name))

        error = None

        def setup_sql_flags():
            if 'sql_flags' in pdata:
                if not type(pdata['sql_flags']) == dict:
                    try:
                        pdata['sql_flags'] = json.loads(pdata['sql_flags'])
                    except Exception as err:
                        msg = "Cannot parse sql_flags as JSON!  sql_flags=%s" % pdata[
                            'sql_flags']
                        msg += err
                        logging.error(msg)
                        raise Exception(msg)

        # dynamic sql: the SQL is allowed to change based on input parameters
        # do this by treating the SQL as a jinja2 tempate
        if crm.meta_info.get('dynamic_sql'):
            setup_sql_flags()
            # a little sanity checking - disallow spaces in any sql_flags values
            sf = pdata['sql_flags']
            for k in sf:
                if ' ' in sf[k]:
                    msg = "Illegal sql_flags %s=%s!" % (k, sf[k])
                    msg += "sql_flags = %s" % json.dumps(sf, indent=4)
                    error = "<pre>%s</pre>" % (msg.replace(
                        '<', '&lt;').replace('<', '&gt;'))
                    data = {'error': error}
                    self.response.headers['Content-Type'] = 'application/json'
                    self.response.out.write(json.dumps(data))
                    return

            try:
                sql_template = Template(crm.sql)
                sql = sql_template.render(pdata)
            except Exception as err:
                msg = 'Custom report data: failed to render dynamic SQL with pdata=%s, err=%s' % (
                    pdata, err)
                logging.error(msg)
                logging.error('sql=%s' % crm.sql)
                error = "<pre>%s</pre>" % (msg.replace('<', '&lt;').replace(
                    '<', '&gt;'))
                data = {'error': error}
                self.response.headers['Content-Type'] = 'application/json'
                self.response.out.write(json.dumps(data))
                return

            # append username to table name
            table = table + "_%s" % pdata['sane_username']

            force_query = True  # for now, all dynamic_sql is done with force_query
            ignore_cache = True
            the_sql = sql
        else:
            the_sql = crm.sql

        if 1:
            # generate SQL and depends_on
            try:
                sql = the_sql.format(**pdata)
            except Exception as err:
                msg = "Custom report data: failed to prepare SQL, err=%s" % str(
                    err)
                msg += '\npdata = %s' % pdata
                logging.error(msg)
                if self.is_superuser():
                    error = "<pre>%s</pre>" % (str(msg).replace(
                        '<', '&lt;').replace('<', '&gt;'))
                    data = {'error': error}
                    self.response.headers['Content-Type'] = 'application/json'
                    self.response.out.write(json.dumps(data))
                    logging.error("Returning with error message")
                    return
                raise

        def strip_brackets(x):
            x = x.strip()
            if x.startswith('[') and x.endswith(']'):
                x = x[1:-1]
                return x
            return x

        if crm.meta_info.get('debug_sql'):
            msg = "debug_sql is true; not running SQL.  This is the SQL which would have been run:\n"
            msg += sql
            msg += "\n\nwith these parameters:\n"
            msg += json.dumps(pdata, indent=4)
            msg += "\n\nproducing the output table: %s.%s\n" % (dataset, table)
            error = "<pre>%s</pre>" % (msg.replace('<', '&lt;').replace(
                '<', '&gt;'))
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'
            self.response.out.write(json.dumps(data))
            return

        try:
            if crm.depends_on and (not crm.depends_on == "None"):
                depends_on = [
                    strip_brackets(x.format(**pdata))
                    for x in (json.loads(crm.depends_on or "[]"))
                ]
            else:
                depends_on = None
        except Exception as err:
            logging.error(
                "for course report %s, cannot process depends_on=%s" %
                (report_name, crm.depends_on))
            raise Exception(
                "Bad format for the 'depends_on' setting in the custom report specification"
            )
            raise

        # get the data, and do query if needed

        logging.info(
            'custom report get_report_data name=%s, table=%s.%s, depends_on=%s, pdata=%s'
            % (report_name, dataset, table, depends_on, pdata))

        the_msg = []

        def my_logger(msg):
            logging.info(msg)
            the_msg.append(msg)

        def output_error(err):
            error = "<pre>%s</pre>" % (str(err).replace('<', '&lt;').replace(
                '<', '&gt;'))
            logging.error('custom report error %s' % error)
            logging.error(err)
            logging.error(traceback.format_exc())
            # raise
            if self.is_superuser():
                msg = ('\n'.join(the_msg))
                msg = msg.replace('<', '&lt;').replace('<', '&gt;')
                error += "<pre>%s</pre>" % msg
                error += "SQL: <pre>%s</pre>" % sql
                error += "Parameters: <pre>%s</pre>" % json.dumps(pdata,
                                                                  indent=4)
                error += "optargs: <pre>%s</pre>" % json.dumps(optargs,
                                                               indent=4)
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'
            self.response.out.write(json.dumps(data))

        # is the request "indexed", meaning only matching rows of the table are to be returned?
        indexed_column = crm.meta_info.get('indexed')
        if indexed_column:
            if type(indexed_column) == list:
                indexed_columns = indexed_column
                try:
                    table_number = int(pdata.get('table_number', 0) or 0)
                    indexed_column = indexed_columns[table_number]
                except Exception as err:
                    raise Exception(
                        "[custom_reports] Cannot select indexed_column from indexed_columns=%s, table_number=%s, err=%s"
                        % (indexed_columns, pdata.get('table_number'), err))
            setup_sql_flags()
            indexed_value = pdata.get('sql_flags', {}).get('indexed_value')
            logging.info(
                "[custom_reports] retrieving %s.%s with indexing on %s to match value %s"
                % (dataset, table, indexed_column, indexed_value))
            if not indexed_value:
                my_logger(
                    'Error: missing sql_flags.indexed_value to match indexed column %s in %s.%s'
                    % (indexed_column, dataset, table))
                data = {'error': msg}
                self.response.headers['Content-Type'] = 'application/json'
                self.response.out.write(json.dumps(data))
                return
            # ensure existence of indexed version of table.  By convention, that is a table named tablename + "__indexed_" + indexed_column
            # the table has a SHA1 hash of the indexed column added, and is sorted according to the last few characters
            # of the SHA1 hash.
            indexed_table = table + "__indexed_" + indexed_column
            indexing_sql_template = """select *,
                                  SUBSTR(TO_BASE64(SHA1(STRING({indexed_column}))),-3,2) as index_sha1_2ch,
                                  ROW_NUMBER() over (order by index_sha1_2ch, {indexed_column}) as index_row_number{subnum},
                              from [{dataset}.{table}]
                              {where_clause}
                              order by index_sha1_2ch, {indexed_column}
                           """
            indexing_sql = indexing_sql_template.format(
                dataset=dataset,
                table=table,
                indexed_column=indexed_column,
                where_clause="",
                subnum="")
            try:
                bqdata = self.cached_get_bq_table(
                    dataset,
                    indexed_table,
                    sql=indexing_sql,
                    logger=my_logger,
                    depends_on=["%s.%s" % (dataset, table)],
                    raise_exception=True,
                    ignore_cache=ignore_cache,
                    force_query=force_query,
                    startIndex=0,
                    maxResults=1,
                    **optargs)
            except Exception as err:
                if "Response too large to return" in str(the_msg):
                    # hmm - table too large!  can't workaround using allowLargeResult because the return results
                    # need to be ordered.  So let's break it up into multiple queries, appending each,
                    # by index_sha1_2ch
                    b64chars = "+/0123456789" + ''.join(
                        map(chr, range(ord('A'),
                                       ord('Z') + 1))) + ''.join(
                                           map(chr,
                                               range(ord('a'),
                                                     ord('z') + 1)))
                    # get table size, divide by 64M, to get number of divisions to use
                    tinfo = bqutil.get_bq_table_info(dataset, table, **optargs)
                    nbytes = int(tinfo['numBytes'])
                    ndivs = int(round(nbytes / (64 * 1024 * 1024)))
                    logging.info(
                        "Response too large - nbytes=%s, so trying ndivs=%s" %
                        (nbytes, ndivs))
                    end_idx = None
                    start_idx = None
                    dn = int(64 / ndivs)
                    offset = dn
                    overwrite = True
                    nrows = 0
                    while (offset < 65):
                        start_idx = end_idx
                        last_row_index = nrows  # note ROW_NUMBER() starts with 1 (not zero)
                        if (offset < 64):
                            end_idx = b64chars[offset] + "+"
                        else:
                            end_idx = None  # boundary case
                        wc = "where "
                        if start_idx:
                            wc += '(SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) >= "%s") ' % (
                                indexed_column, start_idx)
                        else:
                            wc += "True "
                        if end_idx:
                            wc += 'AND (SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) < "%s") ' % (
                                indexed_column, end_idx)
                        logging.info(
                            "--> start_idx=%s, end_idx=%s, starting row %d" %
                            (start_idx, end_idx, last_row_index))
                        tmp_sql = indexing_sql_template.format(
                            dataset=dataset,
                            table=table,
                            indexed_column=indexed_column,
                            where_clause=wc,
                            subnum="_sub")
                        indexing_sql = "SELECT *, index_row_number_sub + %d as index_row_number FROM (%s)" % (
                            last_row_index, tmp_sql)
                        try:
                            bqutil.create_bq_table(dataset,
                                                   indexed_table,
                                                   sql=indexing_sql,
                                                   overwrite=overwrite,
                                                   logger=my_logger,
                                                   **optargs)
                            cnt = 0
                            tinfo = None
                            while (not tinfo) and (cnt < 10):
                                tinfo = bqutil.get_bq_table_info(
                                    dataset, indexed_table, **optargs)
                                if not tinfo:
                                    logging.info(
                                        "==> ERROR?  got unexpected None for get_bq_table_info %s.%s"
                                        % (dataset, indexed_table))
                                    time.sleep(10)
                                    cnt += 1

                            nrows = int(tinfo['numRows'])
                            logging.info(
                                "--> Result from %s to %s has %d rows" %
                                (start_idx, end_idx, nrows))
                        except Exception as err:
                            bqdata = {'data': None}
                            sql = indexing_sql
                            output_error(err)
                            return
                        overwrite = "append"
                        offset += dn

                else:
                    bqdata = {'data': None}
                    sql = indexing_sql
                    output_error(err)
                    return

            # now ensure table index, and retrieve it.  It has just two columns: index_sha1_2ch, start_row
            tindex_table = table + "__index_for_" + indexed_column
            tindex_sql = """SELECT index_sha1_2ch, 
                                min(index_row_number) as start_row,
                                # max(index_row_number) as end_row,   # don't need this - just take next start_row
                            FROM [{dataset}.{indexed_table}]
                            group by index_sha1_2ch
                            order by index_sha1_2ch
                         """.format(dataset=dataset,
                                    indexed_table=indexed_table)
            try:
                bqdata = self.cached_get_bq_table(
                    dataset,
                    tindex_table,
                    sql=tindex_sql,
                    logger=my_logger,
                    depends_on=["%s.%s" % (dataset, indexed_table)],
                    raise_exception=True,
                    ignore_cache=ignore_cache,
                    force_query=force_query,
                    startIndex=0,
                    maxResults=10000,
                    **optargs)
            except Exception as err:
                bqdata = {'data': None}
                sql = tindex_sql
                output_error(err)
                return

            # find the start and end rows to retrieve, based the last characters of the SHA1 hash of the indexed value
            sha1_2ch = base64.b64encode(
                hashlib.sha1(indexed_value).digest())[-3:-1]
            start_row = None
            end_row = None
            for k in bqdata['data']:
                if start_row and not end_row:
                    end_row = int(k['start_row'])
                if (k['index_sha1_2ch'] == sha1_2ch):
                    start_row = int(k['start_row'])
            logging.info(
                "Retrieving iv=%s, sha1_2ch=%s, rows %s to %s of %s.%s" %
                (indexed_value, sha1_2ch, start_row, end_row, dataset,
                 indexed_table))
            if not start_row:
                output_error(
                    "Cannot find %s=%s in %s.%s" %
                    (indexed_column, indexed_value, dataset, indexed_table))
                bqdata = {'data': None}
                return

            max_results = (end_row or (start_row + 4000)) - start_row
            bqdata = self.cached_get_bq_table(dataset,
                                              indexed_table,
                                              ignore_cache=True,
                                              startIndex=start_row - 1,
                                              maxResults=max_results,
                                              **optargs)

            # extract just the row(s) with indexed_column value matching indexed_value (the hash is many to one)
            newdata = []
            for k in range(len(bqdata['data'])):
                datum = bqdata['data'][k]
                if (datum[indexed_column] == indexed_value):
                    newdata.append(datum)
            logging.info("--> Result has %d items, of which %d match" %
                         (len(bqdata['data']), len(newdata)))
            bqdata['data'] = newdata

            table = indexed_table  # so that columns are retrieved properly

        if not indexed_column:
            # retrieve full table
            try:
                bqdata = self.cached_get_bq_table(
                    dataset,
                    table,
                    sql=sql,
                    logger=my_logger,
                    depends_on=depends_on,
                    startIndex=int(pdata['start'] or 0),
                    maxResults=int(pdata['length'] or 100000),
                    raise_exception=True,
                    ignore_cache=ignore_cache,
                    force_query=force_query,
                    **optargs)
                self.fix_bq_dates(bqdata)
            except Exception as err:
                bqdata = {'data': None}
                output_error(err)
                return

        tablecolumns = []
        if pdata['get_table_columns']:
            try:
                tableinfo = bqutil.get_bq_table_info(dataset, table, **optargs)
            except Exception as err:
                error = (error or "\n") + str(err)
                tableinfo = None
                raise

            if tableinfo:
                fields = tableinfo['schema']['fields']
                field_names = [x['name'] for x in fields]
                tablecolumns = [{
                    'data': x,
                    'title': x,
                    'class': 'dt-center'
                } for x in field_names]

        data = self.common_data.copy()
        data.update({
            'data': bqdata['data'],
            'draw': pdata['draw'],
            'last_modified_date': str(bqdata.get('last_modified_date')),
            'fields': bqdata['fields'],
            'recordsTotal': bqdata.get('numRows', 0),
            'recordsFiltered': bqdata.get('numRows', 0),
            'error': error,
            'tablecolumns': tablecolumns,
            'output_table': table,
            'output_dataset': dataset,
        })

        # logging.info('[cr] data=%s' % data)

        self.response.headers['Content-Type'] = 'application/json'
        self.response.out.write(json.dumps(data))

コード例 #16

ファイルを表示

ファイル: make_irt_report.py プロジェクト: scmc/edx2bigquery

def make_irt_report(course_id,
                    force_recompute=False,
                    use_dataset_latest=False):
    '''
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)

    the_sql_alpha = """
    IR.itemtestcorr as item_test,
    IR.itemrestcorr as item_rest,
    IR.alpha as alpha,
    """

    the_sql_no_alpha = """
    null as item_test,
    null as item_rest,
    null as alpha,
    """

    the_sql_alpha_join = """
    JOIN [{dataset}.item_reliabilities] IR
    on IR.item = CP.problem_yid
    """.format(dataset=dataset)

    the_sql = """
# item_response_theory_report for {course_id}
#
# problem_nid,problem_short_id,chapter,assignment_type,problem_label,problem_id,IRT item number,avg_problem_raw_score,avg_problem_pct_score,
# n_unique_users_attempted,item_test,item_rest,alpha ,Discrimination,Difficulty

SELECT 
    "{course_id}" as course_id,
    IG.problem_nid as problem_nid,
    CP.problem_short_id as problem_short_id,
    CI.chapter_name as chapter,
    assignment_type,
    CONCAT("[", STRING(IG.problem_nid), "] ", CI.chapter_name, " / ", CI.section_name, " / ", CP.problem_name) as problem_label,
    CP.problem_id as problem_id,
    CONCAT(STRING(CP.problem_nid), "/", STRING(cutnum)) as IRT_item_number,
    CP.avg_problem_raw_score avg_problem_raw_score,
    CP.avg_problem_pct_score avg_problem_pct_score,
    CP.n_unique_users_attempted n_unique_users_attempted,
    {sql_alpha}
    irt_diff as Difficulty,
    irt_disc as Discrimination,
    diff_se as Difficulty_SE,
    disc_se as Discrimination_SE,
    "{irt_method}" as irt_method,

FROM [{dataset}.{item_irt_grm}] IG
JOIN [{dataset}.course_item] CI
on IG.problem_nid = CI.problem_nid
JOIN 
(
    SELECT *, CONCAT("y", STRING(problem_nid)) as problem_yid,
    FROM [{dataset}.course_problem]
) CP
on IG.problem_nid = CP.problem_nid
{sql_alpha_join}
where CI.item_number = 1
    """

    tablename = "item_response_theory_report"
    RELIABILITIES_TABLE = "item_reliabilities"
    IRT_TABLES = OrderedDict([
        ("item_irt_grm", "STATA GRM"),
        ("item_irt_grm_R", "R mirt GRM"),
    ])

    irt_table_to_use = None
    irt_table_date = None

    # use newest of the existing IRT tables
    for irt_tablename in IRT_TABLES:
        try:
            tinfo = bqutil.get_bq_table_info(dataset, irt_tablename)
            assert tinfo is not None, "%s.%s does not exist" % (dataset,
                                                                irt_tablename)
            lmt = tinfo.get('lastModifiedTime')
            use_table = lmt and ((not irt_table_date) or
                                 (irt_table_date and lmt > irt_table_date))
            if use_table:
                irt_table_date = lmt
                irt_table_to_use = irt_tablename
            else:
                print "[make_irt_report] Not using IRT table %s (date %s) - older than %s (date %s)" % (
                    irt_tablename, lmt, irt_table_to_use, irt_table_date)
        except Exception as err:
            pass

    if not irt_table_to_use:
        raise Exception(
            "[make_irt_report] Cannot generate IRT report; requires one of %s"
            % (','.join(IRT_TABLES.keys())))

    # SQL changes depending on whether item_reliabilities exists or not
    have_reliabilities = False
    try:
        tinfo = bqutil.get_bq_table_info(dataset, RELIABILITIES_TABLE)
        assert tinfo is not None, "%s.%s does not exist" % (
            dataset, RELIABILITIES_TABLE)
        if tinfo is not None:
            have_reliabilities = True
    except Exception as err:
        pass

    if have_reliabilities:
        sql_alpha = {
            'sql_alpha': the_sql_alpha,
            "sql_alpha_join": the_sql_alpha_join
        }
    else:
        sql_alpha = {'sql_alpha': the_sql_no_alpha, "sql_alpha_join": ""}

    the_sql = the_sql.format(dataset=dataset,
                             course_id=course_id,
                             item_irt_grm=irt_table_to_use,
                             irt_method=IRT_TABLES[irt_table_to_use],
                             **sql_alpha)

    depends_on = [
        "%s.course_item" % dataset,
        "%s.course_problem" % dataset,
        "%s.%s" % (dataset, irt_table_to_use),
    ]

    if have_reliabilities:
        depends_on.append("%s.item_reliabilities" % dataset)

    try:
        bqdat = bqutil.get_bq_table(dataset,
                                    tablename,
                                    the_sql,
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    newer_than=datetime.datetime(
                                        2016, 9, 27, 14, 48),
                                    startIndex=-2)
    except Exception as err:
        print "[make_irt_report] ERR! failed in creating %s.%s using this sql:" % (
            dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d problem items found" % (
        tablename, course_id, nfound)
    sys.stdout.flush()

コード例 #17

ファイルを表示

ファイル: make_item_tables.py プロジェクト: lstmemery/edx2bigquery

def create_course_item_table(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    the course_item dataset has these columns:

    Field Name                              Type    Example         Description
    item_id                                 string  i4x-MITx-8_MReV-problem-CheckPoint_1_Newton_s_First_Law_2_1     
                                                                    Unique ID for an assessment item (constructed using the problem module_id, and linked to problem_analysis table keys)
    problem_id                              string  CheckPoint_1_Newton_s_First_Law 
                                                                    Unique ID for an assessment problem (constructed using problem url_name)
    problem_nid                             integer 27              unique problem numerical id (equal to the sequential count of problems up to this one)
    assignment_short_id                     string  HW_4            Unique short ID for assignment, using assignment short name + "_" + assignment_seq_num (should be same as what shows up in user's edX platform progress page)
    item_weight                             float   6.59E-05        Fraction of overall grade (between 0 and 1) contributed by this item
    n_user_responses                        integer 4868            Number of users who provided a response to this assessment item
    problem_name                            string  CheckPoint 1: Newton's First Law        
                                                                    Name of problem within which this item exists
    chapter_name                            string  Chapter 1       Name of chapter within which the problem exists
    section_name                            string  Section 1       Name of section (aka sequential) within which the problem exists
    assignment_id                           string  Checkpoint_ch3  Unique ID for the assignment within which the problem exists
    n_problems_in_assignment                integer 23              Number of problems within the assignment
    assignment_type                         string  Checkpoint      The assignment type within which the assignment exists
    assignment_type_weight                  float   0.1             Fraction of the overall grade contributed by the assignment type
    n_assignments_of_type                   integer 11              Number of assignments of this type
    assignment_seq_num                      integer 3               Sequential number of the assignment_type within the course
    chapter_number                          integer 3               Number of the chapter within which the problem exists
    section_number                          integer 3               Number of the section (aka sequential) within which the problem exists
    content_index                           integer 141             Index number of the problem within the content course axis
    problem_weight                          integer 1               Weight of the problem within the assignment
    item_points_possible                    float   1               Always 1 (used for debugging - number of points assigned to an item)
    problem_points_possible                 integer 6               Always equal to the number of items in the assignment (used for debugging)
    emperical_item_points_possible          integer 1               Emperical value of point value of item, based on user data in problem_analysis table (for debugging)
    emperical_problem_points_possible       integer 6               Emperical value of maximum number of points possible for problem based on problem_analysis (for debugging)
    item_number                             integer 1               Number of the item, within the problem (in order of presentation, starting from 1)
    n_items                                 integer 6               Number of items within the problem
    start_date                              date    2013-06-01 00:01:00 UTC 
                                                                    Date when problem was issued
    due_date                                date    2013-06-23 23:59:00 UTC 
                                                                    Date when problem was due
    problem_path                            string  /Unit_1/Newtons_First_Law/2/1   
                                                                    Path of problem within course content, specifying chapter and sequential
    problem_short_id                        string  HW_7__3         short (and unique) problem ID, made using assignment short ID + "__" + problem number
    item_short_id                           string  HW_7__3_1       short (and unique) item ID, made using problem short ID + "_" + item number
    item_nid                                integer 41              unique item numerical id (equal to the row number of this entry in the course_itm table)
    cumulative_item_weight                  float   6.59E-05        Cumulative fraction of item weights (for debugging: should increase to 1.0 by the end of table)
    is_split                                boolean False           Boolean flag indicating if this item was within an A/B split_test or not
    split_name                              string  CircMotionAB    Name of the split_test within which this item is placed, if is_split is True

    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    tablename = "course_item"

    # determine if grading_policy exists or not
    GP_TABLE = "grading_policy"
    have_grading_policy = False
    try:
        tinfo = bqutil.get_bq_table_info(dataset, GP_TABLE)
        assert tinfo is not None, "%s.%s does not exist" % ( dataset, GP_TABLE )
        if tinfo is not None:
            have_grading_policy = True
    except Exception as err:
        pass

    # change SQL if grading_policy doesn't exist
    if have_grading_policy:
        disable_gformat = ""
        alternate_gp = ""
    else:
        print "Warning - grading_policy doest NOT exist, using a dummy grading policy instead, and allowing gformat=null"
        sys.stdout.flush()
        disable_gformat = "#"
        alternate_gp = '( SELECT "" as assignment_type, 1.0 as fraction_of_overall_grade, "none" as short_label ) GP'

    the_sql = """
SELECT 
    # '{course_id}' as course_id,
    *,
    CONCAT(assignment_short_id, "__", STRING(problem_number)) as problem_short_id,
    CONCAT(assignment_short_id, "__", STRING(problem_number), "_", STRING(item_number)) as item_short_id,
    row_number() over (order by content_index, item_number) as item_nid,
    sum(item_weight) over (order by content_index, item_number) cumulative_item_weight
FROM
(
    # items with additional data about fraction_of_overall_grade from grading_policy
    SELECT item_id, 
        problem_id,
        max(if(item_number=1, x_item_nid, null)) over (partition by problem_id) as problem_nid,
        CONCAT((case when GP.short_label is null then "" else GP.short_label end),
               "_", STRING(assignment_seq_num)) as assignment_short_id,
        (problem_weight * (case when GP.fraction_of_overall_grade is null then 1.0 else GP.fraction_of_overall_grade end)
             / n_items / sum_problem_weight_in_assignment / n_assignments_of_type) as item_weight,
        n_user_responses,
        chapter_name,
        section_name,
        vertical_name,
        problem_name,
        CI.assignment_id as assignment_id,
        n_problems_in_assignment,
        CI.assignment_type as assignment_type,
        (case when GP.fraction_of_overall_grade is null then 1.0 else GP.fraction_of_overall_grade end) as assignment_type_weight,
        n_assignments_of_type,
        assignment_seq_num,
        chapter_number,
        content_index,
        section_number,
        problem_number,
        problem_weight,
        item_points_possible,
        problem_points_possible,
        emperical_item_points_possible,
        emperical_problem_points_possible,
        item_number,
        n_items,
        start_date,
        due_date,
        is_split,
        split_name,
        problem_path,
    FROM
    (
        # items with number of problems per assignment
        SELECT item_id, item_number,
            n_items,
            problem_id,
            row_number() over (partition by item_number order by content_index) as x_item_nid,
            n_user_responses,
            chapter_name,
            section_name,
            vertical_name,
            problem_name,
            assignment_id,
            sum(if(assignment_id is not null and item_number=1, 1, 0)) over (partition by assignment_id) n_problems_in_assignment,
            sum(if(assignment_id is not null and item_number=1, problem_weight, 0)) 
                over (partition by assignment_id) sum_problem_weight_in_assignment,
            assignment_type,
            n_assignments_of_type,
            assignment_seq_num,
            chapter_number,
            section_number,
            problem_number,
            problem_path,
            content_index,
            start_date,
            due_date,
            is_split,
            split_name,
            problem_weight,
            item_points_possible,
            problem_points_possible,
            emperical_item_points_possible,
            emperical_problem_points_possible,
        FROM
        (
            # items from problem_analysis with metadata from course_axis
            SELECT item_id, item_number,
                n_items,
                problem_id,
                n_user_responses,
                CA.name as problem_name,
                chapter_name,
                section_name,
                vertical_name,
                assignment_id,
                assignment_type,
                n_assignments_of_type,
                CA.assignment_seq_num as assignment_seq_num,
                CA.chapter_number as chapter_number,
                CA.section_number as section_number,
                CA.problem_number as problem_number,
                CA.path as problem_path,
                CA.index as content_index,
                CA.start as start_date,
                CA.due as due_date,
                CA.is_split as is_split,
                CA.split_name as split_name,
                if(CA.weight is null, 1.0, CA.weight) as problem_weight,
                item_points_possible,
                problem_points_possible,
                emperical_item_points_possible,
                emperical_problem_points_possible,
            FROM
            (
                # get items with item metadata from problem_analysis table
                SELECT item_id, item_number,
                    n_items,
                    problem_id,
                    n_user_responses,
                    1.0 as item_points_possible,
                    1.0 * n_items as problem_points_possible,
                    problem_points_possible / n_items as emperical_item_points_possible,
                    problem_points_possible as emperical_problem_points_possible,
                FROM
                (
                    SELECT item_id, item_number,
                        max(item_number) over (partition by problem_id) n_items,
                        problem_id,
                        problem_points_possible,
                        n_user_responses,
                    FROM
                    (
                        SELECT item_id,
                            row_number() over (partition by problem_id order by item_id) item_number,
                            problem_id,
                            problem_points_possible,
                            n_user_responses,
                        FROM
                        (
                            SELECT item.answer_id as item_id,
                                problem_url_name as problem_id,
                                max_grade as problem_points_possible,
                                count(*) as n_user_responses,
                            FROM [{dataset}.problem_analysis]
                            group by item_id, problem_id, problem_points_possible
                            having n_user_responses > 5   # minimum cutoff for an item to be included
                        )
                    )
                )
                order by item_id, item_number
            ) as PA
            JOIN 
            (
                # -------------------------------------------------- graded problems from course axis
                # master table of graded problems from course_axis, with assignment metadata
                SELECT module_id,
                    url_name,
                    index,
                    weight,
                    assignment_type,
                    MAX(IF(problem_number=1, x_assignment_seq_num, null)) over (partition by assignment_id) as assignment_seq_num,
                    problem_number,
                    assignment_id,
                    n_assignments_of_type,
                    chapter_name,
                    section_name,
                    vertical_name,
                    name,
                    path,
                    start,
                    due,
                    is_split,
                    split_name,
                    chapter_number,
                    section_number,
                FROM
                (
                    # course_axis with chapter number and number of assignments of type
                    SELECT *,  # add column with number of assignments of type
                        SUM(IF(problem_number=1, 1, 0)) over (partition by assignment_type) n_assignments_of_type,
                        row_number() over (partition by assignment_type, problem_number order by index) as x_assignment_seq_num,
                    FROM
                    (
                        # ---------------------------------------- course axis with vertical name
                        SELECT module_id,
                            url_name,
                            index,
                            weight,
                            assignment_type,
                            chapter_number,
                            section_number,
                            assignment_id,  
                            chapter_name,
                            section_name,
                            vertical_name,
                            name,
                            path,
                            start,
                            due,
                            is_split,
                            split_name,
                            # add column with problem number within assignment_id
                            row_number() over (partition by assignment_id order by index) problem_number,
                        FROM
                        (
                            # course axis of problems which have non-null grading_format, including chapter number
                            # and section (aka sequential) number (within the chapter)
                            SELECT CAI.module_id as module_id,
                                CAI.url_name as url_name,
                                index,
                                weight,
                                assignment_type,
                                chapter_number,
                                section_number,
                                #  assignment_id = assignment_type + ch_chapter_number + sec_section_number
                                CONCAT(assignment_type, "_ch", STRING(chapter_number), "_sec", STRING(section_number)) as assignment_id,  
                                chapter_name,
                                section_name,
                                name,
                                path,
                                start,
                                due,
                                is_split,
                                split_name,
                                parent,
                            FROM 
                            (
                                # course axis entries of things which have non-null grading format, with section_mid from path
                                SELECT module_id,
                                    url_name,
                                    index,
                                    If(data.weight is null, 1.0, data.weight) as weight,
                                    (case when gformat is null then "" else gformat end) as assignment_type,
                                    chapter_mid as chapter_mid,
                                    REGEXP_EXTRACT(path, '^/[^/]+/([^/]+)') as section_mid,
                                    name,
                                    path,
                                    start,
                                    due,
                                    is_split,
                                    split_url_name as split_name,
                                    parent,
                                FROM [{dataset}.course_axis] CAI
                                where 
                                #{disable_gformat} gformat is not null and
                                category = "problem"
                                order by index
                            ) CAI
                            LEFT JOIN  # join course_axis with itself to get chapter_number and section_number
                            (   
                                # get chapters and sections (aka sequentials) with module_id, chapter_number, and section_number
                                # each assignment is identified by assignment_type + chapter_number + section_number
                                # note in some previous calculations, the section_number was left out by mistake
                                # see https://github.com/edx/edx-platform/blob/master/common/lib/xmodule/xmodule/course_module.py#L1305
                                SELECT module_id, url_name, name as section_name,
                                    max(if(category="chapter", x_chapter_number, null)) over (partition by chapter_mid order by index) as chapter_number,
                                    section_number,
                                    chapter_name,
                                FROM
                                (
                                    SELECT module_id, url_name,
                                        row_number() over (partition by category order by index) as x_chapter_number,
                                        row_number() over (partition by chapter_mid, category order by index) as section_number,
                                        FIRST_VALUE(name) over (partition by chapter_mid order by index) as chapter_name,
                                        index,
                                        category,
                                        name,
                                        if(category="chapter", module_id, chapter_mid) as chapter_mid,
                                    FROM  [{dataset}.course_axis] 
                                    where category = "chapter" or category = "sequential" or category = "videosequence"
                                    order by index
                                )
                                order by index
                            ) CHN
                            # ON CAI.chapter_mid = CHN.chapter_mid  # old, for assignments by chapter
                            ON CAI.section_mid = CHN.url_name     # correct way, for assignments by section (aka sequential)
                            # where gformat is not null
                        ) CAPN
                        LEFT JOIN # join with course_axis to get names of verticals in which problems reside
                        (
                            # get verticals
                            SELECT url_name as vertical_url_name, 
                                name as vertical_name,
                            FROM  [{dataset}.course_axis] 
                            where category = "vertical"
                        ) CAV
                        ON CAPN.parent = CAV.vertical_url_name
                        # ---------------------------------------- END course axis with vertical_name
                    )
                )
                order by index
                # -------------------------------------------------- END graded problems from course axis
            ) CA
            ON PA.problem_id = CA.url_name
        )
    ) CI
    LEFT JOIN 
    {disable_gformat} [{dataset}.grading_policy] GP
    {alternate_gp}
    ON CI.assignment_type = GP.assignment_type
    order by content_index, item_number
)
order by content_index, item_number
    """.format(dataset=dataset, course_id=course_id, disable_gformat=disable_gformat, alternate_gp=alternate_gp)

    depends_on = [ "%s.course_axis" % dataset,
                   "%s.grading_policy" % dataset,
                   "%s.problem_analysis" % dataset
               ]

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    newer_than=datetime.datetime(2015, 10, 31, 17, 00),
                                    depends_on=depends_on,
                                    force_query=force_recompute)
    except Exception as err:
        print "[make_course_item_table] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = len(bqdat['data'])
    print "--> Done with %s for %s, %d entries found" % (tablename, course_id, nfound)
    sys.stdout.flush()

コード例 #18

ファイルを表示

def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, 
                   check_dates=True, skip_last_day=False):
    '''
    Make {course_id}.person_course_day table for specified course_id.

    This is a single course-specific table, which contains all day's data.
    It is incrementally updated when new tracking logs data comes in,
    by appending rows to the end.  The rows are kept in time order.

    check_dates is disregarded.

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    videoTableExists = False
    try:

        tinfo_video = bqutil.get_bq_table_info(dataset, 'video_stats_day')
        assert tinfo_video is not None, "Video stats table missing... Not including video stats"
	videoTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass
        #return

    forumTableExists = False
    try:

        tinfo_forum = bqutil.get_bq_table_info(dataset, 'forum_events')
        assert tinfo_forum is not None, "Forum events table missing... Not including forum stats"
	forumTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass
        #return

    problemTableExists = False
    try:

        tinfo_personproblem = bqutil.get_bq_table_info(dataset, 'person_problem')
        tinfo_courseproblem = bqutil.get_bq_table_info(dataset, 'course_problem')
        tinfo_courseaxis = bqutil.get_bq_table_info(dataset, 'course_axis')
        tinfo_personcourse = bqutil.get_bq_table_info(dataset, 'person_course')
	# Check course axis and person course, course problem
        assert tinfo_personproblem is not None, "Person problem table missing... Not including problem stats"
        assert tinfo_courseproblem is not None, "Course problem table missing... Not including problem stats"
        assert tinfo_courseaxis is not None, "Course axis table missing... Not including problem stats"
        assert tinfo_personcourse is not None, "Person Course table missing... Not including problem stats"
	problemTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass

    PCDAY_SQL_BASE_SELECT = """
			  SELECT username,
				 '{course_id}' AS course_id,
				 DATE(time) AS date,
				 SUM(bevent) AS nevents,
				 SUM(bprogress) AS nprogcheck,
				 SUM(bshow_answer) AS nshow_answer,
				 SUM(bvideo) AS nvideo,
				 SUM(bproblem_check) AS nproblem_check,
				 SUM(bforum) AS nforum,
				 SUM(bshow_transcript) AS ntranscript,
				 SUM(bseq_goto) AS nseq_goto,
				 SUM(bseek_video) AS nseek_video,
				 SUM(bpause_video) AS npause_video,
		    """

    PCDAY_SQL_VIDEO_EXISTS = """
			  	 COUNT(DISTINCT video_id) AS nvideos_viewed, # New Video - Unique videos viewed
				 SUM(case when position is not null then FLOAT(position) else FLOAT(0.0) end) AS nvideos_watched_sec, # New Video - # sec watched using max video position
		    """

    PCDAY_SQL_VIDEO_DNE = """
				 0 AS nvideos_viewed, # New Video - Unique videos viewed
				 FLOAT(0.0) AS nvideos_watched_sec, # New Video - # sec watched using max video position
		    """
    PCDAY_SQL_VIDEO_SELECT = PCDAY_SQL_VIDEO_EXISTS if videoTableExists else PCDAY_SQL_VIDEO_DNE

    PCDAY_SQL_FORUM_EXISTS = """
				 SUM(case when read is not null then read else 0 end) AS nforum_reads, # New discussion - Forum reads
				 SUM(case when write is not null then write else 0 end) AS nforum_posts, # New discussion - Forum posts
				 COUNT(DISTINCT thread_id ) AS nforum_threads, # New discussion - Unique forum threads interacted with
		    """

    PCDAY_SQL_FORUM_DNE = """
				 0 AS nforum_reads, # New discussion - Forum reads
				 0 AS nforum_posts, # New discussion - Forum posts
				 0 AS nforum_threads, # New discussion - Unique forum threads interacted with
		    """
    PCDAY_SQL_FORUM_SELECT = PCDAY_SQL_FORUM_EXISTS if forumTableExists else PCDAY_SQL_FORUM_DNE

    PCDAY_SQL_PROBLEM_EXISTS = """
				 COUNT(DISTINCT problem_nid ) AS nproblems_answered, # New Problem - Unique problems attempted
				 SUM(case when n_attempts is not null then n_attempts else 0 end) AS nproblems_attempted, # New Problem - Total attempts
				 SUM(case when ncount_problem_multiplechoice is not null then ncount_problem_multiplechoice else 0 end) as nproblems_multiplechoice,
				 SUM(case when ncount_problem_choice is not null then ncount_problem_choice else 0 end) as nproblems_choice,
				 SUM(case when ncount_problem_numerical is not null then ncount_problem_numerical else 0 end) as nproblems_numerical,
				 SUM(case when ncount_problem_option is not null then ncount_problem_option else 0 end) as nproblems_option,
				 SUM(case when ncount_problem_custom is not null then ncount_problem_custom else 0 end) as nproblems_custom,
				 SUM(case when ncount_problem_string is not null then ncount_problem_string else 0 end) as nproblems_string,
				 SUM(case when ncount_problem_mixed is not null then ncount_problem_mixed else 0 end) as nproblems_mixed,
				 SUM(case when ncount_problem_formula is not null then ncount_problem_formula else 0 end) as nproblems_forumula,
				 SUM(case when ncount_problem_other is not null then ncount_problem_other else 0 end) as nproblems_other,
		    """

    PCDAY_SQL_PROBLEM_DNE = """
				 0 AS nproblems_answered, # New Problem - Unique problems attempted
				 0 AS nproblems_attempted, # New Problem - Total attempts
				 0 AS nproblems_multiplechoice,
				 0 AS nproblems_choice,
				 0 AS nproblems_numerical,
				 0 AS nproblems_option,
				 0 AS nproblems_custom,
				 0 AS nproblems_string,
				 0 AS nproblems_mixed,
				 0 AS nproblems_forumula,
				 0 AS nproblems_other,
		    """
    PCDAY_SQL_PROBLEM_SELECT = PCDAY_SQL_PROBLEM_EXISTS if problemTableExists else PCDAY_SQL_PROBLEM_DNE

    PCDAY_SQL_MID = """
				 MIN(time) AS first_event,
				 MAX(time) AS last_event,
				 AVG( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS avg_dt,
				 STDDEV( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sdv_dt,
				 MAX( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS max_dt,
				 COUNT( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS n_dt,
				 SUM( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sum_dt
			FROM (
			  SELECT
			    *
			  FROM (
			    SELECT
			      username,
			      CASE WHEN event_type = "play_video" THEN 1 ELSE 0 END AS bvideo,
			      CASE WHEN event_type = "problem_check" THEN 1 ELSE 0 END AS bproblem_check,
			      CASE WHEN username != "" THEN 1 ELSE 0 END AS bevent,
			      CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum,
			      CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress,
			      CASE WHEN event_type IN ("show_answer",
				"showanswer") THEN 1 ELSE 0 END AS bshow_answer,
			      CASE WHEN event_type = 'show_transcript' THEN 1 ELSE 0 END AS bshow_transcript,
			      CASE WHEN event_type = 'seq_goto' THEN 1 ELSE 0 END AS bseq_goto,
			      CASE WHEN event_type = 'seek_video' THEN 1 ELSE 0 END AS bseek_video,
			      CASE WHEN event_type = 'pause_video' THEN 1 ELSE 0 END AS bpause_video,
			      # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll,
			      # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll
			      time,
			      LAG(time, 1) OVER (PARTITION BY username ORDER BY time) last_time
			    FROM {DATASETS}
			    WHERE
			      NOT event_type CONTAINS "/xblock/"
			      AND username != "" )
		    """


    PCDAY_SQL_VIDEO = """ ,
			  ( # Video events
				  SELECT TIMESTAMP(date) as time,
				         '{course_id}' as course_id,
				         username,
				         video_id,
				         position,
				  FROM [{dataset}.video_stats_day]
				  WHERE TIMESTAMP(date)>= TIMESTAMP("{min_date_start}") and TIMESTAMP(date) <= TIMESTAMP("{max_date_end}")

			  )
                      """
    PCDAY_SQL_ADD = PCDAY_SQL_VIDEO if videoTableExists else ''

    PCDAY_SQL_FORUM = """ ,
			  ( # Forum Events
				   SELECT time,
					  username,
				          '{course_id}' as course_id,
				          thread_id,
				          (CASE WHEN (forum_action == "reply" or forum_action == "comment_reply"
						      or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment")
						THEN 1 ELSE 0 END) AS write,
					  (CASE WHEN (forum_action == "read" or forum_action == "read_inline") THEN 1 ELSE 0 END) AS read,
				   FROM [{dataset}.forum_events]
				   WHERE (forum_action == "reply" or forum_action == "comment_reply"
					  or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment"
					  or forum_action == "read" or forum_action == "read_inline")
				          and ( time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}") )
			  )
                      """
    PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_FORUM if forumTableExists else PCDAY_SQL_ADD

    PCDAY_SQL_PROBLEM = """,
			  ( # Problems
				   SELECT pc.username AS username,
				          pp.problem_nid AS problem_nid,
				          pp.n_attempts AS n_attempts,
				          pp.time AS time,
				          '{course_id}' as course_id,
					  pp.ncount_problem_multiplechoice as ncount_problem_multiplechoice,
					  pp.ncount_problem_choice as ncount_problem_choice,
					  pp.ncount_problem_numerical as ncount_problem_numerical,
					  pp.ncount_problem_option as ncount_problem_option,
					  pp.ncount_problem_custom as ncount_problem_custom,
					  pp.ncount_problem_string as ncount_problem_string,
					  pp.ncount_problem_mixed as ncount_problem_mixed,
					  pp.ncount_problem_formula as ncount_problem_formula,
					  pp.ncount_problem_other as ncount_problem_other,
				   FROM (

					   (
					      SELECT PP.user_id as user_id,
						     PP.problem_nid AS problem_nid,
						     PP.n_attempts as n_attempts,
						     PP.date as time,
						     (Case when CP_CA.data_itype == "multiplechoiceresponse" then 1 else 0 end) as ncount_problem_multiplechoice, # Choice
					             (Case when CP_CA.data_itype == "choiceresponse" then 1 else 0 end) as ncount_problem_choice,       # Choice
						     (Case when CP_CA.data_itype == "numericalresponse" then 1 else 0 end) as ncount_problem_numerical, #input
						     (Case when CP_CA.data_itype == "optionresponse" then 1 else 0 end) as ncount_problem_option,       # Choice
					             (Case when CP_CA.data_itype == "customresponse" then 1 else 0 end) as ncount_problem_custom,       # Custom
					             (Case when CP_CA.data_itype == "stringresponse" then 1 else 0 end) as ncount_problem_string,       # Input
					             (Case when CP_CA.data_itype == "mixed" then 1 else 0 end) as ncount_problem_mixed,                 # Mixed
					             (Case when CP_CA.data_itype == "forumula" then 1 else 0 end) as ncount_problem_formula,            # Input
					             (Case when CP_CA.data_itype != "multiplechoiceresponse" and
							        CP_CA.data_itype != "choiceresponse" and
							        CP_CA.data_itype != "numericalresponse" and
							        CP_CA.data_itype != "optionresponse" and
							        CP_CA.data_itype != "customresponse" and
							        CP_CA.data_itype != "stringresponse" and
							        CP_CA.data_itype != "mixed" and
							        CP_CA.data_itype != "forumula"
							   then 1 else 0 end) as ncount_problem_other, # Input
						     #MAX(n_attempts) AS n_attempts,
						     #MAX(date) AS time,
					      FROM [{dataset}.person_problem] PP
					      LEFT JOIN
					      (
							SELECT CP.problem_nid as problem_nid,
							       INTEGER(CP.problem_id) as problem_id,
							       CA.data.itype as data_itype,
						        FROM [{dataset}.course_problem] CP
						        LEFT JOIN [{dataset}.course_axis] CA
						        ON CP.problem_id == CA.url_name
					      ) as CP_CA
					      ON PP.problem_nid == CP_CA.problem_nid
					      GROUP BY time, user_id, problem_nid, n_attempts,
						       ncount_problem_multiplechoice,
						       ncount_problem_choice,
						       ncount_problem_choice,
						       ncount_problem_numerical,
						       ncount_problem_option,
						       ncount_problem_custom,
						       ncount_problem_string,
						       ncount_problem_mixed,
						       ncount_problem_formula,
						       ncount_problem_other
					      )

					      #FROM [{dataset}.person_item] PI
					      #JOIN [{dataset}.course_item] CI
					      #ON PI.item_nid = CI.item_nid
					      #GROUP BY user_id,
						       #problem_nid
					      #ORDER BY
						       #user_id,
						       #problem_nid
					) AS pp
				        LEFT JOIN (
							      SELECT username,
								     user_id
							      FROM [{dataset}.person_course] 
					) AS pc
					ON pc.user_id = pp.user_id
				        WHERE time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}")
			  )
 
                        """
    PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_PROBLEM if problemTableExists else PCDAY_SQL_ADD

    PCDAY_SQL_END = """
			  )
			  WHERE time > TIMESTAMP("{last_date}")
			  GROUP BY course_id,
				   username,
				   date
			  ORDER BY date
		    """


    PCDAY_SQL_NEW = PCDAY_SQL_BASE_SELECT + PCDAY_SQL_VIDEO_SELECT + PCDAY_SQL_FORUM_SELECT + PCDAY_SQL_PROBLEM_SELECT + PCDAY_SQL_MID + PCDAY_SQL_ADD + PCDAY_SQL_END

    PCDAY_SQL = PCDAY_SQL_NEW.format( dataset=dataset, course_id="{course_id}", DATASETS="{DATASETS}", last_date="{last_date}", min_date_start="{min_date_start}", max_date_end="{max_date_end}")

    table = 'person_course_day'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    print "=== Processing person_course_day for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    # Major person_course_day schema revision 19-Jan-2016 adds new fields; if table exists, ensure it 
    # has new schema, else force recompute.
    try:
        tinfo = bqutil.get_bq_table_info(dataset, table)
    except Exception as err:
        tinfo = None
    if tinfo:
        fields = tinfo['schema']['fields']
        field_names = [x['name'] for x in fields]
        if not 'nvideos_viewed' in field_names:
            cdt = tinfo['creationTime']
            print "    --> person_course_day created %s; missing nvideos_viewed field in schema; forcing recompute - this may take a long time!" % cdt
            sys.stdout.flush()
            force_recompute = True

    process_tracking_logs.run_query_on_tracking_logs(PCDAY_SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf,
                                                     newer_than=datetime.datetime( 2017, 2, 8, 16, 30 ),
                                                     skip_last_day=skip_last_day)
    
    print "Done with person_course_day for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()

コード例 #19

ファイルを表示

ファイル: make_problem_events.py プロジェクト: AbdouSeck/edx2bigquery

def ExtractProblemEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None):
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_PROBLEM_EVENTS
    the_sql = """
SELECT  
    context.user_id as user_id, 
    time,
    event_source,
    REGEXP_EXTRACT(
      (CASE when module_id is not null then module_id 
          when event_type contains "/xblock/i4x:;_" then REPLACE(REGEXP_EXTRACT(event_type, r"i4x:;_;_(.*)/handler/xmodule"),";_", "/")
          else REPLACE(event_struct.problem, "i4x://", "")
          end),
      "[^/]+/problem/([^/]+)") as problem_url,
    (CASE when event_type contains "/xblock/i4x:;_" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)")
          when event_type contains "type@problem+block" then REGEXP_EXTRACT(event_type, r"xmodule_handler/(.[^/]+)")
          else event_type
          end) as event_type,
   event_struct.attempts as attempts,
   event_struct.success as success,
   event_struct.grade as grade,          
FROM {DATASETS}
WHERE       
   ( REGEXP_MATCH(event_type, r'problem_\w+') 
     OR event_type = "showanswer"
   )
   AND context.user_id is not null
   and time > TIMESTAMP("{last_date}")
   {hash_limit}
order by user_id, time
    """

    try:
        tinfo = bqutil.get_bq_table_info(dataset, table )
        assert tinfo is not None, "[make_problem_events] Creating %s.%s table for %s" % (dataset, table, course_id)

        print "[make_problem_events] Appending latest data to %s.%s table for %s" % (dataset, table, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % ( dataset, table )
        sys.stdout.flush()
        pass

    print "=== Processing Forum Events for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     get_date_function=gdf,
                                                     has_hash_limit=True,
                                                     end_date=end_date,
                                                     skip_last_day=skip_last_day
                                                    )

    print "Done with Problem Events for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()

コード例 #20

ファイルを表示

ファイル: make_irt_report.py プロジェクト: AbdouSeck/edx2bigquery

def make_irt_report(course_id, force_recompute=False, use_dataset_latest=False):
    '''
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    the_sql_alpha = """
    IR.itemtestcorr as item_test,
    IR.itemrestcorr as item_rest,
    IR.alpha as alpha,
    """

    the_sql_no_alpha = """
    null as item_test,
    null as item_rest,
    null as alpha,
    """

    the_sql_alpha_join = """
    JOIN [{dataset}.item_reliabilities] IR
    on IR.item = CP.problem_yid
    """.format(dataset=dataset)

    the_sql = """
# item_response_theory_report for {course_id}
#
# problem_nid,problem_short_id,chapter,assignment_type,problem_label,problem_id,IRT item number,avg_problem_raw_score,avg_problem_pct_score,
# n_unique_users_attempted,item_test,item_rest,alpha ,Discrimination,Difficulty

SELECT 
    "{course_id}" as course_id,
    IG.problem_nid as problem_nid,
    CP.problem_short_id as problem_short_id,
    CI.chapter_name as chapter,
    assignment_type,
    CONCAT("[", STRING(IG.problem_nid), "] ", CI.chapter_name, " / ", CI.section_name, " / ", CP.problem_name) as problem_label,
    CP.problem_id as problem_id,
    CONCAT(STRING(CP.problem_nid), "/", STRING(cutnum)) as IRT_item_number,
    CP.avg_problem_raw_score avg_problem_raw_score,
    CP.avg_problem_pct_score avg_problem_pct_score,
    CP.n_unique_users_attempted n_unique_users_attempted,
    {sql_alpha}
    irt_diff as Difficulty,
    irt_disc as Discrimination,
    diff_se as Difficulty_SE,
    disc_se as Discrimination_SE,
    "{irt_method}" as irt_method,

FROM [{dataset}.{item_irt_grm}] IG
JOIN [{dataset}.course_item] CI
on IG.problem_nid = CI.problem_nid
JOIN 
(
    SELECT *, CONCAT("y", STRING(problem_nid)) as problem_yid,
    FROM [{dataset}.course_problem]
) CP
on IG.problem_nid = CP.problem_nid
{sql_alpha_join}
where CI.item_number = 1
    """

    tablename = "item_response_theory_report"
    RELIABILITIES_TABLE = "item_reliabilities"
    IRT_TABLES = OrderedDict([ ("item_irt_grm", "STATA GRM"),
                               ("item_irt_grm_R", "R mirt GRM"),
                           ])
    
    irt_table_to_use = None
    irt_table_date = None

    # use newest of the existing IRT tables
    for irt_tablename in IRT_TABLES:
        try:
            tinfo = bqutil.get_bq_table_info(dataset, irt_tablename )
            assert tinfo is not None, "%s.%s does not exist" % ( dataset, irt_tablename )
            lmt = tinfo.get('lastModifiedTime')
            use_table = lmt and ( (not irt_table_date) or (irt_table_date and lmt > irt_table_date) )
            if use_table:
                irt_table_date = lmt
                irt_table_to_use = irt_tablename
            else:
                print "[make_irt_report] Not using IRT table %s (date %s) - older than %s (date %s)" % ( irt_tablename,
                                                                                                         lmt,
                                                                                                         irt_table_to_use,
                                                                                                         irt_table_date )
        except Exception as err:
            pass
    
    if not irt_table_to_use:
        raise Exception("[make_irt_report] Cannot generate IRT report; requires one of %s" % (','.join(IRT_TABLES.keys())))

    # SQL changes depending on whether item_reliabilities exists or not
    have_reliabilities = False
    try:
        tinfo = bqutil.get_bq_table_info(dataset, RELIABILITIES_TABLE)
        assert tinfo is not None, "%s.%s does not exist" % ( dataset, RELIABILITIES_TABLE )
        if tinfo is not None:
            have_reliabilities = True
    except Exception as err:
        pass

    if have_reliabilities:
        sql_alpha = {'sql_alpha': the_sql_alpha, "sql_alpha_join": the_sql_alpha_join }
    else:
        sql_alpha = {'sql_alpha': the_sql_no_alpha, "sql_alpha_join": "" }

    the_sql = the_sql.format(dataset=dataset, course_id=course_id, item_irt_grm=irt_table_to_use, 
                             irt_method=IRT_TABLES[irt_table_to_use],
                             **sql_alpha)

    depends_on = [ "%s.course_item" % dataset,
                   "%s.course_problem" % dataset,
                   "%s.%s" % (dataset, irt_table_to_use),
               ]

    if have_reliabilities:
        depends_on.append("%s.item_reliabilities" % dataset)

    try:
        bqdat = bqutil.get_bq_table(dataset, tablename, the_sql, 
                                    depends_on=depends_on,
                                    force_query=force_recompute,
                                    newer_than=datetime.datetime(2016, 9, 27, 14, 48),
                                    startIndex=-2)
    except Exception as err:
        print "[make_irt_report] ERR! failed in creating %s.%s using this sql:" % (dataset, tablename)
        print the_sql
        raise

    if not bqdat:
        nfound = 0
    else:
        nfound = bqutil.get_bq_table_size_rows(dataset, tablename)
    print "--> Done with %s for %s, %d problem items found" % (tablename, course_id, nfound)
    sys.stdout.flush()

コード例 #21

ファイルを表示

ファイル: make_person_course_day.py プロジェクト: musixhine/edx2bigquery

def process_course(course_id, force_recompute=False, use_dataset_latest=False, end_date=None, 
                   check_dates=True, skip_last_day=False):
    '''
    Make {course_id}.person_course_day table for specified course_id.

    This is a single course-specific table, which contains all day's data.
    It is incrementally updated when new tracking logs data comes in,
    by appending rows to the end.  The rows are kept in time order.

    check_dates is disregarded.

    If skip_last_day is True then do not include the last day of tracking log data
    in the processing.  This is done to avoid processing partial data, e.g. when
    tracking log data are incrementally loaded with a delta of less than one day.
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)

    videoTableExists = False
    try:

        tinfo_video = bqutil.get_bq_table_info(dataset, 'video_stats_day')
        assert tinfo_video is not None, "Video stats table missing... Not including video stats"
	videoTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass
        #return

    forumTableExists = False
    try:

        tinfo_forum = bqutil.get_bq_table_info(dataset, 'forum_events')
        assert tinfo_forum is not None, "Forum events table missing... Not including forum stats"
	forumTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass
        #return

    problemTableExists = False
    try:

        tinfo_personproblem = bqutil.get_bq_table_info(dataset, 'person_problem')
        tinfo_courseproblem = bqutil.get_bq_table_info(dataset, 'course_problem')
        tinfo_courseaxis = bqutil.get_bq_table_info(dataset, 'course_axis')
        tinfo_personcourse = bqutil.get_bq_table_info(dataset, 'person_course')
	# Check course axis and person course, course problem
        assert tinfo_personproblem is not None, "Person problem table missing... Not including problem stats"
        assert tinfo_courseproblem is not None, "Course problem table missing... Not including problem stats"
        assert tinfo_courseaxis is not None, "Course axis table missing... Not including problem stats"
        assert tinfo_personcourse is not None, "Person Course table missing... Not including problem stats"
	problemTableExists = True

    except (AssertionError, Exception) as err:
        #print " --> Err: missing %s.%s?  Skipping creation of chapter_grades" % (dataset, "course_axis")
        sys.stdout.flush()
	pass

    PCDAY_SQL_BASE_SELECT = """
			  SELECT username,
				 '{course_id}' AS course_id,
				 DATE(time) AS date,
				 SUM(bevent) AS nevents,
				 SUM(bprogress) AS nprogcheck,
				 SUM(bshow_answer) AS nshow_answer,
				 SUM(bvideo) AS nvideo,
				 SUM(bproblem_check) AS nproblem_check,
				 SUM(bforum) AS nforum,
				 SUM(bshow_transcript) AS ntranscript,
				 SUM(bseq_goto) AS nseq_goto,
				 SUM(bseek_video) AS nseek_video,
				 SUM(bpause_video) AS npause_video,
		    """

    PCDAY_SQL_VIDEO_EXISTS = """
			  	 COUNT(DISTINCT video_id) AS nvideos_viewed, # New Video - Unique videos viewed
				 SUM(case when position is not null then FLOAT(position) else FLOAT(0.0) end) AS nvideos_watched_sec, # New Video - # sec watched using max video position
		    """

    PCDAY_SQL_VIDEO_DNE = """
				 0 AS nvideos_viewed, # New Video - Unique videos viewed
				 FLOAT(0.0) AS nvideos_watched_sec, # New Video - # sec watched using max video position
		    """
    PCDAY_SQL_VIDEO_SELECT = PCDAY_SQL_VIDEO_EXISTS if videoTableExists else PCDAY_SQL_VIDEO_DNE

    PCDAY_SQL_FORUM_EXISTS = """
				 SUM(case when read is not null then read else 0 end) AS nforum_reads, # New discussion - Forum reads
				 SUM(case when write is not null then write else 0 end) AS nforum_posts, # New discussion - Forum posts
				 COUNT(DISTINCT thread_id ) AS nforum_threads, # New discussion - Unique forum threads interacted with
		    """

    PCDAY_SQL_FORUM_DNE = """
				 0 AS nforum_reads, # New discussion - Forum reads
				 0 AS nforum_posts, # New discussion - Forum posts
				 0 AS nforum_threads, # New discussion - Unique forum threads interacted with
		    """
    PCDAY_SQL_FORUM_SELECT = PCDAY_SQL_FORUM_EXISTS if forumTableExists else PCDAY_SQL_FORUM_DNE

    PCDAY_SQL_PROBLEM_EXISTS = """
				 COUNT(DISTINCT problem_nid ) AS nproblems_answered, # New Problem - Unique problems attempted
				 SUM(case when n_attempts is not null then n_attempts else 0 end) AS nproblems_attempted, # New Problem - Total attempts
				 SUM(case when ncount_problem_multiplechoice is not null then ncount_problem_multiplechoice else 0 end) as nproblems_multiplechoice,
				 SUM(case when ncount_problem_choice is not null then ncount_problem_choice else 0 end) as nproblems_choice,
				 SUM(case when ncount_problem_numerical is not null then ncount_problem_numerical else 0 end) as nproblems_numerical,
				 SUM(case when ncount_problem_option is not null then ncount_problem_option else 0 end) as nproblems_option,
				 SUM(case when ncount_problem_custom is not null then ncount_problem_custom else 0 end) as nproblems_custom,
				 SUM(case when ncount_problem_string is not null then ncount_problem_string else 0 end) as nproblems_string,
				 SUM(case when ncount_problem_mixed is not null then ncount_problem_mixed else 0 end) as nproblems_mixed,
				 SUM(case when ncount_problem_formula is not null then ncount_problem_formula else 0 end) as nproblems_forumula,
				 SUM(case when ncount_problem_other is not null then ncount_problem_other else 0 end) as nproblems_other,
		    """

    PCDAY_SQL_PROBLEM_DNE = """
				 0 AS nproblems_answered, # New Problem - Unique problems attempted
				 0 AS nproblems_attempted, # New Problem - Total attempts
				 0 AS nproblems_multiplechoice,
				 0 AS nproblems_choice,
				 0 AS nproblems_numerical,
				 0 AS nproblems_option,
				 0 AS nproblems_custom,
				 0 AS nproblems_string,
				 0 AS nproblems_mixed,
				 0 AS nproblems_forumula,
				 0 AS nproblems_other,
		    """
    PCDAY_SQL_PROBLEM_SELECT = PCDAY_SQL_PROBLEM_EXISTS if problemTableExists else PCDAY_SQL_PROBLEM_DNE

    PCDAY_SQL_MID = """
				 MAX(time) AS last_event,
				 AVG( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS avg_dt,
				 STDDEV( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sdv_dt,
				 MAX( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS max_dt,
				 COUNT( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS n_dt,
				 SUM( CASE WHEN (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 > 5*60 THEN NULL ELSE (TIMESTAMP_TO_USEC(time) - last_time)/1.0E6 END ) AS sum_dt
			FROM (
			  SELECT
			    *
			  FROM (
			    SELECT
			      username,
			      CASE WHEN event_type = "play_video" THEN 1 ELSE 0 END AS bvideo,
			      CASE WHEN event_type = "problem_check" THEN 1 ELSE 0 END AS bproblem_check,
			      CASE WHEN username != "" THEN 1 ELSE 0 END AS bevent,
			      CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/discussion/.*") then 1 else 0 end as bforum,
			      CASE WHEN REGEXP_MATCH(event_type, "^/courses/{course_id}/progress") then 1 else 0 end as bprogress,
			      CASE WHEN event_type IN ("show_answer",
				"showanswer") THEN 1 ELSE 0 END AS bshow_answer,
			      CASE WHEN event_type = 'show_transcript' THEN 1 ELSE 0 END AS bshow_transcript,
			      CASE WHEN event_type = 'seq_goto' THEN 1 ELSE 0 END AS bseq_goto,
			      CASE WHEN event_type = 'seek_video' THEN 1 ELSE 0 END AS bseek_video,
			      CASE WHEN event_type = 'pause_video' THEN 1 ELSE 0 END AS bpause_video,
			      # case when event_type = 'edx.course.enrollment.activated' then 1 else 0 end as benroll,
			      # case when event_type = 'edx.course.enrollment.deactivated' then 1 else 0 end as bunenroll
			      time,
			      LAG(time, 1) OVER (PARTITION BY username ORDER BY time) last_time
			    FROM {DATASETS}
			    WHERE
			      NOT event_type CONTAINS "/xblock/"
			      AND username != "" )
		    """


    PCDAY_SQL_VIDEO = """ ,
			  ( # Video events
				  SELECT TIMESTAMP(date) as time,
				         '{course_id}' as course_id,
				         username,
				         video_id,
				         position,
				  FROM [{dataset}.video_stats_day]
				  WHERE TIMESTAMP(date)>= TIMESTAMP("{min_date_start}") and TIMESTAMP(date) <= TIMESTAMP("{max_date_end}")

			  )
                      """
    PCDAY_SQL_ADD = PCDAY_SQL_VIDEO if videoTableExists else ''

    PCDAY_SQL_FORUM = """ ,
			  ( # Forum Events
				   SELECT time,
					  username,
				          '{course_id}' as course_id,
				          thread_id,
				          (CASE WHEN (forum_action == "reply" or forum_action == "comment_reply"
						      or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment")
						THEN 1 ELSE 0 END) AS write,
					  (CASE WHEN (forum_action == "read" or forum_action == "read_inline") THEN 1 ELSE 0 END) AS read,
				   FROM [{dataset}.forum_events]
				   WHERE (forum_action == "reply" or forum_action == "comment_reply"
					  or forum_action == "created_thread" or forum_action == "created_response" or forum_action == "created_comment"
					  or forum_action == "read" or forum_action == "read_inline")
				          and ( time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}") )
			  )
                      """
    PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_FORUM if forumTableExists else PCDAY_SQL_ADD

    PCDAY_SQL_PROBLEM = """,
			  ( # Problems
				   SELECT pc.username AS username,
				          pp.problem_nid AS problem_nid,
				          pp.n_attempts AS n_attempts,
				          pp.time AS time,
				          '{course_id}' as course_id,
					  pp.ncount_problem_multiplechoice as ncount_problem_multiplechoice,
					  pp.ncount_problem_choice as ncount_problem_choice,
					  pp.ncount_problem_numerical as ncount_problem_numerical,
					  pp.ncount_problem_option as ncount_problem_option,
					  pp.ncount_problem_custom as ncount_problem_custom,
					  pp.ncount_problem_string as ncount_problem_string,
					  pp.ncount_problem_mixed as ncount_problem_mixed,
					  pp.ncount_problem_formula as ncount_problem_formula,
					  pp.ncount_problem_other as ncount_problem_other,
				   FROM (

					   (
					      SELECT PP.user_id as user_id,
						     PP.problem_nid AS problem_nid,
						     PP.n_attempts as n_attempts,
						     PP.date as time,
						     (Case when CP_CA.data_itype == "multiplechoiceresponse" then 1 else 0 end) as ncount_problem_multiplechoice, # Choice
					             (Case when CP_CA.data_itype == "choiceresponse" then 1 else 0 end) as ncount_problem_choice,       # Choice
						     (Case when CP_CA.data_itype == "numericalresponse" then 1 else 0 end) as ncount_problem_numerical, #input
						     (Case when CP_CA.data_itype == "optionresponse" then 1 else 0 end) as ncount_problem_option,       # Choice
					             (Case when CP_CA.data_itype == "customresponse" then 1 else 0 end) as ncount_problem_custom,       # Custom
					             (Case when CP_CA.data_itype == "stringresponse" then 1 else 0 end) as ncount_problem_string,       # Input
					             (Case when CP_CA.data_itype == "mixed" then 1 else 0 end) as ncount_problem_mixed,                 # Mixed
					             (Case when CP_CA.data_itype == "forumula" then 1 else 0 end) as ncount_problem_formula,            # Input
					             (Case when CP_CA.data_itype != "multiplechoiceresponse" and
							        CP_CA.data_itype != "choiceresponse" and
							        CP_CA.data_itype != "numericalresponse" and
							        CP_CA.data_itype != "optionresponse" and
							        CP_CA.data_itype != "customresponse" and
							        CP_CA.data_itype != "stringresponse" and
							        CP_CA.data_itype != "mixed" and
							        CP_CA.data_itype != "forumula"
							   then 1 else 0 end) as ncount_problem_other, # Input
						     #MAX(n_attempts) AS n_attempts,
						     #MAX(date) AS time,
					      FROM [{dataset}.person_problem] PP
					      LEFT JOIN
					      (
							SELECT CP.problem_nid as problem_nid,
							       INTEGER(CP.problem_id) as problem_id,
							       CA.data.itype as data_itype,
						        FROM [{dataset}.course_problem] CP
						        LEFT JOIN [{dataset}.course_axis] CA
						        ON CP.problem_id == CA.url_name
					      ) as CP_CA
					      ON PP.problem_nid == CP_CA.problem_nid
					      GROUP BY time, user_id, problem_nid, n_attempts,
						       ncount_problem_multiplechoice,
						       ncount_problem_choice,
						       ncount_problem_choice,
						       ncount_problem_numerical,
						       ncount_problem_option,
						       ncount_problem_custom,
						       ncount_problem_string,
						       ncount_problem_mixed,
						       ncount_problem_formula,
						       ncount_problem_other
					      )

					      #FROM [{dataset}.person_item] PI
					      #JOIN [{dataset}.course_item] CI
					      #ON PI.item_nid = CI.item_nid
					      #GROUP BY user_id,
						       #problem_nid
					      #ORDER BY
						       #user_id,
						       #problem_nid
					) AS pp
				        LEFT JOIN (
							      SELECT username,
								     user_id
							      FROM [{dataset}.person_course] 
					) AS pc
					ON pc.user_id = pp.user_id
				        WHERE time >= TIMESTAMP("{min_date_start}") and time <= TIMESTAMP("{max_date_end}")
			  )
 
                        """
    PCDAY_SQL_ADD = PCDAY_SQL_ADD + PCDAY_SQL_PROBLEM if problemTableExists else PCDAY_SQL_ADD

    PCDAY_SQL_END = """
			  )
			  WHERE time > TIMESTAMP("{last_date}")
			  GROUP BY course_id,
				   username,
				   date
			  ORDER BY date
		    """


    PCDAY_SQL_NEW = PCDAY_SQL_BASE_SELECT + PCDAY_SQL_VIDEO_SELECT + PCDAY_SQL_FORUM_SELECT + PCDAY_SQL_PROBLEM_SELECT + PCDAY_SQL_MID + PCDAY_SQL_ADD + PCDAY_SQL_END

    PCDAY_SQL = PCDAY_SQL_NEW.format( dataset=dataset, course_id="{course_id}", DATASETS="{DATASETS}", last_date="{last_date}", min_date_start="{min_date_start}", max_date_end="{max_date_end}")

    table = 'person_course_day'

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    print "=== Processing person_course_day for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    # Major person_course_day schema revision 19-Jan-2016 adds new fields; if table exists, ensure it 
    # has new schema, else force recompute.
    try:
        tinfo = bqutil.get_bq_table_info(dataset, table)
    except Exception as err:
        tinfo = None
    if tinfo:
        fields = tinfo['schema']['fields']
        field_names = [x['name'] for x in fields]
        if not 'nvideos_viewed' in field_names:
            cdt = tinfo['creationTime']
            print "    --> person_course_day created %s; missing nvideos_viewed field in schema; forcing recompute - this may take a long time!" % cdt
            sys.stdout.flush()
            force_recompute = True

    process_tracking_logs.run_query_on_tracking_logs(PCDAY_SQL, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     end_date=end_date,
                                                     get_date_function=gdf,
                                                     newer_than=datetime.datetime( 2016, 1, 19, 22, 30 ),
                                                     skip_last_day=skip_last_day)
    
    print "Done with person_course_day for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()

コード例 #22

ファイルを表示

ファイル: make_course_report_tables.py プロジェクト: maxliu/edx2bigquery

    def __init__(self, course_id_set, output_project_id=None, nskip=0, 
                 output_dataset_id=None, 
                 output_bucket=None,
                 use_dataset_latest=False,
                 only_step=None,
                 end_date=None,
                 ):
        '''
        Compute course report tables, based on combination of all person_course and other individual course tables.

        only_step: specify a single course report step to be executed; runs all reports, if None
        '''
        
        if only_step and ',' in only_step:
            only_step = only_step.split(',')
        self.only_step = only_step

        self.end_date = end_date;

        if not course_id_set:
            print "ERROR! Must specify list of course_id's for report.  Aborting."
            return

        org = course_id_set[0].split('/',1)[0]	# extract org from first course_id
        self.org = org

        self.output_project_id = output_project_id

        crname = ('course_report_%s' % org)
        if use_dataset_latest:
            crname = 'course_report_latest'
        self.dataset = output_dataset_id or crname

        self.gsbucket = gsutil.gs_path_from_course_id(crname, gsbucket=output_bucket)
        self.course_id_set = course_id_set

        course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set]

        # check to see which datasets have person_course tables
        datasets_with_pc = []
        self.all_pc_tables = OrderedDict()
        self.all_pcday_ip_counts_tables = OrderedDict()
        self.all_uic_tables = OrderedDict()
        self.all_tott_tables = OrderedDict()
        for cd in course_datasets:
            try:
                table = bqutil.get_bq_table_info(cd, 'person_course')
            except Exception as err:
                print "[make-course_report_tables] Err: %s" % str(err)
                table = None
            if table is not None:
                self.all_pc_tables[cd] = table
                datasets_with_pc.append(cd)

            try:
                table = bqutil.get_bq_table_info(cd, 'pcday_ip_counts')
            except Exception as err:
                table = None
            if table is not None:
                self.all_pcday_ip_counts_tables[cd] = table

            try:
                table = bqutil.get_bq_table_info(cd, 'user_info_combo')
            except Exception as err:
                table = None
            if table is not None:
                self.all_uic_tables[cd] = table

            try:
                table = bqutil.get_bq_table_info(cd, 'time_on_task_totals')
            except Exception as err:
                print "[make-course_report_tables] Err: %s" % str(err)
                table = None
            if table is not None:
                self.all_tott_tables[cd] = table

        pc_tables = ',\n'.join(['[%s.person_course]' % x for x in datasets_with_pc])
        pcday_ip_counts_tables = ',\n'.join(['[%s.pcday_ip_counts]' % x for x in self.all_pcday_ip_counts_tables])
        uic_tables = ',\n'.join(['[%s.user_info_combo]' % x for x in self.all_uic_tables])
        tott_tables = ',\n'.join(['[%s.time_on_task_totals]' % x for x in self.all_tott_tables])

        print "%d time_on_task tables: %s" % (len(self.all_tott_tables), tott_tables)
        sys.stdout.flush()

        # find latest combined person_course table
        cpc_tables = [ x for x in bqutil.get_list_of_table_ids(self.dataset) if x.startswith("person_course_") ]
        if cpc_tables:
            the_cpc_table = "[%s.%s]" % (self.dataset, max(cpc_tables))
        else:
            the_cpc_table = None
        print "[make_course_report_tables] ==> Using %s as the latest combined person_course table" % the_cpc_table

        self.parameters = {'dataset': self.dataset,
                           'pc_tables': pc_tables,
                           'uic_tables': uic_tables,
                           'tott_tables': tott_tables,
                           'pcday_ip_counts_tables': pcday_ip_counts_tables,
                           'combined_person_course': the_cpc_table,
                           }
        print "[make_course_report_tables] ==> Using these datasets (with person_course tables): %s" % datasets_with_pc

        self.course_datasets = course_datasets
    
        print "="*100
        print "Generating course report tables -> dataset=%s, project=%s" % (self.dataset, self.output_project_id)
        sys.stdout.flush()

        bqutil.create_dataset_if_nonexistent(self.dataset, project_id=output_project_id)

        self.nskip = nskip
        if 1:
            self.combine_show_answer_stats_by_course()
            self.make_totals_by_course()
            self.make_medians_by_course()
            self.make_table_of_email_addresses()
            self.make_global_modal_ip_table()
            self.make_enrollment_by_day()
            self.make_time_on_task_stats_by_course()
            self.make_total_populations_by_course()
            self.make_table_of_n_courses_registered()
            self.make_geographic_distributions()
            # self.count_tracking_log_events()
            self.make_overall_totals()
    
        print "="*100
        print "Done with course report tables"
        sys.stdout.flush()

コード例 #23

ファイルを表示

ファイル: make_video_analysis.py プロジェクト: musixhine/edx2bigquery

def createVideoStats_day(course_id,
                         force_recompute=False,
                         use_dataset_latest=False,
                         skip_last_day=False,
                         end_date=None):
    '''
    Create video statistics per ay for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video
    position > 95% of the total video length duration.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS_PER_DAY

    the_sql = """
              SELECT date(time)as date, username,
                              #module_id as video_id,
                              #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path
                              (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'([-])' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only
                              max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else  float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position,
                       FROM {DATASETS}
                       WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and
                              event is not null
                       group by username, video_id, date
                       order by date
              """
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_STATS_PER_DAY)
        assert tinfo is not None, "[analyze_videos] Creating %s.%s table for %s" % (
            dataset, TABLE_VIDEO_STATS_PER_DAY, course_id)

        print "[analyze_videos] Appending latest data to %s.%s table for %s" % (
            dataset, TABLE_VIDEO_STATS_PER_DAY, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % (
            dataset, TABLE_VIDEO_STATS_PER_DAY)
        sys.stdout.flush()
        pass

    print "=== Processing Video Stats Per Day for %s (start %s)" % (
        course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.strptime(row['date'], '%Y-%m-%d')

    process_tracking_logs.run_query_on_tracking_logs(
        the_sql,
        table,
        course_id,
        force_recompute=force_recompute,
        use_dataset_latest=use_dataset_latest,
        get_date_function=gdf,
        skip_last_day=skip_last_day)

    print "Done with Video Stats Per Day for %s (end %s)" % (
        course_id, datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()

コード例 #24

ファイルを表示

ファイル: run_external.py プロジェクト: musixhine/edx2bigquery

def run_external_script(extcmd, param, ecinfo, course_id):
    """
    Run external script on specified course.

    extcmd = string specifying external command to run
    param = command line parameters, including extparam
    ecinfo = external command info from edx2bigquery_config
    course_id = course_id to run external command on
    """
    # use default for base set of parameters
    ed_name = ecinfo.get('default_parameters', 'DEFAULT')
    settings = ecinfo.get(ed_name, {})
    settings.update(ecinfo.get(extcmd))
    # print "settings: ", json.dumps(settings, indent=4)
    
    print settings['name']
    
    if param.verbose:
        print settings.get('description', '')

    cidns = course_id.replace('/', '__')
    cidns_nodots = course_id.replace('/', '__').replace('.', '_').replace('-', '_')

    mypath = path(os.path.realpath(__file__)).dirname()
    edx2bigquery_context = {'lib': mypath / "lib",
                            'bin': mypath / "bin",
                        }

    the_template = settings['template'].format(**edx2bigquery_context)
    fnpre = settings['filename_prefix']
    lfn = "%s-%s.log" % (fnpre, cidns)
    if settings.get('logs_dir'):
        lfn = path(settings['logs_dir']) / lfn

    try:
        ofn = settings['script_fn'].format(filename_prefix=fnpre, cidns=cidns)
    except Exception as err:
        print "oops, errr %s" % str(err)
        print "settings=", json.dumps(settings, indent=4)
        raise
    cwd = os.getcwd()

    the_date = str(datetime.datetime.now())

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=param.use_dataset_latest)
    table_prefix = dataset

    if param.force_recompute:
        param.force_recompute = 1
    else:
        param.force_recompute = 0

    context = {'course_id': course_id,
               'script_name': ofn,
               'the_date': the_date,
               'cidns': cidns,
               'cidns_nodots': cidns,
               'template_file': the_template,
               'log_file': lfn,
               'filename_prefix': fnpre,
               'filename_prefix_cidns': "%s__%s" % (fnpre, cidns),
               'working_dir': cwd,
               'table_prefix': table_prefix,
               'lib_dir': edx2bigquery_context['lib'],
               'bin_dir': edx2bigquery_context['bin'],
    }
    context.update(settings)
    context.update(param.__dict__)

    rundir = settings['run_dir'].format(**context)
    runcmd = settings['script_cmd'].format(**context)

    tem = codecs.open(the_template).read()
    tem = unicode(tem)
    try:
        # script_file = tem.format(**context)
        script_file = Template(tem).render(**context)
    except Exception as err:
        print "Oops, cannot properly format template %s" % the_template
        print "Error %s" % str(err)
        print "context: ", json.dumps(context, indent=4)
        raise
    ofndir = path(ofn).dirname()
    if not os.path.exists(ofndir):
        print "[Warning] Directory %s doesn't exist - creating it" % ofndir
        os.mkdir(ofndir)
    fp = codecs.open(ofn, 'w', encoding="utf8")
    fp.write(script_file)
    fp.close()
    print "Generated %s" % ofn

    # if depends_on is defined, and force_recompute is not true, then skip
    # run if output already exists and is newer than all depends_on tables.

    depends_on = settings.get('depends_on')
    output_table = settings.get('output_table')
    if depends_on and not type(depends_on)==list:
        depends_on = [ depends_on ]
    do_compute = param.force_recompute
    if (not param.force_recompute) and depends_on and output_table:
        # does output already exist?
        has_output = False
        try:
            tinfo = bqutil.get_bq_table_info(dataset, output_table)
            if tinfo:
                has_output = True
        except:
            pass
        if not has_output:
            print "Output table %s.%s doesn't exist: running" % (dataset, output_table)
            do_compute = True
        else:
            table_date = tinfo['lastModifiedTime']
            for deptab in depends_on:
                try:
                    dtab_date = bqutil.get_bq_table_last_modified_datetime(dataset, deptab)
                except Exception as err:
                    raise Exception("[run_external] missing dependent table %s.%s" % (dataset, deptab))
                if not dtab_date:
                    raise Exception("[run_external] missing dependent table %s.%s" % (dataset, deptab))
                if table_date and dtab_date > table_date:
                    do_compute = True
                    break
            if not do_compute:
                print "Output table %s.%s exists and is newer than %s, skipping" % (dataset, output_table, depends_on)
            
    if do_compute:
        os.chdir(rundir)
        print "Working directory: %s" % rundir
        print "Logging to %s" % lfn
        print "Run command: %s" % runcmd
        sys.stdout.flush()
        if not param.skiprun:
            start = datetime.datetime.now()

            if param.submit_condor:
                condor_template_fn = settings.get('condor_job_template', '').format(**edx2bigquery_context)
                if not condor_template_fn:
                    raise Exception("[run_external] missing condor_job_template specification for %s" % (extcmd))
                condor_submit_fn = "CONDOR/{filename_prefix}-{cidns}.submit".format(**context)
                context.update({ 'MEMORY': 32768,
                                 'arguments': '{script_name}'.format(**context),
                                 'executable': context['script_cmd'],
                                 'input_file': '',
                                 'filename': condor_submit_fn,
                                 })
                condor_template = Template(open(condor_template_fn).read()).render(**context)
                dirs = ['CONDOR', 'JOBS']
                for dir in dirs:
                    if not os.path.exists(dir):
                        os.mkdir(dir)
                fp = open(condor_submit_fn, 'w')
                fp.write(condor_template)
                fp.close()
                cmd = "condor_submit %s" % condor_submit_fn
                print cmd
                jobid = None
                for k in os.popen(cmd):
                    m = re.search('submitted to cluster ([0-9]+)', k)
                    if m:
                        jobid = m.group(1)
                dt = str(datetime.datetime.now())
                jobfile = 'condor_jobs.csv'
                open(jobfile, 'a').write("%s,%s,%s,%s\n" % (course_id, dt, jobid, lfn))
                print "[%s] Submitted as condor job %s at %s" % (course_id, jobid, dt)
                # print "[run_external] submitted %s, job=%s" % (extcmd, jobnum)
                return
            else:
                os.system(runcmd)

            if settings.get('type')=="stata":
                # cleanup leftover log file after stata batch run
                batch_log = ofn.split('.')[0] + ".log"
                if os.path.exists(batch_log):
                    os.unlink(batch_log)
                    print "Removed old log file %s" % batch_log

            end = datetime.datetime.now()
            has_output = False
            try:
                tinfo = bqutil.get_bq_table_info(dataset, output_table)
                if tinfo:
                    has_output = True
            except:
                pass
            success = has_output
            dt = end-start
            print "[run_external] DONE WITH %s, success=%s, dt=%s" % (extcmd, success, dt)
            sys.stdout.flush()
            if param.parallel and not success:
                raise Exception("[run_external] External command %s failed on %s" % (extcmd, course_id))

コード例 #25

ファイルを表示

ファイル: make_video_analysis.py プロジェクト: musixhine/edx2bigquery

def createVideoStats_obsolete(course_id,
                              force_recompute=False,
                              use_dataset_latest=False,
                              startDate=DATE_DEFAULT_START,
                              endDate=DATE_DEFAULT_END):
    '''
    Create video statistics for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video
    position > 95% of the total video length duration.
    This was the original method used, but is not the most efficient since it queries entire log set. Instead, generate video stats per day, then incrementally
    append to that data table as the daily log data comes in.
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS

    the_sql = """
                 SELECT index_chapter,
                        index_video,
                        name,
                        video_id, 
                        chapter_name,
                        sum(case when position > 0 then 1 else 0 end) as videos_viewed, 
                        sum(case when position > video_length*0.95 then 1 else 0 end) as videos_watched,
                 FROM (SELECT username,
                              #module_id as video_id,
                              #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path
                              (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'[-]' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only
                              max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else  float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position,
                       FROM (TABLE_QUERY({logs},
                             "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start_date} and {end_date}"))
                       WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and
                              event is not null
                       group by username, video_id
                       order by username, video_id) as video_log,
                       LEFT JOIN EACH
                       (SELECT video_length,
                                video_id as vid_id,
                                name,
                                index_video,
                                index_chapter,
                                chapter_name
                        FROM [{dataset}.{videoaxis}]
                        ) as {videoaxis}
                        ON video_log.video_id = {videoaxis}.vid_id
                        WHERE video_id is not null
                        group by video_id, name, index_chapter, index_video, chapter_name
                        order by index_video asc;
                """.format(dataset=dataset,
                           start_date=startDate,
                           end_date=endDate,
                           logs=logs,
                           videoaxis=TABLE_VIDEO_AXIS)

    print "[analyze_videos] Creating %s.%s table for %s" % (
        dataset, TABLE_VIDEO_STATS, course_id)
    sys.stdout.flush()

    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS)
        assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % (
            TABLE_VIDEO_STATS, TABLE_VIDEO_AXIS)

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % (
            dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_STATS)
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        the_sql,
        force_query=force_recompute,
        depends_on=["%s.%s" % (dataset, TABLE_VIDEO_AXIS)],
    )
    return bqdat

コード例 #26

ファイルを表示

ファイル: make_forum_analysis.py プロジェクト: sibycharley/edx2bigquery

def CreateForumPerson(course_id,
                      force_recompute=False,
                      use_dataset_latest=False,
                      skip_last_day=False,
                      end_date=None,
                      has_hash_limit=False,
                      hash_limit=HASH):
    '''
    Create Forum Person table, based on forum events and forum posts tables. 
    This table contains both read and writes for all forum posts, for all users.
    '''

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    table = TABLE_FORUM_PERSON

    original_the_sql = """

                  SELECT (case when PP.username_fp is not null then PP.username_fp else FE.username_fe end) as username,
			 "{course_id}" as course_id,
                         (case when PP.username_fp is not null then PP.slug_id else FE.slug_id end) as slug_id,
                         (case when PP.username_fp is not null then PP.slug_type else FE.slug_type end) as slug_type,
                         (case when PP.username_fp is not null then PP.thread_id else FE.thread_id end) as thread_id,
                         (case when PP.username_fp is not null then PP.parent_id else FE.parent_id end) as parent_id,
                         (case when PP.original_poster is not null then PP.original_poster else FE.original_poster end) as original_poster,
                         (case when PP.responded_to is not null then PP.responded_to else FE.responded_to end) as responded_to,
                         (case when PP.username_fp is not null then PP.title else FE.title end) as title,
                         (case when PP.username_fp is not null then PP.wrote else 0 end) as wrote,
                         FE.read as read,
                         FE.pin as pinned,
                         FE.upvote as upvoted,
                         FE.unvote as unvoted,
                         #FE.del as deleted,
                         FE.follow as followed,
                         (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) <= TIMESTAMP(FE.last_time))
                                   then TIMESTAMP(PP.first_time)
                               else (case when PP.first_time is not null and FE.last_time is null
                                     then TIMESTAMP(PP.first_time) else
                                         (case when FE.first_time is not null
                                               then TIMESTAMP(FE.first_time)
                                              else FE.last_time end) end) end) as first_time,
                         (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) >= TIMESTAMP(FE.last_time))
                                   then TIMESTAMP(PP.first_time)
                               else (case when PP.first_time is not null and FE.last_time is null
                                     then TIMESTAMP(PP.first_time) else
                                         (case when FE.last_time is not null
                                               then TIMESTAMP(FE.last_time)
                                              else FE.first_time end) end) end) as last_time,


                  FROM
                  (
                          # Find 1st level posting => "original_post"
                          SELECT username as username_fp,
                                 slug_id,
                                 slug_type,
                                 thread_id,
                                 parent_id,
                                 original_poster,
                                 responded_to,
                                 title,
                                 1 as wrote,
                                 #created_at as first_time,
                                 first_time
                          FROM [{dataset}.{forum_posts}]
                          {hash_limit_where}
                          ORDER by username_fp, first_time
                  ) PP
                  FULL OUTER JOIN EACH
                  (
                          SELECT username as username_fe, 
                                 MIN(TIMESTAMP(time)) as first_time,
                                 MAX(TIMESTAMP(time)) as last_time,
                                 slug_id,
                                 FE.thread_id as thread_id,
                                 FIRST(parent_id) as parent_id,
				 F.slug_type as slug_type,
				 F.original_poster as original_poster,
				 F.responded_to as responded_to,
				 F.title as title,
                                 #1 as read,
                                 sum(case when forum_action = "read" or forum_action = "read_inline" then 1 else 0 end) as read,
                                 sum(case when forum_action = "pin" then 1 else 0 end) as pin,
                                 sum(case when forum_action = "upvote" then 1 else 0 end) as upvote,
                                 sum(case when forum_action = "unvote" then 1 else 0 end) as unvote,
                                 #sum(case when forum_action = "delete" then 1 else 0 end) as del,
                                 sum(case when forum_action = "follow_thread" then 1 else 0 end) as follow,      
                          FROM [{dataset}.{forum_events}] FE
                          JOIN EACH 
                          (
                                 SELECT username as username_fe,
                                        slug_id,
                                        slug_type,
                                        thread_id,
                                        parent_id,
                                        original_poster,
                                        responded_to,
                                        title,
                                        first_time,
                                 FROM [{dataset}.{forum_posts}]
                                 {hash_limit_where}
                          ) as F
                          ON F.thread_id = FE.thread_id
                          WHERE ((FE.forum_action = "read") or 
                                (FE.forum_action = "read_inline") or
                                (FE.forum_action = "pin") or 
                                (FE.forum_action = "upvote") or 
                                (FE.forum_action = "unvote") or
                                #(FE.forum_action = "delete") or
                                (FE.forum_action = "follow_thread"))
                                {hash_limit_and}
                          GROUP BY username_fe, slug_id, thread_id, slug_type, original_poster, responded_to, title
                  ) as FE
                  ON (PP.username_fp = FE.username_fe) AND (PP.slug_id = FE.slug_id)
                  WHERE (PP.username_fp is not null and PP.username_fp != '') or (FE.username_fe is not null and FE.username_fe != '')

              """

    the_sql = original_the_sql.format(dataset=dataset,
                                      course_id=course_id,
                                      forum=TABLE_FORUM,
                                      forum_posts=TABLE_FORUM_POSTS,
                                      forum_events=TABLE_FORUM_EVENTS,
                                      hash_limit_and='',
                                      hash_limit_where='')

    print "[make_forum_analysis] Creating %s.%s table for %s" % (
        dataset, TABLE_FORUM_PERSON, course_id)
    sys.stdout.flush()

    try:

        tinfo_fe = bqutil.get_bq_table_info(dataset, TABLE_FORUM_EVENTS)
        trows_fe = int(tinfo_fe['numRows'])
        print "[make_forum_analysis] %s Forum Events found " % trows_fe
        tinfo_fp = bqutil.get_bq_table_info(dataset, TABLE_FORUM_POSTS)
        trows_fp = int(tinfo_fp['numRows'])
        print "[make_forum_analysis] %s Forum Posts found " % trows_fp

        assert tinfo_fe is not None and trows_fe != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % (
            TABLE_FORUM_PERSON, TABLE_FORUM_EVENTS)
        assert tinfo_fp is not None and trows_fp != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % (
            TABLE_FORUM_PERSON, TABLE_FORUM_POSTS)

    except (AssertionError, Exception) as err:

        print " --> Err: missing %s.%s and/or %s (including 0 rows in table)?  Skipping creation of %s" % (
            dataset, TABLE_FORUM_POSTS, TABLE_FORUM_EVENTS, TABLE_FORUM_PERSON)
        sys.stdout.flush()
        return

    # Now try to create table
    try:

        if has_hash_limit:

            overwrite = True
            hash_limit = int(hash_limit)
            for k in range(hash_limit):
                hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % (
                    hash_limit, k)
                hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % (
                    hash_limit, k)

                retry_the_sql = original_the_sql.format(
                    dataset=dataset,
                    forum=TABLE_FORUM,
                    forum_posts=TABLE_FORUM_POSTS,
                    forum_events=TABLE_FORUM_EVENTS,
                    hash_limit_and=hash_limit_and,
                    hash_limit_where=hash_limit_where)
                print "[make_forum_analysis] Retrying with this query...", retry_the_sql
                sys.stdout.flush()
                bqutil.create_bq_table(dataset,
                                       table,
                                       retry_the_sql,
                                       wait=True,
                                       overwrite=overwrite,
                                       allowLargeResults=True)
                overwrite = "append"

        else:

            overwrite = True
            bqutil.create_bq_table(dataset,
                                   table,
                                   the_sql,
                                   wait=True,
                                   overwrite=overwrite,
                                   allowLargeResults=True)

    except Exception as err:

        has_hash_limit = True
        if ((('Response too large to return.' in str(err)) or
             ('Resources exceeded during query execution' in str(err)))
                and has_hash_limit):

            # 'Resources exceeded during query execution'
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0
            print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit

            try:

                for k in range(hash_limit):

                    hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % (
                        hash_limit, k)
                    hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % (
                        hash_limit, k)

                    retry_the_sql = original_the_sql.format(
                        dataset=dataset,
                        forum=TABLE_FORUM,
                        forum_posts=TABLE_FORUM_POSTS,
                        forum_events=TABLE_FORUM_EVENTS,
                        hash_limit_and=hash_limit_and,
                        hash_limit_where=hash_limit_where)
                    print "[make_forum_analysis] Retrying with this query...", retry_the_sql
                    sys.stdout.flush()
                    bqutil.create_bq_table(dataset,
                                           table,
                                           retry_the_sql,
                                           wait=True,
                                           overwrite=overwrite,
                                           allowLargeResults=True)
                    overwrite = "append"

            except Exception as err:

                if ((('Response too large to return.' in str(err)) or
                     ('Resources exceeded during query execution' in str(err)))
                        and has_hash_limit):

                    hash_limit = int(hash_limit * 2.0)
                    print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit
                    CreateForumPerson(course_id,
                                      force_recompute,
                                      use_dataset_latest,
                                      skip_last_day,
                                      end_date,
                                      has_hash_limit=True,
                                      hash_limit=hash_limit)

                else:

                    print '[make_forum_analysis] An error occurred with this query: %s' % the_sql
                    raise

        else:

            print '[make_forum_analysis] An error occurred with this query: %s' % the_sql
            raise

    print "Done with Forum Person for %s (end %s)" % (course_id,
                                                      datetime.datetime.now())
    print "=" * 77
    sys.stdout.flush()

    return

コード例 #27

ファイルを表示

ファイル: reports.py プロジェクト: Excel-Chart/xanalytics

            def __getitem__(self, report_name):
                try:
                    crm = other.get_custom_report_metadata(report_name)
                    err = None
                except Exception as err:
                    crm = None
                if not crm:
                    logging.info("No custom report '%s' found, err=%s" % (report_name, err))
                    return "Missing custom report %s" % report_name

                # check access authorization
                # logging.info('[crc] checking auth for report %s, pdata=%s' % (crm.name, pdata))
                auth_ok, msg = is_authorized_for_custom_report(crm, pdata)
                if not auth_ok:
                    return ""			# return empty string if not authorized

                # logging.info('[cr] name=%s, title=%s' % (crm.name, crm.title))	# debugging

                title = JINJA_ENVIRONMENT.from_string(crm.title)
                try:
                    title_rendered = title.render(pdata)
                except Exception as err:
                    logging.error('[cr] Failed to render report %s title %s' % (crm.name, crm.title))
                    title = crm.title

                parameters = {x:v for x,v in pdata.items() if v is not None}
                parameters['orgname'] = other.ORGNAME
                parameters['dashboard_mode'] = other.MODE	# 'mooc' or '' (empty meaning residential, non-mooc)
                parameters['course_report'] = other.get_course_report_dataset()
                parameters['course_report_org'] = other.get_course_report_dataset(force_use_org=True)
                parameters['orgname'] = other.ORGNAME
                
                if 'require_table' in (crm.meta_info or []):
                    dataset = None
                    table = crm.meta_info['require_table']
                    if '{' in table:
                        try:
                            table = table.format(**parameters)
                        except Exception as err:
                            logging.error("Cannot substitute for parameters in require_table=%s, err=%s" % (table, err))
                    if '.' in table:
                        (dataset, table) = table.split('.', 1)
                    else:
                        course_id = parameters.get('course_id')
                        if course_id:
                            try:
                                dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=other.use_dataset_latest())
                            except Exception as err:
                                logging.error("failed to get dataset for course_id=%s" % course_id)
                                raise 
                        else:
                            logging.info("Suppressing report %s because dataset not specifid in require_table %s" % (title, table))
                            dataset = None
                            
                    if dataset is not None:
                        try:
                            tinfo = bqutil.get_bq_table_info(dataset, table) or None
                        except Exception as err:
                            tinfo = None
                            if not "Not Found" in str(err):
                                logging.error(err)
                                logging.error(traceback.format_exc())
                        if not tinfo:
                            logging.info("Suppressing report %s because %s.%s doesn't exist" % (title, dataset, table))
                            return ""
                    else:
                        logging.info("Skipping require_table check")

                report_id = hashlib.sha224("%s %s" % (crm.name, json.dumps(pdata))).hexdigest()
                if crm.description:
                    try:
                        crm.description = crm.description.format(**parameters)
                    except Exception as err:
                        logging.info('[cr] %s cannot format description %s' % (crm.name, crm.description))

                if self.do_no_embed and 'embedded' in (crm.meta_info or {}):
                    crm.meta_info.pop('embedded')
                if self.force_embed:
                    crm.meta_info['embedded'] = True
                
                if self.do_always_show:
                    crm.meta_info['always_show'] = True

                template = JINJA_ENVIRONMENT.get_template('custom_report_container.html')
                data = {'is_staff': other.is_superuser(),
                        'report': crm,
                        'report_params': json.dumps(parameters),
                        'report_is_staff': pdata.get('staff'),
                        'report_meta_info': json.dumps(crm.meta_info or {}),
                        'immediate_view': json.dumps(self.immediate_view),
                        'do_embed' : (crm.meta_info or {}).get('embedded') or self.force_embed,
                        'always_show': self.do_always_show,
                        'title': title_rendered,
                        'id': report_id,
                }
                self.immediate_view = False	# return to non-immediate view by default
                self.do_no_embed = False		# return to default
                self.force_embed = False		# return to default
                return template.render(data)

コード例 #28

ファイルを表示

ファイル: make_forum_analysis.py プロジェクト: sibycharley/edx2bigquery

def CreateForumPosts(course_id,
                     force_recompute=True,
                     use_dataset_latest=False,
                     skip_last_day=False,
                     end_date=None):
    '''
    Create Forum posts table, based on forum data. Categorizes forum posts as initial_post, response_post or comment.
    Also extracts first 100 characters of the post content as a preview.
    '''

    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    table = TABLE_FORUM_POSTS

    the_sql = """
                        SELECT * FROM
                        (
                             SELECT ADDED_TITLE.FA.username as username,
				    "{course_id}" as course_id,
                                    ADDED_TITLE.FA.slug_id as slug_id,
                                    ADDED_TITLE.FA.slug_type as slug_type,
                                    ADDED_TITLE.FA.thread_id as thread_id,
                                    ADDED_TITLE.FA.parent_id as parent_id,
                                    ADDED_TITLE.IP.username as original_poster,
                                    ADD_RESPOND_TO.username as responded_to,
                                    ADDED_TITLE.IP.title as title,
                                    ADDED_TITLE.FA.first_time as first_time,
                                    ADDED_TITLE.FA.body_preview as body_preview,
                             FROM 
                             (
                                  SELECT * FROM 
                                  (
                                       SELECT author_username as username,
                                              mongoid as slug_id,
                                              (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                              comment_thread_id as thread_id,
                                              parent_id,
                                              title,
                                              created_at as first_time,
                                              SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                       FROM [{dataset}.{forum}]
                                       WHERE (_type = "Comment" and parent_id is not null)
                                  ) as FA # 3rd level comment
                                  LEFT JOIN EACH
                                  ( 
                                       SELECT * FROM
                                       ( 
                                            SELECT author_username as username,
                                                   mongoid as slug_id,
                                                   (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                     (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                                   comment_thread_id as thread_id,
                                                   parent_id,
                                                   title,
                                                   created_at as first_time,
                                                   SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                            FROM [{dataset}.{forum}]
                                            WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                                       )
                                  ) as IP
                                  ON FA.thread_id = IP.slug_id
                             ) as ADDED_TITLE
                             LEFT JOIN EACH
                             (
                                  SELECT author_username as username,
                                         mongoid as slug_id,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                  FROM [{dataset}.{forum}]
                             ) as ADD_RESPOND_TO
                             ON ADDED_TITLE.FA.parent_id = ADD_RESPOND_TO.slug_id
                             WHERE ADDED_TITLE.FA.slug_type = "comment"
                        ) as RC,
                        (
                             SELECT FA.username as username,
				    "{course_id}" as course_id,
                                    FA.slug_id as slug_id,
                                    FA.slug_type as slug_type,
                                    FA.thread_id as thread_id,
                                    FA.parent_id as parent_id,
                                    IP.username as original_poster,
                                    IP.title as title,
                                    FA.first_time as first_time,
                                    FA.body_preview as body_preview
                             FROM 
                             (
                                  SELECT author_username as username,
                                         mongoid as slug_id,
                                         (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                           (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                         title,
                                         created_at as first_time,
                                         SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                  FROM [{dataset}.{forum}]
                                  WHERE (_type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null)
                             ) as FA # 2nd level comment
                             LEFT JOIN EACH
                             (
                                  SELECT * FROM 
                                  (    
                                       SELECT author_username as username,
                                              mongoid as slug_id,
                                              (case when _type = "Comment" and comment_thread_id is not null and parent_id is null and mongoid is not null then "response_post" else
                                                (case when _type = "Comment" and parent_id is not null then "comment" else null end) end) as slug_type,
                                              comment_thread_id as thread_id,
                                              parent_id,
                                              title,
                                              created_at as first_time,
                                              SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                       FROM [{dataset}.{forum}]
                                       WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                                  )
                             ) as IP
                             ON FA.thread_id = IP.slug_id
                        ) as RC2,
                        (
                             SELECT * FROM
                             (
                                  SELECT author_username as username,
				         "{course_id}" as course_id,
                                         mongoid as slug_id,
                                         (case when _type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null then "initial_post" end) as slug_type,
                                         comment_thread_id as thread_id,
                                         parent_id,
                                         title,
                                         created_at as first_time,
                                         SUBSTR(body, 0, {post_preview_char_count}) as body_preview,
                                  FROM [{dataset}.{forum}]
                                  WHERE (_type = "CommentThread" and comment_thread_id is null and parent_id is null and mongoid is not null)
                             )
                        ) as NA
              """.format(dataset=dataset,
                         course_id=course_id,
                         forum=TABLE_FORUM,
                         post_preview_char_count=POST_PREVIEW_CHAR_COUNT)

    print "[make_forum_analysis] Creating %s.%s table for %s" % (
        dataset, TABLE_FORUM_POSTS, course_id)
    sys.stdout.flush()

    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_FORUM)
        assert tinfo is not None, "[make_forum_analysis] %s table depends on %s, which does not exist" % (
            TABLE_FORUM_POSTS, TABLE_FORUM)

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % (
            dataset, TABLE_FORUM, TABLE_FORUM_POSTS)
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        the_sql,
        force_query=force_recompute,
        depends_on=["%s.%s" % (dataset, TABLE_FORUM)],
    )

    return bqdat

コード例 #29

ファイルを表示

ファイル: make_video_analysis.py プロジェクト: AbdouSeck/edx2bigquery

def createVideoStats( course_id, force_recompute=False, use_dataset_latest=False ):
    '''
    Final step for video stats is to run through daily video stats table and aggregate for entire course for videos watch and videos viewed
    Join results with video axis to get detailed metadata per video for dashboard data
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS

    the_sql = """
              SELECT index_chapter,
                     index_video,
                     name,
                     video_id,
                     chapter_name,
                     sum(case when position > 0 then 1 else 0 end) as videos_viewed, 
                     sum(case when position > video_length*0.95 then 1 else 0 end) as videos_watched,
               FROM (
                     SELECT username, index_chapter,
                            index_video,
                            name,
                            video_id, 
                            chapter_name,
                            max(position) as position,
                            video_length,
                     FROM (SELECT * FROM [{dataset}.{videostatsperday}]) as video_log,
                           LEFT JOIN EACH
                          (SELECT video_length,
                                  video_id as vid_id,
                                  name,
                                  index_video,
                                  index_chapter,
                                  chapter_name
                           FROM [{dataset}.{videoaxis}]
                           ) as video_axis
                           ON video_log.video_id = video_axis.vid_id
                           WHERE video_id is not null and username is not null
                           group by username, video_id, name, index_chapter, index_video, chapter_name, video_length
                           order by video_id asc)
                GROUP BY video_id, index_chapter, index_video, name, chapter_name
                ORDER BY index_video asc;
                """.format(dataset=dataset, videoaxis=TABLE_VIDEO_AXIS, videostatsperday=TABLE_VIDEO_STATS_PER_DAY)

    print "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS, course_id)
    sys.stdout.flush()
        
    try:
        tinfo_va = bqutil.get_bq_table_info( dataset, TABLE_VIDEO_AXIS )
        trows_va = int(tinfo_va['numRows'])
        tinfo_va_day = bqutil.get_bq_table_info( dataset, TABLE_VIDEO_STATS_PER_DAY )
        trows_va_day = int(tinfo_va['numRows'])
        assert tinfo_va is not None and trows_va != 0, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_STATS, TABLE_VIDEO_AXIS ) 
        assert tinfo_va_day is not None and trows_va_day != 0, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_STATS, TABLE_VIDEO_STATS_PER_DAY ) 

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s and/or %s (including 0 rows in table)?  Skipping creation of %s" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_STATS_PER_DAY, TABLE_VIDEO_STATS )
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(dataset, table, the_sql, force_query=force_recompute,
                                depends_on=["%s.%s" % (dataset, TABLE_VIDEO_AXIS)],
                                )
    return bqdat

コード例 #30

ファイルを表示

ファイル: make_video_analysis.py プロジェクト: AbdouSeck/edx2bigquery

def createVideoStats_obsolete( course_id, force_recompute=False, use_dataset_latest=False, startDate=DATE_DEFAULT_START, endDate=DATE_DEFAULT_END ):
    '''
    Create video statistics for viewed by looking for users who had a video position > 0, and watched by looking for users who had a video
    position > 95% of the total video length duration.
    This was the original method used, but is not the most efficient since it queries entire log set. Instead, generate video stats per day, then incrementally
    append to that data table as the daily log data comes in.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS
    
    the_sql = """
                 SELECT index_chapter,
                        index_video,
                        name,
                        video_id, 
                        chapter_name,
                        sum(case when position > 0 then 1 else 0 end) as videos_viewed, 
                        sum(case when position > video_length*0.95 then 1 else 0 end) as videos_watched,
                 FROM (SELECT username,
                              #module_id as video_id,
                              #REGEXP_REPLACE(REGEXP_EXTRACT(JSON_EXTRACT(event, '$.id'), r'(?:i4x-)(.*)(?:"$)'), '-', '/') as video_id, # Old method takes full video id path
                              (case when REGEXP_MATCH( JSON_EXTRACT(event, '$.id') , r'[-]' ) then REGEXP_EXTRACT(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', ''), r'(?:.*\/)(.*)') else REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(JSON_EXTRACT(event, '$.id'), '-', '/'), '"', ''), 'i4x/', '') end) as video_id, # This takes video id only
                              max(case when JSON_EXTRACT_SCALAR(event, '$.speed') is not null then float(JSON_EXTRACT_SCALAR(event,'$.speed'))*float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) else  float(JSON_EXTRACT_SCALAR(event, '$.currentTime')) end) as position,
                       FROM (TABLE_QUERY({logs},
                             "integer(regexp_extract(table_id, r'tracklog_([0-9]+)')) BETWEEN {start_date} and {end_date}"))
                       WHERE (event_type = "play_video" or event_type = "pause_video" or event_type = "stop_video") and
                              event is not null
                       group by username, video_id
                       order by username, video_id) as video_log,
                       LEFT JOIN EACH
                       (SELECT video_length,
                                video_id as vid_id,
                                name,
                                index_video,
                                index_chapter,
                                chapter_name
                        FROM [{dataset}.{videoaxis}]
                        ) as {videoaxis}
                        ON video_log.video_id = {videoaxis}.vid_id
                        WHERE video_id is not null
                        group by video_id, name, index_chapter, index_video, chapter_name
                        order by index_video asc;
                """.format(dataset=dataset,start_date=startDate,end_date=endDate,logs=logs, videoaxis=TABLE_VIDEO_AXIS)

    print "[analyze_videos] Creating %s.%s table for %s" % (dataset, TABLE_VIDEO_STATS, course_id)
    sys.stdout.flush()
        
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS )
        assert tinfo is not None, "[analyze videos] %s table depends on %s, which does not exist" % ( TABLE_VIDEO_STATS, TABLE_VIDEO_AXIS ) 

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s?  Skipping creation of %s" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_STATS )
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(dataset, table, the_sql, force_query=force_recompute,
                                depends_on=["%s.%s" % (dataset, TABLE_VIDEO_AXIS)],
                                )
    return bqdat

コード例 #31

ファイルを表示

ファイル: make_video_analysis.py プロジェクト: AbdouSeck/edx2bigquery

def make_video_stats(course_id, api_key, basedir, datedir, force_recompute, use_dataset_latest, use_latest_sql_dir):
    '''
    Create Video stats for Videos Viewed and Videos Watched.
    First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched
    '''

    assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable"

    # Get Course Dir path
    basedir = path(basedir or '')
    course_dir = course_id.replace('/','__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest or use_latest_sql_dir)
    
    # get schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/%s' % ( mypath, SCHEMA_VIDEO_AXIS )
    the_schema = json.loads(open(SCHEMA_FILE).read())[ SCHEMA_VIDEO_AXIS_NAME ]
    the_dict_schema = schema2dict(the_schema)

    # Create initial video axis
    videoAxisExists = False
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    va_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS )
        assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % ( dataset, TABLE_VIDEO_AXIS )
	videoAxisExists = True
        va_date = tinfo['lastModifiedTime']		# datetime
    except (AssertionError, Exception) as err:
        print "%s --> Attempting to process %s table" % ( str(err), TABLE_VIDEO_AXIS )
        sys.stdout.flush()

    # get course axis time
    ca_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS )
        ca_date = tinfo['lastModifiedTime']		# datetime
    except (AssertionError, Exception) as err:
        pass

    if videoAxisExists and (not force_recompute) and ca_date and va_date and (ca_date > va_date):
        force_recompute = True
        print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations( dataset=dataset, bq_table_input=va_bqdata, api_key=api_key, outputfilename=fileoutput, schema=the_dict_schema, force_recompute=force_recompute )

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(course_id, use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % ( dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS )

    # Lastly, create video stats
    createVideoStats_day( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )
    createVideoStats( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )

    # also create person_course_video_watched
    createPersonCourseVideo( course_id, force_recompute=force_recompute, use_dataset_latest=use_dataset_latest )

コード例 #32

ファイルを表示

ファイル: make_video_analysis.py プロジェクト: musixhine/edx2bigquery

def createVideoStats(course_id,
                     force_recompute=False,
                     use_dataset_latest=False):
    '''
    Final step for video stats is to run through daily video stats table and aggregate for entire course for videos watch and videos viewed
    Join results with video axis to get detailed metadata per video for dashboard data
    '''
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_VIDEO_STATS

    the_sql = """
              SELECT index_chapter,
                     index_video,
                     name,
                     video_id,
                     chapter_name,
                     sum(case when position > 0 then 1 else 0 end) as videos_viewed, 
                     sum(case when position > video_length*0.95 then 1 else 0 end) as videos_watched,
               FROM (
                     SELECT username, index_chapter,
                            index_video,
                            name,
                            video_id, 
                            chapter_name,
                            max(position) as position,
                            video_length,
                     FROM (SELECT * FROM [{dataset}.{videostatsperday}]) as video_log,
                           LEFT JOIN EACH
                          (SELECT video_length,
                                  video_id as vid_id,
                                  name,
                                  index_video,
                                  index_chapter,
                                  chapter_name
                           FROM [{dataset}.{videoaxis}]
                           ) as video_axis
                           ON video_log.video_id = video_axis.vid_id
                           WHERE video_id is not null and username is not null
                           group by username, video_id, name, index_chapter, index_video, chapter_name, video_length
                           order by video_id asc)
                GROUP BY video_id, index_chapter, index_video, name, chapter_name
                ORDER BY index_video asc;
                """.format(dataset=dataset,
                           videoaxis=TABLE_VIDEO_AXIS,
                           videostatsperday=TABLE_VIDEO_STATS_PER_DAY)

    print "[analyze_videos] Creating %s.%s table for %s" % (
        dataset, TABLE_VIDEO_STATS, course_id)
    sys.stdout.flush()

    try:
        tinfo_va = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS)
        trows_va = int(tinfo_va['numRows'])
        tinfo_va_day = bqutil.get_bq_table_info(dataset,
                                                TABLE_VIDEO_STATS_PER_DAY)
        trows_va_day = int(tinfo_va['numRows'])
        assert tinfo_va is not None and trows_va != 0, "[analyze videos] %s table depends on %s, which does not exist" % (
            TABLE_VIDEO_STATS, TABLE_VIDEO_AXIS)
        assert tinfo_va_day is not None and trows_va_day != 0, "[analyze videos] %s table depends on %s, which does not exist" % (
            TABLE_VIDEO_STATS, TABLE_VIDEO_STATS_PER_DAY)

    except (AssertionError, Exception) as err:
        print " --> Err: missing %s.%s and/or %s (including 0 rows in table)?  Skipping creation of %s" % (
            dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_STATS_PER_DAY,
            TABLE_VIDEO_STATS)
        sys.stdout.flush()
        return

    bqdat = bqutil.get_bq_table(
        dataset,
        table,
        the_sql,
        force_query=force_recompute,
        depends_on=["%s.%s" % (dataset, TABLE_VIDEO_AXIS)],
    )
    return bqdat

コード例 #33

ファイルを表示

ファイル: make_forum_analysis.py プロジェクト: kesiena115/edx2bigquery

def CreateForumPerson( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None, has_hash_limit=False, hash_limit=HASH ):
    '''
    Create Forum Person table, based on forum events and forum posts tables. 
    This table contains both read and writes for all forum posts, for all users.
    '''

    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    table = TABLE_FORUM_PERSON

    original_the_sql = """

                  SELECT (case when PP.username_fp is not null then PP.username_fp else FE.username_fe end) as username,
			 "{course_id}" as course_id,
                         (case when PP.username_fp is not null then PP.slug_id else FE.slug_id end) as slug_id,
                         (case when PP.username_fp is not null then PP.slug_type else FE.slug_type end) as slug_type,
                         (case when PP.username_fp is not null then PP.thread_id else FE.thread_id end) as thread_id,
                         (case when PP.username_fp is not null then PP.parent_id else FE.parent_id end) as parent_id,
                         (case when PP.original_poster is not null then PP.original_poster else FE.original_poster end) as original_poster,
                         (case when PP.responded_to is not null then PP.responded_to else FE.responded_to end) as responded_to,
                         (case when PP.username_fp is not null then PP.title else FE.title end) as title,
                         (case when PP.username_fp is not null then PP.wrote else 0 end) as wrote,
                         FE.read as read,
                         FE.pin as pinned,
                         FE.upvote as upvoted,
                         FE.unvote as unvoted,
                         #FE.del as deleted,
                         FE.follow as followed,
                         (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) <= TIMESTAMP(FE.last_time))
                                   then TIMESTAMP(PP.first_time)
                               else (case when PP.first_time is not null and FE.last_time is null
                                     then TIMESTAMP(PP.first_time) else
                                         (case when FE.first_time is not null
                                               then TIMESTAMP(FE.first_time)
                                              else FE.last_time end) end) end) as first_time,
                         (case when PP.first_time is not null and FE.last_time is not null and (TIMESTAMP(PP.first_time) >= TIMESTAMP(FE.last_time))
                                   then TIMESTAMP(PP.first_time)
                               else (case when PP.first_time is not null and FE.last_time is null
                                     then TIMESTAMP(PP.first_time) else
                                         (case when FE.last_time is not null
                                               then TIMESTAMP(FE.last_time)
                                              else FE.first_time end) end) end) as last_time,


                  FROM
                  (
                          # Find 1st level posting => "original_post"
                          SELECT username as username_fp,
                                 slug_id,
                                 slug_type,
                                 thread_id,
                                 parent_id,
                                 original_poster,
                                 responded_to,
                                 title,
                                 1 as wrote,
                                 #created_at as first_time,
                                 first_time
                          FROM [{dataset}.{forum_posts}]
                          {hash_limit_where}
                          ORDER by username_fp, first_time
                  ) PP
                  FULL OUTER JOIN EACH
                  (
                          SELECT username as username_fe, 
                                 MIN(TIMESTAMP(time)) as first_time,
                                 MAX(TIMESTAMP(time)) as last_time,
                                 slug_id,
                                 FE.thread_id as thread_id,
                                 FIRST(parent_id) as parent_id,
				 F.slug_type as slug_type,
				 F.original_poster as original_poster,
				 F.responded_to as responded_to,
				 F.title as title,
                                 #1 as read,
                                 sum(case when forum_action = "read" or forum_action = "read_inline" then 1 else 0 end) as read,
                                 sum(case when forum_action = "pin" then 1 else 0 end) as pin,
                                 sum(case when forum_action = "upvote" then 1 else 0 end) as upvote,
                                 sum(case when forum_action = "unvote" then 1 else 0 end) as unvote,
                                 #sum(case when forum_action = "delete" then 1 else 0 end) as del,
                                 sum(case when forum_action = "follow_thread" then 1 else 0 end) as follow,      
                          FROM [{dataset}.{forum_events}] FE
                          JOIN EACH 
                          (
                                 SELECT username as username_fe,
                                        slug_id,
                                        slug_type,
                                        thread_id,
                                        parent_id,
                                        original_poster,
                                        responded_to,
                                        title,
                                        first_time,
                                 FROM [{dataset}.{forum_posts}]
                                 {hash_limit_where}
                          ) as F
                          ON F.thread_id = FE.thread_id
                          WHERE ((FE.forum_action = "read") or 
                                (FE.forum_action = "read_inline") or
                                (FE.forum_action = "pin") or 
                                (FE.forum_action = "upvote") or 
                                (FE.forum_action = "unvote") or
                                #(FE.forum_action = "delete") or
                                (FE.forum_action = "follow_thread"))
                                {hash_limit_and}
                          GROUP BY username_fe, slug_id, thread_id, slug_type, original_poster, responded_to, title
                  ) as FE
                  ON (PP.username_fp = FE.username_fe) AND (PP.slug_id = FE.slug_id)
                  WHERE (PP.username_fp is not null and PP.username_fp != '') or (FE.username_fe is not null and FE.username_fe != '')

              """

    the_sql = original_the_sql.format( dataset=dataset, course_id=course_id, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and='', hash_limit_where='' )

    print "[make_forum_analysis] Creating %s.%s table for %s" % (dataset, TABLE_FORUM_PERSON, course_id)
    sys.stdout.flush()

    try:

        tinfo_fe = bqutil.get_bq_table_info( dataset, TABLE_FORUM_EVENTS )
        trows_fe = int(tinfo_fe['numRows'])
	print "[make_forum_analysis] %s Forum Events found " % trows_fe
        tinfo_fp = bqutil.get_bq_table_info( dataset, TABLE_FORUM_POSTS )
        trows_fp = int(tinfo_fp['numRows'])
	print "[make_forum_analysis] %s Forum Posts found " % trows_fp

        assert tinfo_fe is not None and trows_fe != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_PERSON, TABLE_FORUM_EVENTS )
        assert tinfo_fp is not None and trows_fp != 0, "[make_forum_analysis] %s table depends on %s, which does not exist" % ( TABLE_FORUM_PERSON, TABLE_FORUM_POSTS ) 

    except (AssertionError, Exception) as err:

        print " --> Err: missing %s.%s and/or %s (including 0 rows in table)?  Skipping creation of %s" % ( dataset, TABLE_FORUM_POSTS, TABLE_FORUM_EVENTS, TABLE_FORUM_PERSON )
        sys.stdout.flush()
        return

    # Now try to create table
    try:

        if has_hash_limit:

            overwrite = True
            hash_limit = int( hash_limit )
            for k in range( hash_limit ):
                hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % ( hash_limit, k )
                hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % ( hash_limit, k )

                retry_the_sql = original_the_sql.format( dataset=dataset, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and=hash_limit_and, hash_limit_where=hash_limit_where )
                print "[make_forum_analysis] Retrying with this query...", retry_the_sql
                sys.stdout.flush()
                bqutil.create_bq_table( dataset, table, retry_the_sql, wait=True, overwrite=overwrite, allowLargeResults=True )
                overwrite = "append"

        else:

	    overwrite = True
            bqutil.create_bq_table(dataset, table, the_sql, wait=True, overwrite=overwrite, allowLargeResults=True)

    except Exception as err:

        has_hash_limit = True
        if ( (('Response too large to return.' in str(err)) or ('Resources exceeded during query execution' in str(err))) and has_hash_limit ):

            # 'Resources exceeded during query execution'
            # try using hash limit on username
            # e.g. WHERE ABS(HASH(username)) % 4 = 0
            print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit

            try:

                for k in range( hash_limit ):

		    hash_limit_where = "WHERE ABS(HASH(username)) %% %d = %d" % ( hash_limit, k )
		    hash_limit_and = "and ABS(HASH(username)) %% %d = %d" % ( hash_limit, k )

                    retry_the_sql = original_the_sql.format( dataset=dataset, forum=TABLE_FORUM, forum_posts=TABLE_FORUM_POSTS, forum_events=TABLE_FORUM_EVENTS, hash_limit_and=hash_limit_and, hash_limit_where=hash_limit_where )
                    print "[make_forum_analysis] Retrying with this query...", retry_the_sql
                    sys.stdout.flush()
                    bqutil.create_bq_table( dataset, table, retry_the_sql, wait=True, overwrite=overwrite, allowLargeResults=True )
                    overwrite = "append"
  
            except Exception as err:

                if ( (('Response too large to return.' in str(err)) or ('Resources exceeded during query execution' in str(err))) and has_hash_limit ):

                    hash_limit = int( hash_limit * 2.0 )
                    print '[make_forum_analysis] Response too large to return. Attempting to break down into multiple queries and append instead... using hash of %s' % hash_limit
                    CreateForumPerson( course_id, force_recompute, use_dataset_latest, skip_last_day, end_date, has_hash_limit=True, hash_limit=hash_limit )

                else:

                    print '[make_forum_analysis] An error occurred with this query: %s' % the_sql
                    raise

        else:

	    print '[make_forum_analysis] An error occurred with this query: %s' % the_sql
            raise

    print "Done with Forum Person for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()

    return

コード例 #34

ファイルを表示

ファイル: make_video_analysis.py プロジェクト: musixhine/edx2bigquery

def make_video_stats(course_id, api_key, basedir, datedir, force_recompute,
                     use_dataset_latest):
    '''
    Create Video stats for Videos Viewed and Videos Watched.
    First create a video axis, based on course axis. Then use tracking logs to count up videos viewed and videos watched
    '''

    assert api_key is not None, "[analyze videos]: Public API Key is missing from configuration file. Visit https://developers.google.com/console/help/new/#generatingdevkeys for details on how to generate public key, and then add to edx2bigquery_config.py as API_KEY variable"

    # Get Course Dir path
    basedir = path(basedir or '')
    course_dir = course_id.replace('/', '__')
    lfp = find_course_sql_dir(course_id, basedir, datedir, use_dataset_latest)

    # get schema
    mypath = os.path.dirname(os.path.realpath(__file__))
    SCHEMA_FILE = '%s/%s' % (mypath, SCHEMA_VIDEO_AXIS)
    the_schema = json.loads(open(SCHEMA_FILE).read())[SCHEMA_VIDEO_AXIS_NAME]
    the_dict_schema = schema2dict(the_schema)

    # Create initial video axis
    videoAxisExists = False
    dataset = bqutil.course_id2dataset(course_id,
                                       use_dataset_latest=use_dataset_latest)
    va_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_VIDEO_AXIS)
        assert tinfo is not None, "[analyze videos] %s.%s does not exist. First time creating table" % (
            dataset, TABLE_VIDEO_AXIS)
        videoAxisExists = True
        va_date = tinfo['lastModifiedTime']  # datetime
    except (AssertionError, Exception) as err:
        print "%s --> Attempting to process %s table" % (str(err),
                                                         TABLE_VIDEO_AXIS)
        sys.stdout.flush()

    # get course axis time
    ca_date = None
    try:
        tinfo = bqutil.get_bq_table_info(dataset, TABLE_COURSE_AXIS)
        ca_date = tinfo['lastModifiedTime']  # datetime
    except (AssertionError, Exception) as err:
        pass

    if videoAxisExists and (not force_recompute) and ca_date and va_date and (
            ca_date > va_date):
        force_recompute = True
        print "video_axis exists, but has date %s, older than course_axis date %s; forcing recompute" % (
            va_date, ca_date)
        sys.stdout.flush()

    if not videoAxisExists or force_recompute:
        force_recompute = True
        createVideoAxis(course_id=course_id,
                        force_recompute=force_recompute,
                        use_dataset_latest=use_dataset_latest)

        # Get video lengths
        va = bqutil.get_table_data(dataset, TABLE_VIDEO_AXIS)
        assert va is not None, "[analyze videos] Possibly no data in video axis table. Check course axis table"
        va_bqdata = va['data']
        fileoutput = lfp / FILENAME_VIDEO_AXIS
        getYoutubeDurations(dataset=dataset,
                            bq_table_input=va_bqdata,
                            api_key=api_key,
                            outputfilename=fileoutput,
                            schema=the_dict_schema,
                            force_recompute=force_recompute)

        # upload and import video axis
        gsfn = gsutil.gs_path_from_course_id(
            course_id,
            use_dataset_latest=use_dataset_latest) / FILENAME_VIDEO_AXIS
        gsutil.upload_file_to_gs(fileoutput, gsfn)
        table = TABLE_VIDEO_AXIS
        bqutil.load_data_to_table(dataset, table, gsfn, the_schema, wait=True)

    else:
        print "[analyze videos] %s.%s already exists (and force recompute not specified). Skipping step to generate %s using latest course axis" % (
            dataset, TABLE_VIDEO_AXIS, TABLE_VIDEO_AXIS)

    # Lastly, create video stats
    createVideoStats_day(course_id,
                         force_recompute=force_recompute,
                         use_dataset_latest=use_dataset_latest)
    createVideoStats(course_id,
                     force_recompute=force_recompute,
                     use_dataset_latest=use_dataset_latest)

コード例 #35

ファイルを表示

ファイル: make_forum_analysis.py プロジェクト: kesiena115/edx2bigquery

def CreateForumEvents( course_id, force_recompute=False, use_dataset_latest=False, skip_last_day=False, end_date=None):
    '''
    Create forum events table, based on tracking logs.  Extracts all forum-related events, including forum post reads,
    into the date-time ordered table.  Repeated calls to this procedure will append new events to the table.  If no
    new events are found, the existing table is left unchanged.
    '''
    dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=use_dataset_latest)
    logs = bqutil.course_id2dataset(course_id, dtype='logs')

    table = TABLE_FORUM_EVENTS
    
    # event_type for forums may be like:
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/threads/5460c918a2a525003a0007fa
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/The_Subject/inline
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/users/4051854/followed
    #  /courses/UnivX/123.4x/2T2015/discussion/comments/54593f21a2a525003a000351/reply
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545e4f5da2a5251aac000672/reply
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/upvote
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/545770e9dad66c17cd0001d5/unvote
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/5447c22e892b213c7b0001f3/update
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/54493025892b2120a1000335/pin
    #  /courses/UnivX/123.4x/2T2015/discussion/threads/54492e9c35c79cb03e00030c/delete
    #  /courses/UnivX/123.4x/2T2015/discussion/forum/General/inline
    #  /courses/UnivX/123.4x/2T2015/instructor/api/list_forum_members
    #  /courses/UnivX/123.4x/2T2015/instructor/api/update_forum_role_membership
    #     \"GET\": {\"action\": [\"allow\"], \"rolename\": [\"Administrator\"], \"unique_student_identifier\": [\"NEW_ADMIN_USER\"]}}"}
    #
    # module_id will be like:
    # "module_id": "UnivX/123.4x/forum/54492f0c892b21597e00030a"

    the_sql = """
              SELECT time, 
                     username,
                     '{course_id}' as course_id,
                     (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/reply') then "reply"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/upvote') then "upvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unvote') then "unvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/update') then "update"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/delete') then "delete"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/close') then "close"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/follow') then "follow_thread"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unfollow') then "unfollow_thread"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/pin') then "pin"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/unpin') then "unpin"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/downvote') then "downvote"  # does this happen?
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/reply') then "comment_reply"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/upvote') then "comment_upvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/update') then "comment_update"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/unvote') then "comment_unvote"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/comments/[^/]+/delete') then "comment_delete"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+/followed') then "follow_user"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/users/[^/]+$') then "target_user"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then "read"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/inline') then "read_inline"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/search') then "search"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum$') then "enter_forum"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/$') then "enter_forum"
                           when REGEXP_MATCH(event_type, r'/courses/(.*)/instructor/api/(.*)') then REGEXP_EXTRACT(event_type, r'/courses/.*/instructor/api/(.*)')
                           when event_type = "edx.forum.thread.created" then "created_thread"
                           when event_type = "edx.forum.response.created" then "created_response"
                           when event_type = "edx.forum.comment.created" then "created_comment"
                           when event_type = "edx.forum.searched" then "searched"
                           else event_type end) as forum_action,
                           (case when module_id is not null then REGEXP_EXTRACT(module_id, r'[^/]+/[^/]+/forum/([^/]+)') # For old-school courses with transparent course ids
                                      else (case when module_id is null # otherwise, for new opaque course ids, use regex to find thread_id from event_type, since module_id is null
                                               then (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/forum/[^/]+/threads/[^/]+') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/forum/[^/]+/threads/([^/]+)') # read
                                                      else (case when REGEXP_MATCH(event_type, r'/courses/(.*)/discussion/threads/[^/]+/') then REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/threads/([^/]+)') # upvote, pinned, upvoted, unvoted, deleted, followed
                                                             else REGEXP_EXTRACT(event_type, r'/courses/.*/discussion/comments/([^/]+)/') end) # comment
                                                                end) end) end) as thread_id,
                     REGEXP_EXTRACT(event_type, r'/courses/.*/forum/([^/]+)/') as subject,
                     REGEXP_EXTRACT(event_type, r'/courses/.*/forum/users/([^/]+)') as target_user_id,
                     event_struct.query as search_query,   # unavailable before June 1, 2015
                     event_struct.GET as event_GET,        # unavailable before June 1, 2015
              FROM {DATASETS}
              WHERE  (REGEXP_MATCH(event_type ,r'^edx\.forum\..*')
                      or event_type contains "/discussion/forum"
                      or event_type contains "/discussion/threads"
                      or event_type contains "/discussion/comments"
                      or event_type contains "list-forum-"
                      or event_type contains "list_forum_"
                      or event_type contains "add-forum-"
                      or event_type contains "add_forum_"
                      or event_type contains "remove-forum-"
                      or event_type contains "remove_forum_"
                      or event_type contains "update_forum_"
                     ) 
                    AND username is not null
                    AND event is not null
                    and time > TIMESTAMP("{last_date}")
                    {hash_limit}
              order by time
              """

    try:
        tinfo = bqutil.get_bq_table_info(dataset, table )
        assert tinfo is not None, "[make_forum_analysis] Creating %s.%s table for %s" % (dataset, table, course_id)

        print "[make_forum_analysis] Appending latest data to %s.%s table for %s" % (dataset, table, course_id)
        sys.stdout.flush()

    except (AssertionError, Exception) as err:
        print str(err)
        sys.stdout.flush()
        print " --> Missing %s.%s?  Attempting to create..." % ( dataset, table )
        sys.stdout.flush()
        pass

    print "=== Processing Forum Events for %s (start %s)"  % (course_id, datetime.datetime.now())
    sys.stdout.flush()

    def gdf(row):
        return datetime.datetime.utcfromtimestamp(float(row['time']))

    process_tracking_logs.run_query_on_tracking_logs(the_sql, table, course_id, force_recompute=force_recompute,
                                                     use_dataset_latest=use_dataset_latest,
                                                     get_date_function=gdf,
                                                     has_hash_limit=True,
                                                     end_date=end_date,
                                                     skip_last_day=skip_last_day
                                                    )

    print "Done with Forum Events for %s (end %s)"  % (course_id, datetime.datetime.now())
    print "="*77
    sys.stdout.flush()

コード例 #36

ファイルを表示

ファイル: make_research_data_tables.py プロジェクト: sibycharley/edx2bigquery

    def __init__(
        self,
        course_id_set,
        basedir='',
        datedir='',
        output_project_id=None,
        nskip=0,
        output_dataset_id=None,
        output_bucket=None,
        use_dataset_latest=False,
        only_step=None,
        end_date=None,
    ):
        '''
	Extract Research Datasets, based on defined list of tables
        '''

        if only_step and ',' in only_step:
            only_step = only_step.split(',')
        self.only_step = only_step

        self.end_date = end_date

        if not course_id_set:
            print "ERROR! Must specify list of course_id's for report.  Aborting."
            return

        org = course_id_set[0].split('/',
                                     1)[0]  # extract org from first course_id
        self.org = org

        self.output_project_id = output_project_id

        crname = ('course_report_%s' % org)
        if use_dataset_latest:
            crname = 'course_report_latest'
        self.dataset = output_dataset_id or crname

        self.gsbucket = gsutil.gs_path_from_course_id(crname,
                                                      gsbucket=output_bucket)
        self.course_id_set = course_id_set
        course_id = course_id_set

        #course_datasets = [ bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set]
        #course_datasets_dict = { x:bqutil.course_id2dataset(x, use_dataset_latest=use_dataset_latest) for x in course_id_set}
        course_dataset = bqutil.course_id2dataset(
            course_id, use_dataset_latest=use_dataset_latest)

        self.rdp_matrix = collections.OrderedDict()
        #for course_id in course_datasets_dict.keys():

        print "[researchData] Processing data for course %s" % (course_id)
        sys.stdout.flush()
        for rdp in RESEARCH_DATA_PRODUCTS.keys():
            try:
                table = bqutil.get_bq_table_info(course_dataset, rdp)
                #table = bqutil.get_bq_table_info( course_id, rdp )
                if table is not None:
                    #[print "[researchData] %s found for %s dataset" % ( rdp, course_datasets_dict[ course_id ] )
                    print "[researchData] %s found" % (rdp)
                    sys.stdout.flush()
                    if rdp not in self.rdp_matrix:
                        #self.rdp_matrix[ str(rdp) ] = cd
                        self.rdp_matrix[str(rdp)] = (course_id, course_dataset)
                        #self.rdp_matrix[ str(rdp) ] = ( course_id, course_id )
                    else:
                        self.rdp_matrix[str(rdp)].append(
                            (course_id, course_dataset))
                        #self.rdp_matrix[ str(rdp) ].append( (course_id,  course_id ) )

            except Exception as err:
                #print str(err)
                print "[researchData] Err: %s not found for %s dataset" % (
                    rdp, course_id)

# Extract to archival storage
        for researchDataProduct in self.rdp_matrix:

            the_dataset = self.rdp_matrix[researchDataProduct][1]
            course_id = self.rdp_matrix[researchDataProduct][
                0]  #the_dataset.replace( '__', '/' )
            self.extractResearchData(course_id=course_id,
                                     tablename=researchDataProduct,
                                     the_dataset=the_dataset,
                                     rdp=researchDataProduct,
                                     rdp_format='csv',
                                     output_bucket=output_bucket,
                                     basedir=basedir,
                                     datedir=datedir)

        print "=" * 100
        print "Done extracting Research Data tables -> %s" % RESEARCH_DATA_PRODUCTS.keys(
        )
        print "=" * 100
        sys.stdout.flush()

コード例 #37

ファイルを表示

ファイル: custom_reports.py プロジェクト: mitodl/xanalytics

    def actual_ajax_get_report_data(self, report_name=None):
        '''
        get data for custom report.
        parameters like course_id, chapter_id, problem_id are passed in as GET or POST parameters

        Defined parameters for SQL:

        {person_course} --> person_course table for the specific course
        {dataset} --> dataset for the specific course
        {course_report} --> course_report_* dataset for the ORG or latest
        {course_report_org} --> course_report_ORG dataset for ORG = ORGANIZATION_NAME
        {orgname} --> organization name
        
        '''
        crm, pdata, auth_ok, msg = self.custom_report_auth_check(report_name)	# crm = CourseReport model
        if not auth_ok:
            return self.no_auth_sorry()
        course_id = pdata['course_id']
        force_query = pdata.get('force_query', False)
        if force_query == 'false':
            force_query = False
        ignore_cache = pdata.get('ignore_cache', False) or force_query

        # project_id specified?
        optargs = {}
        if 'project_id' in (crm.meta_info or {}):
            optargs['project_id'] = crm.meta_info['project_id']

        if course_id:
            dataset = bqutil.course_id2dataset(course_id, use_dataset_latest=self.use_dataset_latest())
            pdata['person_course'] = '[%s.person_course]' % dataset
        elif 'dataset' in (crm.meta_info or {}):
            dataset = crm.meta_info['dataset']
        else:
            dataset = self.get_course_report_dataset()
            # using course report dataset; list the tables, to determine which is the latest
            # person_course dataset, and use that for {person_course}
            pdata['person_course_latest'] = self.find_latest_person_course_table(dataset, project_id=optargs.get('project_id'))
            pdata['person_course'] = '[%s.%s]' % (dataset, pdata['person_course_latest'])
        pdata['dataset'] = dataset
        pdata['course_report'] = self.get_course_report_dataset()
        pdata['course_report_org'] = self.get_course_report_dataset(force_use_org=True)
        pdata['orgname'] = self.ORGNAME
        pdata['sane_username'] = self.user.replace(' ', '_').replace('.', '_').replace('@', '_')

        if 'module_id' in pdata and pdata['module_id']:
            url_name = pdata['module_id'].rsplit('/',1)[-1]
            pdata['module_url_name'] = url_name.replace(':','__').replace('-','_')

        # what table?  get custom course report configuration metadata for report name as specified
        table = crm.table_name
        if not table or table=="None":
            error = "No table name defined!  Cannot process this custom report"
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'   
            self.response.out.write(json.dumps(data))
            return

        # multiple table names?  use parameters to select one
        if ',' in table:
            tables = table.split(',')
            try:
                table_number = int(pdata.get('table_number', 0) or 0)
                table = tables[table_number]
            except Exception as err:
                raise Exception("[custom_reports] Cannot select table from tables=%s, table_number=%s, err=%s" % (tables, pdata.get('table_number'), err))

        # allow parameters in table name
        if '{' in table:
            table = table.format(**pdata)
            table = table.replace('-', '_').replace(' ', '_')
        if not ('dataset' in (crm.meta_info or {})) and not table.startswith('stats_') and not (crm.meta_info.get('no_stats_ok')):
            table = "stats_" + table

        # special handling for person_course table from particular dataset
        for m in re.findall('{person_course__([^ \}]+)}', crm.sql):
            org = m
            org_dataset = self.get_course_report_dataset(orgname=org)
            pcd = '[%s.%s]' % (org_dataset, self.find_latest_person_course_table(org_dataset, project_id=optargs.get('project_id')))
            pdata['person_course__' + org] = pcd
            logging.info('[cr] org=%s, pc=%s.%s' % (org, org_dataset, pcd))

        # special handling for course_report tables for specific orgs
        for m in re.findall('{course_report__([^ \}]+)}', crm.sql):
            org = m
            org_dataset = self.get_course_report_dataset(orgname=org)
            pdata['course_report__' + org] = org_dataset

        logging.info("Using %s for custom report %s person_course" % (pdata.get('person_course'), report_name))

        error = None

        def setup_sql_flags():
            if 'sql_flags' in pdata:
                if not type(pdata['sql_flags'])==dict:
                    try:
                        pdata['sql_flags'] = json.loads(pdata['sql_flags'])
                    except Exception as err:
                        msg = "Cannot parse sql_flags as JSON!  sql_flags=%s" % pdata['sql_flags']
                        msg += err
                        logging.error(msg)
                        raise Exception(msg)

        # dynamic sql: the SQL is allowed to change based on input parameters
        # do this by treating the SQL as a jinja2 tempate
        if crm.meta_info.get('dynamic_sql'):
            setup_sql_flags()
            # a little sanity checking - disallow spaces in any sql_flags values
            sf = pdata['sql_flags']
            for k in sf:
                if ' ' in sf[k]:
                    msg = "Illegal sql_flags %s=%s!" % (k, sf[k])
                    msg += "sql_flags = %s" % json.dumps(sf, indent=4)
                    error = "<pre>%s</pre>" % (msg.replace('<','&lt;').replace('<','&gt;'))
                    data = {'error': error}
                    self.response.headers['Content-Type'] = 'application/json'   
                    self.response.out.write(json.dumps(data))
                    return

            try:
                sql_template = Template(crm.sql)
                sql = sql_template.render(pdata)
            except Exception as err:
                msg = 'Custom report data: failed to render dynamic SQL with pdata=%s, err=%s' % (pdata, err)
                logging.error(msg)
                logging.error('sql=%s' % crm.sql)
                error = "<pre>%s</pre>" % (msg.replace('<','&lt;').replace('<','&gt;'))
                data = {'error': error}
                self.response.headers['Content-Type'] = 'application/json'   
                self.response.out.write(json.dumps(data))
                return

            # append username to table name
            table = table + "_%s" % pdata['sane_username']

            force_query = True		# for now, all dynamic_sql is done with force_query
            ignore_cache = True
            the_sql = sql
        else:
            the_sql = crm.sql

        if 1:
            # generate SQL and depends_on
            try:
                sql = the_sql.format(**pdata)
            except Exception as err:
                msg = "Custom report data: failed to prepare SQL, err=%s" % str(err)
                msg += '\npdata = %s' %  pdata
                logging.error(msg)
                if self.is_superuser():
                    error = "<pre>%s</pre>" % (str(msg).replace('<','&lt;').replace('<','&gt;'))
                    data = {'error': error}
                    self.response.headers['Content-Type'] = 'application/json'   
                    self.response.out.write(json.dumps(data))
                    logging.error("Returning with error message")
                    return
                raise

        def strip_brackets(x):
            x = x.strip()
            if x.startswith('[') and x.endswith(']'):
                x = x[1:-1]
                return x
            return x

        if crm.meta_info.get('debug_sql'):
            msg = "debug_sql is true; not running SQL.  This is the SQL which would have been run:\n"
            msg += sql
            msg += "\n\nwith these parameters:\n"
            msg += json.dumps(pdata, indent=4)
            msg += "\n\nproducing the output table: %s.%s\n" % (dataset, table)
            error = "<pre>%s</pre>" % (msg.replace('<','&lt;').replace('<','&gt;'))
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'   
            self.response.out.write(json.dumps(data))
            return

        try:
            if crm.depends_on and (not crm.depends_on=="None"):
                depends_on = [ strip_brackets(x.format(**pdata)) for x in (json.loads(crm.depends_on or "[]")) ]
            else:
                depends_on = None
        except Exception as err:
            logging.error("for course report %s, cannot process depends_on=%s" % (report_name, crm.depends_on))
            raise Exception("Bad format for the 'depends_on' setting in the custom report specification")
            raise

        # get the data, and do query if needed

        logging.info('custom report get_report_data name=%s, table=%s.%s, depends_on=%s, pdata=%s' % (report_name, dataset, table, depends_on, pdata))

        the_msg = []

        def my_logger(msg):
            logging.info(msg)
            the_msg.append(msg)

        def output_error(err):
            error = "<pre>%s</pre>" % (str(err).replace('<','&lt;').replace('<','&gt;'))
            logging.error('custom report error %s' % error)
            logging.error(err)
            logging.error(traceback.format_exc())
            # raise
            if self.is_superuser():
                msg = ('\n'.join(the_msg))
                msg = msg.replace('<','&lt;').replace('<','&gt;')
                error += "<pre>%s</pre>" % msg
                error += "SQL: <pre>%s</pre>" % sql
                error += "Parameters: <pre>%s</pre>" % json.dumps(pdata, indent=4)
                error += "optargs: <pre>%s</pre>" % json.dumps(optargs, indent=4)
            data = {'error': error}
            self.response.headers['Content-Type'] = 'application/json'   
            self.response.out.write(json.dumps(data))

        # is the request "indexed", meaning only matching rows of the table are to be returned?
        indexed_column = crm.meta_info.get('indexed')
        if indexed_column:
            if type(indexed_column)==list:
                indexed_columns = indexed_column
                try:
                    table_number = int(pdata.get('table_number', 0) or 0)
                    indexed_column = indexed_columns[table_number]
                except Exception as err:
                    raise Exception("[custom_reports] Cannot select indexed_column from indexed_columns=%s, table_number=%s, err=%s" % (indexed_columns, 
                                                                                                                                        pdata.get('table_number'), 
                                                                                                                                        err))
            setup_sql_flags()
            indexed_value = pdata.get('sql_flags', {}).get('indexed_value')
            logging.info("[custom_reports] retrieving %s.%s with indexing on %s to match value %s" % (dataset, table, indexed_column, indexed_value))
            if not indexed_value:
                my_logger('Error: missing sql_flags.indexed_value to match indexed column %s in %s.%s' % (indexed_column, dataset, table))
                data = {'error': msg}
                self.response.headers['Content-Type'] = 'application/json'   
                self.response.out.write(json.dumps(data))
                return
            # ensure existence of indexed version of table.  By convention, that is a table named tablename + "__indexed_" + indexed_column
            # the table has a SHA1 hash of the indexed column added, and is sorted according to the last few characters
            # of the SHA1 hash.
            indexed_table = table + "__indexed_" + indexed_column
            indexing_sql_template = """select *,
                                  SUBSTR(TO_BASE64(SHA1(STRING({indexed_column}))),-3,2) as index_sha1_2ch,
                                  ROW_NUMBER() over (order by index_sha1_2ch, {indexed_column}) as index_row_number{subnum},
                              from [{dataset}.{table}]
                              {where_clause}
                              order by index_sha1_2ch, {indexed_column}
                           """
            indexing_sql = indexing_sql_template.format(dataset=dataset, table=table, indexed_column=indexed_column, 
                                                        where_clause="",
                                                        subnum="")
            try:
                bqdata = self.cached_get_bq_table(dataset, indexed_table,
                                                  sql=indexing_sql,
                                                  logger=my_logger,
                                                  depends_on=["%s.%s" % (dataset, table)],
                                                  raise_exception=True,
                                                  ignore_cache=ignore_cache,
                                                  force_query=force_query,
                                                  startIndex=0,
                                                  maxResults=1,
                                                  **optargs
                                                )
            except Exception as err:
                if "Response too large to return" in str(the_msg):
                    # hmm - table too large!  can't workaround using allowLargeResult because the return results
                    # need to be ordered.  So let's break it up into multiple queries, appending each,
                    # by index_sha1_2ch
                    b64chars = "+/0123456789" + ''.join(map(chr,range(ord('A'), ord('Z')+1))) + ''.join(map(chr,range(ord('a'), ord('z')+1)))
                    # get table size, divide by 64M, to get number of divisions to use
                    tinfo = bqutil.get_bq_table_info(dataset, table, **optargs)
                    nbytes = int(tinfo['numBytes'])
                    ndivs = int(round(nbytes / (64*1024*1024)))
                    logging.info("Response too large - nbytes=%s, so trying ndivs=%s" % (nbytes, ndivs))
                    end_idx = None
                    start_idx = None
                    dn = int(64 / ndivs)
                    offset = dn
                    overwrite = True
                    nrows = 0
                    while (offset < 65):
                        start_idx = end_idx
                        last_row_index = nrows	# note ROW_NUMBER() starts with 1 (not zero)
                        if (offset < 64):
                            end_idx = b64chars[offset] + "+"
                        else:
                            end_idx = None	# boundary case
                        wc = "where "
                        if start_idx:
                            wc += '(SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) >= "%s") ' % (indexed_column, start_idx)
                        else:
                            wc += "True "
                        if end_idx:
                            wc += 'AND (SUBSTR(TO_BASE64(SHA1(STRING(%s))),-3,2) < "%s") ' % (indexed_column, end_idx)
                        logging.info("--> start_idx=%s, end_idx=%s, starting row %d" % (start_idx, end_idx, last_row_index))
                        tmp_sql = indexing_sql_template.format(dataset=dataset, table=table, indexed_column=indexed_column, 
                                                               where_clause=wc, subnum="_sub")
                        indexing_sql = "SELECT *, index_row_number_sub + %d as index_row_number FROM (%s)" % (last_row_index, tmp_sql)
                        try:
                            bqutil.create_bq_table(dataset, indexed_table,
                                                   sql=indexing_sql,
                                                   overwrite=overwrite,
                                                   logger=my_logger,
                                                   **optargs
                                               )
                            cnt = 0
                            tinfo = None
                            while (not tinfo) and (cnt < 10):
                                tinfo = bqutil.get_bq_table_info(dataset, indexed_table, **optargs)
                                if not tinfo:
                                    logging.info("==> ERROR?  got unexpected None for get_bq_table_info %s.%s" % (dataset, indexed_table))
                                    time.sleep(10)
                                    cnt += 1

                            nrows = int(tinfo['numRows'])
                            logging.info("--> Result from %s to %s has %d rows" % (start_idx, end_idx, nrows))
                        except Exception as err:
                            bqdata = {'data': None}
                            sql = indexing_sql
                            output_error(err)
                            return
                        overwrite = "append"
                        offset += dn
                        
                else:
                    bqdata = {'data': None}
                    sql = indexing_sql
                    output_error(err)
                    return

            # now ensure table index, and retrieve it.  It has just two columns: index_sha1_2ch, start_row
            tindex_table = table + "__index_for_" + indexed_column
            tindex_sql = """SELECT index_sha1_2ch, 
                                min(index_row_number) as start_row,
                                # max(index_row_number) as end_row,   # don't need this - just take next start_row
                            FROM [{dataset}.{indexed_table}]
                            group by index_sha1_2ch
                            order by index_sha1_2ch
                         """.format(dataset=dataset, indexed_table=indexed_table)
            try:
                bqdata = self.cached_get_bq_table(dataset, tindex_table,
                                                    sql=tindex_sql,
                                                    logger=my_logger,
                                                    depends_on=["%s.%s" % (dataset, indexed_table)],
                                                    raise_exception=True,
                                                    ignore_cache=ignore_cache,
                                                    force_query=force_query,
                                                    startIndex=0,
                                                    maxResults=10000,
                                                    **optargs
                                                )
            except Exception as err:
                bqdata = {'data': None}
                sql = tindex_sql
                output_error(err)
                return

            # find the start and end rows to retrieve, based the last characters of the SHA1 hash of the indexed value
            sha1_2ch = base64.b64encode(hashlib.sha1(indexed_value).digest())[-3:-1]
            start_row = None
            end_row = None
            for k in bqdata['data']:
                if start_row and not end_row:
                    end_row =  int(k['start_row'])
                if (k['index_sha1_2ch']==sha1_2ch):
                    start_row = int(k['start_row'])
            logging.info("Retrieving iv=%s, sha1_2ch=%s, rows %s to %s of %s.%s" % (indexed_value,
                                                                                    sha1_2ch,
                                                                                    start_row, end_row, dataset, indexed_table))
            if not start_row:
                output_error("Cannot find %s=%s in %s.%s" % (indexed_column, indexed_value,
                                                             dataset, indexed_table))
                bqdata = {'data': None}
                return
                
            max_results = (end_row or (start_row+4000)) - start_row
            bqdata = self.cached_get_bq_table(dataset, indexed_table, ignore_cache=True,
                                              startIndex=start_row-1,
                                              maxResults=max_results,
                                              **optargs
                                            )

            # extract just the row(s) with indexed_column value matching indexed_value (the hash is many to one)
            newdata = []
            for k in range(len(bqdata['data'])):
                datum = bqdata['data'][k]
                if (datum[indexed_column]==indexed_value):
                    newdata.append(datum)
            logging.info("--> Result has %d items, of which %d match" % (len(bqdata['data']), len(newdata)))
            bqdata['data'] = newdata                
            
            table = indexed_table	# so that columns are retrieved properly
            
        if not indexed_column:
            # retrieve full table
            try:
                bqdata = self.cached_get_bq_table(dataset, table, 
                                                  sql=sql,
                                                  logger=my_logger,
                                                  depends_on=depends_on,
                                                  startIndex=int(pdata['start'] or 0), 
                                                  maxResults=int(pdata['length'] or 100000),
                                                  raise_exception=True,
                                                  ignore_cache=ignore_cache,
                                                  force_query=force_query,
                                                  **optargs
                )
                self.fix_bq_dates(bqdata)
            except Exception as err:
                bqdata = {'data': None}
                output_error(err)
                return

        tablecolumns = []
        if pdata['get_table_columns']:
            try:
                tableinfo = bqutil.get_bq_table_info(dataset, table, **optargs)
            except Exception as err:
                error = (error or "\n") + str(err)
                tableinfo = None
                raise

            if tableinfo:
                fields = tableinfo['schema']['fields']
                field_names = [x['name'] for x in fields]
                tablecolumns = [ { 'data': x, 'title': x, 'class': 'dt-center' } for x in field_names ]

        data = self.common_data.copy()
        data.update({'data': bqdata['data'],
                     'draw': pdata['draw'],
                     'last_modified_date': str(bqdata.get('last_modified_date')),
                     'fields': bqdata['fields'],
                     'recordsTotal': bqdata.get('numRows', 0),
                     'recordsFiltered': bqdata.get('numRows', 0),
                     'error': error,
                     'tablecolumns': tablecolumns,
                     'output_table': table,
                     'output_dataset': dataset,
                 })
        
        
        # logging.info('[cr] data=%s' % data)

        self.response.headers['Content-Type'] = 'application/json'   
        self.response.out.write(json.dumps(data))