Beispiel #1
0
def GetWikiEdits(vars):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db'])
    
    wiki_edit_ctid = moocdb_utils.GetCollaborationTypeMap(vars)['wiki_edit']
    
    gen_anon = vars['general_anon_col_name']
    
    items = []
    q = "SELECT * FROM wiki_revisions JOIN `{0}`.hash_mapping USING ({1})".format(vars['source']['hash_mapping_db'], gen_anon)
    if vars['options']['debug']:
        q += " WHERE wiki_revisions.{} IN ({})".format(gen_anon, ",".join(vars['hash_map']['qls_general']))
    wiki_revisions = general_db_selector.query(q)
    
    vars["logger"].Log(vars, "Counts: Read {} wiki edits from source".format(len(wiki_revisions)))
    
    xi = 0
    for x in wiki_revisions:
        items.append({
            'original_id': 'wiki_edit_' + str(xi),
            'user_original_id': vars['hash_map']['map_general'][x[gen_anon]],
            'collaboration_parent_original_id': None,
            'resource_original_id': x['page_id'],
            'collaboration_child_number': None,
            'collaboration_timestamp': datetime.fromtimestamp(x['timestamp']),
            'collaboration_type_id': wiki_edit_ctid,
            'collaboration_content': None,
        })
        xi += 1
        
    return items
Beispiel #2
0
def GetTutorials(vars):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db'])
    
    output_items = []
    resource_type_id = moocdb_utils.GetResourceTypeMap(vars)['tutorial']
    
    src_videos = general_db_selector.query("SELECT * FROM lecture_metadata")
    vars["logger"].Log(vars, "\t\tCounts: Read {} videos from source".format(len(src_videos)))
    
    items_sections = general_db_selector.query("SELECT * FROM items_sections WHERE item_type='lecture'")
    items_sections_lookup = {x['item_id']: {'resource_parent_id': x['section_id'], 'resource_child_number': x['order']} for x in items_sections if x['item_type'] == 'lecture'}
    for video in src_videos:
        item = {
            'original_id': video['id'],
            'resource_name': video['title'],
            'resource_uri': "www.coursera.org/{}/lecture/view?lecture_id={}".format(vars['source']['course_url_id'], video['id']),
            'resource_parent_original_id': None,
            'resource_child_number': None,
            'resource_type_id': resource_type_id,
        }
        
        if video['id'] in items_sections_lookup.keys():
            item['resource_parent_original_id'] = items_sections_lookup[video['id']]['resource_parent_id']
            item['resource_child_number'] = items_sections_lookup[video['id']]['resource_child_number']
            
        output_items.append(item)
    
    return output_items
def GetContentSections(vars):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])

    resource_type_id = moocdb_utils.GetResourceTypeMap(vars)['content_section']
    output_items = []
    src_sections = general_db_selector.query(
        "SELECT * FROM sections ORDER BY display_order, id")
    vars["logger"].Log(
        vars, "\t\tCounts: Read {} content sections from source".format(
            len(src_sections)))

    section_index = 1

    for section in src_sections:
        output_items.append({
            'original_id': section['id'],
            'resource_name': section['title'],
            'resource_uri': '',
            'resource_parent_original_id': None,
            'resource_child_number': section_index,
            'resource_type_id': resource_type_id,
        })

        section_index += 1

    return output_items
Beispiel #4
0
def GetWikis(vars):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])

    resource_type_id = moocdb_utils.GetResourceTypeMap(vars)['wiki']

    output_items = []
    src_items = general_db_selector.query("SELECT * FROM wiki_pages")
    vars["logger"].Log(
        vars, "\t\tCounts: Read {} wikis from source".format(len(src_items)))

    for wiki in src_items:
        output_items.append({
            'original_id':
            wiki['id'],
            'resource_name':
            wiki['title'],
            'resource_uri':
            "www.coursera.org/{}/wiki/view?page={}".format(
                vars['source']['course_url_id'], wiki['canonical_name']),
            'resource_parent_original_id':
            None,
            'resource_child_number':
            None,
            'resource_type_id':
            resource_type_id
        })

    return output_items
Beispiel #5
0
def GetForums(vars):
    # DB connections
    # --------------
    s = vars['source']
    forum_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                    s['port'], s['forum_db'])

    output_items = []
    resource_type_id = moocdb_utils.GetResourceTypeMap(vars)['forum']
    src_forums = forum_db_selector.query(
        "SELECT * FROM forum_forums ORDER BY display_order, id")

    vars["logger"].Log(
        vars, "\t\tCounts: Read {} forums from source".format(len(src_forums)))

    for forum in src_forums:
        output_items.append({
            'original_id':
            forum['id'],
            'resource_uri':
            "www.coursera.org/{}/forum/list?forum_id={}".format(
                vars['source']['course_url_id'], forum['id']),
            'resource_name':
            forum['name'],
            'resource_parent_original_id':
            None,
            'resource_child_number':
            None,
            'resource_type_id':
            resource_type_id,
        })

    return output_items
def GetQuizContent(vars, quiz_id):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])

    s = "<quiz></quiz>"
    table_name = "kvs_course.{}.quiz".format(
        vars['source']['course_id']) if vars['source'][
            'platform_format'] == 'coursera_1' else "kvs_course.quiz"
    q = "SELECT value FROM `{}` WHERE `key`='xml.quiz_id:{}'".format(
        table_name, quiz_id)
    r = general_db_selector.query(q)
    content = {'question_groups': [], 'question_dict': {}}
    if len(r) > 0 and r[0]['value'].count('"') >= 2:
        a = r[0]['value'].find('"') + 1
        b = r[0]['value'].rindex('"')
        quiz_xml = r[0]['value'][a:b]
        s = quiz_xml
        try:
            content = ParseQuizXML(s)
        except:
            vars['logger'].Log(
                vars,
                "\t\t\tFailed to parse quiz XML for quiz {}. Skippping this quiz and its submissions"
                .format(quiz_id))

    return content
Beispiel #7
0
def GetCourseraHashMap(vars, gen_anon):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])
    hm_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                 s['port'], s['hash_mapping_db'])
    print gen_anon
    q = "SELECT * FROM hash_mapping"
    if vars['options']['debug']:
        users = general_db_selector.query(
            "SELECT * FROM users limit 0,{}".format(
                vars['options']['num_users_debug_mode']))
        #users = general_db_selector.query("SELECT * FROM users where session_user_id in ('53c2c2914b0ad5c3eb01216b242a8ac20d1b1a69', '7d7fe7332e600cb172580aa624b0c93b4331f0c5', 'ec74ba61aa0b0aa279952ba57ea4ef001bf6ea9d')")
        user_id_list = [u[gen_anon] for u in users]
        user_id_list_string = "','".join(user_id_list)
        q += " WHERE {} IN ('{}')".format(gen_anon, user_id_list_string)
    rows = hm_db_selector.query(q)

    if vars['source']['platform_format'] == 'coursera_1':
        map = {
            'map_forum':
            {row['forum_user_id']: row['user_id']
             for row in rows},
            'map_general':
            {row['anon_user_id']: row['user_id']
             for row in rows},
            'list_raw': [row['user_id'] for row in rows],
            'qls_general':
            ["'{}'".format(row['anon_user_id']) for row in rows],
            'qls_forum': ["'{}'".format(row['forum_user_id']) for row in rows],
        }
    else:
        map = {
            'map_forum': {row['user_id']: row['user_id']
                          for row in rows},
            'map_general':
            {row['session_user_id']: row['user_id']
             for row in rows},
            'list_raw': [row['user_id'] for row in rows],
            'qls_general':
            ["'{}'".format(row['session_user_id']) for row in rows],
            'qls_forum': [str(row['user_id']) for row in rows],
        }

    return map
Beispiel #8
0
def GetTests(vars):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db'])
    
    output_items = []
    resource_type_id = moocdb_utils.GetResourceTypeMap(vars)['testing']
    
    src_quizzes = general_db_selector.query("SELECT * FROM quiz_metadata")
    vars["logger"].Log(vars, "\t\tCounts: Read {} quizzes from source".format(len(src_quizzes)))
    
    items_sections = general_db_selector.query("SELECT * FROM items_sections WHERE item_type='quiz'")
    items_sections_lookup = {x['item_id']: {'resource_parent_id': x['section_id'], 'resource_child_number': x['order']} for x in items_sections if x['item_type'] == 'quiz'}
    for quiz in src_quizzes:
        item = {
            'original_id': 'quiz_' + str(quiz['id']),
            'resource_name': quiz['title'],
            'resource_uri': "www.coursera.org/{}/quiz/start?quiz_id={}".format(vars['source']['course_url_id'], quiz['id']),
            'resource_parent_original_id': None,
            'resource_child_number': None,
            'resource_type_id': resource_type_id,
        }
        
        if quiz['id'] in items_sections_lookup.keys():
            item['resource_parent_original_id'] = items_sections_lookup[quiz['id']]['resource_parent_id']
            item['resource_child_number'] = items_sections_lookup[quiz['id']]['resource_child_number']
        
        output_items.append(item)
        
    src_assignments = general_db_selector.query("SELECT * FROM assignment_metadata")
    vars["logger"].Log(vars, "\t\tCounts: Read {} assignments from source".format(len(src_assignments)))
    
    items_sections = general_db_selector.query("SELECT * FROM items_sections WHERE item_type='assignment'")
    items_sections_lookup = {x['item_id']: {'resource_parent_id': x['section_id'], 'resource_child_number': x['order']} for x in items_sections if x['item_type'] == 'assignment'}
    for assn in src_assignments:
        assn_id = assn['id']
        item = {
            'original_id': 'assignment_' + str(assn_id),
            'resource_name': assn['title'],
            'resource_uri': "www.coursera.org/{}/assignment/view?assignment_id={}".format(vars['source']['course_url_id'], assn_id),
            'resource_parent_original_id': None,
            'resource_child_number': None,
            'resource_type_id': resource_type_id,
        }
        
        if assn['id'] in items_sections_lookup.keys():
            item['resource_parent_original_id'] = items_sections_lookup[assn['id']]['resource_parent_id']
            item['resource_child_number'] = items_sections_lookup[assn['id']]['resource_child_number']
            
        output_items.append(item)
    
    return output_items
Beispiel #9
0
def GetSubmissionAndAssessmentData(vars, test):

    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])

    output_items = []
    gen_anon = vars['general_anon_col_name']

    if 'quiz_' in test['original_id']:
        stud_quiz_submissions = GetStudentQuizResponses(
            vars, test['original_id'].replace("quiz_", ""))

        vars["logger"].Log(
            vars, "\t\tCounts: Read {} submissions from source for {}".format(
                len(stud_quiz_submissions.keys()), test['original_id']))

        for uid in stud_quiz_submissions.keys():
            for qid in stud_quiz_submissions[uid].keys():
                attempt_number = 0
                for answer in stud_quiz_submissions[uid][qid]:
                    attempt_number += 1
                    submission = {
                        'user_original_id': uid,
                        'problem_original_id': 'quiz_question_' + qid,
                        'submission_timestamp': answer['submission_time'],
                        'submission_answer': json.dumps([answer['answer']]),
                        'submission_attempt_number': attempt_number,
                        'submission_is_submitted': 1,
                        'assessments': [],
                    }
                    if answer['grade'] != -1:
                        submission['assessments'].append({
                            'grader_original_id':
                            0,
                            'grade':
                            answer['grade'],
                            'max_grade':
                            1,
                            'assessment_timestamp':
                            answer['submission_time'],
                        })
                    output_items.append(submission)

    return output_items
def GetForumVotes(vars):
    # DB connections
    # --------------
    s = vars['source']
    forum_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                    s['port'], s['forum_db'])

    forum_anon = vars['forum_anon_col_name']
    forum_vote_ctid = moocdb_utils.GetCollaborationTypeMap(vars)['forum_vote']
    output_items = []

    q = "SELECT * FROM forum_reputation_record JOIN `{0}`.hash_mapping USING ({1})".format(
        vars['source']['hash_mapping_db'], forum_anon)
    if vars['options']['debug']:
        q += " WHERE {} IN ({})".format(
            forum_anon, ",".join(vars['hash_map']['qls_forum']))
    src_forum_voting_records = forum_db_selector.query(q)

    vars["logger"].Log(
        vars, "\t\tCounts: Read {} forum_votes from source".format(
            len(src_forum_voting_records)))

    vote_index = 0
    for vote in src_forum_voting_records:
        output_items.append({
            'original_id':
            'vote_' + str(vote_index),
            'user_original_id':
            vars['hash_map']['map_forum'][vote[forum_anon]],
            'resource_original_id':
            None,
            'collaboration_parent_original_id':
            vote['type'] + '_' + str(vote['pc_id']),
            'collaboration_child_number':
            None,
            'collaboration_content':
            vote['direction'],
            'collaboration_timestamp':
            vote['timestamp'],
            'collaboration_type_id':
            forum_vote_ctid,
        })
        vote_index += 1

    return output_items
def InsertObservedEvents(vars, events):
    fields = {
        'observed_event_type_id': 'num',
        'user_id': 'num',
        'item_id': 'num',
        'observed_event_timestamp': 'datetime',
        'observed_event_data': 'string',
    }

    t = vars['target']
    target_db_selector = db.Selector(t['host'], t['user'], t['password'],
                                     t['port'], t['db'])
    oe_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'],
                                     t['port'], t['db'], 'observed_events',
                                     fields)
    for event in events:
        oe_inserter.addRow({k: event[k] for k in fields})

    oe_inserter.insertPendingRows()
def GetQuizMetadata(vars, quiz_id):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])

    q = "SELECT * FROM quiz_metadata WHERE id={}".format(quiz_id)
    r = general_db_selector.query(q)
    output = {}
    if len(r) > 0:
        output = {
            'title': r[0]['title'],
            'open_time': r[0]['open_time'],
            'soft_deadline': r[0]['soft_close_time'],
            'hard_deadline': r[0]['hard_close_time'],
            'max_submissions': r[0]['maximum_submissions']
        }
    return output
Beispiel #13
0
def GetTransformedCollaborationEvents(vars):
    # DB connections
    # --------------
    t = vars['target']
    target_db_selector = db.Selector(t['host'], t['user'], t['password'],
                                     t['port'], t['db'])

    oe_type_map = moocdb_utils.GetObservedEventTypeMap(vars)
    coll_type_map = moocdb_utils.GetCollaborationTypeMap(vars)
    coll_type_map_id_to_name = {
        coll_type_map[k]: k
        for k in coll_type_map.keys()
    }

    events = []

    rows = target_db_selector.query(
        "SELECT * FROM collaborations JOIN collaboration_types ON collaborations.collaboration_type_id=collaboration_types.collaboration_type_id"
    )
    for row in rows:
        coll_type_id = row['collaboration_type_id']
        coll_type_name = coll_type_map_id_to_name[coll_type_id]
        oe_type_name = coll_type_name
        oe_type_id = oe_type_map[oe_type_name]

        events.append({
            'user_id':
            row['user_id'],
            'item_id':
            row['collaboration_parent_id']
            if row['collaboration_parent_id'] != None else row['resource_id'],
            'observed_event_type_id':
            oe_type_id,
            'observed_event_timestamp':
            row['collaboration_timestamp'],
            'observed_event_data':
            json.dumps({}),
        })

    return events
def GetVideoEvents(vars, original_item_id):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])

    oe_type_map = moocdb_utils.GetObservedEventTypeMap(vars)
    events = []

    gen_anon = vars['general_anon_col_name']

    q = "SELECT * FROM lecture_submission_metadata JOIN `{0}`.hash_mapping USING ({1}) WHERE item_id={2}".format(
        vars['source']['hash_mapping_db'], gen_anon, original_item_id)
    if vars['options']['debug']:
        q += " AND {} IN ({})".format(
            gen_anon, ",".join(vars['hash_map']['qls_general']))
    rows = general_db_selector.query(q)
    for row in rows:
        events.append({
            'user_original_id':
            vars['hash_map']['map_general'][row[gen_anon]],
            'item_type':
            'tutorials',
            'item_original_id':
            original_item_id,
            'observed_event_data':
            json.dumps({}),
            'observed_event_timestamp':
            datetime.fromtimestamp(row['submission_time']),
            'observed_event_type_id':
            oe_type_map['tutorial_visit'],
            'observed_event_data':
            json.dumps({}),
        })

    return events
Beispiel #15
0
# from ..utilities import db
Beispiel #16
0
def TransformUserData(vars):
    # DB connections
    # --------------
    c = vars['core']
    core_db_selector = db.Selector(c['host'], c['user'], c['password'],
                                   c['port'], c['db'])

    # Populate the users table
    user_id_map = {}
    users = vars['queries'].GetUsers(vars)

    fields = {
        'user_id': 'num',
        'user_email': 'string',
        'user_type_id': 'num',
        'user_join_timestamp': 'datetime',
        'user_ip': 'ip',
        'user_country': 'string',
        'user_timezone_offset': 'num',
        'user_final_grade': 'num',
    }

    # IP-country lookup table
    ip_country_rows = [{
        'start': int(x['ip_numeric_start']),
        'stop': int(x['ip_numeric_stop']),
        'country_code': x['country_code']
    } for x in core_db_selector.query(
        "SELECT ip_numeric_start,ip_numeric_stop,country_code FROM ip_country ORDER BY ip_numeric_start"
    )]

    t = vars['target']
    user_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'],
                                       t['port'], t['db'], 'users', fields)
    moocdb_user_id = 1
    for user in users:
        # User MOOCdb ID
        user['user_id'] = moocdb_user_id

        # User IP
        user['user_ip'] = db.ip_aton(user['user_ip'])

        # User email cannot be null
        if 'user_email' not in user.keys() or user['user_email'] == None:
            user['user_email'] = ''

        # User country
        if 'user_country' not in user.keys(): user['user_country'] = None
        if user['user_country'] == None and user[
                'user_ip'] != 'null':  # Note: Some platforms don't record IP, but do record country
            for ipc_row in ip_country_rows:
                if user['user_ip'] >= ipc_row['start'] and user[
                        'user_ip'] <= ipc_row['stop']:
                    user['user_country'] = ipc_row['country_code']
                    break

        # User timezone offset
        # We are computing it as the mean for the country since some platforms provide incorrect data for user timezone
        utzo = None
        if user['user_country'] != None:
            r = core_db_selector.query(
                "SELECT * FROM timezone WHERE country_code='{}'".format(
                    user['user_country']))
            if len(r) > 0:
                offsets = [x['gmt_offset'] for x in r]
                utzo = offsets[len(offsets) / 2]
        user['user_timezone_offset'] = utzo

        user_inserter.addRow(
            {k: user[k] if k in user.keys() else None
             for k in fields})

        user_id_map[user['original_id']] = moocdb_user_id
        moocdb_user_id += 1

    user_inserter.insertPendingRows()

    vars["logger"].Log(
        vars, "Counts: Inserted {} users to target".format(
            user_inserter.num_inserted_rows))

    return user_id_map
from utilities import db
Beispiel #18
0
def GetProblems(vars):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])

    problem_type_map = moocdb_utils.GetProblemTypeMap(vars)
    output_items = []

    quiz_ids = [
        x['id']
        for x in general_db_selector.query("SELECT * FROM quiz_metadata")
    ]

    vars["logger"].Log(
        vars, "\t\tCounts: Read {} quizzes from source".format(len(quiz_ids)))

    for quiz_id in quiz_ids:
        quiz_metadata = GetQuizMetadata(vars, quiz_id)
        test_original_id = 'quiz_' + str(quiz_id)

        quiz_content = GetQuizContent(vars, quiz_id)

        question_index = 0
        for question_group in quiz_content['question_groups']:
            for question in question_group:

                if 'choice_type' in question.keys(
                ) and question['choice_type'] == 'radio':
                    problem_type = 'question_mc_single'
                elif 'choice_type' in question.keys(
                ) and question['choice_type'] == 'select':
                    problem_type = 'question_mc_multiple'
                else:
                    problem_type = 'question_free_text'

                item = {
                    'original_item_type': 'quiz_question',
                    'problem_original_id': 'quiz_question_' + question['id'],
                    'problem_name': test_original_id + "." + question['id'],
                    'problem_type_id': problem_type_map[problem_type],
                    'problem_parent_original_id': test_original_id,
                    'problem_child_number': question_index,
                    'problem_release_timestamp': quiz_metadata['open_time'],
                    'problem_soft_deadline': quiz_metadata['soft_deadline'],
                    'problem_hard_deadline': quiz_metadata['hard_deadline'],
                    'problem_max_submission': quiz_metadata['max_submissions'],
                    'resource_original_id': test_original_id,
                }
                output_items.append(item)
                question_index += 1

    assn_metadata = []
    assn_metadata_rows = general_db_selector.query(
        "SELECT * FROM assignment_metadata")
    for assn_metadata in assn_metadata_rows:
        assn_id = assn_metadata['id']
        test_original_id = 'assignment_' + str(assn_id)

        assn_parts = general_db_selector.query(
            "SELECT * FROM assignment_part_metadata WHERE assignment_id={}".
            format(assn_id))
        assn_part_index = 0
        for assn_part in assn_parts:
            assn_part_original_id = "assn_part_" + str(assn_part['id'])
            item = {
                'original_item_type': 'assignment_part',
                'problem_original_id': assn_part_original_id,
                'problem_name': assn_part_original_id,
                'problem_type_id': problem_type_map['assignment_part'],
                'problem_parent_original_id': test_original_id,
                'problem_child_number': assn_part_index,
                'problem_release_timestamp': assn_metadata['open_time'],
                'problem_soft_deadline': assn_metadata['soft_close_time'],
                'problem_hard_deadline': assn_metadata['hard_close_time'],
                'problem_max_submission': assn_metadata['maximum_submissions'],
                'problem_weight': None,
                'resource_original_id': test_original_id,
            }
            output_items.append(item)
            assn_part_index += 1

    vars["logger"].Log(
        vars,
        "\t\tCounts: Read {} problems from source".format(len(output_items)))

    return output_items
Beispiel #19
0
def GetAdditionalCollaborationEvents(vars):
    # DB connections
    # --------------
    s = vars['source']
    forum_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                    s['port'], s['forum_db'])

    oe_type_map = moocdb_utils.GetObservedEventTypeMap(vars)
    events = []

    # Find the id of the first forum_post of each thread. This is the item_id under which the event will be registered
    q = "SELECT id, thread_id FROM forum_posts WHERE id IN (SELECT min(id) FROM forum_posts GROUP BY thread_id)"
    thread_id_to_first_post_id = {
        x['thread_id']: x['id']
        for x in forum_db_selector.query(q)
    }

    table_name = "kvs_course.{}.forum_readrecord".format(
        vars['source']['course_id']
    ) if vars['source'][
        'platform_format'] == 'coursera_1' else "kvs_course.forum_readrecord"
    q = "SELECT * FROM `{}`".format(table_name)
    rows = forum_db_selector.query(q)

    for row in rows:
        key_parts = row['key'].split(".")
        uoid_str = key_parts[1]
        try:
            int(uoid_str)
        except:
            continue

        user_original_id = int(uoid_str)
        if user_original_id not in vars['hash_map']['list_raw']:
            continue

        forum_id = int(key_parts[0].replace('forum_', ''))
        value = phpserialize.loads(row['value'])
        if "_all" in value.keys():
            events.append({
                'user_original_id':
                user_original_id,
                'observed_event_type_id':
                oe_type_map['forum_visit'],
                'item_original_id':
                forum_id,
                'item_type':
                'forums',
                'observed_event_timestamp':
                datetime.fromtimestamp(value["_all"]),
                'observed_event_data':
                json.dumps({}),
            })
        else:
            for k in value.keys():
                if k in thread_id_to_first_post_id.keys():
                    events.append({
                        'user_original_id':
                        user_original_id,
                        'observed_event_type_id':
                        oe_type_map['forum_post_read'],
                        'item_original_id':
                        'post_' + str(thread_id_to_first_post_id[k]),
                        'item_type':
                        'forum_posts',
                        'observed_event_timestamp':
                        datetime.fromtimestamp(value[k]),
                        'observed_event_data':
                        json.dumps({}),
                    })

    return events
Beispiel #20
0
def GetTestEvents(vars, original_item_id):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])
    oe_type_map = moocdb_utils.GetObservedEventTypeMap(vars)
    events = []

    if 'quiz_' in original_item_id:

        # Get submission metadata
        q = "SELECT * FROM quiz_submission_metadata JOIN `{0}`.hash_mapping USING ({1}) WHERE item_id={2}".format(
            vars['source']['hash_mapping_db'], vars['general_anon_col_name'],
            original_item_id.replace('quiz_', ''))
        if vars['options']['debug']:
            q += " AND {} IN ({})".format(
                vars['general_anon_col_name'],
                ",".join(vars['hash_map']['qls_general']))
        rows = general_db_selector.query(q)
        submission_metadata = {
            row['id']: {
                'user_original_id':
                vars['hash_map']['map_general'][row[
                    vars['general_anon_col_name']]]
            }
            for row in rows
        }

        if len(submission_metadata) > 0:
            # Get submission content from kvs table
            table_name = "kvs_course.{}.quiz".format(
                vars['source']['course_id']) if vars['source'][
                    'platform_format'] == 'coursera_1' else "kvs_course.quiz"
            in_list = [
                "'submission.submission_id:{}'".format(submission['id'])
                for submission in rows
            ]
            # the IN clause in the next select stmt can be too large. Need to chop it up to 1000 submission_id at a time and then union
            # all the select statement
            rows = []
            chunkSize = len(in_list) / 1000
            leftoverSize = len(in_list) % 1000
            query = ""
            for i in range(chunkSize):
                in_list_string = ",".join(in_list[i * 1000:(i * 1000 + 1000)])
                query = "SELECT * FROM `{}` WHERE `key` IN ({})".format(
                    table_name, in_list_string)
                subResult = general_db_selector.query(query)
                rows += subResult
                """if i == 0:
                    unionQuery = "SELECT * FROM `{}` WHERE `key` IN ({})".format(table_name, in_list_string)
                else:
                    unionQuery += " UNION SELECT * FROM `{}` WHERE `key` IN ({})".format(table_name, in_list_string)"""
            in_list_string = ",".join(
                in_list[chunkSize * 1000:(chunkSize * 1000 + leftoverSize)])
            """if chunkSize == 0:
                unionQuery = "SELECT * FROM `{}` WHERE `key` IN ({})".format(table_name, in_list_string)
            else:
                unionQuery += " UNION SELECT * FROM `{}` WHERE `key` IN ({})".format(table_name, in_list_string)"""
            query = "SELECT * FROM `{}` WHERE `key` IN ({})".format(
                table_name, in_list_string)
            subResult = general_db_selector.query(query)
            rows += subResult
            """if rows is None:
                print unionQuery"""
            user_event_params = {}
            for row in rows:
                try:
                    value = phpserialize.loads(phpserialize.loads(
                        row['value']))
                except:
                    vars['logger'].Log(
                        vars,
                        "\t\t\tFailed to deserialize php-serialized string: {}\n\t\t\tSkipping this record"
                        .format(row['value']))
                    continue

                key_parts = row['key'].split(":")
                submission_id = int(key_parts[1])
                uoid = submission_metadata[submission_id]['user_original_id']
                if uoid not in user_event_params.keys():
                    user_event_params[uoid] = []
                user_event_params[uoid].append({
                    'event_type': 'started',
                    'timestamp': value['start_time']
                })
                user_event_params[uoid].append({
                    'event_type': 'submitted',
                    'timestamp': value['saved_time']
                })

            for uoid in user_event_params.keys():
                submits = [
                    x for x in user_event_params[uoid]
                    if x['event_type'] == 'submitted'
                ]
                filtered_starts = []
                for x in user_event_params[uoid]:
                    if x['event_type'] == 'started':
                        retain_start = True
                        for y in submits:
                            if y['timestamp'] < x['timestamp'] and y[
                                    'timestamp'] > (x['timestamp'] - 120):
                                retain_start = False
                                break
                        if retain_start:
                            filtered_starts.append(x)

                for x in filtered_starts:
                    events.append({
                        'user_original_id':
                        uoid,
                        'item_type':
                        'tests',
                        'item_original_id':
                        original_item_id,
                        'observed_event_type_id':
                        oe_type_map['test_visit'],
                        'observed_event_timestamp':
                        datetime.fromtimestamp(x['timestamp']),
                        'observed_event_data':
                        json.dumps({}),
                    })

                for x in submits:
                    events.append({
                        'user_original_id':
                        uoid,
                        'item_type':
                        'tests',
                        'item_original_id':
                        original_item_id,
                        'observed_event_type_id':
                        oe_type_map['test_submission'],
                        'observed_event_timestamp':
                        datetime.fromtimestamp(x['timestamp']),
                        'observed_event_data':
                        json.dumps({}),
                    })

    return events
def TransformObservationData(vars):
    # DB connections
    # --------------
    t = vars['target']
    target_db_selector = db.Selector(t['host'], t['user'], t['password'],
                                     t['port'], t['db'])

    # Videos
    i = 1
    n = len(vars['id_maps']['tutorials'].keys())
    for original_item_id in vars['id_maps']['tutorials'].keys():
        vars['logger'].Log(
            vars,
            "\tProcessing events for tutorial {} out of {}: original ID {}".
            format(i, n, original_item_id))
        events = vars['queries'].observations.GetVideoEvents(
            vars, original_item_id)
        for event in events:
            event['user_id'] = vars['id_maps']['users'][
                event['user_original_id']]
            event['item_id'] = vars['id_maps'][event['item_type']][
                event['item_original_id']]
        InsertObservedEvents(vars, events)
        i += 1

    # Tests
    i = 1
    n = len(vars['id_maps']['tests'].keys())
    for original_item_id in vars['id_maps']['tests'].keys():
        vars['logger'].Log(
            vars, "\tProcessing events for test: original ID {}".format(
                i, n, original_item_id))
        events = vars['queries'].observations.GetTestEvents(
            vars, original_item_id)
        for event in events:
            event['user_id'] = vars['id_maps']['users'][
                event['user_original_id']]
            event['item_id'] = vars['id_maps'][event['item_type']][
                event['item_original_id']]
        InsertObservedEvents(vars, events)
        i += 1

    # Wikis
    i = 1
    n = len(vars['id_maps']['wikis'].keys())
    for original_item_id in vars['id_maps']['wikis'].keys():
        vars['logger'].Log(
            vars, "\tProcessing events for wiki: original ID {}".format(
                i, n, original_item_id))
        events = vars['queries'].observations.GetWikiVisits(
            vars, original_item_id)
        for event in events:
            event['user_id'] = vars['id_maps']['users'][
                event['user_original_id']]
            event['item_id'] = vars['id_maps'][event['item_type']][
                event['item_original_id']]
        InsertObservedEvents(vars, events)
        i += 1

    # Indices
    i = 1
    n = len(vars['id_maps']['indices'].keys())
    for original_item_id in vars['id_maps']['indices'].keys():
        vars['logger'].Log(
            vars, "\tProcessing events for index: original ID {}".format(
                i, n, original_item_id))
        events = vars['queries'].observations.GetIndexVisits(
            vars, original_item_id)
        for event in events:
            event['user_id'] = vars['id_maps']['users'][
                event['user_original_id']]
            event['item_id'] = vars['id_maps'][event['item_type']][
                event['item_original_id']]
        InsertObservedEvents(vars, events)
        i += 1

    # Collaborations
    # ---------------
    # Events from what already exists in the collaborations table (so we don't have to redo it for the platforms separately)
    vars['logger'].Log(vars, "\tProcessing collaboration events")
    events = vars['common_queries'].GetTransformedCollaborationEvents(vars)
    InsertObservedEvents(vars, events)

    # If a platform has additional collaboration data aside from what was fetched above
    vars['logger'].Log(
        vars, "\tProcessing collaboration events not transformed previously")
    events = vars['queries'].observations.GetAdditionalCollaborationEvents(
        vars)
    for event in events:
        event['user_id'] = vars['id_maps']['users'][event['user_original_id']]
        if event['item_original_id'] in vars['id_maps'][
                event['item_type']].keys():
            event['item_id'] = vars['id_maps'][event['item_type']][
                event['item_original_id']]
        else:
            event['item_id'] = -1

    InsertObservedEvents(vars, events)
Beispiel #22
0
def GetUsers(vars):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])

    user_type_map = moocdb_utils.GetUserTypeMap(vars)

    # Stuff for mapping coursera access group to moocdb user type
    access_group_id_to_name = {
        x['id']: x['name']
        for x in general_db_selector.query("SELECT * FROM access_groups")
    }
    access_group_name_to_user_type_name = {
        'Student': 'Student',
        'Administrator': 'Administrator',
        'Instructor': 'Instructor',
        'Teaching Staff': 'Teaching Staff',
        'Blocked': 'Blocked',
        'Student Access': 'Student Access',
        'External Viewer': 'Student Access',
        'Community TA': 'Community TA',
        'School Administrator': 'School Administrator',
        'Data Coordinator': 'School Administrator',
        'Coursera Tech Support': 'Administrator',
        'Student (Forum Banned)': 'Student (Forum Banned)',
    }

    # Fetch the users data
    # ---------------------
    q = "SELECT {} AS uid, normal_grade FROM course_grades".format(
        vars['general_anon_col_name'])
    if vars['options']['debug']:
        q += " WHERE {} IN ({})".format(
            vars['general_anon_col_name'],
            ",".join(vars['hash_map']['qls_general']))

    rows = general_db_selector.query(q)
    max_grade = max([row['normal_grade']
                     for row in rows]) if len(rows) > 0 else 1
    course_grade_dict = {
        vars['hash_map']['map_general'][row['uid']]:
        1.0 * row['normal_grade'] / max_grade
        for row in rows
        if row['uid'] in vars['hash_map']['map_general'].keys()
    }

    user_items = {
        x: {
            'original_id': x,
            'user_ip': None,
            'user_country': None,
            'user_final_grade': None,
            'user_join_timestamp': None,
            'user_type_id': user_type_map['Student']
        }
        for x in vars['hash_map']['list_raw']
    }

    # The join below is to ensure that we only fetch users who have corresponding hash_mapping entries
    q = "SELECT * FROM users JOIN `{0}`.hash_mapping USING (`{1}`)".format(
        vars['source']['hash_mapping_db'], vars['general_anon_col_name'])
    if vars['options']['debug']:
        q += " WHERE users.{} IN ({})".format(
            vars['general_anon_col_name'],
            ",".join(vars['hash_map']['qls_general']))
    user_metadata_rows = general_db_selector.query(q)

    for row in user_metadata_rows:
        user_id = vars['hash_map']['map_general'][row[
            vars['general_anon_col_name']]]

        user_items[user_id]['user_ip'] = row[
            'last_access_ip'] if 'last_access_ip' in row.keys() else None

        user_items[user_id]['user_join_timestamp'] = row['registration_time']

        user_items[user_id]['user_final_grade'] = course_grade_dict[
            user_id] if user_id in course_grade_dict.keys() else None

        user_coursera_access_group_name = access_group_id_to_name[
            row['access_group_id']]
        if user_coursera_access_group_name in access_group_name_to_user_type_name:
            user_moocdb_user_type_name = access_group_name_to_user_type_name[
                user_coursera_access_group_name]
        else:
            user_moocdb_user_type_name = access_group_name_to_user_type_name[
                'External Viewer']
        user_moocdb_user_type_id = user_type_map[user_moocdb_user_type_name]
        user_items[user_id]['user_type_id'] = user_moocdb_user_type_id

    output_items = sorted(user_items.values(), key=lambda x: x['original_id'])

    return output_items
def GetForumPosts(vars):
    # DB connections
    # --------------
    s = vars['source']
    forum_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                    s['port'], s['forum_db'])

    output_items = []
    post_ctid = moocdb_utils.GetCollaborationTypeMap(vars)['forum_post']
    comment_ctid = moocdb_utils.GetCollaborationTypeMap(vars)['forum_comment']

    q_threads = "SELECT * FROM forum_threads JOIN `{0}`.hash_mapping USING ({1})".format(
        vars['source']['hash_mapping_db'], vars['forum_anon_col_name'])
    if vars['options']['debug']:
        in_list = ",".join(vars['hash_map']['qls_forum'])
        q_threads += " WHERE {} IN ({})".format(vars['forum_anon_col_name'],
                                                in_list)
    forum_threads_rows = forum_db_selector.query(q_threads)

    vars["logger"].Log(
        vars, "\t\tCounts: Read {} forum posts from source".format(
            len(forum_threads_rows)))

    q_posts = "SELECT forum_posts.id AS post_id, forum_threads.id AS thread_id, forum_forums.id AS forum_id, forum_posts.{0} AS poster_id, post_text, post_time FROM forum_posts JOIN `{1}`.hash_mapping USING ({0}) JOIN forum_threads ON forum_posts.thread_id=forum_threads.id JOIN forum_forums ON forum_threads.forum_id=forum_forums.id".format(
        vars['forum_anon_col_name'], vars['source']['hash_mapping_db'])

    if vars['options']['debug']:
        in_list = ",".join(vars['hash_map']['qls_forum'])
        q_posts += " WHERE forum_posts.{} IN ({})".format(
            vars['forum_anon_col_name'], in_list)
    forum_posts_rows = forum_db_selector.query(q_posts)

    thread_forum_ids = {x['id']: x['forum_id'] for x in forum_threads_rows}

    thread_first_post_id = {}
    forum_num_posts = {}
    posts_num_comments = {}
    for p in forum_posts_rows:
        if p['thread_id'] not in thread_forum_ids.keys(): continue
        forum_id = thread_forum_ids[p['thread_id']]
        thread_id = p['thread_id']
        is_root_post = thread_id not in thread_first_post_id.keys()
        if is_root_post: thread_first_post_id[thread_id] = p['post_id']
        parent_id = None if is_root_post else "post_" + str(
            thread_first_post_id[thread_id])
        x = {
            'original_id': 'post_' + str(p['post_id']),
            'resource_original_id': p['forum_id'],
            'collaboration_type_id':
            post_ctid if is_root_post else comment_ctid,
            'collaboration_parent_original_id': parent_id,
            'user_original_id': vars['hash_map']['map_forum'][p['poster_id']],
            'collaboration_content': p['post_text'],
            'collaboration_timestamp': datetime.fromtimestamp(p['post_time']),
        }

        if is_root_post:
            if parent_id not in forum_num_posts.keys():
                forum_num_posts[parent_id] = 1
            x['collaboration_child_number'] = forum_num_posts[parent_id]
            forum_num_posts[parent_id] += 1
        else:
            if parent_id not in posts_num_comments.keys():
                posts_num_comments[parent_id] = 1
            x['collaboration_child_number'] = posts_num_comments[parent_id]
            posts_num_comments[parent_id] += 1

        output_items.append(x)

    q_comments = "SELECT forum_comments.id AS comment_id, forum_comments.post_id AS post_id, forum_forums.id AS forum_id, forum_comments.{0} AS poster_id, comment_text, forum_comments.post_time AS post_time FROM forum_comments JOIN `{1}`.hash_mapping USING ({0}) JOIN forum_posts ON forum_comments.post_id=forum_posts.id JOIN forum_threads ON forum_posts.thread_id=forum_threads.id JOIN forum_forums ON forum_threads.forum_id=forum_forums.id".format(
        vars['forum_anon_col_name'], vars['source']['hash_mapping_db'])
    if vars['options']['debug']:
        in_list = ",".join(vars['hash_map']['qls_forum'])
        q_comments += " WHERE forum_comments.{} IN ({})".format(
            vars['forum_anon_col_name'], in_list)
    forum_comments_rows = forum_db_selector.query(q_comments)

    vars["logger"].Log(
        vars, "\t\tCounts: Read {} forum_comments from source".format(
            len(forum_posts_rows) + len(forum_comments_rows) -
            len(forum_threads_rows)))

    for c in forum_comments_rows:
        parent_id = "post_" + str(c['post_id'])

        if parent_id not in posts_num_comments.keys():
            posts_num_comments[parent_id] = 1
        child_number = posts_num_comments[parent_id]
        posts_num_comments[parent_id] += 1

        x = {
            'original_id': 'comment_' + str(c['comment_id']),
            'resource_original_id': c['forum_id'],
            'collaboration_type_id': comment_ctid,
            'collaboration_parent_original_id': parent_id,
            'collaboration_child_number': child_number,
            'collaboration_timestamp': datetime.fromtimestamp(c['post_time']),
            'collaboration_content': c['comment_text'],
            'user_original_id': vars['hash_map']['map_forum'][c['poster_id']],
        }

        output_items.append(x)

    return output_items
def GetStudentQuizResponses(vars, quiz_original_id):
    # DB connections
    # --------------
    s = vars['source']
    general_db_selector = db.Selector(s['host'], s['user'], s['password'],
                                      s['port'], s['general_db'])

    table_name = "kvs_course.{}.quiz".format(
        vars['source']['course_id']) if vars['source'][
            'platform_format'] == 'coursera_1' else "kvs_course.quiz"
    output = {}

    quiz_id = quiz_original_id.replace("quiz_", "")
    course_id = vars['source']['course_id']
    quiz_content = GetQuizContent(vars, quiz_id)
    quiz_question_dict = quiz_content['question_dict']

    q = "SELECT * FROM quiz_submission_metadata JOIN `{0}`.hash_mapping USING ({1}) WHERE item_id={2}".format(
        vars['source']['hash_mapping_db'], vars['general_anon_col_name'],
        quiz_id)
    if vars['options']['debug']:
        q += " AND {} IN ({})".format(
            vars['general_anon_col_name'],
            ",".join(vars['hash_map']['qls_general']))
    r = general_db_selector.query(q)

    vars["logger"].Log(
        vars, "\t\tCounts: Read {} quiz responses from source for {}".format(
            0 if r == None else len(r), quiz_original_id))

    if r == None: return {}

    for sub1 in r:
        anon_user_id = sub1[vars['general_anon_col_name']]
        try:
            user_id = vars['hash_map']['map_general'][anon_user_id]
        except:
            vars["logger"].Log(
                vars,
                "\t\t\tSubmission {} skipped: anon_user_id '{}' not found in hash_mapping"
                .format(sub1['id'], anon_user_id))
            continue

        sub2 = general_db_selector.query(
            "SELECT * FROM `{}` WHERE `key`='submission.submission_id:{}'".
            format(table_name, sub1['id']))
        if len(sub2) == 0:
            vars["logger"].Log(
                vars,
                "\t\t\tSubmission {} skipped: Not found in kvs table".format(
                    sub1['id']))
            continue

        sub2 = sub2[0]

        if user_id not in output.keys(): output[user_id] = {}
        s = unicode(sub2['value'], errors='ignore')
        try:
            value = phpserialize.loads(phpserialize.loads(s))
        except:
            vars['logger'].Log(
                vars,
                "\t\t\tFailed to load php-serialized string: {}".format(s))
            continue

        for qid in value['answers'].keys():
            grade = -1  # We don't know the grade
            question_answer = value['answers'][qid]
            if qid not in quiz_question_dict.keys():
                vars['logger'].Log(
                    vars,
                    "\t\t\tA question id found in a student response does not exist in the quiz XML! Question ID in student response: {}, Question IDs from XML: "
                    .format(qid, quiz_question_dict.keys()))
                continue

            if "choice_type" in quiz_question_dict[qid].keys():
                question_answer = question_answer.values()
                if quiz_question_dict[qid]["choice_type"] in [
                        'select', 'radio'
                ]:
                    if len(question_answer) == 0:
                        grade = 0
                        question_answer = None
                    elif len(question_answer) == 1:
                        question_answer = question_answer[0]
                        selected_score_dict = {}
                        for option_group in quiz_question_dict[qid]['options']:
                            for option in option_group:
                                selected_score_dict[
                                    option['id']] = option['selected_score']
                        if question_answer in selected_score_dict.keys():
                            grade = selected_score_dict[question_answer]
                    else:
                        vars['logger'].Log(
                            vars,
                            "\t\t\t\tRadio choice_type question, but student response contains multiple selections. Question: {}, Answer: {}"
                            .format(qid, question_answer))

                elif quiz_question_dict[qid]["choice_type"] == 'checkbox':
                    pass

            elif isinstance(question_answer, dict) and len(
                    question_answer.keys()) == 1 and question_answer.keys(
                    )[0] == 'answer':
                question_answer = question_answer['answer']
            else:
                vars['logger'].Log(
                    vars,
                    "\t\t\tUnexpected answer format for question {}:".format(
                        qid))

            if qid not in output[user_id].keys(): output[user_id][qid] = []
            if len(
                    output[user_id][qid]
            ) == 0 or output[user_id][qid][-1]['answer'] != question_answer:
                output[user_id][qid].append({
                    'submission_time':
                    value['saved_time']
                    if 'saved_time' in value.keys() else None,
                    'answer':
                    question_answer,
                    'grade':
                    grade
                })

    return output