def GetWikiEdits(vars): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) wiki_edit_ctid = moocdb_utils.GetCollaborationTypeMap(vars)['wiki_edit'] gen_anon = vars['general_anon_col_name'] items = [] q = "SELECT * FROM wiki_revisions JOIN `{0}`.hash_mapping USING ({1})".format(vars['source']['hash_mapping_db'], gen_anon) if vars['options']['debug']: q += " WHERE wiki_revisions.{} IN ({})".format(gen_anon, ",".join(vars['hash_map']['qls_general'])) wiki_revisions = general_db_selector.query(q) vars["logger"].Log(vars, "Counts: Read {} wiki edits from source".format(len(wiki_revisions))) xi = 0 for x in wiki_revisions: items.append({ 'original_id': 'wiki_edit_' + str(xi), 'user_original_id': vars['hash_map']['map_general'][x[gen_anon]], 'collaboration_parent_original_id': None, 'resource_original_id': x['page_id'], 'collaboration_child_number': None, 'collaboration_timestamp': datetime.fromtimestamp(x['timestamp']), 'collaboration_type_id': wiki_edit_ctid, 'collaboration_content': None, }) xi += 1 return items
def GetTutorials(vars): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) output_items = [] resource_type_id = moocdb_utils.GetResourceTypeMap(vars)['tutorial'] src_videos = general_db_selector.query("SELECT * FROM lecture_metadata") vars["logger"].Log(vars, "\t\tCounts: Read {} videos from source".format(len(src_videos))) items_sections = general_db_selector.query("SELECT * FROM items_sections WHERE item_type='lecture'") items_sections_lookup = {x['item_id']: {'resource_parent_id': x['section_id'], 'resource_child_number': x['order']} for x in items_sections if x['item_type'] == 'lecture'} for video in src_videos: item = { 'original_id': video['id'], 'resource_name': video['title'], 'resource_uri': "www.coursera.org/{}/lecture/view?lecture_id={}".format(vars['source']['course_url_id'], video['id']), 'resource_parent_original_id': None, 'resource_child_number': None, 'resource_type_id': resource_type_id, } if video['id'] in items_sections_lookup.keys(): item['resource_parent_original_id'] = items_sections_lookup[video['id']]['resource_parent_id'] item['resource_child_number'] = items_sections_lookup[video['id']]['resource_child_number'] output_items.append(item) return output_items
def GetContentSections(vars): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) resource_type_id = moocdb_utils.GetResourceTypeMap(vars)['content_section'] output_items = [] src_sections = general_db_selector.query( "SELECT * FROM sections ORDER BY display_order, id") vars["logger"].Log( vars, "\t\tCounts: Read {} content sections from source".format( len(src_sections))) section_index = 1 for section in src_sections: output_items.append({ 'original_id': section['id'], 'resource_name': section['title'], 'resource_uri': '', 'resource_parent_original_id': None, 'resource_child_number': section_index, 'resource_type_id': resource_type_id, }) section_index += 1 return output_items
def GetWikis(vars): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) resource_type_id = moocdb_utils.GetResourceTypeMap(vars)['wiki'] output_items = [] src_items = general_db_selector.query("SELECT * FROM wiki_pages") vars["logger"].Log( vars, "\t\tCounts: Read {} wikis from source".format(len(src_items))) for wiki in src_items: output_items.append({ 'original_id': wiki['id'], 'resource_name': wiki['title'], 'resource_uri': "www.coursera.org/{}/wiki/view?page={}".format( vars['source']['course_url_id'], wiki['canonical_name']), 'resource_parent_original_id': None, 'resource_child_number': None, 'resource_type_id': resource_type_id }) return output_items
def GetForums(vars): # DB connections # -------------- s = vars['source'] forum_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['forum_db']) output_items = [] resource_type_id = moocdb_utils.GetResourceTypeMap(vars)['forum'] src_forums = forum_db_selector.query( "SELECT * FROM forum_forums ORDER BY display_order, id") vars["logger"].Log( vars, "\t\tCounts: Read {} forums from source".format(len(src_forums))) for forum in src_forums: output_items.append({ 'original_id': forum['id'], 'resource_uri': "www.coursera.org/{}/forum/list?forum_id={}".format( vars['source']['course_url_id'], forum['id']), 'resource_name': forum['name'], 'resource_parent_original_id': None, 'resource_child_number': None, 'resource_type_id': resource_type_id, }) return output_items
def GetQuizContent(vars, quiz_id): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) s = "<quiz></quiz>" table_name = "kvs_course.{}.quiz".format( vars['source']['course_id']) if vars['source'][ 'platform_format'] == 'coursera_1' else "kvs_course.quiz" q = "SELECT value FROM `{}` WHERE `key`='xml.quiz_id:{}'".format( table_name, quiz_id) r = general_db_selector.query(q) content = {'question_groups': [], 'question_dict': {}} if len(r) > 0 and r[0]['value'].count('"') >= 2: a = r[0]['value'].find('"') + 1 b = r[0]['value'].rindex('"') quiz_xml = r[0]['value'][a:b] s = quiz_xml try: content = ParseQuizXML(s) except: vars['logger'].Log( vars, "\t\t\tFailed to parse quiz XML for quiz {}. Skippping this quiz and its submissions" .format(quiz_id)) return content
def GetCourseraHashMap(vars, gen_anon): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) hm_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['hash_mapping_db']) print gen_anon q = "SELECT * FROM hash_mapping" if vars['options']['debug']: users = general_db_selector.query( "SELECT * FROM users limit 0,{}".format( vars['options']['num_users_debug_mode'])) #users = general_db_selector.query("SELECT * FROM users where session_user_id in ('53c2c2914b0ad5c3eb01216b242a8ac20d1b1a69', '7d7fe7332e600cb172580aa624b0c93b4331f0c5', 'ec74ba61aa0b0aa279952ba57ea4ef001bf6ea9d')") user_id_list = [u[gen_anon] for u in users] user_id_list_string = "','".join(user_id_list) q += " WHERE {} IN ('{}')".format(gen_anon, user_id_list_string) rows = hm_db_selector.query(q) if vars['source']['platform_format'] == 'coursera_1': map = { 'map_forum': {row['forum_user_id']: row['user_id'] for row in rows}, 'map_general': {row['anon_user_id']: row['user_id'] for row in rows}, 'list_raw': [row['user_id'] for row in rows], 'qls_general': ["'{}'".format(row['anon_user_id']) for row in rows], 'qls_forum': ["'{}'".format(row['forum_user_id']) for row in rows], } else: map = { 'map_forum': {row['user_id']: row['user_id'] for row in rows}, 'map_general': {row['session_user_id']: row['user_id'] for row in rows}, 'list_raw': [row['user_id'] for row in rows], 'qls_general': ["'{}'".format(row['session_user_id']) for row in rows], 'qls_forum': [str(row['user_id']) for row in rows], } return map
def GetTests(vars): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) output_items = [] resource_type_id = moocdb_utils.GetResourceTypeMap(vars)['testing'] src_quizzes = general_db_selector.query("SELECT * FROM quiz_metadata") vars["logger"].Log(vars, "\t\tCounts: Read {} quizzes from source".format(len(src_quizzes))) items_sections = general_db_selector.query("SELECT * FROM items_sections WHERE item_type='quiz'") items_sections_lookup = {x['item_id']: {'resource_parent_id': x['section_id'], 'resource_child_number': x['order']} for x in items_sections if x['item_type'] == 'quiz'} for quiz in src_quizzes: item = { 'original_id': 'quiz_' + str(quiz['id']), 'resource_name': quiz['title'], 'resource_uri': "www.coursera.org/{}/quiz/start?quiz_id={}".format(vars['source']['course_url_id'], quiz['id']), 'resource_parent_original_id': None, 'resource_child_number': None, 'resource_type_id': resource_type_id, } if quiz['id'] in items_sections_lookup.keys(): item['resource_parent_original_id'] = items_sections_lookup[quiz['id']]['resource_parent_id'] item['resource_child_number'] = items_sections_lookup[quiz['id']]['resource_child_number'] output_items.append(item) src_assignments = general_db_selector.query("SELECT * FROM assignment_metadata") vars["logger"].Log(vars, "\t\tCounts: Read {} assignments from source".format(len(src_assignments))) items_sections = general_db_selector.query("SELECT * FROM items_sections WHERE item_type='assignment'") items_sections_lookup = {x['item_id']: {'resource_parent_id': x['section_id'], 'resource_child_number': x['order']} for x in items_sections if x['item_type'] == 'assignment'} for assn in src_assignments: assn_id = assn['id'] item = { 'original_id': 'assignment_' + str(assn_id), 'resource_name': assn['title'], 'resource_uri': "www.coursera.org/{}/assignment/view?assignment_id={}".format(vars['source']['course_url_id'], assn_id), 'resource_parent_original_id': None, 'resource_child_number': None, 'resource_type_id': resource_type_id, } if assn['id'] in items_sections_lookup.keys(): item['resource_parent_original_id'] = items_sections_lookup[assn['id']]['resource_parent_id'] item['resource_child_number'] = items_sections_lookup[assn['id']]['resource_child_number'] output_items.append(item) return output_items
def GetSubmissionAndAssessmentData(vars, test): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) output_items = [] gen_anon = vars['general_anon_col_name'] if 'quiz_' in test['original_id']: stud_quiz_submissions = GetStudentQuizResponses( vars, test['original_id'].replace("quiz_", "")) vars["logger"].Log( vars, "\t\tCounts: Read {} submissions from source for {}".format( len(stud_quiz_submissions.keys()), test['original_id'])) for uid in stud_quiz_submissions.keys(): for qid in stud_quiz_submissions[uid].keys(): attempt_number = 0 for answer in stud_quiz_submissions[uid][qid]: attempt_number += 1 submission = { 'user_original_id': uid, 'problem_original_id': 'quiz_question_' + qid, 'submission_timestamp': answer['submission_time'], 'submission_answer': json.dumps([answer['answer']]), 'submission_attempt_number': attempt_number, 'submission_is_submitted': 1, 'assessments': [], } if answer['grade'] != -1: submission['assessments'].append({ 'grader_original_id': 0, 'grade': answer['grade'], 'max_grade': 1, 'assessment_timestamp': answer['submission_time'], }) output_items.append(submission) return output_items
def GetForumVotes(vars): # DB connections # -------------- s = vars['source'] forum_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['forum_db']) forum_anon = vars['forum_anon_col_name'] forum_vote_ctid = moocdb_utils.GetCollaborationTypeMap(vars)['forum_vote'] output_items = [] q = "SELECT * FROM forum_reputation_record JOIN `{0}`.hash_mapping USING ({1})".format( vars['source']['hash_mapping_db'], forum_anon) if vars['options']['debug']: q += " WHERE {} IN ({})".format( forum_anon, ",".join(vars['hash_map']['qls_forum'])) src_forum_voting_records = forum_db_selector.query(q) vars["logger"].Log( vars, "\t\tCounts: Read {} forum_votes from source".format( len(src_forum_voting_records))) vote_index = 0 for vote in src_forum_voting_records: output_items.append({ 'original_id': 'vote_' + str(vote_index), 'user_original_id': vars['hash_map']['map_forum'][vote[forum_anon]], 'resource_original_id': None, 'collaboration_parent_original_id': vote['type'] + '_' + str(vote['pc_id']), 'collaboration_child_number': None, 'collaboration_content': vote['direction'], 'collaboration_timestamp': vote['timestamp'], 'collaboration_type_id': forum_vote_ctid, }) vote_index += 1 return output_items
def InsertObservedEvents(vars, events): fields = { 'observed_event_type_id': 'num', 'user_id': 'num', 'item_id': 'num', 'observed_event_timestamp': 'datetime', 'observed_event_data': 'string', } t = vars['target'] target_db_selector = db.Selector(t['host'], t['user'], t['password'], t['port'], t['db']) oe_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'], t['port'], t['db'], 'observed_events', fields) for event in events: oe_inserter.addRow({k: event[k] for k in fields}) oe_inserter.insertPendingRows()
def GetQuizMetadata(vars, quiz_id): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) q = "SELECT * FROM quiz_metadata WHERE id={}".format(quiz_id) r = general_db_selector.query(q) output = {} if len(r) > 0: output = { 'title': r[0]['title'], 'open_time': r[0]['open_time'], 'soft_deadline': r[0]['soft_close_time'], 'hard_deadline': r[0]['hard_close_time'], 'max_submissions': r[0]['maximum_submissions'] } return output
def GetTransformedCollaborationEvents(vars): # DB connections # -------------- t = vars['target'] target_db_selector = db.Selector(t['host'], t['user'], t['password'], t['port'], t['db']) oe_type_map = moocdb_utils.GetObservedEventTypeMap(vars) coll_type_map = moocdb_utils.GetCollaborationTypeMap(vars) coll_type_map_id_to_name = { coll_type_map[k]: k for k in coll_type_map.keys() } events = [] rows = target_db_selector.query( "SELECT * FROM collaborations JOIN collaboration_types ON collaborations.collaboration_type_id=collaboration_types.collaboration_type_id" ) for row in rows: coll_type_id = row['collaboration_type_id'] coll_type_name = coll_type_map_id_to_name[coll_type_id] oe_type_name = coll_type_name oe_type_id = oe_type_map[oe_type_name] events.append({ 'user_id': row['user_id'], 'item_id': row['collaboration_parent_id'] if row['collaboration_parent_id'] != None else row['resource_id'], 'observed_event_type_id': oe_type_id, 'observed_event_timestamp': row['collaboration_timestamp'], 'observed_event_data': json.dumps({}), }) return events
def GetVideoEvents(vars, original_item_id): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) oe_type_map = moocdb_utils.GetObservedEventTypeMap(vars) events = [] gen_anon = vars['general_anon_col_name'] q = "SELECT * FROM lecture_submission_metadata JOIN `{0}`.hash_mapping USING ({1}) WHERE item_id={2}".format( vars['source']['hash_mapping_db'], gen_anon, original_item_id) if vars['options']['debug']: q += " AND {} IN ({})".format( gen_anon, ",".join(vars['hash_map']['qls_general'])) rows = general_db_selector.query(q) for row in rows: events.append({ 'user_original_id': vars['hash_map']['map_general'][row[gen_anon]], 'item_type': 'tutorials', 'item_original_id': original_item_id, 'observed_event_data': json.dumps({}), 'observed_event_timestamp': datetime.fromtimestamp(row['submission_time']), 'observed_event_type_id': oe_type_map['tutorial_visit'], 'observed_event_data': json.dumps({}), }) return events
# from ..utilities import db
def TransformUserData(vars): # DB connections # -------------- c = vars['core'] core_db_selector = db.Selector(c['host'], c['user'], c['password'], c['port'], c['db']) # Populate the users table user_id_map = {} users = vars['queries'].GetUsers(vars) fields = { 'user_id': 'num', 'user_email': 'string', 'user_type_id': 'num', 'user_join_timestamp': 'datetime', 'user_ip': 'ip', 'user_country': 'string', 'user_timezone_offset': 'num', 'user_final_grade': 'num', } # IP-country lookup table ip_country_rows = [{ 'start': int(x['ip_numeric_start']), 'stop': int(x['ip_numeric_stop']), 'country_code': x['country_code'] } for x in core_db_selector.query( "SELECT ip_numeric_start,ip_numeric_stop,country_code FROM ip_country ORDER BY ip_numeric_start" )] t = vars['target'] user_inserter = db.StaggeredInsert(t['host'], t['user'], t['password'], t['port'], t['db'], 'users', fields) moocdb_user_id = 1 for user in users: # User MOOCdb ID user['user_id'] = moocdb_user_id # User IP user['user_ip'] = db.ip_aton(user['user_ip']) # User email cannot be null if 'user_email' not in user.keys() or user['user_email'] == None: user['user_email'] = '' # User country if 'user_country' not in user.keys(): user['user_country'] = None if user['user_country'] == None and user[ 'user_ip'] != 'null': # Note: Some platforms don't record IP, but do record country for ipc_row in ip_country_rows: if user['user_ip'] >= ipc_row['start'] and user[ 'user_ip'] <= ipc_row['stop']: user['user_country'] = ipc_row['country_code'] break # User timezone offset # We are computing it as the mean for the country since some platforms provide incorrect data for user timezone utzo = None if user['user_country'] != None: r = core_db_selector.query( "SELECT * FROM timezone WHERE country_code='{}'".format( user['user_country'])) if len(r) > 0: offsets = [x['gmt_offset'] for x in r] utzo = offsets[len(offsets) / 2] user['user_timezone_offset'] = utzo user_inserter.addRow( {k: user[k] if k in user.keys() else None for k in fields}) user_id_map[user['original_id']] = moocdb_user_id moocdb_user_id += 1 user_inserter.insertPendingRows() vars["logger"].Log( vars, "Counts: Inserted {} users to target".format( user_inserter.num_inserted_rows)) return user_id_map
from utilities import db
def GetProblems(vars): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) problem_type_map = moocdb_utils.GetProblemTypeMap(vars) output_items = [] quiz_ids = [ x['id'] for x in general_db_selector.query("SELECT * FROM quiz_metadata") ] vars["logger"].Log( vars, "\t\tCounts: Read {} quizzes from source".format(len(quiz_ids))) for quiz_id in quiz_ids: quiz_metadata = GetQuizMetadata(vars, quiz_id) test_original_id = 'quiz_' + str(quiz_id) quiz_content = GetQuizContent(vars, quiz_id) question_index = 0 for question_group in quiz_content['question_groups']: for question in question_group: if 'choice_type' in question.keys( ) and question['choice_type'] == 'radio': problem_type = 'question_mc_single' elif 'choice_type' in question.keys( ) and question['choice_type'] == 'select': problem_type = 'question_mc_multiple' else: problem_type = 'question_free_text' item = { 'original_item_type': 'quiz_question', 'problem_original_id': 'quiz_question_' + question['id'], 'problem_name': test_original_id + "." + question['id'], 'problem_type_id': problem_type_map[problem_type], 'problem_parent_original_id': test_original_id, 'problem_child_number': question_index, 'problem_release_timestamp': quiz_metadata['open_time'], 'problem_soft_deadline': quiz_metadata['soft_deadline'], 'problem_hard_deadline': quiz_metadata['hard_deadline'], 'problem_max_submission': quiz_metadata['max_submissions'], 'resource_original_id': test_original_id, } output_items.append(item) question_index += 1 assn_metadata = [] assn_metadata_rows = general_db_selector.query( "SELECT * FROM assignment_metadata") for assn_metadata in assn_metadata_rows: assn_id = assn_metadata['id'] test_original_id = 'assignment_' + str(assn_id) assn_parts = general_db_selector.query( "SELECT * FROM assignment_part_metadata WHERE assignment_id={}". format(assn_id)) assn_part_index = 0 for assn_part in assn_parts: assn_part_original_id = "assn_part_" + str(assn_part['id']) item = { 'original_item_type': 'assignment_part', 'problem_original_id': assn_part_original_id, 'problem_name': assn_part_original_id, 'problem_type_id': problem_type_map['assignment_part'], 'problem_parent_original_id': test_original_id, 'problem_child_number': assn_part_index, 'problem_release_timestamp': assn_metadata['open_time'], 'problem_soft_deadline': assn_metadata['soft_close_time'], 'problem_hard_deadline': assn_metadata['hard_close_time'], 'problem_max_submission': assn_metadata['maximum_submissions'], 'problem_weight': None, 'resource_original_id': test_original_id, } output_items.append(item) assn_part_index += 1 vars["logger"].Log( vars, "\t\tCounts: Read {} problems from source".format(len(output_items))) return output_items
def GetAdditionalCollaborationEvents(vars): # DB connections # -------------- s = vars['source'] forum_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['forum_db']) oe_type_map = moocdb_utils.GetObservedEventTypeMap(vars) events = [] # Find the id of the first forum_post of each thread. This is the item_id under which the event will be registered q = "SELECT id, thread_id FROM forum_posts WHERE id IN (SELECT min(id) FROM forum_posts GROUP BY thread_id)" thread_id_to_first_post_id = { x['thread_id']: x['id'] for x in forum_db_selector.query(q) } table_name = "kvs_course.{}.forum_readrecord".format( vars['source']['course_id'] ) if vars['source'][ 'platform_format'] == 'coursera_1' else "kvs_course.forum_readrecord" q = "SELECT * FROM `{}`".format(table_name) rows = forum_db_selector.query(q) for row in rows: key_parts = row['key'].split(".") uoid_str = key_parts[1] try: int(uoid_str) except: continue user_original_id = int(uoid_str) if user_original_id not in vars['hash_map']['list_raw']: continue forum_id = int(key_parts[0].replace('forum_', '')) value = phpserialize.loads(row['value']) if "_all" in value.keys(): events.append({ 'user_original_id': user_original_id, 'observed_event_type_id': oe_type_map['forum_visit'], 'item_original_id': forum_id, 'item_type': 'forums', 'observed_event_timestamp': datetime.fromtimestamp(value["_all"]), 'observed_event_data': json.dumps({}), }) else: for k in value.keys(): if k in thread_id_to_first_post_id.keys(): events.append({ 'user_original_id': user_original_id, 'observed_event_type_id': oe_type_map['forum_post_read'], 'item_original_id': 'post_' + str(thread_id_to_first_post_id[k]), 'item_type': 'forum_posts', 'observed_event_timestamp': datetime.fromtimestamp(value[k]), 'observed_event_data': json.dumps({}), }) return events
def GetTestEvents(vars, original_item_id): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) oe_type_map = moocdb_utils.GetObservedEventTypeMap(vars) events = [] if 'quiz_' in original_item_id: # Get submission metadata q = "SELECT * FROM quiz_submission_metadata JOIN `{0}`.hash_mapping USING ({1}) WHERE item_id={2}".format( vars['source']['hash_mapping_db'], vars['general_anon_col_name'], original_item_id.replace('quiz_', '')) if vars['options']['debug']: q += " AND {} IN ({})".format( vars['general_anon_col_name'], ",".join(vars['hash_map']['qls_general'])) rows = general_db_selector.query(q) submission_metadata = { row['id']: { 'user_original_id': vars['hash_map']['map_general'][row[ vars['general_anon_col_name']]] } for row in rows } if len(submission_metadata) > 0: # Get submission content from kvs table table_name = "kvs_course.{}.quiz".format( vars['source']['course_id']) if vars['source'][ 'platform_format'] == 'coursera_1' else "kvs_course.quiz" in_list = [ "'submission.submission_id:{}'".format(submission['id']) for submission in rows ] # the IN clause in the next select stmt can be too large. Need to chop it up to 1000 submission_id at a time and then union # all the select statement rows = [] chunkSize = len(in_list) / 1000 leftoverSize = len(in_list) % 1000 query = "" for i in range(chunkSize): in_list_string = ",".join(in_list[i * 1000:(i * 1000 + 1000)]) query = "SELECT * FROM `{}` WHERE `key` IN ({})".format( table_name, in_list_string) subResult = general_db_selector.query(query) rows += subResult """if i == 0: unionQuery = "SELECT * FROM `{}` WHERE `key` IN ({})".format(table_name, in_list_string) else: unionQuery += " UNION SELECT * FROM `{}` WHERE `key` IN ({})".format(table_name, in_list_string)""" in_list_string = ",".join( in_list[chunkSize * 1000:(chunkSize * 1000 + leftoverSize)]) """if chunkSize == 0: unionQuery = "SELECT * FROM `{}` WHERE `key` IN ({})".format(table_name, in_list_string) else: unionQuery += " UNION SELECT * FROM `{}` WHERE `key` IN ({})".format(table_name, in_list_string)""" query = "SELECT * FROM `{}` WHERE `key` IN ({})".format( table_name, in_list_string) subResult = general_db_selector.query(query) rows += subResult """if rows is None: print unionQuery""" user_event_params = {} for row in rows: try: value = phpserialize.loads(phpserialize.loads( row['value'])) except: vars['logger'].Log( vars, "\t\t\tFailed to deserialize php-serialized string: {}\n\t\t\tSkipping this record" .format(row['value'])) continue key_parts = row['key'].split(":") submission_id = int(key_parts[1]) uoid = submission_metadata[submission_id]['user_original_id'] if uoid not in user_event_params.keys(): user_event_params[uoid] = [] user_event_params[uoid].append({ 'event_type': 'started', 'timestamp': value['start_time'] }) user_event_params[uoid].append({ 'event_type': 'submitted', 'timestamp': value['saved_time'] }) for uoid in user_event_params.keys(): submits = [ x for x in user_event_params[uoid] if x['event_type'] == 'submitted' ] filtered_starts = [] for x in user_event_params[uoid]: if x['event_type'] == 'started': retain_start = True for y in submits: if y['timestamp'] < x['timestamp'] and y[ 'timestamp'] > (x['timestamp'] - 120): retain_start = False break if retain_start: filtered_starts.append(x) for x in filtered_starts: events.append({ 'user_original_id': uoid, 'item_type': 'tests', 'item_original_id': original_item_id, 'observed_event_type_id': oe_type_map['test_visit'], 'observed_event_timestamp': datetime.fromtimestamp(x['timestamp']), 'observed_event_data': json.dumps({}), }) for x in submits: events.append({ 'user_original_id': uoid, 'item_type': 'tests', 'item_original_id': original_item_id, 'observed_event_type_id': oe_type_map['test_submission'], 'observed_event_timestamp': datetime.fromtimestamp(x['timestamp']), 'observed_event_data': json.dumps({}), }) return events
def TransformObservationData(vars): # DB connections # -------------- t = vars['target'] target_db_selector = db.Selector(t['host'], t['user'], t['password'], t['port'], t['db']) # Videos i = 1 n = len(vars['id_maps']['tutorials'].keys()) for original_item_id in vars['id_maps']['tutorials'].keys(): vars['logger'].Log( vars, "\tProcessing events for tutorial {} out of {}: original ID {}". format(i, n, original_item_id)) events = vars['queries'].observations.GetVideoEvents( vars, original_item_id) for event in events: event['user_id'] = vars['id_maps']['users'][ event['user_original_id']] event['item_id'] = vars['id_maps'][event['item_type']][ event['item_original_id']] InsertObservedEvents(vars, events) i += 1 # Tests i = 1 n = len(vars['id_maps']['tests'].keys()) for original_item_id in vars['id_maps']['tests'].keys(): vars['logger'].Log( vars, "\tProcessing events for test: original ID {}".format( i, n, original_item_id)) events = vars['queries'].observations.GetTestEvents( vars, original_item_id) for event in events: event['user_id'] = vars['id_maps']['users'][ event['user_original_id']] event['item_id'] = vars['id_maps'][event['item_type']][ event['item_original_id']] InsertObservedEvents(vars, events) i += 1 # Wikis i = 1 n = len(vars['id_maps']['wikis'].keys()) for original_item_id in vars['id_maps']['wikis'].keys(): vars['logger'].Log( vars, "\tProcessing events for wiki: original ID {}".format( i, n, original_item_id)) events = vars['queries'].observations.GetWikiVisits( vars, original_item_id) for event in events: event['user_id'] = vars['id_maps']['users'][ event['user_original_id']] event['item_id'] = vars['id_maps'][event['item_type']][ event['item_original_id']] InsertObservedEvents(vars, events) i += 1 # Indices i = 1 n = len(vars['id_maps']['indices'].keys()) for original_item_id in vars['id_maps']['indices'].keys(): vars['logger'].Log( vars, "\tProcessing events for index: original ID {}".format( i, n, original_item_id)) events = vars['queries'].observations.GetIndexVisits( vars, original_item_id) for event in events: event['user_id'] = vars['id_maps']['users'][ event['user_original_id']] event['item_id'] = vars['id_maps'][event['item_type']][ event['item_original_id']] InsertObservedEvents(vars, events) i += 1 # Collaborations # --------------- # Events from what already exists in the collaborations table (so we don't have to redo it for the platforms separately) vars['logger'].Log(vars, "\tProcessing collaboration events") events = vars['common_queries'].GetTransformedCollaborationEvents(vars) InsertObservedEvents(vars, events) # If a platform has additional collaboration data aside from what was fetched above vars['logger'].Log( vars, "\tProcessing collaboration events not transformed previously") events = vars['queries'].observations.GetAdditionalCollaborationEvents( vars) for event in events: event['user_id'] = vars['id_maps']['users'][event['user_original_id']] if event['item_original_id'] in vars['id_maps'][ event['item_type']].keys(): event['item_id'] = vars['id_maps'][event['item_type']][ event['item_original_id']] else: event['item_id'] = -1 InsertObservedEvents(vars, events)
def GetUsers(vars): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) user_type_map = moocdb_utils.GetUserTypeMap(vars) # Stuff for mapping coursera access group to moocdb user type access_group_id_to_name = { x['id']: x['name'] for x in general_db_selector.query("SELECT * FROM access_groups") } access_group_name_to_user_type_name = { 'Student': 'Student', 'Administrator': 'Administrator', 'Instructor': 'Instructor', 'Teaching Staff': 'Teaching Staff', 'Blocked': 'Blocked', 'Student Access': 'Student Access', 'External Viewer': 'Student Access', 'Community TA': 'Community TA', 'School Administrator': 'School Administrator', 'Data Coordinator': 'School Administrator', 'Coursera Tech Support': 'Administrator', 'Student (Forum Banned)': 'Student (Forum Banned)', } # Fetch the users data # --------------------- q = "SELECT {} AS uid, normal_grade FROM course_grades".format( vars['general_anon_col_name']) if vars['options']['debug']: q += " WHERE {} IN ({})".format( vars['general_anon_col_name'], ",".join(vars['hash_map']['qls_general'])) rows = general_db_selector.query(q) max_grade = max([row['normal_grade'] for row in rows]) if len(rows) > 0 else 1 course_grade_dict = { vars['hash_map']['map_general'][row['uid']]: 1.0 * row['normal_grade'] / max_grade for row in rows if row['uid'] in vars['hash_map']['map_general'].keys() } user_items = { x: { 'original_id': x, 'user_ip': None, 'user_country': None, 'user_final_grade': None, 'user_join_timestamp': None, 'user_type_id': user_type_map['Student'] } for x in vars['hash_map']['list_raw'] } # The join below is to ensure that we only fetch users who have corresponding hash_mapping entries q = "SELECT * FROM users JOIN `{0}`.hash_mapping USING (`{1}`)".format( vars['source']['hash_mapping_db'], vars['general_anon_col_name']) if vars['options']['debug']: q += " WHERE users.{} IN ({})".format( vars['general_anon_col_name'], ",".join(vars['hash_map']['qls_general'])) user_metadata_rows = general_db_selector.query(q) for row in user_metadata_rows: user_id = vars['hash_map']['map_general'][row[ vars['general_anon_col_name']]] user_items[user_id]['user_ip'] = row[ 'last_access_ip'] if 'last_access_ip' in row.keys() else None user_items[user_id]['user_join_timestamp'] = row['registration_time'] user_items[user_id]['user_final_grade'] = course_grade_dict[ user_id] if user_id in course_grade_dict.keys() else None user_coursera_access_group_name = access_group_id_to_name[ row['access_group_id']] if user_coursera_access_group_name in access_group_name_to_user_type_name: user_moocdb_user_type_name = access_group_name_to_user_type_name[ user_coursera_access_group_name] else: user_moocdb_user_type_name = access_group_name_to_user_type_name[ 'External Viewer'] user_moocdb_user_type_id = user_type_map[user_moocdb_user_type_name] user_items[user_id]['user_type_id'] = user_moocdb_user_type_id output_items = sorted(user_items.values(), key=lambda x: x['original_id']) return output_items
def GetForumPosts(vars): # DB connections # -------------- s = vars['source'] forum_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['forum_db']) output_items = [] post_ctid = moocdb_utils.GetCollaborationTypeMap(vars)['forum_post'] comment_ctid = moocdb_utils.GetCollaborationTypeMap(vars)['forum_comment'] q_threads = "SELECT * FROM forum_threads JOIN `{0}`.hash_mapping USING ({1})".format( vars['source']['hash_mapping_db'], vars['forum_anon_col_name']) if vars['options']['debug']: in_list = ",".join(vars['hash_map']['qls_forum']) q_threads += " WHERE {} IN ({})".format(vars['forum_anon_col_name'], in_list) forum_threads_rows = forum_db_selector.query(q_threads) vars["logger"].Log( vars, "\t\tCounts: Read {} forum posts from source".format( len(forum_threads_rows))) q_posts = "SELECT forum_posts.id AS post_id, forum_threads.id AS thread_id, forum_forums.id AS forum_id, forum_posts.{0} AS poster_id, post_text, post_time FROM forum_posts JOIN `{1}`.hash_mapping USING ({0}) JOIN forum_threads ON forum_posts.thread_id=forum_threads.id JOIN forum_forums ON forum_threads.forum_id=forum_forums.id".format( vars['forum_anon_col_name'], vars['source']['hash_mapping_db']) if vars['options']['debug']: in_list = ",".join(vars['hash_map']['qls_forum']) q_posts += " WHERE forum_posts.{} IN ({})".format( vars['forum_anon_col_name'], in_list) forum_posts_rows = forum_db_selector.query(q_posts) thread_forum_ids = {x['id']: x['forum_id'] for x in forum_threads_rows} thread_first_post_id = {} forum_num_posts = {} posts_num_comments = {} for p in forum_posts_rows: if p['thread_id'] not in thread_forum_ids.keys(): continue forum_id = thread_forum_ids[p['thread_id']] thread_id = p['thread_id'] is_root_post = thread_id not in thread_first_post_id.keys() if is_root_post: thread_first_post_id[thread_id] = p['post_id'] parent_id = None if is_root_post else "post_" + str( thread_first_post_id[thread_id]) x = { 'original_id': 'post_' + str(p['post_id']), 'resource_original_id': p['forum_id'], 'collaboration_type_id': post_ctid if is_root_post else comment_ctid, 'collaboration_parent_original_id': parent_id, 'user_original_id': vars['hash_map']['map_forum'][p['poster_id']], 'collaboration_content': p['post_text'], 'collaboration_timestamp': datetime.fromtimestamp(p['post_time']), } if is_root_post: if parent_id not in forum_num_posts.keys(): forum_num_posts[parent_id] = 1 x['collaboration_child_number'] = forum_num_posts[parent_id] forum_num_posts[parent_id] += 1 else: if parent_id not in posts_num_comments.keys(): posts_num_comments[parent_id] = 1 x['collaboration_child_number'] = posts_num_comments[parent_id] posts_num_comments[parent_id] += 1 output_items.append(x) q_comments = "SELECT forum_comments.id AS comment_id, forum_comments.post_id AS post_id, forum_forums.id AS forum_id, forum_comments.{0} AS poster_id, comment_text, forum_comments.post_time AS post_time FROM forum_comments JOIN `{1}`.hash_mapping USING ({0}) JOIN forum_posts ON forum_comments.post_id=forum_posts.id JOIN forum_threads ON forum_posts.thread_id=forum_threads.id JOIN forum_forums ON forum_threads.forum_id=forum_forums.id".format( vars['forum_anon_col_name'], vars['source']['hash_mapping_db']) if vars['options']['debug']: in_list = ",".join(vars['hash_map']['qls_forum']) q_comments += " WHERE forum_comments.{} IN ({})".format( vars['forum_anon_col_name'], in_list) forum_comments_rows = forum_db_selector.query(q_comments) vars["logger"].Log( vars, "\t\tCounts: Read {} forum_comments from source".format( len(forum_posts_rows) + len(forum_comments_rows) - len(forum_threads_rows))) for c in forum_comments_rows: parent_id = "post_" + str(c['post_id']) if parent_id not in posts_num_comments.keys(): posts_num_comments[parent_id] = 1 child_number = posts_num_comments[parent_id] posts_num_comments[parent_id] += 1 x = { 'original_id': 'comment_' + str(c['comment_id']), 'resource_original_id': c['forum_id'], 'collaboration_type_id': comment_ctid, 'collaboration_parent_original_id': parent_id, 'collaboration_child_number': child_number, 'collaboration_timestamp': datetime.fromtimestamp(c['post_time']), 'collaboration_content': c['comment_text'], 'user_original_id': vars['hash_map']['map_forum'][c['poster_id']], } output_items.append(x) return output_items
def GetStudentQuizResponses(vars, quiz_original_id): # DB connections # -------------- s = vars['source'] general_db_selector = db.Selector(s['host'], s['user'], s['password'], s['port'], s['general_db']) table_name = "kvs_course.{}.quiz".format( vars['source']['course_id']) if vars['source'][ 'platform_format'] == 'coursera_1' else "kvs_course.quiz" output = {} quiz_id = quiz_original_id.replace("quiz_", "") course_id = vars['source']['course_id'] quiz_content = GetQuizContent(vars, quiz_id) quiz_question_dict = quiz_content['question_dict'] q = "SELECT * FROM quiz_submission_metadata JOIN `{0}`.hash_mapping USING ({1}) WHERE item_id={2}".format( vars['source']['hash_mapping_db'], vars['general_anon_col_name'], quiz_id) if vars['options']['debug']: q += " AND {} IN ({})".format( vars['general_anon_col_name'], ",".join(vars['hash_map']['qls_general'])) r = general_db_selector.query(q) vars["logger"].Log( vars, "\t\tCounts: Read {} quiz responses from source for {}".format( 0 if r == None else len(r), quiz_original_id)) if r == None: return {} for sub1 in r: anon_user_id = sub1[vars['general_anon_col_name']] try: user_id = vars['hash_map']['map_general'][anon_user_id] except: vars["logger"].Log( vars, "\t\t\tSubmission {} skipped: anon_user_id '{}' not found in hash_mapping" .format(sub1['id'], anon_user_id)) continue sub2 = general_db_selector.query( "SELECT * FROM `{}` WHERE `key`='submission.submission_id:{}'". format(table_name, sub1['id'])) if len(sub2) == 0: vars["logger"].Log( vars, "\t\t\tSubmission {} skipped: Not found in kvs table".format( sub1['id'])) continue sub2 = sub2[0] if user_id not in output.keys(): output[user_id] = {} s = unicode(sub2['value'], errors='ignore') try: value = phpserialize.loads(phpserialize.loads(s)) except: vars['logger'].Log( vars, "\t\t\tFailed to load php-serialized string: {}".format(s)) continue for qid in value['answers'].keys(): grade = -1 # We don't know the grade question_answer = value['answers'][qid] if qid not in quiz_question_dict.keys(): vars['logger'].Log( vars, "\t\t\tA question id found in a student response does not exist in the quiz XML! Question ID in student response: {}, Question IDs from XML: " .format(qid, quiz_question_dict.keys())) continue if "choice_type" in quiz_question_dict[qid].keys(): question_answer = question_answer.values() if quiz_question_dict[qid]["choice_type"] in [ 'select', 'radio' ]: if len(question_answer) == 0: grade = 0 question_answer = None elif len(question_answer) == 1: question_answer = question_answer[0] selected_score_dict = {} for option_group in quiz_question_dict[qid]['options']: for option in option_group: selected_score_dict[ option['id']] = option['selected_score'] if question_answer in selected_score_dict.keys(): grade = selected_score_dict[question_answer] else: vars['logger'].Log( vars, "\t\t\t\tRadio choice_type question, but student response contains multiple selections. Question: {}, Answer: {}" .format(qid, question_answer)) elif quiz_question_dict[qid]["choice_type"] == 'checkbox': pass elif isinstance(question_answer, dict) and len( question_answer.keys()) == 1 and question_answer.keys( )[0] == 'answer': question_answer = question_answer['answer'] else: vars['logger'].Log( vars, "\t\t\tUnexpected answer format for question {}:".format( qid)) if qid not in output[user_id].keys(): output[user_id][qid] = [] if len( output[user_id][qid] ) == 0 or output[user_id][qid][-1]['answer'] != question_answer: output[user_id][qid].append({ 'submission_time': value['saved_time'] if 'saved_time' in value.keys() else None, 'answer': question_answer, 'grade': grade }) return output