Usage: python activities_with_lower_completion.py ''' from collections import defaultdict from base_edx import EdXConnection from generate_csv_report import CSV # Connect to MongoDB and extra the tracking collection connection = EdXConnection('user_attempts_per_problem_id') collection = connection.get_access_to_collection() cursor = collection['user_attempts_per_problem_id'].find() result = defaultdict(lambda: defaultdict(int)) for index, document in enumerate(cursor): # If there is a correct attempts, accept as answered correctly, else accept #as incorrect only once per student per problem id if 'correct' in document['attempts']: result[document['_id']['problem_id']]['correct'] += 1 else: result[document['_id']['problem_id']]['incorrect'] += 1 csv_result = [[item, result[item]['correct'], result[item]['incorrect']] for item in result] output = CSV(csv_result, ['Problem Id', 'Correct Count', 'Incorrect Count'], output_file='activities_with_lower_completion.csv') output.generate_csv()
cursor = collection['format_tests'].find({'parent_data.chapter_display_name' : 'Test 1'}) users_sessions = defaultdict(list) for index,item in enumerate(cursor): #print index, item['parent_data']['chapter_display_name'] users_sessions[(item['username'], item['session'])].append(item['time']) users_tests_events = defaultdict(int) for (username,session),times in users_sessions.iteritems(): end_time = datetime.strptime(max(times).split('+')[0], "%Y-%m-%dT%H:%M:%S.%f") start_time = datetime.strptime(min(times).split('+')[0], "%Y-%m-%dT%H:%M:%S.%f") cursor = collection['tracking'].find({'username' : username, 'session' : session, '$or' : [{'event_type' : 'seq_goto'},{'event_type':'seq_prev'},{'event_type' : 'seq_next'}]}) for index, document in enumerate(cursor): try: time_stamp = datetime.strptime(document['time'].split('+')[0], "%Y-%m-%dT%H:%M:%S.%f") if start_time <= time_stamp <= end_time: if 'sequential_display_name' in document['parent_data'] and document['parent_data']['sequential_display_name']: sequential_display_name = document['parent_data']['sequential_display_name'] elif document['metadata']['display_name']: sequential_display_name = document['metadata']['display_name'] else: sequential_display_name = None #users_tests_events[(username, session, document['parent_data'].get('chapter_display_name', None),document['parent_data'].get('sequential_display_name', None))] += 1 users_tests_events[(username, session, document['parent_data'].get('chapter_display_name', None),sequential_display_name)] += 1 except: print index,document result = [] for (username,session,chapter_name, sequential_name) in users_tests_events: result.append([username,session,chapter_name, sequential_name, users_tests_events[(username,session,chapter_name, sequential_name)]]) output = CSV(result, ['Username', 'Session ID', 'Chapter Display Name', 'Sequential Display Name', 'Navigation Count'], output_file='test1_analysis.csv') output.generate_csv()
''' This module determines how many chapters were accessed by each user for a given course Usage: python chapters_accessed_per_user ''' from collections import defaultdict from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('tracking', 'course_structure') collection = connection.get_access_to_collection() # Get all chapters chapters = collection['course_structure'].distinct( 'parent_data.chapter_display_name') tracking = collection['tracking'].find() result = [] for document in tracking: if 'parent_data' in document: pass output = CSV(result, ['Username'].extend(chapters), output_file='atoc185x_chapters_accesses_per_user.csv') output.generate_csv()
'$match': { 'username': { '$in': usernames }, '$or': [{ 'event_type': 'play_video' }, { 'event_type': 'problem_check', 'event_source': 'server' }] } }, { '$group': { '_id': { "username": "******", "chapter_name": "$parent_data.chapter_display_name", "sequential_name": "$parent_data.sequential_display_name", "vertical_name": "$parent_data.vertical_display_name" } } }]) #, {'$out' : 'students_50_to_59_events'}]) result = [[ document['_id']['username'], document['_id']['chapter_name'], document['_id']['sequential_name'], document['_id']['vertical_name'] ] for document in cursor['result'] if 'chapter_name' in document['_id']] output = CSV(result, ['Username', 'Chapter Name', 'Sequential Name', 'Vertical Name'], output_file='failure_analysis_50_to_59.csv') output.generate_csv()
item['_id'], item['parent_data']['chapter_display_name'], item['metadata']['display_name'], len(children) ] aggregate_vertical = defaultdict(int) aggregate_category = defaultdict(int) for _id in children: try: vertical = collection['course_structure'].find_one({'_id': _id}) aggregate_vertical[vertical['category']] += 1 for _id in vertical['children']: child = collection['course_structure'].find_one({'_id': _id}) aggregate_category[child['category']] += 1 except Exception: pass temp_result.extend([ aggregate_category['video'], aggregate_category['html'], aggregate_category['problem'], aggregate_category['discussion'], aggregate_category['poll_question'], aggregate_category['word_cloud'] ]) result.append(temp_result) output = CSV(result, [ 'Sequential ID', 'Chapter Display Name', 'Sequential Name', 'Number of Verticals', 'Number of Videos', 'Number of HTML', 'Number of Problems', 'Number of Discussions', 'Number of Poll Questions', 'Number of Word Cloud' ], output_file='sequential_aggregation.csv') output.generate_csv()
''' import sys import csv from base_edx import EdXConnection from generate_csv_report import CSV db_name = sys.argv[1] # Change name of collection as required connection = EdXConnection(db_name, 'user_id_map' ) collection = connection.get_access_to_collection() with open(sys.argv[2]) as f: headers = next(f) reader = csv.reader(f) data = [row for row in reader] result = [] for row in data: cursor = collection['user_id_map'].find_one({'id' : long(row[0])}) hash_id = cursor['hash_id'] username = cursor['username'] result.append([row[0], username, hash_id] + row[1:]) input_file, extension = sys.argv[2].split('.') output = CSV(result, [headers.split(',')[0],'Username','User Hash ID'] + headers.split(',')[1:], output_file=input_file+'_userid_anon.'+extension) output.generate_csv()
not_in_auth_user.add(document['student_id']) # For loop to retrieve the names of all the survey pages. Since a student may # not have filled all pages, we look for the longest list and use the values # to retrieve the survey pages survey_question_ids = {} for value in result.values(): if len(value) == 5: temp = {key for item in value for key in item.keys()} if len(temp) > len(survey_question_ids): survey_question_ids = temp survey_question_ids = sorted(list(survey_question_ids)) csv_data = [] for username, survey_info in result.iteritems(): temp = [''] * len(survey_question_ids) for item in survey_info: for key, value in item.iteritems(): try: index = survey_question_ids.index(key) if key in survey_question_ids: temp[index] = value except: pass temp.insert(0, username) csv_data.append(temp) output = CSV(csv_data, ['Username'] + survey_question_ids, output_file=db_name + '_entrance_exit_surveys.csv') output.generate_csv()
''' import csv from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('tracking_atoc185x') collection = connection.get_access_to_collection() # Can replace csv file with any csv file that contains the list of usernames # who completed the course and achieved a certificate. Alternately, one can # save that info in another collection in mongoDB and extra it from the collection with open('atoc185x/course_completers.csv', 'r') as csv_file: reader = csv.reader(csv_file) reader.next() usernames = {row[1] for row in reader} cursor = collection['tracking_atoc185x'].find( {'event_type': 'show_transcript'}) result = [] seen = set() for document in cursor: if document['username'] in usernames and document['username'] not in seen: seen.add(document['username']) result.append([document['username']]) output = CSV(result, ['Username'], output_file='show_transcript_completers.csv') output.generate_csv()
with open('csv_files/McGillX_CHEM181x_1T2014_grade_report_2014-04-24-1030.csv', 'r') as csv_file: reader = csv.reader(csv_file) reader.next() usernames = [row[2] for row in reader] NAVIGATION_TABS = {'/courses/McGillX/ATOC185x/2T2014/info' : 'info', '/courses/McGillX/ATOC185x/2T2014/progress' : 'progress', '/courses/McGillX/ATOC185x/2T2014/109d5374b52040e2a8b737cf90c5618a/' : 'syllabus', '/courses/McGillX/ATOC185x/2T2014/441b2c519f5c464883e2ddceb26c5559/' : 'maps','/courses/McGillX/ATOC185x/2T2014/84f630e833eb4dbabe0a6c45c52bb443/' : 'scoreboard' , '/courses/McGillX/ATOC185x/2T2014/e75195cb39fa4e3890a613a1b3c04c7d/' : 'faq', 'courseware' : 'courseware', 'discussion': 'discussion', '/courses/McGillX/ATOC185x/2T2014/instructor' : 'instructor'} cursor = collection['tracking_atoc185x'].find({'username' : {'$in' : usernames},'event_type' : { '$regex' : '^/courses/McGillX/ATOC185x/2T2014/(info$|progress$|instructor$|109d5374b52040e2a8b737cf90c5618a/$|441b2c519f5c464883e2ddceb26c5559/$|84f630e833eb4dbabe0a6c45c52bb443/$|e75195cb39fa4e3890a613a1b3c04c7d/$|courseware|discussion)'}}) tab_events_per_date = defaultdict(int) for doc in cursor: date = datetime.strptime(doc['time'].split('T')[0], "%Y-%m-%d").date() if 'courseware' in doc['event_type']: tab_events_per_date[(date,'courseware')] += 1 elif 'discussion' in doc['event_type']: tab_events_per_date[(date, 'discussion')] += 1 else: tab_events_per_date[(date, doc['event_type'])] += 1 result = [] for date, tab in tab_events_per_date: result.append([date,tab, tab_events_per_date[(date,tab)]]) output = CSV(result, ['Date','Tab ID','Number of Events'], output_file='number_of_tab_events_per_date_completers.csv') output.generate_csv() #with open('csv_files/number_of_tab_events_per_date_completers.csv', 'w') as csv_file: # writer = csv.writer(csv_file) # writer.writerow(['Date','Tab ID','Number of Events']) # for date,tab in tab_events_per_date: # writer.writerow([date,tab, tab_events_per_date[(date,tab)] ])
''' This module extracts the student IDs from the collection certificates_generatedcertificate of the students who completed the course and achieved a certificate. The ids are then used to extract the usernames of the course completers Usage: python course_completers.py ''' from collections import defaultdict from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('certificates_generatedcertificate', 'auth_user') collection = connection.get_access_to_collection() completers = collection['certificates_generatedcertificate'].find({'status' : 'downloadable'}) result = [] for document in completers: user_document = collection['auth_user'].find_one({"id" : document['user_id']}) result.append([user_document['id'],user_document['username'], document['name'], document['grade']]) output = CSV(result, ['User ID','Username', 'Name', 'Grade'], output_file='course_completers.csv') output.generate_csv()
from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('seek_video') collection = connection.get_access_to_collection() sort_parameters = [('parent_data.chapter_display_name', 1), ('parent_data.sequential_display_name', 1), ('parent_data.vertical_display_name', 1)] cursor = collection['seek_video'].find() result = [] for index, item in enumerate(cursor): if 'old_time' in item['event']: old_time = item['event']['old_time'] else: old_time = 0 result.append([ item['username'], item['parent_data']['chapter_display_name'], item['parent_data']['sequential_display_name'], item['parent_data']['vertical_display_name'], old_time, item['event']['new_time'] ]) output = CSV(result, [ 'Username', 'Chapter Name', 'Sequential Name', 'Vertical Name', 'Old Time', 'New Time' ], output_file='seek_video.csv', row_limit=200000) output.generate_csv()
username, video associated with load_video event, parent_data: {chapter_display_name, sequential_display_name, vertical_display_name,}, edx_video_id, video watch segments get the event_types : load_video, play_video, pause_video, seek_video sort by "time": "" so that the events are chronologically ordered for each load_video new video watch segment should include ONLY: - time between play_video -> next video event in time (pause_video or seek_video) - time between seek_video : {'new_time' : Time} -> pause video (only with new_time > old_time, this is to avoid including rewinds) watch periods: event_type : pause_video - "event_type":"play_video" {"event":{"currentTime":TIME}} = new video watch segment if seek_video : {'old_time' : TIME} < seek_video : {'new_time' : TIME} "pause_video" {"event":{"currentTime":TIME}} - seek_video : {'new_time': TIME } = new video watch segment if seek_video : {'old_time' : TIME} > seek_video : {'new_time' : TIME} = rewind (exclude from watch segments) ''' from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('video_watch_duration_collection') collection = connection.get_access_to_collection() cursor = collection['video_watch_duration_collection'].find() result = [] output = CSV(result,['Username',], output_file='video_watch_duration.csv', row_limit=200000) output.generate_csv()
from collections import defaultdict import json import sys from base_edx import EdXConnection from generate_csv_report import CSV db_name = sys.argv[1] # Change name of collection as required connection = EdXConnection(db_name, 'forum') collection = connection.get_access_to_collection() forum_data = collection['forum'].find() csv_data = [] for document in forum_data: csv_data.append([ document['_id']['oid'], document['author_username'], document['_type'], document.get('title', ''), document['body'], document['created_at']['date'] ]) headers = ['ID', 'Author Username', 'Type', 'Title', 'Body', 'Created At Date'] output = CSV(csv_data, headers, output_file=db_name + '_forum_data.csv') output.generate_csv()
print "Fail -> %s" % item fail.append(item) print "Number of fail: " + str(len(fail)) if fail: import json with open('report.txt', 'w') as outfile: json.dump(fail, outfile) else: print "no fail" result = [] for item in users_to_sessions: for nested_item in users_to_sessions[item]: max_time = max(users_to_sessions[item][nested_item]) end_time = datetime.strptime( max_time.split('+')[0], "%Y-%m-%dT%H:%M:%S.%f") min_time = min(users_to_sessions[item][nested_item]) start_time = datetime.strptime( min_time.split('+')[0], "%Y-%m-%dT%H:%M:%S.%f") result.append([ item, nested_item, len(users_to_sessions[item][nested_item]), start_time, end_time, end_time - start_time ]) output = CSV(result, [ 'Username', 'Session ID', 'Number of Events', 'Start Time', 'End Time', 'Time Spent' ], output_file='session_info.csv') output.generate_csv()
from generate_csv_report import CSV connection = EdXConnection('tracking_atoc185x') collection = connection.get_access_to_collection() with open('csv_files/McGillX_CHEM181x_1T2014_grade_report_2014-04-24-1030.csv', 'r') as csv_file: reader = csv.reader(csv_file) usernames = [row[2] for row in reader] cursor = collection['tracking_atoc185x'].aggregate([{"$match" : {"event_source" : "browser", "$or" : [{"event_type" : "seq_prev"},{"event_type" : "seq_goto"},{"event_type" : "seq_next"}], 'username' : {'$in' : usernames}}}, {"$group" : {"_id" : {'chapter_name' : "$parent_data.chapter_display_name", "display_name" : "$metadata.display_name", "event_type" : "$event_type", "event_old" : "$event.old", "event_new" : "$event.new"}, "count" : {"$sum" : 1}}}]) #with open('csv_files/navigation_frequency_completers.csv', 'w') as csv_file: # writer = csv.writer(csv_file) # writer.writerow(['Chapter Name', 'Display Name', 'Event Type', 'Event Old', 'Event New', 'Count']) # for item in cursor['result']: # try: # writer.writerow([item['_id']['chapter_name'], item['_id']['display_name'], item['_id']['event_type'], item['_id'].get('event_old', 0), item['_id']['event_new'], item['count']]) # except: # pass result = [] for item in cursor['result']: try: result.append([item['_id']['chapter_name'], item['_id']['display_name'], item['_id']['event_type'], item['_id'].get('event_old', 0), item['_id']['event_new'], ite m['count']]) except: pass output = CSV(result, ['Chapter Name', 'Display Name', 'Event Type', 'Event Old', 'Event New', 'Count'], output_file='navigation_frequency_completers.csv') output.generate_csv()
from datetime import datetime from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('student_courseenrollment') collection = connection.get_access_to_collection() # Can replace csv file with any csv file that contains the list of usernames # who completed the course and achieved a certificate. Alternately, one can # save that info in another collection in mongoDB and extra it from the collection with open('atoc185x/course_completers.csv') as csv_file: reader = csv.reader(csv_file) reader.next() users = {row[0]: row[1] for row in reader} result = [] student_courseenrollment = collection['student_courseenrollment'].find() seen = set() for document in student_courseenrollment: if str(document['user_id']) in users and document['user_id'] not in seen: seen.add(document['user_id']) result.append([ document['user_id'], users[str(document['user_id'])], document['created'].split()[0] ]) output = CSV(result, ['Username', 'Date of Registration'], output_file='date_of_registration_completers.csv') output.generate_csv()
def _generate_name_from_problem_id(problem_id): ''' Generate name of csv output file from problem id ''' return '_'.join(problem_id.split('/')[3:]) + '.csv' cursor = collection['atoc185x_problem_ids'].find( {'event.problem_id': sys.argv[1]}) #cursor = collection['atoc185x_problem_ids'].aggregate([{'$match' : #{'problem_id':sys.argv[1]}}, {'$group' : { '_id' : {'username' : '$username', #'attempt_number' : '$event.attempts', 'time' : '$time','answers' : '$event.answers', #'success' : '$event.success', 'grade' : '$event.grade', 'max_grade' : '$event.max_grade'}}}]) result = [] for document in cursor: result.append([ document['username'], document['event']['attempts'], document['module']['display_name'], document['time'], document['event']['success'], document['event']['grade'], document['event']['max_grade'], document['event']['answers'] ]) csv_report_name = _generate_name_from_problem_id(sys.argv[1]) output = CSV(result, [ 'Username', 'Attempt Number', 'Module', 'Time', 'Success', 'Grade Achieved', 'Max Grade', 'Answers' ], output_file=csv_report_name) output.generate_csv()
'^/courses/McGillX/ATOC185x/2T2014/(info$|progress$|instructor$|109d5374b52040e2a8b737cf90c5618a/$|441b2c519f5c464883e2ddceb26c5559/$|84f630e833eb4dbabe0a6c45c52bb443/$|e75195cb39fa4e3890a613a1b3c04c7d/$|courseware|discussion)' } }) unique_users_per_tab = defaultdict(set) for doc in cursor: if 'courseware' in doc['event_type']: unique_users_per_tab['courseware'].add(doc['username']) elif 'discussion' in doc['event_type']: unique_users_per_tab['discussion'].add(doc['username']) else: unique_users_per_tab[doc['event_type']].add(doc['username']) #with open('csv_files/number_of_unique_users_per_navigation_tab.csv', 'w') as csv_file: # writer = csv.writer(csv_file) # writer.writerow(['Navigation Tab', 'Number of Unique Users']) # for key in unique_users_per_tab: # writer.writerow([key, len(unique_users_per_tab[key])]) #with open('csv_files/users_per_navigation_tab.csv', 'w') as csv_file: # writer = csv.writer(csv_file) # writer.writerow(['Navigation Tab','Tab', 'Number of Unique Users']) # for key in unique_users_per_tab: # writer.writerow([key,NAVIGATION_TABS[key] ,len(unique_users_per_tab[key])]) result = [] for key in unique_users_per_tab: result.append([key, NAVIGATION_TABS[key], len(unique_users_per_tab[key])]) output = CSV(result, ['Navigation Tab', 'Tab', 'Number of Unique Users'], output_file='number_of_unique_users_per_navigation_tab.csv') output.generate_csv()
try: country_code = geoip.country(value) country = country_code_to_country[country_code] if not key: key = 'anonymous' ip_to_country.append([key, value, country_code, country]) elif (key, country) not in country_set: country_set.add((key, country)) ip_to_country.append([key, value, country_code, country]) except: # IMPORTANT # The following code for an exception are hardcoded for those IPs which do have a mapping to a # country code but they were not available in GeoIP.dat (most probably because it was not updated) # People using this script can either report this code (under except) and or additional conditions # IP addresses which cannot be mapped to a country code stored in GeoIP.dat if value == '41.79.120.29': country = country_code_to_country['SS'] if not key: key = 'anonymous' ip_to_country.append( [key, value, 'SS', country_code_to_country['SS']]) elif (key, country) not in country_set: country_set.add((key, country)) ip_to_country.append( [key, value, 'SS', country_code_to_country['SS']]) output = CSV(ip_to_country, ['Username', 'IP Address', 'Country Code', 'Country'], output_file=db_name + '_ip_to_country.csv') output.generate_csv()
student_answers_values[key] = mapped_values else: student_answers_values[key] = values if username not in output: output[username] = { 'problem_id': problem_id, 'student_answers': student_answers_values, 'attempts': student_attempts } else: if student_attempts > output[username]['attempts']: output[username]['problem_id'] = problem_id output[username]['student_answers'] = student_answers_values output[username]['attempts'] = student_attempts except: print "Key Error!!!", key, values, problem_id result = [] for key, values in output.iteritems(): username = key problem_id = values['problem_id'] for question, answer in values['student_answers'].iteritems(): result.append([username, problem_id, question, answer]) output = CSV(result, ['Username', 'Problem ID', 'Student Answers'], output_file=db_name + '_student_answers.csv') output.generate_csv() #with open(db_name + '_student_answers', 'w') as f: # json.dump(output, f)
from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('certificates_generatedcertificate', 'auth_userprofile') collection = connection.get_access_to_collection() documents = collection['auth_userprofile'].find() result = [] for document in documents: user_id = document['user_id'] try: final_grade = collection['certificates_generatedcertificate'].find_one( {'user_id': user_id})['grade'] result.append([ user_id, document['name'], final_grade, document['gender'], document['year_of_birth'], document['level_of_education'], document['country'], document['city'] ]) except: # Handle users with no grades pass output = CSV(result, [ 'User ID', 'Username', 'Final Grade', 'Gender', 'Year of Birth', 'Level of Education', 'Country', 'City' ], output_file='atoc185x_user_info.csv') output.generate_csv()
Since we will need to sort a very large number of documents, you should create a separate collection to aggregate all required documents in one collection and then extract results from the new collection. Command to run on the mongo shell to create new collection: db.tracking_atoc185x.aggregate([{$match : {$and : [{"event_type" : "speed_change_video"},{ "parent_data": { $exists: true } }]}}, {$sort : {"parent_data.chapter_display_name" : 1, "parent_data.sequential_display_name" : 1, "parent_data.vertical_display_name" : 1}}, {$out : "speed_change_video_data"}], {allowDiskUse : true}) Usage: python speed_change_video.py ''' from base_edx import EdXConnection from generate_csv_report import CSV connection = EdXConnection('speed_change_video_data') collection = connection.get_access_to_collection() cursor = collection['speed_change_video_data'].find() result = [[ item['username'], item['parent_data']['chapter_display_name'], item['parent_data']['sequential_display_name'], item['parent_data']['vertical_display_name'], item['event']['old_speed'], item['event']['new_speed'] ] for item in cursor] output = CSV(result, [ 'Username', 'Chapter Name', 'Sequential Name', 'Vertical Name', 'Old Speed', 'New Speed' ], output_file='speed_change.csv') output.generate_csv()