db.tracking_atoc185x.aggregate([{$match : {event_type : 'problem_check', 'event_source': 'server'}}, {$group : {_id : {"username" : "$username", "problem_id" : "$event.problem_id"}, attempts : {$push : "$event.success"}}}, {$out : "user_attempts_per_problem_id"}]) Then run this script on the above collection Usage: python activities_with_lower_completion.py ''' from collections import defaultdict from common.base_edx import EdXConnection from common.generate_csv_report import CSV # Connect to MongoDB and extra the tracking collection connection = EdXConnection('user_attempts_per_problem_id') collection = connection.get_access_to_collection() cursor = collection['user_attempts_per_problem_id'].find() result = defaultdict(lambda: defaultdict(int)) for index,document in enumerate(cursor): # If there is a correct attempts, accept as answered correctly, else accept #as incorrect only once per student per problem id if 'correct' in document['attempts']: result[document['_id']['problem_id']]['correct'] += 1 else: result[document['_id']['problem_id']]['incorrect'] += 1 csv_result = [[item, result[item]['correct'], result[item]['incorrect']] for item in result] output = CSV(csv_result, ['Problem Id', 'Correct Count', 'Incorrect Count'], output_file='activities_with_lower_completion.csv') output.generate_csv()
''' This module calculates the number of forum threads and posts for a given course stored in the MongoDB database Usage: python <path_to_script> ''' from common.base_edx import EdXConnection connection = EdXConnection('forum' ) collection = connection.get_access_to_collection() # Number of documents with _type CommentThread # A CommentThread represents the first level of interaction: a post that opens #a new thread, often a student question of some sort number_of_comment_threads = collection['forum'].find({'_type' : 'CommentThread'}).count() # Total number of comments in the forum number_of_posts = collection['forum'].find().count() print number_of_posts, number_of_comment_threads
''' This module gets the number of completers who did each activity ''' import csv from datetime import datetime from collections import defaultdict import sys from common.base_edx import EdXConnection connection = EdXConnection('tracking' ) collection = connection.get_access_to_collection() with open('csv_files/McGillX_CHEM181x_1T2014_grade_report_2014-04-24-1030.csv', 'r') as csv_file: reader = csv.reader(csv_file) #usernames = [row[2] for row in reader] usernames = [row[2] for row in reader] cursor = collection['tracking'].aggregate([{'$match' : {'username' : {'$in' : usernames}}},{'$group' : { '_id' : { "chapter_name" : "$parent_data.chapter_display_name" ,"sequential_name" : "$parent_data.sequential_display_name","vertical_name" : "$parent_data.vertical_display_name"},'students' : {'$addToSet':"$username"}}}, {'$unwind' : "$students"} ,{'$group' : {'_id': "$_id",' num_of_students' : {'$sum' : 1}}}, {'$out' : sys.argv[1]}])
db.tracking_atoc185x.aggregate([{$match : {event_type : 'problem_check', 'event_source': 'server'}}, {$group : {_id : {"username" : "$username", "problem_id" : "$event.problem_id"}, attempts : {$push : "$event.success"}}}, {$out : "user_attempts_per_problem_id"}]) Then run this script on the above collection Usage: python activities_with_lower_completion.py ''' from collections import defaultdict from common.base_edx import EdXConnection from common.generate_csv_report import CSV # Connect to MongoDB and extra the tracking collection connection = EdXConnection('user_attempts_per_problem_id') collection = connection.get_access_to_collection() cursor = collection['user_attempts_per_problem_id'].find() result = defaultdict(lambda: defaultdict(int)) for index, document in enumerate(cursor): # If there is a correct attempts, accept as answered correctly, else accept #as incorrect only once per student per problem id if 'correct' in document['attempts']: result[document['_id']['problem_id']]['correct'] += 1 else: result[document['_id']['problem_id']]['incorrect'] += 1 csv_result = [[item, result[item]['correct'], result[item]['incorrect']] for item in result] output = CSV(csv_result, ['Problem Id', 'Correct Count', 'Incorrect Count'],
''' This module gets all the events per user while watching videos. Since we will need to sort a very large number of documents, user should create a separate collection to aggregate all required documents in one collection and then extract results from the new collection Command to run on the mongo shell to creare new collection: db.tracking_atoc185x.aggregate([{$match : {$and : [{"event_type" : "seek_video"},{ "parent_data": { $exists: true } }]}}, {$sort : {"parent_data.chapter_display_name" : 1, "parent_data.sequential_display_name" : 1, "parent_data.vertical_display_name" : 1}}, {$out : "seek_video"}, {allowDiskUse : true}]) ''' from common.base_edx import EdXConnection from common.generate_csv_report import CSV connection = EdXConnection('seek_video') collection = connection.get_access_to_collection() sort_parameters = [('parent_data.chapter_display_name', 1), ('parent_data.sequential_display_name', 1), ('parent_data.vertical_display_name', 1)] cursor = collection['seek_video'].find() result = [] for index, item in enumerate(cursor): if 'old_time' in item['event']: old_time = item['event']['old_time'] else: old_time = 0 result.append([ item['username'], item['parent_data']['chapter_display_name'], item['parent_data']['sequential_display_name'], item['parent_data']['vertical_display_name'], old_time, item['event']['new_time']
if len(sys.argv) < 3: usage_message = """ No problem id given as a command line argument. Please provide a problem_id Usage: python -m problem_ids.get_csv_report_by_problem_id <db_name> <problem_id> [--final_attempts] """ sys.stderr.write(usage_message) sys.exit(1) db_name = sys.argv[1] problem_id = sys.argv[2] final_attempts = True if len(sys.argv) == 4 else False connection = EdXConnection(db_name, 'problem_ids') collection = connection.get_access_to_collection() def _generate_name_from_problem_id(problem_id, display_name): '''Generate name of csv output file from problem id''' attempts_name = '_FinalAttempts' if final_attempts else '_AllAttempts' return ('_'.join(problem_id.split('/')[3:]) + '_' + ''.join(e for e in display_name if e.isalnum()) + attempts_name + '.csv') cursor = collection['problem_ids'].find({'event.problem_id': problem_id}) display_name = cursor[0]['module']['display_name'] one_record = cursor[0]['event'] problem_ids_keys = sorted(one_record['correct_map'].keys(),
to their hash ids and return a new csv_report Usage: python username_to_hash_id_reports.py db_name csv_report ''' import sys import csv from common.base_edx import EdXConnection from common.generate_csv_report import CSV db_name = sys.argv[1] # Change name of collection as required connection = EdXConnection(db_name, 'user_id_map') collection = connection.get_access_to_collection() with open(sys.argv[2]) as f: headers = next(f) reader = csv.reader(f) data = [row for row in reader] result = [] for row in data: username = row[0] if username.isdigit(): username = int(username) cursor = collection['user_id_map'].find_one({'username': username}) if cursor: hash_id = cursor['hash_id']
from common.generate_csv_report import CSV # If you have access to the grade report provided by edX, you can use the following # 7 lines of code to get all usernames with grades between 50% and 59% inclusive #with open('csv_files/grades_report.csv') as f: # reader = csv.reader(f) # header = reader.next() # usernames = [row[2] for row in reader if '0.5' <= row[3] <= '0.59'] #connection = EdXConnection('tracking') #collection = connection.get_access_to_collection() #cursor = collection['tracking'].aggregate([{'$match' : {'username' : {'$in' : usernames}, '$or': [{'event_type' : 'play_video'},{'event_type' : 'problem_check', 'event_source' : 'server'}]}},{'$group' : { '_id' : { "username" : "$username", "chapter_name" : "$parent_data.chapter_display_name" ,"sequential_name" : "$parent_data.sequential_display_name","vertical_name" : "$parent_data.vertical_display_name"}}}, {'$out' : 'students_50_to_59_events'}]) # Else you can extract the names of the students with grades betweeb 50% and 59% # inclusive from the collection certificates_generatedcertificate from the # following lines of code connection = EdXConnection('tracking_atoc185x', 'auth_user', 'certificates_generatedcertificate') collection = connection.get_access_to_collection() # Get all user ids of students with grades between 50% and 59% inclusive from # the collection certificates_generatedcertificate user_ids = { document['user_id'] for document in collection['certificates_generatedcertificate'].find( {'$and': [{ 'grade': { '$gte': 0.5 } }, { 'grade': { '$lte': 0.59 }
Usage: python navigation_tabs_data_date.py ''' import csv from datetime import datetime from collections import defaultdict import sys from common.base_edx import EdXConnection from common.generate_csv_report import CSV connection = EdXConnection('tracking_atoc185x') collection = connection.get_access_to_collection() # Get all users who completed the course. If you do not have a CSV with list # of users who had completed the course, you will have to extra it from the # MongoDB database with open('csv_files/McGillX_CHEM181x_1T2014_grade_report_2014-04-24-1030.csv', 'r') as csv_file: reader = csv.reader(csv_file) reader.next() usernames = [row[2] for row in reader] NAVIGATION_TABS = {'/courses/McGillX/ATOC185x/2T2014/info' : 'info', '/courses/McGillX/ATOC185x/2T2014/progress' : 'progress', '/courses/McGillX/ATOC185x/2T2014/109d5374b52040e2a8b737cf90c5618a/' : 'syllabus', '/courses/McGillX/ATOC185x/2T2014/441b2c519f5c464883e2ddceb26c5559/' : 'maps','/courses/McGillX/ATOC185x/2T2014/84f630e833eb4dbabe0a6c45c52bb443/' : 'scoreboard' , '/courses/McGillX/ATOC185x/2T2014/e75195cb39fa4e3890a613a1b3c04c7d/' : 'faq', 'courseware' : 'courseware', 'discussion': 'discussion', '/courses/McGillX/ATOC185x/2T2014/instructor' : 'instructor'} cursor = collection['tracking_atoc185x'].find({'username' : {'$in' : usernames},'event_type' : { '$regex' : '^/courses/McGillX/ATOC185x/2T2014/(info$|progress$|instructor$|109d5374b52040e2a8b737cf90c5618a/$|441b2c519f5c464883e2ddceb26c5559/$|84f630e833eb4dbabe0a6c45c52bb443/$|e75195cb39fa4e3890a613a1b3c04c7d/$|courseware|discussion)'}}) tab_events_per_date = defaultdict(int)
Since we will need to sort a very large number of documents, you should create a separate collection to aggregate all required documents in one collection and then extract results from the new collection. Command to run on the mongo shell to create new collection: db.tracking_atoc185x.aggregate([{$match : {$and : [{"event_type" : "speed_change_video"},{ "parent_data": { $exists: true } }]}}, {$sort : {"parent_data.chapter_display_name" : 1, "parent_data.sequential_display_name" : 1, "parent_data.vertical_display_name" : 1}}, {$out : "speed_change_video_data"}], {allowDiskUse : true}) Usage: python speed_change_video.py ''' from common.base_edx import EdXConnection from common.generate_csv_report import CSV connection = EdXConnection('speed_change_video_data') collection = connection.get_access_to_collection() cursor = collection['speed_change_video_data'].find() result = [[ item['username'], item['parent_data']['chapter_display_name'], item['parent_data']['sequential_display_name'], item['parent_data']['vertical_display_name'], item['event']['old_speed'], item['event']['new_speed'] ] for item in cursor] output = CSV(result, [ 'Username', 'Chapter Name', 'Sequential Name', 'Vertical Name', 'Old Speed', 'New Speed' ], output_file='speed_change.csv') output.generate_csv()
db.tracking.aggregate([{$match:{$and:[{'event_source':'browser'},{$or:[{'event_type':'play_video'},{'event_type':'speed_change_video'},{'event_type':'seq_goto'}, {'event_type':'seq_next'}, {'event_type':'seq_prev'}, {'event_type':'page_close'}, {'event_type':'play_video'},{'event_type':'pause_video'}, {'event_type':'seek_video'}, {'event_type':'pause_video'}]}]}}, {$project:{"username":1, "event_type":1, 'time':1,"event":1}},{ $sort: {'username':1,'time':1}},{$out: "video_watching"}],{allowDiskUse:true}) or db.runCommand({aggregate:'tracking',pipeline:[{$match:{$and:[{'event_source':'browser'},{$or:[{'event_type':'play_video'},{'event_type':'speed_change_video'},{'event_type':'seq_goto'}, {'event_type':'seq_next'}, {'event_type':'seq_prev'}, {'event_type':'page_close'}, {'event_type':'play_video'},{'event_type':'pause_video'}, {'event_type':'seek_video'}, {'event_type':'pause_video'}]}]}}, {$project:{"username":1, "event_type":1, 'time':1,"event":1}},{ $sort: {'username':1,'time':1}},{$out: "video_watching"}],allowDiskUse:true}) ''' import sys import csv import time from datetime import datetime from common.base_edx import EdXConnection from common.generate_csv_report import CSV db_name = sys.argv[1] eventCollection = 'video_watching' connection = EdXConnection(db_name, eventCollection) collection = connection.get_access_to_collection() students = collection[eventCollection].distinct('username') print print students print len(students) watch_durations = [] start_event_time = {'blank'} end_event_time = {'blank'} errors = [] count_errors = 0 for student in students: cursor = collection[eventCollection].find({'username': student})
''' This module gets the date of registration of all users who completed the course Usage: python date_of_registration_completers.py ''' import csv from datetime import datetime from common.base_edx import EdXConnection from common.generate_csv_report import CSV connection = EdXConnection('student_courseenrollment') collection = connection.get_access_to_collection() # Can replace csv file with any csv file that contains the list of usernames # who completed the course and achieved a certificate. Alternately, one can # save that info in another collection in mongoDB and extra it from the collection with open('atoc185x/course_completers.csv') as csv_file: reader = csv.reader(csv_file) reader.next() users = {row[0]: row[1] for row in reader} result = [] student_courseenrollment = collection['student_courseenrollment'].find() seen = set() for document in student_courseenrollment: if str(document['user_id']) in users and document['user_id'] not in seen:
''' This module retrieve the first activity of all user who completed a course The list of users who has completed the course was provided in a given csv file, McGillX_CHEM181x_1T2014_grade_report_2014-04-24-1030.csv. When these users are extracted, we then check the database for all of these users and detect their first activity. First activity is defined by the event_type '/courses/McGillX/CHEM181x/1T2014/info' ''' import csv from datetime import datetime from collections import defaultdict from common.base_edx import EdXConnection # Connect to MongoDB and extra the tracking collection connection = EdXConnection('tracking', 'tracking_before_jan22') collection = connection.get_access_to_collection() # Retrieve users who has completed the course. This could be done anyway depending on what is provided with open('csv_files/McGillX_CHEM181x_1T2014_grade_report_2014-04-24-1030.csv', 'r') as csv_file: reader = csv.reader(csv_file) #usernames = [row[2] for row in reader] usernames = [row[2] for row in reader] # Retrieve the time of the first activity of all users who completed the course time_events = defaultdict(list) cursor = collection['tracking'].find( {'event_type': '/courses/McGillX/CHEM181x/1T2014/info'}) #cursor_before_jan_22 = collection['tracking_before_jan22'].find({'event_type' : '/courses/McGillX/CHEM181x/1T2014/info'}) with open('csv_files/first_activity_completers.csv', 'w') as csv_file:
''' This module gets the number of navigation events for each user while they are taking Test 1 ''' from collections import defaultdict import time from datetime import datetime from common.base_edx import EdXConnection from common.generate_csv_report import CSV connection = EdXConnection('format_tests', 'tracking') collection = connection.get_access_to_collection() cursor = collection['format_tests'].find( {'parent_data.chapter_display_name': 'Test 1'}) users_sessions = defaultdict(list) for index, item in enumerate(cursor): #print index, item['parent_data']['chapter_display_name'] users_sessions[(item['username'], item['session'])].append(item['time']) users_tests_events = defaultdict(int) for (username, session), times in users_sessions.iteritems(): end_time = datetime.strptime( max(times).split('+')[0], "%Y-%m-%dT%H:%M:%S.%f") start_time = datetime.strptime( min(times).split('+')[0], "%Y-%m-%dT%H:%M:%S.%f") cursor = collection['tracking'].find({ 'username': username, 'session': session,
Usage: python -m problem_ids.create_problem_ids_collection.py <db_name> ''' from common.base_edx import EdXConnection import sys db_name = sys.argv[1] # The second argument in line 27 is the name of the new collection which will # contain the results of this script. Each new document will be inserted into # this new collection. The name of the resulting collection could be anything; # preferrably relevant to the course connection = EdXConnection(db_name, 'test1', 'tracking', 'user_id_map', 'problem_ids') collection = connection.get_access_to_collection() # Drop problem_ids collection if exists collection['problem_ids'].drop() cursor = collection['tracking'].find({'event_type' : 'problem_check', 'event_source' : 'server'}) for document in cursor: doc_result = {} username = document['username'] if username.isdigit(): username = int(username) doc_result['username'] = username user_id_map = collection['user_id_map'].find_one({'username' : username}) if not user_id_map: print "Username {0} not found in collection user_id_map".format(username)