def quiz_sessions(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) # Processing events data submission_event_collection = [] # Problem check submission_event_collection.append("problem_check") # Server submission_event_collection.append("save_problem_check") submission_event_collection.append("problem_check_fail") submission_event_collection.append("save_problem_check_fail") # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully. submission_event_collection.append("problem_graded") # The server emits problem_rescore events when a problem is successfully rescored. submission_event_collection.append("problem_rescore") submission_event_collection.append("problem_rescore_fail") submission_event_collection.append("problem_reset") # event_source: serve submission_event_collection.append("reset_problem") submission_event_collection.append("reset_problem_fail") # The server emits problem_save events after a user saves a problem. submission_event_collection.append("problem_save") # event_source: server submission_event_collection.append("save_problem_fail") submission_event_collection.append("save_problem_success") # Show answer submission_event_collection.append("problem_show") submission_event_collection.append("showanswer") current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) log_files = os.listdir(log_path) child_parent_map = course_metadata_map["child_parent_map"] learner_all_event_logs = {} updated_learner_all_event_logs = {} quiz_sessions = {} while True: if current_date == end_next_date: break; for file in log_files: if current_date in file: print file learner_all_event_logs.clear() learner_all_event_logs = updated_learner_all_event_logs.copy() updated_learner_all_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_all_event_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(log_path + file,"r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S") if learner_all_event_logs.has_key(course_learner_id): learner_all_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type}) else: learner_all_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}] # For quiz session separation for course_learner_id in learner_all_event_logs.keys(): event_logs = learner_all_event_logs[course_learner_id] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if session_id == "": if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"] or event_logs[i]["event_type"] in submission_event_collection: event_type_array = event_logs[i]["event_type"].split("/") if "problem+block" in event_logs[i]["event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i]["event_type"]: question_id = event_type_array[6].replace(";_", "/") if question_id in child_parent_map.keys(): parent_block_id = child_parent_map[question_id] session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"] or event_logs[i]["event_type"] in submission_event_collection: if event_logs[i]["event_time"] > end_time + datetime.timedelta(hours=0.5): if quiz_sessions.has_key(session_id): quiz_sessions[session_id]["time_array"].append({"start_time":start_time, "end_time":end_time}) else: quiz_sessions[session_id] = {"course_learner_id":course_learner_id, "time_array":[{"start_time":start_time, "end_time":end_time}]} final_time = event_logs[i]["event_time"] if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"] or event_logs[i]["event_type"] in submission_event_collection: event_type_array = event_logs[i]["event_type"].split("/") if "problem+block" in event_logs[i]["event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i]["event_type"]: question_id = event_type_array[6].replace(";_", "/") if question_id in child_parent_map.keys(): parent_block_id = child_parent_map[question_id] session_id = "quiz_session_" + parent_block_id + "_" +course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: session_id = "" start_time = "" end_time = "" else: end_time = event_logs[i]["event_time"] else: if event_logs[i]["event_time"] <= end_time + datetime.timedelta(hours=0.5): end_time = event_logs[i]["event_time"] if quiz_sessions.has_key(session_id): quiz_sessions[session_id]["time_array"].append({"start_time":start_time, "end_time":end_time}) else: quiz_sessions[session_id] = {"course_learner_id":course_learner_id, "time_array":[{"start_time":start_time, "end_time":end_time}]} final_time = event_logs[i]["event_time"] session_id = "" start_time = "" end_time = "" if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] >= final_time: new_logs.append(log) updated_learner_all_event_logs[course_learner_id] = new_logs current_date = getNextDay(current_date) # To compress the session event_logs for session_id in quiz_sessions.keys(): if len(quiz_sessions[session_id]["time_array"]) > 1: start_time = "" end_time = "" updated_time_array = [] for i in range(len(quiz_sessions[session_id]["time_array"])): if i == 0: start_time = quiz_sessions[session_id]["time_array"][i]["start_time"] end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] else: if quiz_sessions[session_id]["time_array"][i]["start_time"] > end_time + datetime.timedelta(hours=0.5): updated_time_array.append({"start_time":start_time, "end_time":end_time}) start_time = quiz_sessions[session_id]["time_array"][i]["start_time"] end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] if i == len(quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({"start_time":start_time, "end_time":end_time}) else: end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] if i == len(quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({"start_time":start_time, "end_time":end_time}) quiz_sessions[session_id]["time_array"] = updated_time_array quiz_session_record = [] for session_id in quiz_sessions.keys(): course_learner_id = quiz_sessions[session_id]["course_learner_id"] for i in range(len(quiz_sessions[session_id]["time_array"])): start_time = quiz_sessions[session_id]["time_array"][i]["start_time"] end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] if start_time < end_time: duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds final_session_id = session_id + "_" + str(start_time) + "_" + str(end_time) if duration > 5: array = [final_session_id, course_learner_id, start_time, end_time, duration] quiz_session_record.append(array) # Database version for array in quiz_session_record: session_id = array[0] course_learner_id = array[1] start_time = array[2] end_time = array[3] duration = process_null(array[4]) sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" data = (session_id, course_learner_id, start_time, end_time, duration) cursor.execute(sql, data) '''
def sessions(metadata_path, daily_log_path, remaining_session_log_path, cursor): utc = pytz.UTC # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) end_date = course_metadata_map["end_date"] learner_logs = {} remaining_learner_logs = {} # Read remaining event logs if os.path.exists(remaining_session_log_path): remaining_input_file = open(remaining_session_log_path) learner_logs = json.loads(remaining_input_file.read(), object_hook=json_util.object_hook) # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(daily_log_path, "r") for line in input_file: jsonObject = json.loads(line) # Skip records without user_id if "user_id" not in jsonObject["context"] or jsonObject["context"][ "user_id"] == "" or jsonObject["context"]["user_id"] == None: continue # For session separation global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] # Check whether the event record belongs to that day log_date = event_time[0:10] if log_date not in daily_log_path: # print "Log not belonging to the day...\t" + log_date continue event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time, "%Y-%m-%d %H:%M:%S") event_time = event_time.replace(tzinfo=utc) if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type }] course_learner_id_set.add(course_learner_id) input_file.close() # For session separation for learner in learner_logs.keys(): course_learner_id = learner event_logs = learner_logs[learner] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if start_time == "": # Initialization start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if event_logs[i]["event_time"] > end_time + datetime.timedelta( hours=0.5): session_id = course_learner_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: array = (session_id, course_learner_id, start_time, end_time, process_null(duration)) sql = "replace into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass final_time = event_logs[i]["event_time"] # Re-initialization session_id = "" start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if event_logs[i]["event_type"] == "page_close": end_time = event_logs[i]["event_time"] session_id = course_learner_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: array = (session_id, course_learner_id, start_time, end_time, process_null(duration)) sql = "replace into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass # Re-initialization session_id = "" start_time = "" end_time = "" final_time = event_logs[i]["event_time"] else: end_time = event_logs[i]["event_time"] if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] > final_time: new_logs.append(log) remaining_learner_logs[course_learner_id] = new_logs # Output remaining logs if str(end_date)[0:10] not in daily_log_path: output_file = open(remaining_session_log_path, "w") output_file.write( json.dumps(remaining_learner_logs, default=json_util.default)) output_file.close() else: os.remove(remaining_session_log_path)
def quiz_mode(metadata_path, log_path, cursor): # quiz_question_record = [] # submissions = {} # assessments = {} # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) quiz_question_map = course_metadata_map["quiz_question_map"] block_type_map = course_metadata_map["block_type_map"] element_time_map_due = course_metadata_map["element_time_map_due"] for question_id in quiz_question_map: question_due = "" question_weight = quiz_question_map[question_id] quiz_question_parent = course_metadata_map["child_parent_map"][question_id] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] while not block_type_map.has_key(quiz_question_parent): quiz_question_parent = course_metadata_map["child_parent_map"][quiz_question_parent] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] quiz_question_type = block_type_map[quiz_question_parent] question_due = process_null(question_due) # array_quiz = [question_id, quiz_question_type, question_weight, question_due] # quiz_question_record.append(array_quiz) sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)" data = (question_id, quiz_question_type, question_weight, question_due) cursor.execute(sql, data) # Processing events data submission_event_collection = [] # Problem check submission_event_collection.append("problem_check") # Server ''' submission_event_collection.append("save_problem_check") submission_event_collection.append("problem_check_fail") submission_event_collection.append("save_problem_check_fail") # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully. submission_event_collection.append("problem_graded") # The server emits problem_rescore events when a problem is successfully rescored. submission_event_collection.append("problem_rescore") submission_event_collection.append("problem_rescore_fail") submission_event_collection.append("problem_reset") # event_source: serve submission_event_collection.append("reset_problem") submission_event_collection.append("reset_problem_fail") # The server emits problem_save events after a user saves a problem. submission_event_collection.append("problem_save") # event_source: server submission_event_collection.append("save_problem_fail") submission_event_collection.append("save_problem_success") # Show answer submission_event_collection.append("problem_show") submission_event_collection.append("showanswer") ''' current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) log_files = os.listdir(log_path) submission_uni_index = 0 while True: if current_date == end_next_date: break; for file in log_files: if current_date in file: print file input_file = open(log_path + file,"r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) if jsonObject["event_type"] in submission_event_collection: # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) question_id = "" grade = "" max_grade = "" event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S") if isinstance(jsonObject["event"], dict): question_id = jsonObject["event"]["problem_id"] # The fields "grade" and "max_grade" are specific to submission event "problem_check" if jsonObject["event"].has_key("grade") and jsonObject["event"].has_key("max_grade"): grade = jsonObject["event"]["grade"] max_grade = jsonObject["event"]["max_grade"] if question_id != "": submission_id = course_learner_id + "_" + question_id + "_" + str(submission_uni_index) submission_uni_index = submission_uni_index + 1 # For submissions # array_submission = [submission_id, course_learner_id, question_id, event_time] # submissions[submission_id] = array_submission submission_timestamp = event_time sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values (%s,%s,%s,%s)" data = (submission_id, course_learner_id, question_id, submission_timestamp) cursor.execute(sql, data) # For assessments if grade != "" and max_grade != "": # array_assessment = [submission_id, course_learner_id, max_grade, grade] # assessments[submission_id] = array_assessment assessment_id = submission_id sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values (%s,%s,%s,%s)" data = (assessment_id, course_learner_id, max_grade, grade) cursor.execute(sql, data) current_date = getNextDay(current_date) # submission_record = [] # assessment_record = [] # for submission_id in submissions.keys(): # submission_record.append(submissions[submission_id]) # for assessment_id in assessments.keys(): # assessment_record.append(assessments[assessment_id]) # Database version # Quiz_question table # for array in quiz_question_record: # question_id = array[0] # question_type = array[1] # question_weight = array[2] # question_due = array[3] # sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values" # sql += "('%s','%s','%s','%s');" % (question_id, question_type, question_weight, question_due) # cursor.execute(sql) # Submissions table # for array in submission_record: # submission_id = array[0] # course_learner_id = array[1] # question_id = array[2] # submission_timestamp = array[3] # sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values" # sql += "('%s','%s','%s','%s');" % (submission_id, course_learner_id, question_id, submission_timestamp) # cursor.execute(sql) # Submissions table # for array in assessment_record: # assessment_id = array[0] # course_learner_id = array[1] # max_grade = array[2] # grade = array[3] # sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values" # sql += "('%s','%s','%s','%s');" % (assessment_id, course_learner_id, max_grade, grade) # cursor.execute(sql) '''
def quiz_sessions(metadata_path, daily_log_path, remaining_forum_session_log_path, cursor): utc = pytz.UTC course_metadata_map = ExtractCourseInformation(metadata_path) end_date = course_metadata_map["end_date"] # Quiz-related events quiz_event_types = [] # Problem check quiz_event_types.append("problem_check") # Server quiz_event_types.append("save_problem_check") quiz_event_types.append("problem_check_fail") quiz_event_types.append("save_problem_check_fail") # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully. quiz_event_types.append("problem_graded") # The server emits problem_rescore events when a problem is successfully rescored. quiz_event_types.append("problem_rescore") quiz_event_types.append("problem_rescore_fail") quiz_event_types.append("problem_reset") # event_source: serve quiz_event_types.append("reset_problem") quiz_event_types.append("reset_problem_fail") # The server emits problem_save events after a user saves a problem. quiz_event_types.append("problem_save") # event_source: server quiz_event_types.append("save_problem_fail") quiz_event_types.append("save_problem_success") # Show answer quiz_event_types.append("problem_show") quiz_event_types.append("showanswer") quiz_event_types.append("edx.problem.hint.demandhint_displayed") quiz_event_types.append("edx.problem.hint.feedback_displayed") child_parent_map = course_metadata_map["child_parent_map"] learner_logs = {} remaining_learner_logs = {} quiz_sessions = {} # Read remaining event logs if os.path.exists(remaining_forum_session_log_path): remaining_input_file = open(remaining_forum_session_log_path) learner_logs = json.loads(remaining_input_file.read(), object_hook=json_util.object_hook) # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(daily_log_path, "r") for line in input_file: jsonObject = json.loads(line) # Skip records without user_id if "user_id" not in jsonObject["context"] or jsonObject["context"]["user_id"] == "" or jsonObject["context"]["user_id"] == None: continue # For quiz session separation global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] # Check whether the event record belongs to that day log_date = event_time[0:10] if log_date not in daily_log_path: # print "Log not belonging to the day...\t" + log_date continue event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S") event_time = event_time.replace(tzinfo=utc) if learner_logs.has_key(course_learner_id): learner_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type}) else: learner_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}] input_file.close() # For quiz session separation for learner in learner_logs.keys(): course_learner_id = learner event_logs = learner_logs[learner] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if session_id == "": if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"]: event_type_array = event_logs[i]["event_type"].split("/") if "problem+block" in event_logs[i]["event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i]["event_type"]: question_id = event_type_array[6].replace(";_", "/") if question_id in child_parent_map.keys(): parent_block_id = child_parent_map[question_id] session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"] or event_logs[i]["event_type"] in quiz_event_types: if event_logs[i]["event_time"] > end_time + datetime.timedelta(hours=0.5): if quiz_sessions.has_key(session_id): quiz_sessions[session_id]["time_array"].append({"start_time":start_time, "end_time":end_time}) else: quiz_sessions[session_id] = {"course_learner_id":course_learner_id, "time_array":[{"start_time":start_time, "end_time":end_time}]} final_time = event_logs[i]["event_time"] if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"] or event_logs[i]["event_type"] in quiz_event_types: event_type_array = event_logs[i]["event_type"].split("/") if "problem+block" in event_logs[i]["event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i]["event_type"]: question_id = event_type_array[6].replace(";_", "/") if question_id in child_parent_map.keys(): parent_block_id = child_parent_map[question_id] session_id = "quiz_session_" + parent_block_id + "_" +course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: session_id = "" start_time = "" end_time = "" else: end_time = event_logs[i]["event_time"] else: if event_logs[i]["event_time"] <= end_time + datetime.timedelta(hours=0.5): end_time = event_logs[i]["event_time"] if quiz_sessions.has_key(session_id): quiz_sessions[session_id]["time_array"].append({"start_time":start_time, "end_time":end_time}) else: quiz_sessions[session_id] = {"course_learner_id":course_learner_id, "time_array":[{"start_time":start_time, "end_time":end_time}]} final_time = event_logs[i]["event_time"] session_id = "" start_time = "" end_time = "" if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] > final_time: new_logs.append(log) remaining_learner_logs[course_learner_id] = new_logs # Output remaining logs if str(end_date)[0:10] not in daily_log_path: output_file = open(remaining_forum_session_log_path, "w") output_file.write(json.dumps(remaining_learner_logs, default=json_util.default)) output_file.close() else: os.remove(remaining_forum_session_log_path) # To compress the session event_logs for session_id in quiz_sessions.keys(): if len(quiz_sessions[session_id]["time_array"]) > 1: start_time = "" end_time = "" updated_time_array = [] for i in range(len(quiz_sessions[session_id]["time_array"])): if i == 0: start_time = quiz_sessions[session_id]["time_array"][i]["start_time"] end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] else: if quiz_sessions[session_id]["time_array"][i]["start_time"] > end_time + datetime.timedelta(hours=0.5): updated_time_array.append({"start_time":start_time, "end_time":end_time}) start_time = quiz_sessions[session_id]["time_array"][i]["start_time"] end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] if i == len(quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({"start_time":start_time, "end_time":end_time}) else: end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] if i == len(quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({"start_time":start_time, "end_time":end_time}) quiz_sessions[session_id]["time_array"] = updated_time_array for session_id in quiz_sessions.keys(): course_learner_id = quiz_sessions[session_id]["course_learner_id"] for i in range(len(quiz_sessions[session_id]["time_array"])): start_time = quiz_sessions[session_id]["time_array"][i]["start_time"] end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] if start_time < end_time: duration = process_null((end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds) final_session_id = session_id + "_" + str(start_time) + "_" + str(end_time) if duration > 5: array = (final_session_id, course_learner_id, start_time, end_time, duration) sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass
def learner_mode(metadata_path, course_code, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) array = (course_metadata_map["course_id"], course_metadata_map["course_name"], course_metadata_map["start_time"], course_metadata_map["end_time"]) sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)" cursor.execute(sql, array) # Course_element table for element_id in course_metadata_map["element_time_map"].keys(): element_start_time = course_metadata_map["element_time_map"][ element_id] # Some contents released just one hour earlier than the hour of start time. # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00. # However, those 2nd week contents are count as 1st week. # In order to avoid above situation, I use date to replace datetime here. week = process_null( getDayDiff(course_metadata_map["start_time"].date(), element_start_time.date()) / 7 + 1) array = (element_id, course_metadata_map["element_type_map"][element_id], week, course_metadata_map["course_id"]) sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)" cursor.execute(sql, array) # Quiz_question table quiz_question_map = course_metadata_map["quiz_question_map"] block_type_map = course_metadata_map["block_type_map"] element_time_map_due = course_metadata_map["element_time_map_due"] for question_id in quiz_question_map: question_due = "" question_weight = quiz_question_map[question_id] quiz_question_parent = course_metadata_map["child_parent_map"][ question_id] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] while not block_type_map.has_key(quiz_question_parent): quiz_question_parent = course_metadata_map["child_parent_map"][ quiz_question_parent] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] quiz_question_type = block_type_map[quiz_question_parent] question_due = process_null(question_due) array = (question_id, quiz_question_type, question_weight, question_due) sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)" cursor.execute(sql, array) files = os.listdir(metadata_path) # Learner_demographic table learner_mail_map = {} # Course_learner table course_learner_map = {} learner_enrollment_time_map = {} # Enrolled learners set enrolled_learner_set = set() course_id = "" # Processing student_courseenrollment data for file in files: if "student_courseenrollment" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] course_id = record[2] time = datetime.datetime.strptime(record[3], "%Y-%m-%d %H:%M:%S") course_learner_id = course_id + "_" + global_learner_id if cmp_datetime(course_metadata_map["end_time"], time): enrolled_learner_set.add(global_learner_id) array = (global_learner_id, course_id, course_learner_id) sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)" cursor.execute(sql, array) course_learner_map[global_learner_id] = course_learner_id learner_enrollment_time_map[global_learner_id] = time input_file.close() # Processing auth_user data for file in files: if "auth_user-" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") if record[0] in enrolled_learner_set: learner_mail_map[record[0]] = record[4] input_file.close() # Processing certificates_generatedcertificate data num_uncertifiedLearners = 0 num_certifiedLearners = 0 for file in files: if "certificates_generatedcertificate" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] final_grade = process_null(record[3]) enrollment_mode = record[14].replace("\n", "") certificate_status = record[7] register_time = "" if course_learner_map.has_key(global_learner_id): register_time = learner_enrollment_time_map[ global_learner_id] register_time = process_null(register_time) if course_learner_map.has_key(global_learner_id): num_certifiedLearners += 1 array = (course_learner_map[global_learner_id], final_grade, enrollment_mode, certificate_status, register_time) sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)" cursor.execute(sql, array) else: num_uncertifiedLearners += 1 input_file.close() # Processing auth_userprofile data for file in files: if "auth_userprofile" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] gender = record[7] year_of_birth = process_null(process_null(record[9])) level_of_education = record[10] country = record[13] course_learner_id = process_null(course_id + "_" + global_learner_id) if global_learner_id in enrolled_learner_set: array = (course_learner_id, gender, year_of_birth, level_of_education, country, learner_mail_map[global_learner_id]) sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)" cursor.execute(sql, array) input_file.close() # Generating forum_interaction records for courses starting before 1T2015 if "1T2015" in course_code or "2014" in course_code or "2013" in course_code: forum_interaction_mongo(metadata_path, cursor)
def video_interaction(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) video_interaction_map = {} # Video-related event types video_event_types = [] video_event_types.append("hide_transcript") video_event_types.append("edx.video.transcript.hidden") video_event_types.append("edx.video.closed_captions.hidden") video_event_types.append("edx.video.closed_captions.shown") video_event_types.append("load_video") video_event_types.append("edx.video.loaded") video_event_types.append("pause_video") video_event_types.append("edx.video.paused") video_event_types.append("play_video") video_event_types.append("edx.video.played") video_event_types.append("seek_video") video_event_types.append("edx.video.position.changed") video_event_types.append("show_transcript") video_event_types.append("edx.video.transcript.shown") video_event_types.append("speed_change_video") video_event_types.append("stop_video") video_event_types.append("edx.video.stopped") video_event_types.append("video_hide_cc_menu") video_event_types.append("edx.video.language_menu.hidden") video_event_types.append("video_show_cc_menu") video_event_types.append("edx.video.language_menu.shown") ''' # Navigation-related event types navigation_event_types = [] navigation_event_types.append("page_close") navigation_event_types.append("seq_goto") navigation_event_types.append("seq_next") navigation_event_types.append("seq_prev") ''' learner_video_event_logs = {} updated_learner_video_event_logs = {} log_files = os.listdir(log_path) while True: if current_date == end_next_date: break; for file in log_files: if current_date in file: print file learner_video_event_logs.clear() learner_video_event_logs = updated_learner_video_event_logs.copy() updated_learner_video_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_video_event_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(log_path + file,"r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) if jsonObject["event_type"] in video_event_types: # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) video_id = "" event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S") event_type = jsonObject["event_type"] # For seek event new_time = 0 old_time = 0 # For speed change event new_speed = 0 old_speed = 0 # This sub-condition does not exist in log data # if isinstance(jsonObject["event"], dict): # video_id = jsonObject["event"]["id"] if isinstance(jsonObject["event"], unicode): event_jsonObject = json.loads(jsonObject["event"]) video_id = event_jsonObject["id"] video_id = video_id.replace("-", "://", 1) video_id = video_id.replace("-", "/") # For video seek event if "new_time" in event_jsonObject and "old_time" in event_jsonObject: new_time = event_jsonObject["new_time"] old_time = event_jsonObject["old_time"] # For video speed change event if "new_speed" in event_jsonObject and "old_speed" in event_jsonObject: new_speed = event_jsonObject["new_speed"] old_speed = event_jsonObject["old_speed"] # To record video seek event if event_type in ["seek_video","edx.video.position.changed"]: if new_time is not None and old_time is not None: if course_learner_id in course_learner_id_set: learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_time":new_time, "old_time":old_time}) else: learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_time":new_time, "old_time":old_time}] course_learner_id_set.add(course_learner_id) continue # To record video speed change event if event_type in ["speed_change_video"]: if course_learner_id in course_learner_id_set: learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_speed":new_speed, "old_speed":old_speed}) else: learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_speed":new_speed, "old_speed":old_speed}] course_learner_id_set.add(course_learner_id) continue if course_learner_id in course_learner_id_set: learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id}) else: learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id}] course_learner_id_set.add(course_learner_id) # For non-video-related events if jsonObject["event_type"] not in video_event_types: # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S") event_type = jsonObject["event_type"] if course_learner_id in course_learner_id_set: learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type}) else: learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}] course_learner_id_set.add(course_learner_id) for course_learner_id in learner_video_event_logs.keys(): video_id = "" event_logs = learner_video_event_logs[course_learner_id] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) video_start_time = "" final_time = "" # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video pause event pause_check = False pause_start_time = "" duration_pause = 0 for log in event_logs: if log["event_type"] in ["play_video", "edx.video.played"]: video_start_time = log["event_time"] video_id = log["video_id"] if pause_check: duration_pause = (log["event_time"] - pause_start_time).seconds video_interaction_id = course_learner_id + "_" + video_id + "_" + str(pause_start_time) if duration_pause > 2 and duration_pause < 600: if video_interaction_id in video_interaction_map.keys(): video_interaction_map[video_interaction_id]["times_pause"] = 1 video_interaction_map[video_interaction_id]["duration_pause"] = duration_pause pause_check = False continue if video_start_time != "": if log["event_time"] > video_start_time + datetime.timedelta(hours=0.5): video_start_time = "" video_id = "" final_time = log["event_time"] else: # 0. Seek if log["event_type"] in ["seek_video", "edx.video.position.changed"] and video_id == log["video_id"]: # Forward seek event if log["new_time"] > log["old_time"]: times_forward_seek += 1 duration_forward_seek += log["new_time"] - log["old_time"] # Backward seek event if log["new_time"] < log["old_time"]: times_backward_seek += 1 duration_backward_seek += log["old_time"] - log["new_time"] continue # 1. Speed change if log["event_type"] == "speed_change_video" and video_id == log["video_id"]: if speed_change_last_time == "": speed_change_last_time = log["event_time"] old_speed = log["old_speed"] new_speed = log["new_speed"] if old_speed < new_speed: times_speed_up += 1 if old_speed > new_speed: times_speed_down += 1 else: if (log["event_time"] - speed_change_last_time).seconds > 10: old_speed = log["old_speed"] new_speed = log["new_speed"] if old_speed < new_speed: times_speed_up += 1 if old_speed > new_speed: times_speed_down += 1 speed_change_last_time = log["event_time"] continue # 2. Pause/Stop situation if log["event_type"] in ["pause_video", "edx.video.paused", "stop_video", "edx.video.stopped"] and video_id == log["video_id"]: watch_duration = (log["event_time"] - video_start_time).seconds video_end_time = log["event_time"] video_interaction_id = course_learner_id + "_" + video_id + "_" + str(video_end_time) if watch_duration > 5: video_interaction_map[video_interaction_id] = {"course_learner_id":course_learner_id, "video_id":video_id, "type": "video", "watch_duration":watch_duration, "times_forward_seek":times_forward_seek, "duration_forward_seek":duration_forward_seek, "times_backward_seek":times_backward_seek, "duration_backward_seek":duration_backward_seek, "times_speed_up":times_speed_up, "times_speed_down":times_speed_down, "start_time":video_start_time, "end_time":video_end_time} if log["event_type"] in ["pause_video", "edx.video.paused"]: pause_check = True pause_start_time = video_end_time # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video general information video_start_time ="" video_id = "" final_time = log["event_time"] continue # 3/4 Page changed/Session closed if log["event_type"] not in video_event_types: video_end_time = log["event_time"] watch_duration = (video_end_time - video_start_time).seconds video_interaction_id = course_learner_id + "_" + video_id + "_" + str(video_end_time) if watch_duration > 5: video_interaction_map[video_interaction_id] = {"course_learner_id":course_learner_id, "video_id":video_id, "type": "video", "watch_duration":watch_duration, "times_forward_seek":times_forward_seek, "duration_forward_seek":duration_forward_seek, "times_backward_seek":times_backward_seek, "duration_backward_seek":duration_backward_seek, "times_speed_up":times_speed_up, "times_speed_down":times_speed_down, "start_time":video_start_time, "end_time":video_end_time} # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video general information video_start_time = "" video_id = "" final_time = log["event_time"] continue if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] > final_time: new_logs.append(log) updated_learner_video_event_logs[course_learner_id] = new_logs current_date = getNextDay(current_date) video_interaction_record = [] for interaction_id in video_interaction_map.keys(): video_interaction_id = interaction_id course_learner_id = video_interaction_map[interaction_id]["course_learner_id"] video_id = video_interaction_map[interaction_id]["video_id"] duration = video_interaction_map[interaction_id]["watch_duration"] times_forward_seek = video_interaction_map[interaction_id]["times_forward_seek"] duration_forward_seek = video_interaction_map[interaction_id]["duration_forward_seek"] times_backward_seek = video_interaction_map[interaction_id]["times_backward_seek"] duration_backward_seek = video_interaction_map[interaction_id]["duration_backward_seek"] times_speed_up = video_interaction_map[interaction_id]["times_speed_up"] times_speed_down = video_interaction_map[interaction_id]["times_speed_down"] start_time = video_interaction_map[interaction_id]["start_time"] end_time = video_interaction_map[interaction_id]["end_time"] if "times_pause" in video_interaction_map[interaction_id]: times_pause = video_interaction_map[interaction_id]["times_pause"] duration_pause = video_interaction_map[interaction_id]["duration_pause"] else: times_pause = 0 duration_pause = 0 array = [video_interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time] video_interaction_record.append(array) # Video_interaction table # Database version for array in video_interaction_record: interaction_id = array[0] course_learner_id = array[1] video_id = array[2] duration = process_null(array[3]) times_forward_seek = process_null(array[4]) duration_forward_seek = process_null(array[5]) times_backward_seek = process_null(array[6]) duration_backward_seek = process_null(array[7]) times_speed_up = process_null(array[8]) times_speed_down = process_null(array[9]) times_pause = process_null(array[10]) duration_pause = process_null(array[11]) start_time = array[12] end_time = array[13] sql = "insert into video_interaction(interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" data = (interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time) cursor.execute(sql, data) # File version '''
def sessions(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) learner_all_event_logs = {} updated_learner_all_event_logs = {} session_record = [] log_files = os.listdir(log_path) while True: if current_date == end_next_date: break; for file in log_files: if current_date in file: print file learner_all_event_logs.clear() learner_all_event_logs = updated_learner_all_event_logs.copy() updated_learner_all_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_all_event_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(log_path + file,"r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S") if course_learner_id in course_learner_id_set: learner_all_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type}) else: learner_all_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}] course_learner_id_set.add(course_learner_id) for course_learner_id in learner_all_event_logs.keys(): event_logs = learner_all_event_logs[course_learner_id] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if start_time == "": # Initialization start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if event_logs[i]["event_time"] > end_time + datetime.timedelta(hours=0.5): session_id = course_learner_id + "_" + str(start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds if duration > 5: array = [session_id, course_learner_id, start_time, end_time, duration] session_record.append(array) final_time = event_logs[i]["event_time"] # Re-initialization session_id = "" start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if event_logs[i]["event_type"] == "page_close": end_time = event_logs[i]["event_time"] session_id = course_learner_id + "_" + str(start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds if duration > 5: array = [session_id, course_learner_id, start_time, end_time, duration] session_record.append(array) # Re-initialization session_id = "" start_time = "" end_time = "" final_time = event_logs[i]["event_time"] else: end_time = event_logs[i]["event_time"] if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] >= final_time: new_logs.append(log) updated_learner_all_event_logs[course_learner_id] = new_logs current_date = getNextDay(current_date) # Filter duplicated records updated_session_record = [] session_id_set = set() for array in session_record: session_id = array[0] if session_id not in session_id_set: session_id_set.add(session_id) updated_session_record.append(array) session_record = updated_session_record # Database version for array in session_record: session_id = array[0] course_learner_id = array[1] start_time = array[2] end_time = array[3] duration = process_null(array[4]) sql = "insert into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" data = (session_id, course_learner_id, start_time, end_time, duration) cursor.execute(sql, data) # File version '''
def learner_mode(metadata_path, course_code, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) array = (course_metadata_map["course_id"], course_metadata_map["course_name"], course_metadata_map["start_time"], course_metadata_map["end_time"]) sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)" cursor.execute(sql, array) # Course_element table for element_id in course_metadata_map["element_time_map"].keys(): element_start_time = course_metadata_map["element_time_map"][element_id] # Some contents released just one hour earlier than the hour of start time. # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00. # However, those 2nd week contents are count as 1st week. # In order to avoid above situation, I use date to replace datetime here. week = process_null(getDayDiff(course_metadata_map["start_time"].date(), element_start_time.date()) / 7 + 1) array = (element_id, course_metadata_map["element_type_map"][element_id], week, course_metadata_map["course_id"]) sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)" cursor.execute(sql, array) # Quiz_question table quiz_question_map = course_metadata_map["quiz_question_map"] block_type_map = course_metadata_map["block_type_map"] element_time_map_due = course_metadata_map["element_time_map_due"] for question_id in quiz_question_map: question_due = "" question_weight = quiz_question_map[question_id] quiz_question_parent = course_metadata_map["child_parent_map"][question_id] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] while not block_type_map.has_key(quiz_question_parent): quiz_question_parent = course_metadata_map["child_parent_map"][quiz_question_parent] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] quiz_question_type = block_type_map[quiz_question_parent] question_due = process_null(question_due) array = (question_id, quiz_question_type, question_weight, question_due) sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)" cursor.execute(sql, array) files = os.listdir(metadata_path) # Learner_demographic table learner_mail_map = {} # Course_learner table course_learner_map = {} learner_enrollment_time_map = {} # Enrolled learners set enrolled_learner_set = set() course_id = "" # Processing student_courseenrollment data for file in files: if "student_courseenrollment" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] course_id = record[2] time = datetime.datetime.strptime(record[3], "%Y-%m-%d %H:%M:%S") course_learner_id = course_id + "_" + global_learner_id if cmp_datetime(course_metadata_map["end_time"], time): enrolled_learner_set.add(global_learner_id) array = (global_learner_id, course_id, course_learner_id) sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)" cursor.execute(sql, array) course_learner_map[global_learner_id] = course_learner_id learner_enrollment_time_map[global_learner_id] = time input_file.close() # Processing auth_user data for file in files: if "auth_user-" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") if record[0] in enrolled_learner_set: learner_mail_map[record[0]] = record[4] input_file.close() # Processing certificates_generatedcertificate data num_uncertifiedLearners = 0 num_certifiedLearners = 0 for file in files: if "certificates_generatedcertificate" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] final_grade = process_null(record[3]) enrollment_mode = record[14].replace("\n", "") certificate_status = record[7] register_time = "" if course_learner_map.has_key(global_learner_id): register_time = learner_enrollment_time_map[global_learner_id] register_time = process_null(register_time) if course_learner_map.has_key(global_learner_id): num_certifiedLearners += 1 array = (course_learner_map[global_learner_id], final_grade, enrollment_mode, certificate_status, register_time) sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)" cursor.execute(sql, array) else: num_uncertifiedLearners += 1 input_file.close() # Processing auth_userprofile data for file in files: if "auth_userprofile" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] gender = record[7] year_of_birth = process_null(process_null(record[9])) level_of_education = record[10] country = record[13] course_learner_id = process_null(course_id + "_" + global_learner_id) if global_learner_id in enrolled_learner_set: array = (course_learner_id, gender, year_of_birth, level_of_education, country, learner_mail_map[global_learner_id]) sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)" cursor.execute(sql, array) input_file.close() # Generating forum_interaction records for courses starting before 1T2015 if "1T2015" in course_code or "2014" in course_code or "2013" in course_code: forum_interaction_mongo(metadata_path, cursor)
def forum_sessions(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) start_date = course_metadata_map["start_date"] end_date = course_metadata_map["end_date"] current_date = start_date end_next_date = getNextDay(end_date) forum_event_types = [] forum_event_types.append("edx.forum.comment.created") forum_event_types.append("edx.forum.response.created") forum_event_types.append("edx.forum.response.voted") forum_event_types.append("edx.forum.thread.created") forum_event_types.append("edx.forum.thread.voted") forum_event_types.append("edx.forum.searched") learner_all_event_logs = {} updated_learner_all_event_logs = {} forum_sessions_record = [] log_files = os.listdir(log_path) while True: if current_date == end_next_date: break; for log_file in log_files: if current_date in log_file: print log_file learner_all_event_logs.clear() learner_all_event_logs = updated_learner_all_event_logs.copy() updated_learner_all_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_all_event_logs.keys(): course_learner_id_set.add(course_learner_id) log_file = open(log_path + log_file,"r") lines = log_file.readlines() for line in lines: jsonObject = json.loads(line) # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue if jsonObject["context"]["user_id"] == "": continue # For forum session separation global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) if "/discussion/" in event_type or event_type in forum_event_types: if event_type != "edx.forum.searched": event_type = "forum_activity" if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S") # added for relevant elements event_page = "" if jsonObject.has_key("page"): event_page = str(jsonObject["page"]) event_path = "" if jsonObject.has_key("path"): event_path = str(jsonObject["path"]) event_referer = "" if jsonObject.has_key("referer"): event_referer = str(jsonObject["referer"]) if course_learner_id in course_learner_id_set: learner_all_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "page":event_page, "path":event_path, "referer":event_referer}) else: learner_all_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "page":event_page, "path":event_path, "referer":event_referer}] course_learner_id_set.add(course_learner_id) # For forum session separation for learner in learner_all_event_logs.keys(): course_learner_id = learner event_logs = learner_all_event_logs[learner] course_id = course_learner_id.split("_")[0] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" times_search = 0 final_time = "" # represent the elements which just before the session. session_rel_element_pre = "" # represent the elements which is mentioned in the session. session_rel_element_cur = "" for i in range(len(event_logs)): rel_element_cur = courseElementsFinder(event_logs[i], course_id) if session_id == "": if event_logs[i]["event_type"] in ["forum_activity", "edx.forum.searched"]: # Initialization session_id = "forum_session_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] if event_logs[i]["event_type"] == "edx.forum.searched": times_search += 1 # Added for relevant element id session_rel_element_cur = rel_element_cur else: if event_logs[i]["event_type"] in ["forum_activity", "edx.forum.searched"]: if event_logs[i]["event_time"] > end_time + datetime.timedelta(hours=0.5): session_id = session_id + "_" + str(start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds if duration > 5: rel_element_id = "" if session_rel_element_cur != "": rel_element_id = session_rel_element_cur else: rel_element_id = session_rel_element_pre array = [session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id] forum_sessions_record.append(array) final_time = event_logs[i]["event_time"] # Re-initialization session_id = "forum_session_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] if event_logs[i]["event_type"] == "edx.forum.searched": times_search = 1 # Added for relevant element id session_rel_element_cur = rel_element_cur else: end_time = event_logs[i]["event_time"] if event_logs[i]["event_type"] == "edx.forum.searched": times_search += 1 if session_rel_element_cur == "": session_rel_element_cur = rel_element_cur else: if event_logs[i]["event_time"] <= end_time + datetime.timedelta(hours=0.5): end_time = event_logs[i]["event_time"] session_id = session_id + "_" + str(start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds if duration > 5: rel_element_id = "" if session_rel_element_cur != "": rel_element_id = session_rel_element_cur else: rel_element_id = session_rel_element_pre array = [session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id] forum_sessions_record.append(array) final_time = event_logs[i]["event_time"] # Re-initialization session_id = "" start_time = "" end_time = "" times_search = 0 # session_rel_element_pre is used for recording the element id # of the most recent event logs before the session logs if rel_element_cur != "": session_rel_element_pre = rel_element_cur if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] >= final_time: new_logs.append(log) updated_learner_all_event_logs[course_learner_id] = new_logs log_file.close() current_date = getNextDay(current_date) # Database version for array in forum_sessions_record: session_id = array[0] course_learner_id = array[1] times_search = process_null(array[2]) start_time = array[3] end_time = array[4] duration = process_null(array[5]) rel_element_id = array[6] sql = "insert into forum_sessions (session_id, course_learner_id, times_search, start_time, end_time, duration, relevent_element_id) values (%s,%s,%s,%s,%s,%s,%s)" data = (session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id) cursor.execute(sql, data) # File version '''
def sessions(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) learner_all_event_logs = {} updated_learner_all_event_logs = {} session_record = [] log_files = os.listdir(log_path) while True: if current_date == end_next_date: break for file in log_files: if current_date in file: print file learner_all_event_logs.clear() learner_all_event_logs = updated_learner_all_event_logs.copy() updated_learner_all_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_all_event_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(log_path + file, "r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str( global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime( event_time, "%Y-%m-%d %H:%M:%S") if course_learner_id in course_learner_id_set: learner_all_event_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type }) else: learner_all_event_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type }] course_learner_id_set.add(course_learner_id) for course_learner_id in learner_all_event_logs.keys(): event_logs = learner_all_event_logs[course_learner_id] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if start_time == "": # Initialization start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if event_logs[i][ "event_time"] > end_time + datetime.timedelta( hours=0.5): session_id = course_learner_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: array = [ session_id, course_learner_id, start_time, end_time, duration ] session_record.append(array) final_time = event_logs[i]["event_time"] # Re-initialization session_id = "" start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if event_logs[i]["event_type"] == "page_close": end_time = event_logs[i]["event_time"] session_id = course_learner_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time ).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: array = [ session_id, course_learner_id, start_time, end_time, duration ] session_record.append(array) # Re-initialization session_id = "" start_time = "" end_time = "" final_time = event_logs[i]["event_time"] else: end_time = event_logs[i]["event_time"] if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] >= final_time: new_logs.append(log) updated_learner_all_event_logs[ course_learner_id] = new_logs current_date = getNextDay(current_date) # Filter duplicated records updated_session_record = [] session_id_set = set() for array in session_record: session_id = array[0] if session_id not in session_id_set: session_id_set.add(session_id) updated_session_record.append(array) session_record = updated_session_record # Database version for array in session_record: session_id = array[0] course_learner_id = array[1] start_time = array[2] end_time = array[3] duration = process_null(array[4]) sql = "insert into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" data = (session_id, course_learner_id, start_time, end_time, duration) cursor.execute(sql, data) # File version '''
def forum_sessions(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) start_date = course_metadata_map["start_date"] end_date = course_metadata_map["end_date"] current_date = start_date end_next_date = getNextDay(end_date) forum_event_types = [] forum_event_types.append("edx.forum.comment.created") forum_event_types.append("edx.forum.response.created") forum_event_types.append("edx.forum.response.voted") forum_event_types.append("edx.forum.thread.created") forum_event_types.append("edx.forum.thread.voted") forum_event_types.append("edx.forum.searched") learner_all_event_logs = {} updated_learner_all_event_logs = {} forum_sessions_record = [] log_files = os.listdir(log_path) while True: if current_date == end_next_date: break for log_file in log_files: if current_date in log_file: print log_file learner_all_event_logs.clear() learner_all_event_logs = updated_learner_all_event_logs.copy() updated_learner_all_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_all_event_logs.keys(): course_learner_id_set.add(course_learner_id) log_file = open(log_path + log_file, "r") lines = log_file.readlines() for line in lines: jsonObject = json.loads(line) # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue if jsonObject["context"]["user_id"] == "": continue # For forum session separation global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) if "/discussion/" in event_type or event_type in forum_event_types: if event_type != "edx.forum.searched": event_type = "forum_activity" if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str( global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime( event_time, "%Y-%m-%d %H:%M:%S") # added for relevant elements event_page = "" if jsonObject.has_key("page"): event_page = str(jsonObject["page"]) event_path = "" if jsonObject.has_key("path"): event_path = str(jsonObject["path"]) event_referer = "" if jsonObject.has_key("referer"): event_referer = str(jsonObject["referer"]) if course_learner_id in course_learner_id_set: learner_all_event_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type, "page": event_page, "path": event_path, "referer": event_referer }) else: learner_all_event_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type, "page": event_page, "path": event_path, "referer": event_referer }] course_learner_id_set.add(course_learner_id) # For forum session separation for learner in learner_all_event_logs.keys(): course_learner_id = learner event_logs = learner_all_event_logs[learner] course_id = course_learner_id.split("_")[0] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" times_search = 0 final_time = "" # represent the elements which just before the session. session_rel_element_pre = "" # represent the elements which is mentioned in the session. session_rel_element_cur = "" for i in range(len(event_logs)): rel_element_cur = courseElementsFinder( event_logs[i], course_id) if session_id == "": if event_logs[i]["event_type"] in [ "forum_activity", "edx.forum.searched" ]: # Initialization session_id = "forum_session_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] if event_logs[i][ "event_type"] == "edx.forum.searched": times_search += 1 # Added for relevant element id session_rel_element_cur = rel_element_cur else: if event_logs[i]["event_type"] in [ "forum_activity", "edx.forum.searched" ]: if event_logs[i][ "event_time"] > end_time + datetime.timedelta( hours=0.5): session_id = session_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time ).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: rel_element_id = "" if session_rel_element_cur != "": rel_element_id = session_rel_element_cur else: rel_element_id = session_rel_element_pre array = [ session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id ] forum_sessions_record.append(array) final_time = event_logs[i]["event_time"] # Re-initialization session_id = "forum_session_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] if event_logs[i][ "event_type"] == "edx.forum.searched": times_search = 1 # Added for relevant element id session_rel_element_cur = rel_element_cur else: end_time = event_logs[i]["event_time"] if event_logs[i][ "event_type"] == "edx.forum.searched": times_search += 1 if session_rel_element_cur == "": session_rel_element_cur = rel_element_cur else: if event_logs[i][ "event_time"] <= end_time + datetime.timedelta( hours=0.5): end_time = event_logs[i]["event_time"] session_id = session_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: rel_element_id = "" if session_rel_element_cur != "": rel_element_id = session_rel_element_cur else: rel_element_id = session_rel_element_pre array = [ session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id ] forum_sessions_record.append(array) final_time = event_logs[i]["event_time"] # Re-initialization session_id = "" start_time = "" end_time = "" times_search = 0 # session_rel_element_pre is used for recording the element id # of the most recent event logs before the session logs if rel_element_cur != "": session_rel_element_pre = rel_element_cur if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] >= final_time: new_logs.append(log) updated_learner_all_event_logs[ course_learner_id] = new_logs log_file.close() current_date = getNextDay(current_date) # Database version for array in forum_sessions_record: session_id = array[0] course_learner_id = array[1] times_search = process_null(array[2]) start_time = array[3] end_time = array[4] duration = process_null(array[5]) rel_element_id = array[6] sql = "insert into forum_sessions (session_id, course_learner_id, times_search, start_time, end_time, duration, relevent_element_id) values (%s,%s,%s,%s,%s,%s,%s)" data = (session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id) cursor.execute(sql, data) # File version '''
def learner_mode(metadata_path, cursor): course_record = [] course_element_record = [] learner_index_record = [] course_learner_record = [] learner_demographic_record = [] # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) course_record.append([ course_metadata_map["course_id"], course_metadata_map["course_name"], course_metadata_map["start_time"], course_metadata_map["end_time"] ]) # Course_element table for element_id in course_metadata_map["element_time_map"].keys(): element_start_time = course_metadata_map["element_time_map"][ element_id] # Some contents released just one hour earlier than the hour of start time. # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00. # However, those 2nd week contents are count as 1st week. # In order to avoid above situation, I use date to replace datetime here. week = getDayDiff(course_metadata_map["start_time"].date(), element_start_time.date()) / 7 + 1 array = [ element_id, course_metadata_map["element_type_map"][element_id], week, course_metadata_map["course_id"] ] course_element_record.append(array) files = os.listdir(metadata_path) # Learner_demographic table learner_mail_map = {} # Course_learner table course_learner_map = {} learner_enrollment_time_map = {} # Enrolled learners set enrolled_learner_set = set() course_id = "" # Processing student_courseenrollment data for file in files: if "student_courseenrollment" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] course_id = record[2] time = datetime.datetime.strptime(record[3], "%Y-%m-%d %H:%M:%S") course_learner_id = course_id + "_" + global_learner_id if cmp_datetime(course_metadata_map["end_time"], time): enrolled_learner_set.add(global_learner_id) array = [global_learner_id, course_id, course_learner_id] learner_index_record.append(array) course_learner_map[global_learner_id] = course_learner_id learner_enrollment_time_map[global_learner_id] = time input_file.close() print "The number of enrolled learners is: " + str( len(enrolled_learner_set)) + "\n" # Processing auth_user data for file in files: if "auth_user-" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") if record[0] in enrolled_learner_set: learner_mail_map[record[0]] = record[4] input_file.close() # Processing certificates_generatedcertificate data num_uncertifiedLearners = 0 num_certifiedLearners = 0 for file in files: if "certificates_generatedcertificate" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] final_grade = record[3] enrollment_mode = record[14].replace("\n", "") certificate_status = record[7] register_time = "" if course_learner_map.has_key(global_learner_id): register_time = learner_enrollment_time_map[ global_learner_id] if course_learner_map.has_key(global_learner_id): num_certifiedLearners += 1 array = [ course_learner_map[global_learner_id], final_grade, enrollment_mode, certificate_status, register_time ] course_learner_record.append(array) else: num_uncertifiedLearners += 1 input_file.close() print "The number of uncertified & certified learners is: " + str( num_uncertifiedLearners) + "\t" + str( num_certifiedLearners) + "\n" # Processing auth_userprofile data for file in files: if "auth_userprofile" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] gender = record[7] year_of_birth = record[9] level_of_education = record[10] country = record[13] course_learner_id = course_id + "_" + global_learner_id if global_learner_id in enrolled_learner_set: array = [ course_learner_id, gender, year_of_birth, level_of_education, country, learner_mail_map[global_learner_id] ] learner_demographic_record.append(array) input_file.close() # Database version # Course table for array in course_record: course_id = course_metadata_map["course_id"] course_name = course_metadata_map["course_name"] start_time = course_metadata_map["start_time"] end_time = course_metadata_map["end_time"] sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)" data = (course_id, course_name, start_time, end_time) cursor.execute(sql, data) for array in course_element_record: element_id = array[0] element_type = array[1] week = process_null(array[2]) course_id = array[3] sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)" data = (element_id, element_type, week, course_id) cursor.execute(sql, data) # Learner_index table for array in learner_index_record: global_learner_id = array[0] course_id = array[1] course_learner_id = array[2] sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)" data = (global_learner_id, course_id, course_learner_id) cursor.execute(sql, data) # Course_learner table for array in course_learner_record: course_learner_id = array[0] final_grade = process_null(array[1]) enrollment_mode = array[2] certificate_status = array[3] register_time = process_null(array[4]) sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)" data = (course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) cursor.execute(sql, data) # Learner_demographic table for array in learner_demographic_record: course_learner_id = process_null(array[0]) gender = array[1] year_of_birth = process_null(process_null(array[2])) level_of_education = array[3] country = array[4] email = array[5] email = email.replace("\'", "") sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)" data = (course_learner_id, gender, year_of_birth, level_of_education, country, email) cursor.execute(sql, data) # File version '''
def quiz_sessions(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) # Processing events data submission_event_collection = [] # Problem check submission_event_collection.append("problem_check") # Server submission_event_collection.append("save_problem_check") submission_event_collection.append("problem_check_fail") submission_event_collection.append("save_problem_check_fail") # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully. submission_event_collection.append("problem_graded") # The server emits problem_rescore events when a problem is successfully rescored. submission_event_collection.append("problem_rescore") submission_event_collection.append("problem_rescore_fail") submission_event_collection.append("problem_reset") # event_source: serve submission_event_collection.append("reset_problem") submission_event_collection.append("reset_problem_fail") # The server emits problem_save events after a user saves a problem. submission_event_collection.append("problem_save") # event_source: server submission_event_collection.append("save_problem_fail") submission_event_collection.append("save_problem_success") # Show answer submission_event_collection.append("problem_show") submission_event_collection.append("showanswer") current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) log_files = os.listdir(log_path) child_parent_map = course_metadata_map["child_parent_map"] learner_all_event_logs = {} updated_learner_all_event_logs = {} quiz_sessions = {} while True: if current_date == end_next_date: break for file in log_files: if current_date in file: print file learner_all_event_logs.clear() learner_all_event_logs = updated_learner_all_event_logs.copy() updated_learner_all_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_all_event_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(log_path + file, "r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str( global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime( event_time, "%Y-%m-%d %H:%M:%S") if learner_all_event_logs.has_key(course_learner_id): learner_all_event_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type }) else: learner_all_event_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type }] # For quiz session separation for course_learner_id in learner_all_event_logs.keys(): event_logs = learner_all_event_logs[course_learner_id] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if session_id == "": if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[ i]["event_type"] or event_logs[i][ "event_type"] in submission_event_collection: event_type_array = event_logs[i][ "event_type"].split("/") if "problem+block" in event_logs[i][ "event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i]["event_type"]: question_id = event_type_array[6].replace( ";_", "/") if question_id in child_parent_map.keys(): parent_block_id = child_parent_map[ question_id] session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[ i]["event_type"] or event_logs[i][ "event_type"] in submission_event_collection: if event_logs[i][ "event_time"] > end_time + datetime.timedelta( hours=0.5): if quiz_sessions.has_key(session_id): quiz_sessions[session_id][ "time_array"].append({ "start_time": start_time, "end_time": end_time }) else: quiz_sessions[session_id] = { "course_learner_id": course_learner_id, "time_array": [{ "start_time": start_time, "end_time": end_time }] } final_time = event_logs[i]["event_time"] if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[ i]["event_type"] or event_logs[ i]["event_type"] in submission_event_collection: event_type_array = event_logs[i][ "event_type"].split("/") if "problem+block" in event_logs[i][ "event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i][ "event_type"]: question_id = event_type_array[ 6].replace(";_", "/") if question_id in child_parent_map.keys( ): parent_block_id = child_parent_map[ question_id] session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id start_time = event_logs[i][ "event_time"] end_time = event_logs[i][ "event_time"] else: session_id = "" start_time = "" end_time = "" else: end_time = event_logs[i]["event_time"] else: if event_logs[i][ "event_time"] <= end_time + datetime.timedelta( hours=0.5): end_time = event_logs[i]["event_time"] if quiz_sessions.has_key(session_id): quiz_sessions[session_id][ "time_array"].append({ "start_time": start_time, "end_time": end_time }) else: quiz_sessions[session_id] = { "course_learner_id": course_learner_id, "time_array": [{ "start_time": start_time, "end_time": end_time }] } final_time = event_logs[i]["event_time"] session_id = "" start_time = "" end_time = "" if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] >= final_time: new_logs.append(log) updated_learner_all_event_logs[ course_learner_id] = new_logs current_date = getNextDay(current_date) # To compress the session event_logs for session_id in quiz_sessions.keys(): if len(quiz_sessions[session_id]["time_array"]) > 1: start_time = "" end_time = "" updated_time_array = [] for i in range(len(quiz_sessions[session_id]["time_array"])): if i == 0: start_time = quiz_sessions[session_id]["time_array"][i][ "start_time"] end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] else: if quiz_sessions[session_id]["time_array"][i][ "start_time"] > end_time + datetime.timedelta( hours=0.5): updated_time_array.append({ "start_time": start_time, "end_time": end_time }) start_time = quiz_sessions[session_id]["time_array"][ i]["start_time"] end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] if i == len( quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({ "start_time": start_time, "end_time": end_time }) else: end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] if i == len( quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({ "start_time": start_time, "end_time": end_time }) quiz_sessions[session_id]["time_array"] = updated_time_array quiz_session_record = [] for session_id in quiz_sessions.keys(): course_learner_id = quiz_sessions[session_id]["course_learner_id"] for i in range(len(quiz_sessions[session_id]["time_array"])): start_time = quiz_sessions[session_id]["time_array"][i][ "start_time"] end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] if start_time < end_time: duration = (end_time - start_time).days * 24 * 60 * 60 + ( end_time - start_time).seconds final_session_id = session_id + "_" + str( start_time) + "_" + str(end_time) if duration > 5: array = [ final_session_id, course_learner_id, start_time, end_time, duration ] quiz_session_record.append(array) # Database version for array in quiz_session_record: session_id = array[0] course_learner_id = array[1] start_time = array[2] end_time = array[3] duration = process_null(array[4]) sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" data = (session_id, course_learner_id, start_time, end_time, duration) cursor.execute(sql, data) '''
def quiz_mode(metadata_path, log_path, cursor): # quiz_question_record = [] # submissions = {} # assessments = {} # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) quiz_question_map = course_metadata_map["quiz_question_map"] block_type_map = course_metadata_map["block_type_map"] element_time_map_due = course_metadata_map["element_time_map_due"] for question_id in quiz_question_map: question_due = "" question_weight = quiz_question_map[question_id] quiz_question_parent = course_metadata_map["child_parent_map"][ question_id] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] while not block_type_map.has_key(quiz_question_parent): quiz_question_parent = course_metadata_map["child_parent_map"][ quiz_question_parent] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] quiz_question_type = block_type_map[quiz_question_parent] question_due = process_null(question_due) # array_quiz = [question_id, quiz_question_type, question_weight, question_due] # quiz_question_record.append(array_quiz) sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)" data = (question_id, quiz_question_type, question_weight, question_due) cursor.execute(sql, data) # Processing events data submission_event_collection = [] # Problem check submission_event_collection.append("problem_check") # Server ''' submission_event_collection.append("save_problem_check") submission_event_collection.append("problem_check_fail") submission_event_collection.append("save_problem_check_fail") # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully. submission_event_collection.append("problem_graded") # The server emits problem_rescore events when a problem is successfully rescored. submission_event_collection.append("problem_rescore") submission_event_collection.append("problem_rescore_fail") submission_event_collection.append("problem_reset") # event_source: serve submission_event_collection.append("reset_problem") submission_event_collection.append("reset_problem_fail") # The server emits problem_save events after a user saves a problem. submission_event_collection.append("problem_save") # event_source: server submission_event_collection.append("save_problem_fail") submission_event_collection.append("save_problem_success") # Show answer submission_event_collection.append("problem_show") submission_event_collection.append("showanswer") ''' current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) log_files = os.listdir(log_path) submission_uni_index = 0 while True: if current_date == end_next_date: break for file in log_files: if current_date in file: print file input_file = open(log_path + file, "r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) if jsonObject["event_type"] in submission_event_collection: # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str( global_learner_id) question_id = "" grade = "" max_grade = "" event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime( event_time, "%Y-%m-%d %H:%M:%S") if isinstance(jsonObject["event"], dict): question_id = jsonObject["event"]["problem_id"] # The fields "grade" and "max_grade" are specific to submission event "problem_check" if jsonObject["event"].has_key( "grade" ) and jsonObject["event"].has_key("max_grade"): grade = jsonObject["event"]["grade"] max_grade = jsonObject["event"][ "max_grade"] if question_id != "": submission_id = course_learner_id + "_" + question_id + "_" + str( submission_uni_index) submission_uni_index = submission_uni_index + 1 # For submissions # array_submission = [submission_id, course_learner_id, question_id, event_time] # submissions[submission_id] = array_submission submission_timestamp = event_time sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values (%s,%s,%s,%s)" data = (submission_id, course_learner_id, question_id, submission_timestamp) cursor.execute(sql, data) # For assessments if grade != "" and max_grade != "": # array_assessment = [submission_id, course_learner_id, max_grade, grade] # assessments[submission_id] = array_assessment assessment_id = submission_id sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values (%s,%s,%s,%s)" data = (assessment_id, course_learner_id, max_grade, grade) cursor.execute(sql, data) current_date = getNextDay(current_date) # submission_record = [] # assessment_record = [] # for submission_id in submissions.keys(): # submission_record.append(submissions[submission_id]) # for assessment_id in assessments.keys(): # assessment_record.append(assessments[assessment_id]) # Database version # Quiz_question table # for array in quiz_question_record: # question_id = array[0] # question_type = array[1] # question_weight = array[2] # question_due = array[3] # sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values" # sql += "('%s','%s','%s','%s');" % (question_id, question_type, question_weight, question_due) # cursor.execute(sql) # Submissions table # for array in submission_record: # submission_id = array[0] # course_learner_id = array[1] # question_id = array[2] # submission_timestamp = array[3] # sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values" # sql += "('%s','%s','%s','%s');" % (submission_id, course_learner_id, question_id, submission_timestamp) # cursor.execute(sql) # Submissions table # for array in assessment_record: # assessment_id = array[0] # course_learner_id = array[1] # max_grade = array[2] # grade = array[3] # sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values" # sql += "('%s','%s','%s','%s');" % (assessment_id, course_learner_id, max_grade, grade) # cursor.execute(sql) '''
def forum_sessions(metadata_path, daily_log_path, remaining_forum_session_log_path, cursor): utc = pytz.UTC course_metadata_map = ExtractCourseInformation(metadata_path) end_date = course_metadata_map["end_date"] # Forum-related events forum_event_types = [] forum_event_types.append("edx.forum.comment.created") forum_event_types.append("edx.forum.response.created") forum_event_types.append("edx.forum.response.voted") forum_event_types.append("edx.forum.thread.created") forum_event_types.append("edx.forum.thread.voted") forum_event_types.append("edx.forum.searched") learner_logs = {} remaining_learner_logs = {} # Read remaining event logs if os.path.exists(remaining_forum_session_log_path): remaining_input_file = open(remaining_forum_session_log_path) learner_logs = json.loads(remaining_input_file.read(), object_hook=json_util.object_hook) # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(daily_log_path, "r") for line in input_file: jsonObject = json.loads(line) # Skip records without user_id if "user_id" not in jsonObject["context"] or jsonObject["context"]["user_id"] == "" or jsonObject["context"]["user_id"] == None: continue # For forum session separation global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) if "/discussion/" in event_type or event_type in forum_event_types: if event_type != "edx.forum.searched": event_type = "forum_activity" course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] # Check whether the event record belongs to that day log_date = event_time[0:10] if log_date not in daily_log_path: # print "Log not belonging to the day...\t" + log_date continue event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S") event_time = event_time.replace(tzinfo=utc) if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type}) else: learner_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}] course_learner_id_set.add(course_learner_id) input_file.close() # For forum session separation for learner in learner_logs.keys(): course_learner_id = learner event_logs = learner_logs[learner] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" times_search = 0 final_time = "" for i in range(len(event_logs)): if session_id == "": if event_logs[i]["event_type"] in ["forum_activity", "edx.forum.searched"]: # Initialization session_id = "forum_session_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] if event_logs[i]["event_type"] == "edx.forum.searched": times_search += 1 else: if event_logs[i]["event_type"] in ["forum_activity", "edx.forum.searched"]: if event_logs[i]["event_time"] > end_time + datetime.timedelta(hours=0.5): session_id = session_id + "_" + str(start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds times_search = process_null(times_search) duration = process_null(duration) if duration > 5: array = (session_id, course_learner_id, times_search, start_time, end_time, duration, "") sql = "insert into forum_sessions (session_id, course_learner_id, times_search, start_time, end_time, duration, relevent_element_id) values (%s,%s,%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass final_time = event_logs[i]["event_time"] # Re-initialization session_id = "forum_session_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] if event_logs[i]["event_type"] == "edx.forum.searched": times_search = 1 else: end_time = event_logs[i]["event_time"] if event_logs[i]["event_type"] == "edx.forum.searched": times_search += 1 else: if event_logs[i]["event_time"] <= end_time + datetime.timedelta(hours=0.5): end_time = event_logs[i]["event_time"] session_id = session_id + "_" + str(start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds times_search = process_null(times_search) duration = process_null(duration) if duration > 5: array = (session_id, course_learner_id, times_search, start_time, end_time, duration, "") sql = "insert into forum_sessions (session_id, course_learner_id, times_search, start_time, end_time, duration, relevent_element_id) values (%s,%s,%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass final_time = event_logs[i]["event_time"] # Re-initialization session_id = "" start_time = "" end_time = "" times_search = 0 if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] > final_time: new_logs.append(log) remaining_learner_logs[course_learner_id] = new_logs # Output remaining logs if str(end_date)[0:10] not in daily_log_path: output_file = open(remaining_forum_session_log_path, "w") output_file.write(json.dumps(remaining_learner_logs, default=json_util.default)) output_file.close() else: os.remove(remaining_forum_session_log_path)
def learner_mode(metadata_path, cursor): course_record = [] course_element_record = [] learner_index_record = [] course_learner_record = [] learner_demographic_record = [] # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) course_record.append([course_metadata_map["course_id"], course_metadata_map["course_name"], course_metadata_map["start_time"], course_metadata_map["end_time"]]) # Course_element table for element_id in course_metadata_map["element_time_map"].keys(): element_start_time = course_metadata_map["element_time_map"][element_id] # Some contents released just one hour earlier than the hour of start time. # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00. # However, those 2nd week contents are count as 1st week. # In order to avoid above situation, I use date to replace datetime here. week = getDayDiff(course_metadata_map["start_time"].date(), element_start_time.date()) / 7 + 1 array = [element_id, course_metadata_map["element_type_map"][element_id], week, course_metadata_map["course_id"]] course_element_record.append(array) files = os.listdir(metadata_path) # Learner_demographic table learner_mail_map = {} # Course_learner table course_learner_map = {} learner_enrollment_time_map = {} # Enrolled learners set enrolled_learner_set = set() course_id = "" # Processing student_courseenrollment data for file in files: if "student_courseenrollment" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] course_id = record[2] time = datetime.datetime.strptime(record[3],"%Y-%m-%d %H:%M:%S") course_learner_id = course_id + "_" + global_learner_id if cmp_datetime(course_metadata_map["end_time"], time): enrolled_learner_set.add(global_learner_id) array = [global_learner_id, course_id, course_learner_id] learner_index_record.append(array) course_learner_map[global_learner_id] = course_learner_id learner_enrollment_time_map[global_learner_id] = time input_file.close() print "The number of enrolled learners is: " + str(len(enrolled_learner_set)) + "\n" # Processing auth_user data for file in files: if "auth_user-" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") if record[0] in enrolled_learner_set: learner_mail_map[record[0]] = record[4] input_file.close() # Processing certificates_generatedcertificate data num_uncertifiedLearners = 0 num_certifiedLearners = 0 for file in files: if "certificates_generatedcertificate" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] final_grade = record[3] enrollment_mode = record[14].replace("\n", "") certificate_status = record[7] register_time = "" if course_learner_map.has_key(global_learner_id): register_time = learner_enrollment_time_map[global_learner_id] if course_learner_map.has_key(global_learner_id): num_certifiedLearners += 1 array = [course_learner_map[global_learner_id], final_grade, enrollment_mode, certificate_status, register_time] course_learner_record.append(array) else: num_uncertifiedLearners += 1 input_file.close() print "The number of uncertified & certified learners is: " + str(num_uncertifiedLearners) + "\t" + str(num_certifiedLearners) + "\n" # Processing auth_userprofile data for file in files: if "auth_userprofile" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] gender = record[7] year_of_birth = record[9] level_of_education = record[10] country = record[13] course_learner_id = course_id + "_" + global_learner_id if global_learner_id in enrolled_learner_set: array = [course_learner_id, gender, year_of_birth, level_of_education, country, learner_mail_map[global_learner_id]] learner_demographic_record.append(array) input_file.close() # Database version # Course table for array in course_record: course_id = course_metadata_map["course_id"] course_name = course_metadata_map["course_name"] start_time = course_metadata_map["start_time"] end_time = course_metadata_map["end_time"] sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)" data = (course_id, course_name, start_time, end_time) cursor.execute(sql, data) for array in course_element_record: element_id = array[0] element_type = array[1] week = process_null(array[2]) course_id = array[3] sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)" data = (element_id, element_type, week, course_id) cursor.execute(sql, data) # Learner_index table for array in learner_index_record: global_learner_id = array[0] course_id = array[1] course_learner_id = array[2] sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)" data = (global_learner_id, course_id, course_learner_id) cursor.execute(sql, data) # Course_learner table for array in course_learner_record: course_learner_id = array[0] final_grade = process_null(array[1]) enrollment_mode = array[2] certificate_status = array[3] register_time = process_null(array[4]) sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)" data = (course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) cursor.execute(sql, data) # Learner_demographic table for array in learner_demographic_record: course_learner_id = process_null(array[0]) gender = array[1] year_of_birth = process_null(process_null(array[2])) level_of_education = array[3] country = array[4] email = array[5] email = email.replace("\'", "") sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)" data = (course_learner_id, gender, year_of_birth, level_of_education, country, email) cursor.execute(sql, data) # File version '''
def quiz_sessions(metadata_path, daily_log_path, remaining_forum_session_log_path, cursor): utc = pytz.UTC course_metadata_map = ExtractCourseInformation(metadata_path) end_date = course_metadata_map["end_date"] # Quiz-related events quiz_event_types = [] # Problem check quiz_event_types.append("problem_check") # Server quiz_event_types.append("save_problem_check") quiz_event_types.append("problem_check_fail") quiz_event_types.append("save_problem_check_fail") # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully. quiz_event_types.append("problem_graded") # The server emits problem_rescore events when a problem is successfully rescored. quiz_event_types.append("problem_rescore") quiz_event_types.append("problem_rescore_fail") quiz_event_types.append("problem_reset") # event_source: serve quiz_event_types.append("reset_problem") quiz_event_types.append("reset_problem_fail") # The server emits problem_save events after a user saves a problem. quiz_event_types.append("problem_save") # event_source: server quiz_event_types.append("save_problem_fail") quiz_event_types.append("save_problem_success") # Show answer quiz_event_types.append("problem_show") quiz_event_types.append("showanswer") quiz_event_types.append("edx.problem.hint.demandhint_displayed") quiz_event_types.append("edx.problem.hint.feedback_displayed") child_parent_map = course_metadata_map["child_parent_map"] learner_logs = {} remaining_learner_logs = {} quiz_sessions = {} # Read remaining event logs if os.path.exists(remaining_forum_session_log_path): remaining_input_file = open(remaining_forum_session_log_path) learner_logs = json.loads(remaining_input_file.read(), object_hook=json_util.object_hook) # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(daily_log_path, "r") for line in input_file: jsonObject = json.loads(line) # Skip records without user_id if "user_id" not in jsonObject["context"] or jsonObject["context"][ "user_id"] == "" or jsonObject["context"]["user_id"] == None: continue # For quiz session separation global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] # Check whether the event record belongs to that day log_date = event_time[0:10] if log_date not in daily_log_path: # print "Log not belonging to the day...\t" + log_date continue event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time, "%Y-%m-%d %H:%M:%S") event_time = event_time.replace(tzinfo=utc) if learner_logs.has_key(course_learner_id): learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type }] input_file.close() # For quiz session separation for learner in learner_logs.keys(): course_learner_id = learner event_logs = learner_logs[learner] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if session_id == "": if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[i][ "event_type"]: event_type_array = event_logs[i]["event_type"].split("/") if "problem+block" in event_logs[i]["event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i]["event_type"]: question_id = event_type_array[6].replace(";_", "/") if question_id in child_parent_map.keys(): parent_block_id = child_parent_map[question_id] session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[i][ "event_type"] or event_logs[i][ "event_type"] in quiz_event_types: if event_logs[i][ "event_time"] > end_time + datetime.timedelta( hours=0.5): if quiz_sessions.has_key(session_id): quiz_sessions[session_id]["time_array"].append({ "start_time": start_time, "end_time": end_time }) else: quiz_sessions[session_id] = { "course_learner_id": course_learner_id, "time_array": [{ "start_time": start_time, "end_time": end_time }] } final_time = event_logs[i]["event_time"] if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[i][ "event_type"] or event_logs[i][ "event_type"] in quiz_event_types: event_type_array = event_logs[i][ "event_type"].split("/") if "problem+block" in event_logs[i]["event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i]["event_type"]: question_id = event_type_array[6].replace( ";_", "/") if question_id in child_parent_map.keys(): parent_block_id = child_parent_map[question_id] session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: session_id = "" start_time = "" end_time = "" else: end_time = event_logs[i]["event_time"] else: if event_logs[i][ "event_time"] <= end_time + datetime.timedelta( hours=0.5): end_time = event_logs[i]["event_time"] if quiz_sessions.has_key(session_id): quiz_sessions[session_id]["time_array"].append({ "start_time": start_time, "end_time": end_time }) else: quiz_sessions[session_id] = { "course_learner_id": course_learner_id, "time_array": [{ "start_time": start_time, "end_time": end_time }] } final_time = event_logs[i]["event_time"] session_id = "" start_time = "" end_time = "" if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] > final_time: new_logs.append(log) remaining_learner_logs[course_learner_id] = new_logs # Output remaining logs if str(end_date)[0:10] not in daily_log_path: output_file = open(remaining_forum_session_log_path, "w") output_file.write( json.dumps(remaining_learner_logs, default=json_util.default)) output_file.close() else: os.remove(remaining_forum_session_log_path) # To compress the session event_logs for session_id in quiz_sessions.keys(): if len(quiz_sessions[session_id]["time_array"]) > 1: start_time = "" end_time = "" updated_time_array = [] for i in range(len(quiz_sessions[session_id]["time_array"])): if i == 0: start_time = quiz_sessions[session_id]["time_array"][i][ "start_time"] end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] else: if quiz_sessions[session_id]["time_array"][i][ "start_time"] > end_time + datetime.timedelta( hours=0.5): updated_time_array.append({ "start_time": start_time, "end_time": end_time }) start_time = quiz_sessions[session_id]["time_array"][ i]["start_time"] end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] if i == len( quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({ "start_time": start_time, "end_time": end_time }) else: end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] if i == len( quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({ "start_time": start_time, "end_time": end_time }) quiz_sessions[session_id]["time_array"] = updated_time_array for session_id in quiz_sessions.keys(): course_learner_id = quiz_sessions[session_id]["course_learner_id"] for i in range(len(quiz_sessions[session_id]["time_array"])): start_time = quiz_sessions[session_id]["time_array"][i][ "start_time"] end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] if start_time < end_time: duration = process_null((end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds) final_session_id = session_id + "_" + str( start_time) + "_" + str(end_time) if duration > 5: array = (final_session_id, course_learner_id, start_time, end_time, duration) sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass
def video_interaction(metadata_path, daily_log_path, remaining_video_interaction_log_path, cursor): utc = pytz.UTC course_metadata_map = ExtractCourseInformation(metadata_path) end_date = course_metadata_map["end_date"] video_interaction_map = {} # Video-related event types video_event_types = [] video_event_types.append("hide_transcript") video_event_types.append("edx.video.transcript.hidden") video_event_types.append("edx.video.closed_captions.hidden") video_event_types.append("edx.video.closed_captions.shown") video_event_types.append("load_video") video_event_types.append("edx.video.loaded") video_event_types.append("pause_video") video_event_types.append("edx.video.paused") video_event_types.append("play_video") video_event_types.append("edx.video.played") video_event_types.append("seek_video") video_event_types.append("edx.video.position.changed") video_event_types.append("show_transcript") video_event_types.append("edx.video.transcript.shown") video_event_types.append("speed_change_video") video_event_types.append("stop_video") video_event_types.append("edx.video.stopped") video_event_types.append("video_hide_cc_menu") video_event_types.append("edx.video.language_menu.hidden") video_event_types.append("video_show_cc_menu") video_event_types.append("edx.video.language_menu.shown") learner_logs = {} remaining_learner_logs = {} # Read remaining event logs if os.path.exists(remaining_video_interaction_log_path): remaining_input_file = open(remaining_video_interaction_log_path) learner_logs = json.loads(remaining_input_file.read(), object_hook=json_util.object_hook) # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(daily_log_path, "r") for line in input_file: jsonObject = json.loads(line) # Skip records without user_id if "user_id" not in jsonObject["context"] or jsonObject["context"][ "user_id"] == "" or jsonObject["context"]["user_id"] == None: continue global_learner_id = jsonObject["context"]["user_id"] event_type = jsonObject["event_type"] course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] # Check whether the event record belongs to that day log_date = event_time[0:10] if log_date not in daily_log_path: # print "Log not belonging to the day...\t" + log_date continue event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time, "%Y-%m-%d %H:%M:%S") event_time = event_time.replace(tzinfo=utc) # For video-related events if event_type in video_event_types: video_id = "" # For seek event new_time = 0 old_time = 0 # For speed change event new_speed = 0 old_speed = 0 if isinstance(jsonObject["event"], unicode): event_jsonObject = json.loads(jsonObject["event"]) video_id = event_jsonObject["id"] video_id = video_id.replace("-", "://", 1) video_id = video_id.replace("-", "/") # For video seek event if "new_time" in event_jsonObject and "old_time" in event_jsonObject: new_time = event_jsonObject["new_time"] old_time = event_jsonObject["old_time"] # For video speed change event if "new_speed" in event_jsonObject and "old_speed" in event_jsonObject: new_speed = event_jsonObject["new_speed"] old_speed = event_jsonObject["old_speed"] # To record video seek event if event_type in ["seek_video", "edx.video.position.changed"]: if new_time is not None and old_time is not None: if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type, "video_id": video_id, "new_time": new_time, "old_time": old_time }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type, "video_id": video_id, "new_time": new_time, "old_time": old_time }] course_learner_id_set.add(course_learner_id) continue # To record video speed change event if event_type in ["speed_change_video"]: if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type, "video_id": video_id, "new_speed": new_speed, "old_speed": old_speed }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type, "video_id": video_id, "new_speed": new_speed, "old_speed": old_speed }] course_learner_id_set.add(course_learner_id) continue if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type, "video_id": video_id }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type, "video_id": video_id }] course_learner_id_set.add(course_learner_id) # For non-video-related events if event_type not in video_event_types: if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type }] course_learner_id_set.add(course_learner_id) input_file.close() # For video interaction extraction for learner in learner_logs.keys(): course_learner_id = learner event_logs = learner_logs[learner] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) video_id = "" video_start_time = "" final_time = "" # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video pause event pause_check = False pause_start_time = "" duration_pause = 0 for i in range(len(event_logs)): if event_logs[i]["event_type"] in [ "play_video", "edx.video.played" ]: video_start_time = event_logs[i]["event_time"] video_id = event_logs[i]["video_id"] if pause_check: duration_pause = (event_logs[i]["event_time"] - pause_start_time).days * 24 * 60 * 60 + ( event_logs[i]["event_time"] - pause_start_time).seconds video_interaction_id = course_learner_id + "_" + video_id + "_" + str( pause_start_time) if duration_pause > 2 and duration_pause < 600: if video_interaction_id in video_interaction_map.keys( ): video_interaction_map[video_interaction_id][ "times_pause"] = 1 video_interaction_map[video_interaction_id][ "duration_pause"] = duration_pause pause_check = False continue if video_start_time != "": if event_logs[i][ "event_time"] > video_start_time + datetime.timedelta( hours=0.5): video_start_time = "" video_id = "" final_time = event_logs[i]["event_time"] else: # 0. Seek if event_logs[i]["event_type"] in [ "seek_video", "edx.video.position.changed" ] and video_id == event_logs[i]["video_id"]: # Forward seek event if event_logs[i]["new_time"] > event_logs[i][ "old_time"]: times_forward_seek += 1 duration_forward_seek += event_logs[i][ "new_time"] - event_logs[i]["old_time"] # Backward seek event if event_logs[i]["new_time"] < event_logs[i][ "old_time"]: times_backward_seek += 1 duration_backward_seek += event_logs[i][ "old_time"] - event_logs[i]["new_time"] continue # 1. Speed change if event_logs[i][ "event_type"] == "speed_change_video" and video_id == event_logs[ i]["video_id"]: if speed_change_last_time == "": speed_change_last_time = event_logs[i][ "event_time"] old_speed = event_logs[i]["old_speed"] new_speed = event_logs[i]["new_speed"] if old_speed < new_speed: times_speed_up += 1 if old_speed > new_speed: times_speed_down += 1 else: if (event_logs[i]["event_time"] - speed_change_last_time).seconds > 10: old_speed = event_logs[i]["old_speed"] new_speed = event_logs[i]["new_speed"] if old_speed < new_speed: times_speed_up += 1 if old_speed > new_speed: times_speed_down += 1 speed_change_last_time = event_logs[i][ "event_time"] continue # 2. Pause/Stop situation if event_logs[i]["event_type"] in [ "pause_video", "edx.video.paused", "stop_video", "edx.video.stopped" ] and video_id == event_logs[i]["video_id"]: watch_duration = (event_logs[i]["event_time"] - video_start_time).seconds video_end_time = event_logs[i]["event_time"] video_interaction_id = course_learner_id + "_" + video_id + "_" + str( video_start_time) + "_" + str(video_end_time) if watch_duration > 5: video_interaction_map[video_interaction_id] = { "course_learner_id": course_learner_id, "video_id": video_id, "type": "video", "watch_duration": watch_duration, "times_forward_seek": times_forward_seek, "duration_forward_seek": duration_forward_seek, "times_backward_seek": times_backward_seek, "duration_backward_seek": duration_backward_seek, "times_speed_up": times_speed_up, "times_speed_down": times_speed_down, "start_time": video_start_time, "end_time": video_end_time } if event_logs[i]["event_type"] in [ "pause_video", "edx.video.paused" ]: pause_check = True pause_start_time = video_end_time # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video general information video_start_time = "" video_id = "" final_time = event_logs[i]["event_time"] continue # 3/4 Page changed/Session closed if event_logs[i]["event_type"] not in video_event_types: video_end_time = event_logs[i]["event_time"] watch_duration = (video_end_time - video_start_time).seconds video_interaction_id = course_learner_id + "_" + video_id + "_" + str( video_start_time) + "_" + str(video_end_time) if watch_duration > 5: video_interaction_map[video_interaction_id] = { "course_learner_id": course_learner_id, "video_id": video_id, "type": "video", "watch_duration": watch_duration, "times_forward_seek": times_forward_seek, "duration_forward_seek": duration_forward_seek, "times_backward_seek": times_backward_seek, "duration_backward_seek": duration_backward_seek, "times_speed_up": times_speed_up, "times_speed_down": times_speed_down, "start_time": video_start_time, "end_time": video_end_time } # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video general information video_start_time = "" video_id = "" final_time = event_logs[i]["event_time"] continue if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] > final_time: new_logs.append(log) remaining_learner_logs[course_learner_id] = new_logs # Output remaining logs if str(end_date)[0:10] not in daily_log_path: output_file = open(remaining_video_interaction_log_path, "w") output_file.write( json.dumps(remaining_learner_logs, default=json_util.default)) output_file.close() else: os.remove(remaining_video_interaction_log_path) for interaction_id in video_interaction_map.keys(): video_interaction_id = interaction_id course_learner_id = video_interaction_map[interaction_id][ "course_learner_id"] video_id = video_interaction_map[interaction_id]["video_id"] duration = process_null( video_interaction_map[interaction_id]["watch_duration"]) times_forward_seek = process_null( video_interaction_map[interaction_id]["times_forward_seek"]) duration_forward_seek = process_null( video_interaction_map[interaction_id]["duration_forward_seek"]) times_backward_seek = process_null( video_interaction_map[interaction_id]["times_backward_seek"]) duration_backward_seek = process_null( video_interaction_map[interaction_id]["duration_backward_seek"]) times_speed_up = process_null( video_interaction_map[interaction_id]["times_speed_up"]) times_speed_down = process_null( video_interaction_map[interaction_id]["times_speed_down"]) start_time = video_interaction_map[interaction_id]["start_time"] end_time = video_interaction_map[interaction_id]["end_time"] if "times_pause" in video_interaction_map[interaction_id]: times_pause = process_null( video_interaction_map[interaction_id]["times_pause"]) duration_pause = process_null( video_interaction_map[interaction_id]["duration_pause"]) else: times_pause = 0 duration_pause = 0 array = [ video_interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time ] sql = "insert into video_interaction(interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass