def RemoveCourseRecords(course_log_path, course_code, mode, cursor): # Retrieve all of the course_learner_id metadata_path = str(course_log_path + course_code + "/metadata/") course_metadata_map = ExtractCourseInformation(metadata_path) course_id = course_metadata_map["course_id"] course_learner_id_set = set() sql = "SELECT learner_index.course_learner_id FROM learner_index where learner_index.course_id = '" + course_id + "';" cursor.execute(sql) results = cursor.fetchall() for result in results: course_learner_id = result[0] course_learner_id_set.add(course_learner_id) if mode == "log": tables = [ "video_interaction", "submissions", "assessments", "quiz_sessions", "forum_interaction", "forum_sessions", "sessions" ] for table in tables: for course_learner_id in course_learner_id_set: sql = "delete from " + table + " where " + table + ".course_learner_id = '" + course_learner_id + "';" cursor.execute(sql) # Remove all of the temporary files files = os.listdir(str(course_log_path + course_code + "/")) for file in files: if not os.path.isdir( str(course_log_path + course_code + "/" + file)): os.remove(str(course_log_path + course_code + "/" + file)) if mode == "metadata": # Quiz_questions table sql = "delete quiz_questions from quiz_questions, course_elements where course_elements.course_id = '" + course_id + "' and course_elements.element_id = quiz_questions.question_id;" cursor.execute(sql) tables = ["courses", "course_elements"] for table in tables: sql = "delete from " + table + " where " + table + ".course_id = '" + course_id + "';" cursor.execute(sql) tables = ["course_learner", "learner_demographic", "learner_index"] for table in tables: for course_learner_id in course_learner_id_set: sql = "delete from " + table + " where " + table + ".course_learner_id = '" + course_learner_id + "';" cursor.execute(sql) if mode == "survey": # Survey_descriptions table sql = "delete from survey_descriptions where survey_descriptions.course_id = '" + course_id + "';" cursor.execute(sql) # Survey_responses table for course_learner_id in course_learner_id_set: sql = "delete from survey_responses where survey_responses.course_learner_id = '" + course_learner_id + "';" cursor.execute(sql)
def quiz_sessions(metadata_path, daily_log_path, remaining_forum_session_log_path, cursor): utc = pytz.UTC course_metadata_map = ExtractCourseInformation(metadata_path) end_date = course_metadata_map["end_date"] # Quiz-related events quiz_event_types = [] # Problem check quiz_event_types.append("problem_check") # Server quiz_event_types.append("save_problem_check") quiz_event_types.append("problem_check_fail") quiz_event_types.append("save_problem_check_fail") # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully. quiz_event_types.append("problem_graded") # The server emits problem_rescore events when a problem is successfully rescored. quiz_event_types.append("problem_rescore") quiz_event_types.append("problem_rescore_fail") quiz_event_types.append("problem_reset") # event_source: serve quiz_event_types.append("reset_problem") quiz_event_types.append("reset_problem_fail") # The server emits problem_save events after a user saves a problem. quiz_event_types.append("problem_save") # event_source: server quiz_event_types.append("save_problem_fail") quiz_event_types.append("save_problem_success") # Show answer quiz_event_types.append("problem_show") quiz_event_types.append("showanswer") quiz_event_types.append("edx.problem.hint.demandhint_displayed") quiz_event_types.append("edx.problem.hint.feedback_displayed") child_parent_map = course_metadata_map["child_parent_map"] learner_logs = {} remaining_learner_logs = {} quiz_sessions = {} # Read remaining event logs if os.path.exists(remaining_forum_session_log_path): remaining_input_file = open(remaining_forum_session_log_path) learner_logs = json.loads(remaining_input_file.read(), object_hook=json_util.object_hook) # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(daily_log_path, "r") for line in input_file: jsonObject = json.loads(line) # Skip records without user_id if "user_id" not in jsonObject["context"] or jsonObject["context"][ "user_id"] == "" or jsonObject["context"]["user_id"] == None: continue # For quiz session separation global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] # Check whether the event record belongs to that day log_date = event_time[0:10] if log_date not in daily_log_path: # print "Log not belonging to the day...\t" + log_date continue event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time, "%Y-%m-%d %H:%M:%S") event_time = event_time.replace(tzinfo=utc) if learner_logs.has_key(course_learner_id): learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type }] input_file.close() # For quiz session separation for learner in learner_logs.keys(): course_learner_id = learner event_logs = learner_logs[learner] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if session_id == "": if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[i][ "event_type"]: event_type_array = event_logs[i]["event_type"].split("/") if "problem+block" in event_logs[i]["event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i]["event_type"]: question_id = event_type_array[6].replace(";_", "/") if question_id in child_parent_map.keys(): parent_block_id = child_parent_map[question_id] session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[i][ "event_type"] or event_logs[i][ "event_type"] in quiz_event_types: if event_logs[i][ "event_time"] > end_time + datetime.timedelta( hours=0.5): if quiz_sessions.has_key(session_id): quiz_sessions[session_id]["time_array"].append({ "start_time": start_time, "end_time": end_time }) else: quiz_sessions[session_id] = { "course_learner_id": course_learner_id, "time_array": [{ "start_time": start_time, "end_time": end_time }] } final_time = event_logs[i]["event_time"] if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[i][ "event_type"] or event_logs[i][ "event_type"] in quiz_event_types: event_type_array = event_logs[i][ "event_type"].split("/") if "problem+block" in event_logs[i]["event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i]["event_type"]: question_id = event_type_array[6].replace( ";_", "/") if question_id in child_parent_map.keys(): parent_block_id = child_parent_map[question_id] session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: session_id = "" start_time = "" end_time = "" else: end_time = event_logs[i]["event_time"] else: if event_logs[i][ "event_time"] <= end_time + datetime.timedelta( hours=0.5): end_time = event_logs[i]["event_time"] if quiz_sessions.has_key(session_id): quiz_sessions[session_id]["time_array"].append({ "start_time": start_time, "end_time": end_time }) else: quiz_sessions[session_id] = { "course_learner_id": course_learner_id, "time_array": [{ "start_time": start_time, "end_time": end_time }] } final_time = event_logs[i]["event_time"] session_id = "" start_time = "" end_time = "" if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] > final_time: new_logs.append(log) remaining_learner_logs[course_learner_id] = new_logs # Output remaining logs if str(end_date)[0:10] not in daily_log_path: output_file = open(remaining_forum_session_log_path, "w") output_file.write( json.dumps(remaining_learner_logs, default=json_util.default)) output_file.close() else: os.remove(remaining_forum_session_log_path) # To compress the session event_logs for session_id in quiz_sessions.keys(): if len(quiz_sessions[session_id]["time_array"]) > 1: start_time = "" end_time = "" updated_time_array = [] for i in range(len(quiz_sessions[session_id]["time_array"])): if i == 0: start_time = quiz_sessions[session_id]["time_array"][i][ "start_time"] end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] else: if quiz_sessions[session_id]["time_array"][i][ "start_time"] > end_time + datetime.timedelta( hours=0.5): updated_time_array.append({ "start_time": start_time, "end_time": end_time }) start_time = quiz_sessions[session_id]["time_array"][ i]["start_time"] end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] if i == len( quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({ "start_time": start_time, "end_time": end_time }) else: end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] if i == len( quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({ "start_time": start_time, "end_time": end_time }) quiz_sessions[session_id]["time_array"] = updated_time_array for session_id in quiz_sessions.keys(): course_learner_id = quiz_sessions[session_id]["course_learner_id"] for i in range(len(quiz_sessions[session_id]["time_array"])): start_time = quiz_sessions[session_id]["time_array"][i][ "start_time"] end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] if start_time < end_time: duration = process_null((end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds) final_session_id = session_id + "_" + str( start_time) + "_" + str(end_time) if duration > 5: array = (final_session_id, course_learner_id, start_time, end_time, duration) sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass
def survey_mode(metadata_path, survey_path, cursor, pre_id_index, post_id_index): description_records = [] response_records = [] # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) course_id = course_metadata_map["course_id"] files = os.listdir(survey_path) learner_id_map = {} # Processing ID information for file in files: if "anon-ids.csv" in file: id_file = open(survey_path + file, "r") id_reader = csv.reader(id_file) id_reader.next() for row in id_reader: global_id = row[0].replace("\"","") anonymized_id = row[1].replace("\"","") learner_id_map[anonymized_id] = global_id # Processing Pre-survey information for file in files: if "pre-survey" in file: pre_file = open(survey_path + file, "r") pre_reader = csv.reader(pre_file) question_id_row = pre_reader.next() question_description_row = pre_reader.next() for i in range(len(list(question_id_row))): question_id = course_id + "_pre_" + question_id_row[i].replace("\"","") question_description = question_description_row[i].replace("\'", "\\'") array = [question_id, course_id, "pre", question_description] description_records.append(array) for row in pre_reader: learner_id = row[pre_id_index] print learner_id if learner_id in learner_id_map.keys(): course_learner_id = course_id + "_" + learner_id_map[learner_id] for i in range(len(list(question_id_row))): question_id = course_id + "_pre_" + question_id_row[i].replace("\"","") response_id = course_learner_id + "_" + "pre" + "_" + question_id_row[i].replace("\"","") answer = row[i] array = [response_id, course_learner_id, question_id, answer] response_records.append(array) pre_file.close() # Processing Post-survey information for file in files: if "post-survey" in file: post_file = open(survey_path + file, "r") post_reader = csv.reader(post_file) question_id_row = post_reader.next() question_description_row = post_reader.next() for i in range(len(list(question_id_row))): question_id = course_id + "_post_" + question_id_row[i].replace("\"","") question_description = question_description_row[i].replace("\'", "\\'") array = [question_id, course_id, "post", question_description] description_records.append(array) for row in post_reader: learner_id = row[post_id_index] if learner_id in learner_id_map.keys(): course_learner_id = course_id + "_" + learner_id_map[learner_id] for i in range(len(list(question_id_row))): question_id = course_id + "_post_" + question_id_row[i].replace("\"","") response_id = course_learner_id + "_post_" + question_id_row[i].replace("\"","") answer = row[i] array = [response_id, course_learner_id, question_id, answer] response_records.append(array) post_file.close() # Database version for array in description_records: question_id = array[0] course_id = array[1] question_type = array[2] question_description = array[3] sql = "insert into survey_descriptions (question_id, course_id, question_type, question_description) values (%s,%s,%s,%s)" data = (question_id, course_id, question_type, question_description) cursor.execute(sql, data) for array in response_records: response_id = array[0] course_learner_id = array[1] question_id = array[2] answer = array[3] sql = "insert into survey_responses (response_id, course_learner_id, question_id, answer) values (%s,%s,%s,%s)" data = (response_id, course_learner_id, question_id, answer) cursor.execute(sql, data) '''
def sessions(metadata_path, daily_log_path, remaining_session_log_path, cursor): utc = pytz.UTC # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) end_date = course_metadata_map["end_date"] learner_logs = {} remaining_learner_logs = {} # Read remaining event logs if os.path.exists(remaining_session_log_path): remaining_input_file = open(remaining_session_log_path) learner_logs = json.loads(remaining_input_file.read(), object_hook=json_util.object_hook) # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(daily_log_path, "r") for line in input_file: jsonObject = json.loads(line) # Skip records without user_id if "user_id" not in jsonObject["context"] or jsonObject["context"][ "user_id"] == "" or jsonObject["context"]["user_id"] == None: continue # For session separation global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] # Check whether the event record belongs to that day log_date = event_time[0:10] if log_date not in daily_log_path: # print "Log not belonging to the day...\t" + log_date continue event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time, "%Y-%m-%d %H:%M:%S") event_time = event_time.replace(tzinfo=utc) if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type }] course_learner_id_set.add(course_learner_id) input_file.close() # For session separation for learner in learner_logs.keys(): course_learner_id = learner event_logs = learner_logs[learner] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if start_time == "": # Initialization start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if event_logs[i]["event_time"] > end_time + datetime.timedelta( hours=0.5): session_id = course_learner_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: array = (session_id, course_learner_id, start_time, end_time, process_null(duration)) sql = "replace into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass final_time = event_logs[i]["event_time"] # Re-initialization session_id = "" start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if event_logs[i]["event_type"] == "page_close": end_time = event_logs[i]["event_time"] session_id = course_learner_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: array = (session_id, course_learner_id, start_time, end_time, process_null(duration)) sql = "replace into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass # Re-initialization session_id = "" start_time = "" end_time = "" final_time = event_logs[i]["event_time"] else: end_time = event_logs[i]["event_time"] if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] > final_time: new_logs.append(log) remaining_learner_logs[course_learner_id] = new_logs # Output remaining logs if str(end_date)[0:10] not in daily_log_path: output_file = open(remaining_session_log_path, "w") output_file.write( json.dumps(remaining_learner_logs, default=json_util.default)) output_file.close() else: os.remove(remaining_session_log_path)
def learner_mode(metadata_path, course_code, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) array = (course_metadata_map["course_id"], course_metadata_map["course_name"], course_metadata_map["start_time"], course_metadata_map["end_time"]) sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)" cursor.execute(sql, array) # Course_element table for element_id in course_metadata_map["element_time_map"].keys(): element_start_time = course_metadata_map["element_time_map"][ element_id] # Some contents released just one hour earlier than the hour of start time. # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00. # However, those 2nd week contents are count as 1st week. # In order to avoid above situation, I use date to replace datetime here. week = process_null( getDayDiff(course_metadata_map["start_time"].date(), element_start_time.date()) / 7 + 1) array = (element_id, course_metadata_map["element_type_map"][element_id], week, course_metadata_map["course_id"]) sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)" cursor.execute(sql, array) # Quiz_question table quiz_question_map = course_metadata_map["quiz_question_map"] block_type_map = course_metadata_map["block_type_map"] element_time_map_due = course_metadata_map["element_time_map_due"] for question_id in quiz_question_map: question_due = "" question_weight = quiz_question_map[question_id] quiz_question_parent = course_metadata_map["child_parent_map"][ question_id] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] while not block_type_map.has_key(quiz_question_parent): quiz_question_parent = course_metadata_map["child_parent_map"][ quiz_question_parent] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] quiz_question_type = block_type_map[quiz_question_parent] question_due = process_null(question_due) array = (question_id, quiz_question_type, question_weight, question_due) sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)" cursor.execute(sql, array) files = os.listdir(metadata_path) # Learner_demographic table learner_mail_map = {} # Course_learner table course_learner_map = {} learner_enrollment_time_map = {} # Enrolled learners set enrolled_learner_set = set() course_id = "" # Processing student_courseenrollment data for file in files: if "student_courseenrollment" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] course_id = record[2] time = datetime.datetime.strptime(record[3], "%Y-%m-%d %H:%M:%S") course_learner_id = course_id + "_" + global_learner_id if cmp_datetime(course_metadata_map["end_time"], time): enrolled_learner_set.add(global_learner_id) array = (global_learner_id, course_id, course_learner_id) sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)" cursor.execute(sql, array) course_learner_map[global_learner_id] = course_learner_id learner_enrollment_time_map[global_learner_id] = time input_file.close() # Processing auth_user data for file in files: if "auth_user-" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") if record[0] in enrolled_learner_set: learner_mail_map[record[0]] = record[4] input_file.close() # Processing certificates_generatedcertificate data num_uncertifiedLearners = 0 num_certifiedLearners = 0 for file in files: if "certificates_generatedcertificate" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] final_grade = process_null(record[3]) enrollment_mode = record[14].replace("\n", "") certificate_status = record[7] register_time = "" if course_learner_map.has_key(global_learner_id): register_time = learner_enrollment_time_map[ global_learner_id] register_time = process_null(register_time) if course_learner_map.has_key(global_learner_id): num_certifiedLearners += 1 array = (course_learner_map[global_learner_id], final_grade, enrollment_mode, certificate_status, register_time) sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)" cursor.execute(sql, array) else: num_uncertifiedLearners += 1 input_file.close() # Processing auth_userprofile data for file in files: if "auth_userprofile" in file: input_file = open(str(metadata_path + file), "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] gender = record[7] year_of_birth = process_null(process_null(record[9])) level_of_education = record[10] country = record[13] course_learner_id = process_null(course_id + "_" + global_learner_id) if global_learner_id in enrolled_learner_set: array = (course_learner_id, gender, year_of_birth, level_of_education, country, learner_mail_map[global_learner_id]) sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)" cursor.execute(sql, array) input_file.close() # Generating forum_interaction records for courses starting before 1T2015 if "1T2015" in course_code or "2014" in course_code or "2013" in course_code: forum_interaction_mongo(metadata_path, cursor)
def forum_interaction(metadata_path, daily_log_path, cursor): course_metadata_map = ExtractCourseInformation(metadata_path) # Forum-related events forum_event_types = [] forum_event_types.append("edx.forum.thread.created") forum_event_types.append("edx.forum.response.created") forum_event_types.append("edx.forum.comment.created") input_file = open(daily_log_path, "r") for line in input_file: jsonObject = json.loads(line) event_type = jsonObject["event_type"] if event_type in forum_event_types: # Skip records without user_id if "user_id" not in jsonObject["context"] or jsonObject["context"][ "user_id"] == "" or jsonObject["context"][ "user_id"] == None: continue post_timestamp = jsonObject["time"] # Check whether the event record belongs to that day log_date = post_timestamp[0:10] if log_date not in daily_log_path: print "Log not belonging to the day...\t" + log_date continue post_id = jsonObject["event"]["id"] course_learner_id = jsonObject["context"]["course_id"] + "_" + str( jsonObject["context"]["user_id"]) post_type = "" if event_type == "edx.forum.thread.created": post_type = "Thread_" + jsonObject["event"]["thread_type"] if event_type == "edx.forum.response.created": post_type = "Response" if event_type == "edx.forum.comment.created": post_type = "Comment" post_title = "" if event_type == "edx.forum.thread.created": post_title = jsonObject["event"]["title"] post_content = jsonObject["event"]["body"] post_timestamp = post_timestamp[0:19] post_timestamp = post_timestamp.replace("T", " ") post_timestamp = datetime.datetime.strptime( post_timestamp, "%Y-%m-%d %H:%M:%S") post_parent_id = "" if event_type == "edx.forum.comment.created": post_parent_id = jsonObject["event"]["response"]["id"] post_thread_id = "" if event_type == "edx.forum.response.created": post_thread_id = jsonObject["event"]["discussion"]["id"] # Pre-processing title & content post_title = post_title.replace("\n", " ") post_title = post_title.replace("\\", "\\\\") post_title = post_title.replace("\'", "\\'") post_content = post_content.replace("\n", " ") post_content = post_content.replace("\\", "\\\\") post_content = post_content.replace("\'", "\\'") post_title = cleanUnicode(post_title) post_content = cleanUnicode(post_content) if post_timestamp < course_metadata_map["end_time"]: array = [ post_id, course_learner_id, post_type, post_title, post_content, post_timestamp, post_parent_id, post_thread_id ] sql = "replace into forum_interaction(post_id, course_learner_id, post_type, post_title, post_content, post_timestamp, post_parent_id, post_thread_id) values (%s,%s,%s,%s,%s,%s,%s,%s);" try: cursor.execute(sql, array) except Exception as e: pass input_file.close()
def forum_interaction_mongo(metadata_path, cursor): course_metadata_map = ExtractCourseInformation(metadata_path) files = os.listdir(metadata_path) for file in files: if ".mongo" in file: forum_file = open(str(metadata_path + file), "r") for line in forum_file: jsonObject = json.loads(line) post_id = jsonObject["_id"]["$oid"] course_learner_id = jsonObject["course_id"] + "_" + jsonObject[ "author_id"] post_type = jsonObject["_type"] if post_type == "CommentThread": post_type += "_" + jsonObject["thread_type"] if "parent_id" in jsonObject and jsonObject["parent_id"] != "": post_type = "Comment_Reply" post_title = "" if "title" in jsonObject: post_title = jsonObject["title"] post_content = jsonObject["body"] post_timestamp = jsonObject["created_at"]["$date"] if type(post_timestamp) == type(100): post_timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime(post_timestamp / 1000)) post_timestamp = datetime.datetime.strptime( post_timestamp, "%Y-%m-%d %H:%M:%S") if isinstance(post_timestamp, unicode): post_timestamp = post_timestamp[0:19] post_timestamp = post_timestamp.replace("T", " ") post_timestamp = datetime.datetime.strptime( post_timestamp, "%Y-%m-%d %H:%M:%S") post_parent_id = "" if "parent_id" in jsonObject: post_parent_id = jsonObject["parent_id"]["$oid"] post_thread_id = "" if "comment_thread_id" in jsonObject: post_thread_id = jsonObject["comment_thread_id"]["$oid"] post_title = post_title.replace("\n", " ") post_title = post_title.replace("\\", "\\\\") post_title = post_title.replace("\'", "\\'") post_content = post_content.replace("\n", " ") post_content = post_content.replace("\\", "\\\\") post_content = post_content.replace("\'", "\\'") if post_timestamp < course_metadata_map["end_time"]: array = [ post_id, course_learner_id, post_type, post_title, post_content, post_timestamp, post_parent_id, post_thread_id ] sql = "replace into forum_interaction(post_id, course_learner_id, post_type, post_title, post_content, post_timestamp, post_parent_id, post_thread_id) values (%s,%s,%s,%s,%s,%s,%s,%s);" cursor.execute(sql, array) forum_file.close()
def forum_interaction(metadata_path, cursor): forum_interaction_records = [] # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) # Processing forum data files = os.listdir(metadata_path) for file in files: if ".mongo" in file: forum_file = open(metadata_path + file, "r") for line in forum_file: jsonObject = json.loads(line) post_id = jsonObject["_id"]["$oid"] course_learner_id = jsonObject["course_id"] + "_" + jsonObject[ "author_id"] post_type = jsonObject["_type"] if post_type == "CommentThread": post_type += "_" + jsonObject["thread_type"] if "parent_id" in jsonObject and jsonObject["parent_id"] != "": post_type = "Comment_Reply" post_title = "" if "title" in jsonObject: post_title = jsonObject["title"] post_content = jsonObject["body"] post_timestamp = jsonObject["created_at"]["$date"] if type(post_timestamp) == type(100): post_timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime(post_timestamp / 1000)) post_timestamp = datetime.datetime.strptime( post_timestamp, "%Y-%m-%d %H:%M:%S") if isinstance(post_timestamp, unicode): post_timestamp = post_timestamp[0:19] post_timestamp = post_timestamp.replace("T", " ") post_timestamp = datetime.datetime.strptime( post_timestamp, "%Y-%m-%d %H:%M:%S") post_parent_id = "" if "parent_id" in jsonObject: post_parent_id = jsonObject["parent_id"]["$oid"] post_thread_id = "" if "comment_thread_id" in jsonObject: post_thread_id = jsonObject["comment_thread_id"]["$oid"] post_title = post_title.replace("\n", " ") post_title = post_title.replace("\\", "\\\\") post_title = post_title.replace("\'", "\\'") post_content = post_content.replace("\n", " ") post_content = post_content.replace("\\", "\\\\") post_content = post_content.replace("\'", "\\'") if post_timestamp < course_metadata_map["end_time"]: array = [ post_id, course_learner_id, post_type, post_title, post_content, post_timestamp, post_parent_id, post_thread_id ] forum_interaction_records.append(array) forum_file.close() # Database version for array in forum_interaction_records: post_id = array[0] course_learner_id = array[1] post_type = array[2] post_title = cleanUnicode(array[3]) post_content = cleanUnicode(array[4]) post_timestamp = array[5] post_parent_id = array[6] post_thread_id = array[7] sql = "insert into forum_interaction(post_id, course_learner_id, post_type, post_title, post_content, post_timestamp, post_parent_id, post_thread_id) values (%s,%s,%s,%s,%s,%s,%s,%s);" data = (post_id, course_learner_id, post_type, post_title, post_content, post_timestamp, post_parent_id, post_thread_id) cursor.execute(sql, data) # File version '''
def forum_sessions(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) start_date = course_metadata_map["start_date"] end_date = course_metadata_map["end_date"] current_date = start_date end_next_date = getNextDay(end_date) forum_event_types = [] forum_event_types.append("edx.forum.comment.created") forum_event_types.append("edx.forum.response.created") forum_event_types.append("edx.forum.response.voted") forum_event_types.append("edx.forum.thread.created") forum_event_types.append("edx.forum.thread.voted") forum_event_types.append("edx.forum.searched") learner_all_event_logs = {} updated_learner_all_event_logs = {} forum_sessions_record = [] log_files = os.listdir(log_path) while True: if current_date == end_next_date: break for log_file in log_files: if current_date in log_file: print log_file learner_all_event_logs.clear() learner_all_event_logs = updated_learner_all_event_logs.copy() updated_learner_all_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_all_event_logs.keys(): course_learner_id_set.add(course_learner_id) log_file = open(log_path + log_file, "r") lines = log_file.readlines() for line in lines: jsonObject = json.loads(line) # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue if jsonObject["context"]["user_id"] == "": continue # For forum session separation global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) if "/discussion/" in event_type or event_type in forum_event_types: if event_type != "edx.forum.searched": event_type = "forum_activity" if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str( global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime( event_time, "%Y-%m-%d %H:%M:%S") # added for relevant elements event_page = "" if jsonObject.has_key("page"): event_page = str(jsonObject["page"]) event_path = "" if jsonObject.has_key("path"): event_path = str(jsonObject["path"]) event_referer = "" if jsonObject.has_key("referer"): event_referer = str(jsonObject["referer"]) if course_learner_id in course_learner_id_set: learner_all_event_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type, "page": event_page, "path": event_path, "referer": event_referer }) else: learner_all_event_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type, "page": event_page, "path": event_path, "referer": event_referer }] course_learner_id_set.add(course_learner_id) # For forum session separation for learner in learner_all_event_logs.keys(): course_learner_id = learner event_logs = learner_all_event_logs[learner] course_id = course_learner_id.split("_")[0] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" times_search = 0 final_time = "" # represent the elements which just before the session. session_rel_element_pre = "" # represent the elements which is mentioned in the session. session_rel_element_cur = "" for i in range(len(event_logs)): rel_element_cur = courseElementsFinder( event_logs[i], course_id) if session_id == "": if event_logs[i]["event_type"] in [ "forum_activity", "edx.forum.searched" ]: # Initialization session_id = "forum_session_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] if event_logs[i][ "event_type"] == "edx.forum.searched": times_search += 1 # Added for relevant element id session_rel_element_cur = rel_element_cur else: if event_logs[i]["event_type"] in [ "forum_activity", "edx.forum.searched" ]: if event_logs[i][ "event_time"] > end_time + datetime.timedelta( hours=0.5): session_id = session_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time ).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: rel_element_id = "" if session_rel_element_cur != "": rel_element_id = session_rel_element_cur else: rel_element_id = session_rel_element_pre array = [ session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id ] forum_sessions_record.append(array) final_time = event_logs[i]["event_time"] # Re-initialization session_id = "forum_session_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] if event_logs[i][ "event_type"] == "edx.forum.searched": times_search = 1 # Added for relevant element id session_rel_element_cur = rel_element_cur else: end_time = event_logs[i]["event_time"] if event_logs[i][ "event_type"] == "edx.forum.searched": times_search += 1 if session_rel_element_cur == "": session_rel_element_cur = rel_element_cur else: if event_logs[i][ "event_time"] <= end_time + datetime.timedelta( hours=0.5): end_time = event_logs[i]["event_time"] session_id = session_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: rel_element_id = "" if session_rel_element_cur != "": rel_element_id = session_rel_element_cur else: rel_element_id = session_rel_element_pre array = [ session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id ] forum_sessions_record.append(array) final_time = event_logs[i]["event_time"] # Re-initialization session_id = "" start_time = "" end_time = "" times_search = 0 # session_rel_element_pre is used for recording the element id # of the most recent event logs before the session logs if rel_element_cur != "": session_rel_element_pre = rel_element_cur if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] >= final_time: new_logs.append(log) updated_learner_all_event_logs[ course_learner_id] = new_logs log_file.close() current_date = getNextDay(current_date) # Database version for array in forum_sessions_record: session_id = array[0] course_learner_id = array[1] times_search = process_null(array[2]) start_time = array[3] end_time = array[4] duration = process_null(array[5]) rel_element_id = array[6] sql = "insert into forum_sessions (session_id, course_learner_id, times_search, start_time, end_time, duration, relevent_element_id) values (%s,%s,%s,%s,%s,%s,%s)" data = (session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id) cursor.execute(sql, data) # File version '''
def sessions(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) learner_all_event_logs = {} updated_learner_all_event_logs = {} session_record = [] log_files = os.listdir(log_path) while True: if current_date == end_next_date: break for file in log_files: if current_date in file: print file learner_all_event_logs.clear() learner_all_event_logs = updated_learner_all_event_logs.copy() updated_learner_all_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_all_event_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(log_path + file, "r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str( global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime( event_time, "%Y-%m-%d %H:%M:%S") if course_learner_id in course_learner_id_set: learner_all_event_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type }) else: learner_all_event_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type }] course_learner_id_set.add(course_learner_id) for course_learner_id in learner_all_event_logs.keys(): event_logs = learner_all_event_logs[course_learner_id] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if start_time == "": # Initialization start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if event_logs[i][ "event_time"] > end_time + datetime.timedelta( hours=0.5): session_id = course_learner_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: array = [ session_id, course_learner_id, start_time, end_time, duration ] session_record.append(array) final_time = event_logs[i]["event_time"] # Re-initialization session_id = "" start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if event_logs[i]["event_type"] == "page_close": end_time = event_logs[i]["event_time"] session_id = course_learner_id + "_" + str( start_time) + "_" + str(end_time) duration = (end_time - start_time ).days * 24 * 60 * 60 + ( end_time - start_time).seconds if duration > 5: array = [ session_id, course_learner_id, start_time, end_time, duration ] session_record.append(array) # Re-initialization session_id = "" start_time = "" end_time = "" final_time = event_logs[i]["event_time"] else: end_time = event_logs[i]["event_time"] if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] >= final_time: new_logs.append(log) updated_learner_all_event_logs[ course_learner_id] = new_logs current_date = getNextDay(current_date) # Filter duplicated records updated_session_record = [] session_id_set = set() for array in session_record: session_id = array[0] if session_id not in session_id_set: session_id_set.add(session_id) updated_session_record.append(array) session_record = updated_session_record # Database version for array in session_record: session_id = array[0] course_learner_id = array[1] start_time = array[2] end_time = array[3] duration = process_null(array[4]) sql = "insert into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" data = (session_id, course_learner_id, start_time, end_time, duration) cursor.execute(sql, data) # File version '''
def learner_mode(metadata_path, cursor): course_record = [] course_element_record = [] learner_index_record = [] course_learner_record = [] learner_demographic_record = [] # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) course_record.append([ course_metadata_map["course_id"], course_metadata_map["course_name"], course_metadata_map["start_time"], course_metadata_map["end_time"] ]) # Course_element table for element_id in course_metadata_map["element_time_map"].keys(): element_start_time = course_metadata_map["element_time_map"][ element_id] # Some contents released just one hour earlier than the hour of start time. # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00. # However, those 2nd week contents are count as 1st week. # In order to avoid above situation, I use date to replace datetime here. week = getDayDiff(course_metadata_map["start_time"].date(), element_start_time.date()) / 7 + 1 array = [ element_id, course_metadata_map["element_type_map"][element_id], week, course_metadata_map["course_id"] ] course_element_record.append(array) files = os.listdir(metadata_path) # Learner_demographic table learner_mail_map = {} # Course_learner table course_learner_map = {} learner_enrollment_time_map = {} # Enrolled learners set enrolled_learner_set = set() course_id = "" # Processing student_courseenrollment data for file in files: if "student_courseenrollment" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] course_id = record[2] time = datetime.datetime.strptime(record[3], "%Y-%m-%d %H:%M:%S") course_learner_id = course_id + "_" + global_learner_id if cmp_datetime(course_metadata_map["end_time"], time): enrolled_learner_set.add(global_learner_id) array = [global_learner_id, course_id, course_learner_id] learner_index_record.append(array) course_learner_map[global_learner_id] = course_learner_id learner_enrollment_time_map[global_learner_id] = time input_file.close() print "The number of enrolled learners is: " + str( len(enrolled_learner_set)) + "\n" # Processing auth_user data for file in files: if "auth_user-" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") if record[0] in enrolled_learner_set: learner_mail_map[record[0]] = record[4] input_file.close() # Processing certificates_generatedcertificate data num_uncertifiedLearners = 0 num_certifiedLearners = 0 for file in files: if "certificates_generatedcertificate" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] final_grade = record[3] enrollment_mode = record[14].replace("\n", "") certificate_status = record[7] register_time = "" if course_learner_map.has_key(global_learner_id): register_time = learner_enrollment_time_map[ global_learner_id] if course_learner_map.has_key(global_learner_id): num_certifiedLearners += 1 array = [ course_learner_map[global_learner_id], final_grade, enrollment_mode, certificate_status, register_time ] course_learner_record.append(array) else: num_uncertifiedLearners += 1 input_file.close() print "The number of uncertified & certified learners is: " + str( num_uncertifiedLearners) + "\t" + str( num_certifiedLearners) + "\n" # Processing auth_userprofile data for file in files: if "auth_userprofile" in file: input_file = open(metadata_path + file, "r") input_file.readline() lines = input_file.readlines() for line in lines: record = line.split("\t") global_learner_id = record[1] gender = record[7] year_of_birth = record[9] level_of_education = record[10] country = record[13] course_learner_id = course_id + "_" + global_learner_id if global_learner_id in enrolled_learner_set: array = [ course_learner_id, gender, year_of_birth, level_of_education, country, learner_mail_map[global_learner_id] ] learner_demographic_record.append(array) input_file.close() # Database version # Course table for array in course_record: course_id = course_metadata_map["course_id"] course_name = course_metadata_map["course_name"] start_time = course_metadata_map["start_time"] end_time = course_metadata_map["end_time"] sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)" data = (course_id, course_name, start_time, end_time) cursor.execute(sql, data) for array in course_element_record: element_id = array[0] element_type = array[1] week = process_null(array[2]) course_id = array[3] sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)" data = (element_id, element_type, week, course_id) cursor.execute(sql, data) # Learner_index table for array in learner_index_record: global_learner_id = array[0] course_id = array[1] course_learner_id = array[2] sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)" data = (global_learner_id, course_id, course_learner_id) cursor.execute(sql, data) # Course_learner table for array in course_learner_record: course_learner_id = array[0] final_grade = process_null(array[1]) enrollment_mode = array[2] certificate_status = array[3] register_time = process_null(array[4]) sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)" data = (course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) cursor.execute(sql, data) # Learner_demographic table for array in learner_demographic_record: course_learner_id = process_null(array[0]) gender = array[1] year_of_birth = process_null(process_null(array[2])) level_of_education = array[3] country = array[4] email = array[5] email = email.replace("\'", "") sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)" data = (course_learner_id, gender, year_of_birth, level_of_education, country, email) cursor.execute(sql, data) # File version '''
def quiz_sessions(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) # Processing events data submission_event_collection = [] # Problem check submission_event_collection.append("problem_check") # Server submission_event_collection.append("save_problem_check") submission_event_collection.append("problem_check_fail") submission_event_collection.append("save_problem_check_fail") # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully. submission_event_collection.append("problem_graded") # The server emits problem_rescore events when a problem is successfully rescored. submission_event_collection.append("problem_rescore") submission_event_collection.append("problem_rescore_fail") submission_event_collection.append("problem_reset") # event_source: serve submission_event_collection.append("reset_problem") submission_event_collection.append("reset_problem_fail") # The server emits problem_save events after a user saves a problem. submission_event_collection.append("problem_save") # event_source: server submission_event_collection.append("save_problem_fail") submission_event_collection.append("save_problem_success") # Show answer submission_event_collection.append("problem_show") submission_event_collection.append("showanswer") current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) log_files = os.listdir(log_path) child_parent_map = course_metadata_map["child_parent_map"] learner_all_event_logs = {} updated_learner_all_event_logs = {} quiz_sessions = {} while True: if current_date == end_next_date: break for file in log_files: if current_date in file: print file learner_all_event_logs.clear() learner_all_event_logs = updated_learner_all_event_logs.copy() updated_learner_all_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_all_event_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(log_path + file, "r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] event_type = str(jsonObject["event_type"]) if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str( global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime( event_time, "%Y-%m-%d %H:%M:%S") if learner_all_event_logs.has_key(course_learner_id): learner_all_event_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type }) else: learner_all_event_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type }] # For quiz session separation for course_learner_id in learner_all_event_logs.keys(): event_logs = learner_all_event_logs[course_learner_id] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) session_id = "" start_time = "" end_time = "" final_time = "" for i in range(len(event_logs)): if session_id == "": if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[ i]["event_type"] or event_logs[i][ "event_type"] in submission_event_collection: event_type_array = event_logs[i][ "event_type"].split("/") if "problem+block" in event_logs[i][ "event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i]["event_type"]: question_id = event_type_array[6].replace( ";_", "/") if question_id in child_parent_map.keys(): parent_block_id = child_parent_map[ question_id] session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id start_time = event_logs[i]["event_time"] end_time = event_logs[i]["event_time"] else: if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[ i]["event_type"] or event_logs[i][ "event_type"] in submission_event_collection: if event_logs[i][ "event_time"] > end_time + datetime.timedelta( hours=0.5): if quiz_sessions.has_key(session_id): quiz_sessions[session_id][ "time_array"].append({ "start_time": start_time, "end_time": end_time }) else: quiz_sessions[session_id] = { "course_learner_id": course_learner_id, "time_array": [{ "start_time": start_time, "end_time": end_time }] } final_time = event_logs[i]["event_time"] if "problem+block" in event_logs[i][ "event_type"] or "_problem;_" in event_logs[ i]["event_type"] or event_logs[ i]["event_type"] in submission_event_collection: event_type_array = event_logs[i][ "event_type"].split("/") if "problem+block" in event_logs[i][ "event_type"]: question_id = event_type_array[4] if "_problem;_" in event_logs[i][ "event_type"]: question_id = event_type_array[ 6].replace(";_", "/") if question_id in child_parent_map.keys( ): parent_block_id = child_parent_map[ question_id] session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id start_time = event_logs[i][ "event_time"] end_time = event_logs[i][ "event_time"] else: session_id = "" start_time = "" end_time = "" else: end_time = event_logs[i]["event_time"] else: if event_logs[i][ "event_time"] <= end_time + datetime.timedelta( hours=0.5): end_time = event_logs[i]["event_time"] if quiz_sessions.has_key(session_id): quiz_sessions[session_id][ "time_array"].append({ "start_time": start_time, "end_time": end_time }) else: quiz_sessions[session_id] = { "course_learner_id": course_learner_id, "time_array": [{ "start_time": start_time, "end_time": end_time }] } final_time = event_logs[i]["event_time"] session_id = "" start_time = "" end_time = "" if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] >= final_time: new_logs.append(log) updated_learner_all_event_logs[ course_learner_id] = new_logs current_date = getNextDay(current_date) # To compress the session event_logs for session_id in quiz_sessions.keys(): if len(quiz_sessions[session_id]["time_array"]) > 1: start_time = "" end_time = "" updated_time_array = [] for i in range(len(quiz_sessions[session_id]["time_array"])): if i == 0: start_time = quiz_sessions[session_id]["time_array"][i][ "start_time"] end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] else: if quiz_sessions[session_id]["time_array"][i][ "start_time"] > end_time + datetime.timedelta( hours=0.5): updated_time_array.append({ "start_time": start_time, "end_time": end_time }) start_time = quiz_sessions[session_id]["time_array"][ i]["start_time"] end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] if i == len( quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({ "start_time": start_time, "end_time": end_time }) else: end_time = quiz_sessions[session_id]["time_array"][i][ "end_time"] if i == len( quiz_sessions[session_id]["time_array"]) - 1: updated_time_array.append({ "start_time": start_time, "end_time": end_time }) quiz_sessions[session_id]["time_array"] = updated_time_array quiz_session_record = [] for session_id in quiz_sessions.keys(): course_learner_id = quiz_sessions[session_id]["course_learner_id"] for i in range(len(quiz_sessions[session_id]["time_array"])): start_time = quiz_sessions[session_id]["time_array"][i][ "start_time"] end_time = quiz_sessions[session_id]["time_array"][i]["end_time"] if start_time < end_time: duration = (end_time - start_time).days * 24 * 60 * 60 + ( end_time - start_time).seconds final_session_id = session_id + "_" + str( start_time) + "_" + str(end_time) if duration > 5: array = [ final_session_id, course_learner_id, start_time, end_time, duration ] quiz_session_record.append(array) # Database version for array in quiz_session_record: session_id = array[0] course_learner_id = array[1] start_time = array[2] end_time = array[3] duration = process_null(array[4]) sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)" data = (session_id, course_learner_id, start_time, end_time, duration) cursor.execute(sql, data) '''
def quiz_mode(metadata_path, log_path, cursor): # quiz_question_record = [] # submissions = {} # assessments = {} # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) quiz_question_map = course_metadata_map["quiz_question_map"] block_type_map = course_metadata_map["block_type_map"] element_time_map_due = course_metadata_map["element_time_map_due"] for question_id in quiz_question_map: question_due = "" question_weight = quiz_question_map[question_id] quiz_question_parent = course_metadata_map["child_parent_map"][ question_id] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] while not block_type_map.has_key(quiz_question_parent): quiz_question_parent = course_metadata_map["child_parent_map"][ quiz_question_parent] if (question_due == "") and (quiz_question_parent in element_time_map_due): question_due = element_time_map_due[quiz_question_parent] quiz_question_type = block_type_map[quiz_question_parent] question_due = process_null(question_due) # array_quiz = [question_id, quiz_question_type, question_weight, question_due] # quiz_question_record.append(array_quiz) sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)" data = (question_id, quiz_question_type, question_weight, question_due) cursor.execute(sql, data) # Processing events data submission_event_collection = [] # Problem check submission_event_collection.append("problem_check") # Server ''' submission_event_collection.append("save_problem_check") submission_event_collection.append("problem_check_fail") submission_event_collection.append("save_problem_check_fail") # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully. submission_event_collection.append("problem_graded") # The server emits problem_rescore events when a problem is successfully rescored. submission_event_collection.append("problem_rescore") submission_event_collection.append("problem_rescore_fail") submission_event_collection.append("problem_reset") # event_source: serve submission_event_collection.append("reset_problem") submission_event_collection.append("reset_problem_fail") # The server emits problem_save events after a user saves a problem. submission_event_collection.append("problem_save") # event_source: server submission_event_collection.append("save_problem_fail") submission_event_collection.append("save_problem_success") # Show answer submission_event_collection.append("problem_show") submission_event_collection.append("showanswer") ''' current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) log_files = os.listdir(log_path) submission_uni_index = 0 while True: if current_date == end_next_date: break for file in log_files: if current_date in file: print file input_file = open(log_path + file, "r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) if jsonObject["event_type"] in submission_event_collection: # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str( global_learner_id) question_id = "" grade = "" max_grade = "" event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime( event_time, "%Y-%m-%d %H:%M:%S") if isinstance(jsonObject["event"], dict): question_id = jsonObject["event"]["problem_id"] # The fields "grade" and "max_grade" are specific to submission event "problem_check" if jsonObject["event"].has_key( "grade" ) and jsonObject["event"].has_key("max_grade"): grade = jsonObject["event"]["grade"] max_grade = jsonObject["event"][ "max_grade"] if question_id != "": submission_id = course_learner_id + "_" + question_id + "_" + str( submission_uni_index) submission_uni_index = submission_uni_index + 1 # For submissions # array_submission = [submission_id, course_learner_id, question_id, event_time] # submissions[submission_id] = array_submission submission_timestamp = event_time sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values (%s,%s,%s,%s)" data = (submission_id, course_learner_id, question_id, submission_timestamp) cursor.execute(sql, data) # For assessments if grade != "" and max_grade != "": # array_assessment = [submission_id, course_learner_id, max_grade, grade] # assessments[submission_id] = array_assessment assessment_id = submission_id sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values (%s,%s,%s,%s)" data = (assessment_id, course_learner_id, max_grade, grade) cursor.execute(sql, data) current_date = getNextDay(current_date) # submission_record = [] # assessment_record = [] # for submission_id in submissions.keys(): # submission_record.append(submissions[submission_id]) # for assessment_id in assessments.keys(): # assessment_record.append(assessments[assessment_id]) # Database version # Quiz_question table # for array in quiz_question_record: # question_id = array[0] # question_type = array[1] # question_weight = array[2] # question_due = array[3] # sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values" # sql += "('%s','%s','%s','%s');" % (question_id, question_type, question_weight, question_due) # cursor.execute(sql) # Submissions table # for array in submission_record: # submission_id = array[0] # course_learner_id = array[1] # question_id = array[2] # submission_timestamp = array[3] # sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values" # sql += "('%s','%s','%s','%s');" % (submission_id, course_learner_id, question_id, submission_timestamp) # cursor.execute(sql) # Submissions table # for array in assessment_record: # assessment_id = array[0] # course_learner_id = array[1] # max_grade = array[2] # grade = array[3] # sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values" # sql += "('%s','%s','%s','%s');" % (assessment_id, course_learner_id, max_grade, grade) # cursor.execute(sql) '''
def video_interaction(metadata_path, log_path, cursor): # Collect course information course_metadata_map = ExtractCourseInformation(metadata_path) current_date = course_metadata_map["start_date"] end_next_date = getNextDay(course_metadata_map["end_date"]) video_interaction_map = {} # Video-related event types video_event_types = [] video_event_types.append("hide_transcript") video_event_types.append("edx.video.transcript.hidden") video_event_types.append("edx.video.closed_captions.hidden") video_event_types.append("edx.video.closed_captions.shown") video_event_types.append("load_video") video_event_types.append("edx.video.loaded") video_event_types.append("pause_video") video_event_types.append("edx.video.paused") video_event_types.append("play_video") video_event_types.append("edx.video.played") video_event_types.append("seek_video") video_event_types.append("edx.video.position.changed") video_event_types.append("show_transcript") video_event_types.append("edx.video.transcript.shown") video_event_types.append("speed_change_video") video_event_types.append("stop_video") video_event_types.append("edx.video.stopped") video_event_types.append("video_hide_cc_menu") video_event_types.append("edx.video.language_menu.hidden") video_event_types.append("video_show_cc_menu") video_event_types.append("edx.video.language_menu.shown") ''' # Navigation-related event types navigation_event_types = [] navigation_event_types.append("page_close") navigation_event_types.append("seq_goto") navigation_event_types.append("seq_next") navigation_event_types.append("seq_prev") ''' learner_video_event_logs = {} updated_learner_video_event_logs = {} log_files = os.listdir(log_path) while True: if current_date == end_next_date: break; for file in log_files: if current_date in file: print file learner_video_event_logs.clear() learner_video_event_logs = updated_learner_video_event_logs.copy() updated_learner_video_event_logs.clear() # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_video_event_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(log_path + file,"r") lines = input_file.readlines() for line in lines: jsonObject = json.loads(line) if jsonObject["event_type"] in video_event_types: # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) video_id = "" event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S") event_type = jsonObject["event_type"] # For seek event new_time = 0 old_time = 0 # For speed change event new_speed = 0 old_speed = 0 # This sub-condition does not exist in log data # if isinstance(jsonObject["event"], dict): # video_id = jsonObject["event"]["id"] if isinstance(jsonObject["event"], unicode): event_jsonObject = json.loads(jsonObject["event"]) video_id = event_jsonObject["id"] video_id = video_id.replace("-", "://", 1) video_id = video_id.replace("-", "/") # For video seek event if "new_time" in event_jsonObject and "old_time" in event_jsonObject: new_time = event_jsonObject["new_time"] old_time = event_jsonObject["old_time"] # For video speed change event if "new_speed" in event_jsonObject and "old_speed" in event_jsonObject: new_speed = event_jsonObject["new_speed"] old_speed = event_jsonObject["old_speed"] # To record video seek event if event_type in ["seek_video","edx.video.position.changed"]: if new_time is not None and old_time is not None: if course_learner_id in course_learner_id_set: learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_time":new_time, "old_time":old_time}) else: learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_time":new_time, "old_time":old_time}] course_learner_id_set.add(course_learner_id) continue # To record video speed change event if event_type in ["speed_change_video"]: if course_learner_id in course_learner_id_set: learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_speed":new_speed, "old_speed":old_speed}) else: learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_speed":new_speed, "old_speed":old_speed}] course_learner_id_set.add(course_learner_id) continue if course_learner_id in course_learner_id_set: learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id}) else: learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id}] course_learner_id_set.add(course_learner_id) # For non-video-related events if jsonObject["event_type"] not in video_event_types: # Some daily logs don't have the "user_id" value if "user_id" not in jsonObject["context"]: continue global_learner_id = jsonObject["context"]["user_id"] if global_learner_id != "": course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S") event_type = jsonObject["event_type"] if course_learner_id in course_learner_id_set: learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type}) else: learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}] course_learner_id_set.add(course_learner_id) for course_learner_id in learner_video_event_logs.keys(): video_id = "" event_logs = learner_video_event_logs[course_learner_id] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) video_start_time = "" final_time = "" # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video pause event pause_check = False pause_start_time = "" duration_pause = 0 for log in event_logs: if log["event_type"] in ["play_video", "edx.video.played"]: video_start_time = log["event_time"] video_id = log["video_id"] if pause_check: duration_pause = (log["event_time"] - pause_start_time).seconds video_interaction_id = course_learner_id + "_" + video_id + "_" + str(pause_start_time) if duration_pause > 2 and duration_pause < 600: if video_interaction_id in video_interaction_map.keys(): video_interaction_map[video_interaction_id]["times_pause"] = 1 video_interaction_map[video_interaction_id]["duration_pause"] = duration_pause pause_check = False continue if video_start_time != "": if log["event_time"] > video_start_time + datetime.timedelta(hours=0.5): video_start_time = "" video_id = "" final_time = log["event_time"] else: # 0. Seek if log["event_type"] in ["seek_video", "edx.video.position.changed"] and video_id == log["video_id"]: # Forward seek event if log["new_time"] > log["old_time"]: times_forward_seek += 1 duration_forward_seek += log["new_time"] - log["old_time"] # Backward seek event if log["new_time"] < log["old_time"]: times_backward_seek += 1 duration_backward_seek += log["old_time"] - log["new_time"] continue # 1. Speed change if log["event_type"] == "speed_change_video" and video_id == log["video_id"]: if speed_change_last_time == "": speed_change_last_time = log["event_time"] old_speed = log["old_speed"] new_speed = log["new_speed"] if old_speed < new_speed: times_speed_up += 1 if old_speed > new_speed: times_speed_down += 1 else: if (log["event_time"] - speed_change_last_time).seconds > 10: old_speed = log["old_speed"] new_speed = log["new_speed"] if old_speed < new_speed: times_speed_up += 1 if old_speed > new_speed: times_speed_down += 1 speed_change_last_time = log["event_time"] continue # 2. Pause/Stop situation if log["event_type"] in ["pause_video", "edx.video.paused", "stop_video", "edx.video.stopped"] and video_id == log["video_id"]: watch_duration = (log["event_time"] - video_start_time).seconds video_end_time = log["event_time"] video_interaction_id = course_learner_id + "_" + video_id + "_" + str(video_end_time) if watch_duration > 5: video_interaction_map[video_interaction_id] = {"course_learner_id":course_learner_id, "video_id":video_id, "type": "video", "watch_duration":watch_duration, "times_forward_seek":times_forward_seek, "duration_forward_seek":duration_forward_seek, "times_backward_seek":times_backward_seek, "duration_backward_seek":duration_backward_seek, "times_speed_up":times_speed_up, "times_speed_down":times_speed_down, "start_time":video_start_time, "end_time":video_end_time} if log["event_type"] in ["pause_video", "edx.video.paused"]: pause_check = True pause_start_time = video_end_time # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video general information video_start_time ="" video_id = "" final_time = log["event_time"] continue # 3/4 Page changed/Session closed if log["event_type"] not in video_event_types: video_end_time = log["event_time"] watch_duration = (video_end_time - video_start_time).seconds video_interaction_id = course_learner_id + "_" + video_id + "_" + str(video_end_time) if watch_duration > 5: video_interaction_map[video_interaction_id] = {"course_learner_id":course_learner_id, "video_id":video_id, "type": "video", "watch_duration":watch_duration, "times_forward_seek":times_forward_seek, "duration_forward_seek":duration_forward_seek, "times_backward_seek":times_backward_seek, "duration_backward_seek":duration_backward_seek, "times_speed_up":times_speed_up, "times_speed_down":times_speed_down, "start_time":video_start_time, "end_time":video_end_time} # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video general information video_start_time = "" video_id = "" final_time = log["event_time"] continue if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] > final_time: new_logs.append(log) updated_learner_video_event_logs[course_learner_id] = new_logs current_date = getNextDay(current_date) video_interaction_record = [] for interaction_id in video_interaction_map.keys(): video_interaction_id = interaction_id course_learner_id = video_interaction_map[interaction_id]["course_learner_id"] video_id = video_interaction_map[interaction_id]["video_id"] duration = video_interaction_map[interaction_id]["watch_duration"] times_forward_seek = video_interaction_map[interaction_id]["times_forward_seek"] duration_forward_seek = video_interaction_map[interaction_id]["duration_forward_seek"] times_backward_seek = video_interaction_map[interaction_id]["times_backward_seek"] duration_backward_seek = video_interaction_map[interaction_id]["duration_backward_seek"] times_speed_up = video_interaction_map[interaction_id]["times_speed_up"] times_speed_down = video_interaction_map[interaction_id]["times_speed_down"] start_time = video_interaction_map[interaction_id]["start_time"] end_time = video_interaction_map[interaction_id]["end_time"] if "times_pause" in video_interaction_map[interaction_id]: times_pause = video_interaction_map[interaction_id]["times_pause"] duration_pause = video_interaction_map[interaction_id]["duration_pause"] else: times_pause = 0 duration_pause = 0 array = [video_interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time] video_interaction_record.append(array) # Video_interaction table # Database version for array in video_interaction_record: interaction_id = array[0] course_learner_id = array[1] video_id = array[2] duration = process_null(array[3]) times_forward_seek = process_null(array[4]) duration_forward_seek = process_null(array[5]) times_backward_seek = process_null(array[6]) duration_backward_seek = process_null(array[7]) times_speed_up = process_null(array[8]) times_speed_down = process_null(array[9]) times_pause = process_null(array[10]) duration_pause = process_null(array[11]) start_time = array[12] end_time = array[13] sql = "insert into video_interaction(interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" data = (interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time) cursor.execute(sql, data) # File version '''
def video_interaction(metadata_path, daily_log_path, remaining_video_interaction_log_path, cursor): utc = pytz.UTC course_metadata_map = ExtractCourseInformation(metadata_path) end_date = course_metadata_map["end_date"] video_interaction_map = {} # Video-related event types video_event_types = [] video_event_types.append("hide_transcript") video_event_types.append("edx.video.transcript.hidden") video_event_types.append("edx.video.closed_captions.hidden") video_event_types.append("edx.video.closed_captions.shown") video_event_types.append("load_video") video_event_types.append("edx.video.loaded") video_event_types.append("pause_video") video_event_types.append("edx.video.paused") video_event_types.append("play_video") video_event_types.append("edx.video.played") video_event_types.append("seek_video") video_event_types.append("edx.video.position.changed") video_event_types.append("show_transcript") video_event_types.append("edx.video.transcript.shown") video_event_types.append("speed_change_video") video_event_types.append("stop_video") video_event_types.append("edx.video.stopped") video_event_types.append("video_hide_cc_menu") video_event_types.append("edx.video.language_menu.hidden") video_event_types.append("video_show_cc_menu") video_event_types.append("edx.video.language_menu.shown") learner_logs = {} remaining_learner_logs = {} # Read remaining event logs if os.path.exists(remaining_video_interaction_log_path): remaining_input_file = open(remaining_video_interaction_log_path) learner_logs = json.loads(remaining_input_file.read(), object_hook=json_util.object_hook) # Course_learner_id set course_learner_id_set = set() for course_learner_id in learner_logs.keys(): course_learner_id_set.add(course_learner_id) input_file = open(daily_log_path, "r") for line in input_file: jsonObject = json.loads(line) # Skip records without user_id if "user_id" not in jsonObject["context"] or jsonObject["context"][ "user_id"] == "" or jsonObject["context"]["user_id"] == None: continue global_learner_id = jsonObject["context"]["user_id"] event_type = jsonObject["event_type"] course_id = jsonObject["context"]["course_id"] course_learner_id = course_id + "_" + str(global_learner_id) event_time = jsonObject["time"] # Check whether the event record belongs to that day log_date = event_time[0:10] if log_date not in daily_log_path: # print "Log not belonging to the day...\t" + log_date continue event_time = event_time[0:19] event_time = event_time.replace("T", " ") event_time = datetime.datetime.strptime(event_time, "%Y-%m-%d %H:%M:%S") event_time = event_time.replace(tzinfo=utc) # For video-related events if event_type in video_event_types: video_id = "" # For seek event new_time = 0 old_time = 0 # For speed change event new_speed = 0 old_speed = 0 if isinstance(jsonObject["event"], unicode): event_jsonObject = json.loads(jsonObject["event"]) video_id = event_jsonObject["id"] video_id = video_id.replace("-", "://", 1) video_id = video_id.replace("-", "/") # For video seek event if "new_time" in event_jsonObject and "old_time" in event_jsonObject: new_time = event_jsonObject["new_time"] old_time = event_jsonObject["old_time"] # For video speed change event if "new_speed" in event_jsonObject and "old_speed" in event_jsonObject: new_speed = event_jsonObject["new_speed"] old_speed = event_jsonObject["old_speed"] # To record video seek event if event_type in ["seek_video", "edx.video.position.changed"]: if new_time is not None and old_time is not None: if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type, "video_id": video_id, "new_time": new_time, "old_time": old_time }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type, "video_id": video_id, "new_time": new_time, "old_time": old_time }] course_learner_id_set.add(course_learner_id) continue # To record video speed change event if event_type in ["speed_change_video"]: if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type, "video_id": video_id, "new_speed": new_speed, "old_speed": old_speed }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type, "video_id": video_id, "new_speed": new_speed, "old_speed": old_speed }] course_learner_id_set.add(course_learner_id) continue if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type, "video_id": video_id }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type, "video_id": video_id }] course_learner_id_set.add(course_learner_id) # For non-video-related events if event_type not in video_event_types: if course_learner_id in course_learner_id_set: learner_logs[course_learner_id].append({ "event_time": event_time, "event_type": event_type }) else: learner_logs[course_learner_id] = [{ "event_time": event_time, "event_type": event_type }] course_learner_id_set.add(course_learner_id) input_file.close() # For video interaction extraction for learner in learner_logs.keys(): course_learner_id = learner event_logs = learner_logs[learner] # Sorting event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time')) video_id = "" video_start_time = "" final_time = "" # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video pause event pause_check = False pause_start_time = "" duration_pause = 0 for i in range(len(event_logs)): if event_logs[i]["event_type"] in [ "play_video", "edx.video.played" ]: video_start_time = event_logs[i]["event_time"] video_id = event_logs[i]["video_id"] if pause_check: duration_pause = (event_logs[i]["event_time"] - pause_start_time).days * 24 * 60 * 60 + ( event_logs[i]["event_time"] - pause_start_time).seconds video_interaction_id = course_learner_id + "_" + video_id + "_" + str( pause_start_time) if duration_pause > 2 and duration_pause < 600: if video_interaction_id in video_interaction_map.keys( ): video_interaction_map[video_interaction_id][ "times_pause"] = 1 video_interaction_map[video_interaction_id][ "duration_pause"] = duration_pause pause_check = False continue if video_start_time != "": if event_logs[i][ "event_time"] > video_start_time + datetime.timedelta( hours=0.5): video_start_time = "" video_id = "" final_time = event_logs[i]["event_time"] else: # 0. Seek if event_logs[i]["event_type"] in [ "seek_video", "edx.video.position.changed" ] and video_id == event_logs[i]["video_id"]: # Forward seek event if event_logs[i]["new_time"] > event_logs[i][ "old_time"]: times_forward_seek += 1 duration_forward_seek += event_logs[i][ "new_time"] - event_logs[i]["old_time"] # Backward seek event if event_logs[i]["new_time"] < event_logs[i][ "old_time"]: times_backward_seek += 1 duration_backward_seek += event_logs[i][ "old_time"] - event_logs[i]["new_time"] continue # 1. Speed change if event_logs[i][ "event_type"] == "speed_change_video" and video_id == event_logs[ i]["video_id"]: if speed_change_last_time == "": speed_change_last_time = event_logs[i][ "event_time"] old_speed = event_logs[i]["old_speed"] new_speed = event_logs[i]["new_speed"] if old_speed < new_speed: times_speed_up += 1 if old_speed > new_speed: times_speed_down += 1 else: if (event_logs[i]["event_time"] - speed_change_last_time).seconds > 10: old_speed = event_logs[i]["old_speed"] new_speed = event_logs[i]["new_speed"] if old_speed < new_speed: times_speed_up += 1 if old_speed > new_speed: times_speed_down += 1 speed_change_last_time = event_logs[i][ "event_time"] continue # 2. Pause/Stop situation if event_logs[i]["event_type"] in [ "pause_video", "edx.video.paused", "stop_video", "edx.video.stopped" ] and video_id == event_logs[i]["video_id"]: watch_duration = (event_logs[i]["event_time"] - video_start_time).seconds video_end_time = event_logs[i]["event_time"] video_interaction_id = course_learner_id + "_" + video_id + "_" + str( video_start_time) + "_" + str(video_end_time) if watch_duration > 5: video_interaction_map[video_interaction_id] = { "course_learner_id": course_learner_id, "video_id": video_id, "type": "video", "watch_duration": watch_duration, "times_forward_seek": times_forward_seek, "duration_forward_seek": duration_forward_seek, "times_backward_seek": times_backward_seek, "duration_backward_seek": duration_backward_seek, "times_speed_up": times_speed_up, "times_speed_down": times_speed_down, "start_time": video_start_time, "end_time": video_end_time } if event_logs[i]["event_type"] in [ "pause_video", "edx.video.paused" ]: pause_check = True pause_start_time = video_end_time # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video general information video_start_time = "" video_id = "" final_time = event_logs[i]["event_time"] continue # 3/4 Page changed/Session closed if event_logs[i]["event_type"] not in video_event_types: video_end_time = event_logs[i]["event_time"] watch_duration = (video_end_time - video_start_time).seconds video_interaction_id = course_learner_id + "_" + video_id + "_" + str( video_start_time) + "_" + str(video_end_time) if watch_duration > 5: video_interaction_map[video_interaction_id] = { "course_learner_id": course_learner_id, "video_id": video_id, "type": "video", "watch_duration": watch_duration, "times_forward_seek": times_forward_seek, "duration_forward_seek": duration_forward_seek, "times_backward_seek": times_backward_seek, "duration_backward_seek": duration_backward_seek, "times_speed_up": times_speed_up, "times_speed_down": times_speed_down, "start_time": video_start_time, "end_time": video_end_time } # For video seek event times_forward_seek = 0 duration_forward_seek = 0 times_backward_seek = 0 duration_backward_seek = 0 # For video speed change event speed_change_last_time = "" times_speed_up = 0 times_speed_down = 0 # For video general information video_start_time = "" video_id = "" final_time = event_logs[i]["event_time"] continue if final_time != "": new_logs = [] for log in event_logs: if log["event_time"] > final_time: new_logs.append(log) remaining_learner_logs[course_learner_id] = new_logs # Output remaining logs if str(end_date)[0:10] not in daily_log_path: output_file = open(remaining_video_interaction_log_path, "w") output_file.write( json.dumps(remaining_learner_logs, default=json_util.default)) output_file.close() else: os.remove(remaining_video_interaction_log_path) for interaction_id in video_interaction_map.keys(): video_interaction_id = interaction_id course_learner_id = video_interaction_map[interaction_id][ "course_learner_id"] video_id = video_interaction_map[interaction_id]["video_id"] duration = process_null( video_interaction_map[interaction_id]["watch_duration"]) times_forward_seek = process_null( video_interaction_map[interaction_id]["times_forward_seek"]) duration_forward_seek = process_null( video_interaction_map[interaction_id]["duration_forward_seek"]) times_backward_seek = process_null( video_interaction_map[interaction_id]["times_backward_seek"]) duration_backward_seek = process_null( video_interaction_map[interaction_id]["duration_backward_seek"]) times_speed_up = process_null( video_interaction_map[interaction_id]["times_speed_up"]) times_speed_down = process_null( video_interaction_map[interaction_id]["times_speed_down"]) start_time = video_interaction_map[interaction_id]["start_time"] end_time = video_interaction_map[interaction_id]["end_time"] if "times_pause" in video_interaction_map[interaction_id]: times_pause = process_null( video_interaction_map[interaction_id]["times_pause"]) duration_pause = process_null( video_interaction_map[interaction_id]["duration_pause"]) else: times_pause = 0 duration_pause = 0 array = [ video_interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time ] sql = "insert into video_interaction(interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" try: cursor.execute(sql, array) except Exception as e: pass
def main(argv): # Read configs config = ConfigParser.ConfigParser() config.read(argv[0]) # All the configs are read as string course_log_path = config.get("data", "path") remove_filtered_logs = config.get("data", "remove_filtered_logs") log_update_list = json.loads(config.get("data", "log_update_list")) metadata_update_list = json.loads(config.get("data", "metadata_update_list")) survey_update_map = json.loads(config.get("data", "survey_update_map")) user = config.get("mysqld", "user") password = config.get("mysqld", "password") host = config.get("mysqld", "host") database = config.get("mysqld", "database") # Database connection = mysql.connector.connect(user=user, password=password, host=host, database=database, charset='utf8mb4') cursor = connection.cursor() # Delete relevant records before updating the database print "Removing log records..." for course_code in log_update_list: print str("\t" + course_code) RemoveCourseRecords(course_log_path, course_code, "log", cursor) print "Removing metadata records..." for course_code in metadata_update_list: print str("\t" + course_code) RemoveCourseRecords(course_log_path, course_code, "metadata", cursor) print "Removing survey records..." for course_code in survey_update_map.keys(): print str("\t" + course_code) RemoveCourseRecords(course_log_path, course_code, "survey", cursor) print folders = os.listdir(course_log_path) for folder in folders: if folder != "daily_logs": # Only for Mac OS if folder == ".DS_Store": continue course_code = folder print "Processing\t" + course_code # A file named "course_processing_tracker" (JSON format) is created # for each course to keep track of the processing files tracker_path = str(course_log_path + course_code + "/course_processing_tracker") if not os.path.exists(tracker_path): output_file = open(tracker_path, "w") tracker_map = {} # This value is used to keep track of the processing status for the course' daily log files, # i.e., "False" (not finished yet) and "True" (finished) tracker_map["status"] = False tracker_map["processed_dates"] = [] tracker_map["num_processed_dates"] = 0 output_file.write(json.dumps(tracker_map)) output_file.close() # Read the "course_processing_tracker" file input_file = open(tracker_path, "r") tracker_map = json.loads(input_file.read()) input_file.close() metadata_path = str(course_log_path + course_code + "/metadata/") # Determine whether the course_structure file is present mark = False files = os.listdir(metadata_path) for file in files: if "course_structure" in file: mark = True break if not mark: print "The course structure file is missing.\n" continue # Learner mode if course_code in metadata_update_list: print "Learner Mode processing..." learner_mode(metadata_path, course_code, cursor) # Survey mode survey_path = str(course_log_path + course_code + "/surveys/") if course_code in survey_update_map.keys(): print "Survey Mode processing..." pre_id_index = int(survey_update_map[course_code][0]) post_id_index = int(survey_update_map[course_code][1]) survey_mode(metadata_path, survey_path, cursor, pre_id_index, post_id_index) if tracker_map["status"]: print continue # Retrieve the start/end date of the course course_metadata_map = ExtractCourseInformation(metadata_path) course_id = course_metadata_map["course_id"] start_date = course_metadata_map["start_date"] end_date = course_metadata_map["end_date"] current_date = start_date while current_date <= end_date: current_date_string = str(current_date)[0:10] if current_date_string not in tracker_map["processed_dates"]: daily_log_file = str("delftx-edx-events-" + current_date_string + ".log.gz") if os.path.exists(str(course_log_path + "/daily_logs/" + daily_log_file)): print daily_log_file # Decompress log files unzip_file_path = str(course_log_path + course_code + "/unzip_daily_logs/") if not os.path.exists(unzip_file_path): os.mkdir(unzip_file_path) output_path = str(unzip_file_path + daily_log_file[0:-3]) if not os.path.exists(output_path): output_file = open(output_path, 'w') with gzip.open(str(course_log_path + "/daily_logs/" + daily_log_file), 'r') as f: for line in f: jsonObject = json.loads(line) if course_id in jsonObject["context"]["course_id"]: output_file.write(line) output_file.close() daily_log_path = output_path # Video_interaction table # print "1.\t Video_interaction table processing..." remaining_video_interaction_log_path = course_log_path + course_code + "/remaining_video_interaction_logs" video_interaction(metadata_path, daily_log_path, remaining_video_interaction_log_path, cursor) # Quiz mode # print "2.\t Quiz mode processing..." quiz_mode(daily_log_path, cursor) # Quiz_sessions table # print "3.\t Quiz_sessions table processing..." remaining_quiz_session_log_path = course_log_path + course_code + "/remaining_quiz_session_logs" quiz_sessions(metadata_path, daily_log_path, remaining_quiz_session_log_path, cursor) # Forum_interaction table # print "4.\t Forum_interaction table processing..." forum_interaction(metadata_path, daily_log_path, cursor) # Forum_sessions table # print "5.\t Forum_sessions table processing..." remaining_forum_session_log_path = course_log_path + course_code + "/remaining_forum_session_logs" forum_sessions(metadata_path, daily_log_path, remaining_forum_session_log_path, cursor) # Sessions table # print "6.\t Sessions table processing..." remaining_session_log_path = course_log_path + course_code + "/remaining_session_logs" sessions(metadata_path, daily_log_path, remaining_session_log_path, cursor) tracker_map["processed_dates"].append(current_date_string) current_date = getNextDay(current_date) if len(tracker_map["processed_dates"]) == getDayDiff(start_date, end_date) + 1: tracker_map["status"] = True if tracker_map["num_processed_dates"] != len(tracker_map["processed_dates"]): tracker_map["num_processed_dates"] = len(tracker_map["processed_dates"]) output_file = open(tracker_path, "w") output_file.write(json.dumps(tracker_map)) output_file.close() # Delete the decompressed files if remove_filtered_logs == "1": log_files = os.listdir(str(course_log_path + "/daily_logs/")) for log_file in log_files: os.remove(str(course_log_path + "/daily_logs/" + log_file)) print