Example #1
0
def quiz_sessions(metadata_path, log_path, cursor):
    
    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)       
                            
    # Processing events data
    submission_event_collection = []

    # Problem check
    submission_event_collection.append("problem_check")     # Server
    submission_event_collection.append("save_problem_check")
    submission_event_collection.append("problem_check_fail")
    submission_event_collection.append("save_problem_check_fail")
    
    # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully.
    submission_event_collection.append("problem_graded")
    
    # The server emits problem_rescore events when a problem is successfully rescored.
    submission_event_collection.append("problem_rescore")
    submission_event_collection.append("problem_rescore_fail")
    
    submission_event_collection.append("problem_reset") # event_source: serve
    submission_event_collection.append("reset_problem")
    submission_event_collection.append("reset_problem_fail")
    
    # The server emits problem_save events after a user saves a problem.
    submission_event_collection.append("problem_save") # event_source: server
    submission_event_collection.append("save_problem_fail")
    submission_event_collection.append("save_problem_success")
    
    # Show answer
    submission_event_collection.append("problem_show")
    submission_event_collection.append("showanswer")
    
    current_date = course_metadata_map["start_date"]   
    end_next_date = getNextDay(course_metadata_map["end_date"])

    log_files = os.listdir(log_path)
    
    child_parent_map = course_metadata_map["child_parent_map"]
    
    learner_all_event_logs = {}
    updated_learner_all_event_logs = {}
    quiz_sessions = {}
    
    while True:
        
        if current_date == end_next_date:
            break;
        
        for file in log_files:           
            if current_date in file:
                
                print file
                
                learner_all_event_logs.clear()
                learner_all_event_logs = updated_learner_all_event_logs.copy()
                updated_learner_all_event_logs.clear()
                
                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_all_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)
                
                input_file = open(log_path + file,"r")                
                lines = input_file.readlines()
                        
                for line in lines:                              
                    
                    jsonObject = json.loads(line)
                    
                    # Some daily logs don't have the "user_id" value
                    if "user_id" not in jsonObject["context"]:
                        continue
                    
                    global_learner_id = jsonObject["context"]["user_id"]
                    event_type = str(jsonObject["event_type"])
                    
                    if global_learner_id != "":
                        
                        course_id = jsonObject["context"]["course_id"]
                        course_learner_id = course_id + "_" + str(global_learner_id)
                        
                        event_time = jsonObject["time"]
                        event_time = event_time[0:19]
                        event_time = event_time.replace("T", " ")
                        event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S")
                                               
                        if learner_all_event_logs.has_key(course_learner_id):
                            learner_all_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type})
                        else:
                            learner_all_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}]
                            
                # For quiz session separation
                for course_learner_id in learner_all_event_logs.keys():
                             
                    event_logs = learner_all_event_logs[course_learner_id]
                    
                    # Sorting
                    event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time'))
                      
                    session_id = ""
                    start_time = ""
                    end_time = ""
                    
                    final_time = ""                  
                    
                    for i in range(len(event_logs)):
                        
                        if session_id == "":
                            
                            if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"] or event_logs[i]["event_type"] in submission_event_collection:
                                
                                event_type_array = event_logs[i]["event_type"].split("/")
                                
                                if "problem+block" in event_logs[i]["event_type"]:
                                    question_id = event_type_array[4]
                                    
                                if "_problem;_" in event_logs[i]["event_type"]:
                                    question_id = event_type_array[6].replace(";_", "/")
                                
                                if question_id in child_parent_map.keys():
                                    
                                    parent_block_id = child_parent_map[question_id]
                                
                                    session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id
                                    start_time = event_logs[i]["event_time"]
                                    end_time = event_logs[i]["event_time"]                                
                                                                                        
                        else:
                            
                            if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"] or event_logs[i]["event_type"] in submission_event_collection:

                                if event_logs[i]["event_time"] > end_time + datetime.timedelta(hours=0.5):
                                    
                                    if quiz_sessions.has_key(session_id):
                                        quiz_sessions[session_id]["time_array"].append({"start_time":start_time, "end_time":end_time})
                                    else:
                                        quiz_sessions[session_id] = {"course_learner_id":course_learner_id, "time_array":[{"start_time":start_time, "end_time":end_time}]}
                                    
                                    final_time = event_logs[i]["event_time"]
                                    
                                    if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"] or event_logs[i]["event_type"] in submission_event_collection:
                                        event_type_array = event_logs[i]["event_type"].split("/")
                                        
                                        if "problem+block" in event_logs[i]["event_type"]:
                                            question_id = event_type_array[4]
                                    
                                        if "_problem;_" in event_logs[i]["event_type"]:
                                            question_id = event_type_array[6].replace(";_", "/")
                                        
                                        if question_id in child_parent_map.keys():
                                            parent_block_id = child_parent_map[question_id]
                                            session_id = "quiz_session_" + parent_block_id + "_" +course_learner_id
                                            start_time = event_logs[i]["event_time"]
                                            end_time = event_logs[i]["event_time"]
                                        else:
                                            session_id = ""
                                            start_time = ""
                                            end_time = ""     
                                else:                                    
                                    end_time = event_logs[i]["event_time"]
                                                                
                            else:

                                if event_logs[i]["event_time"] <= end_time + datetime.timedelta(hours=0.5):
                                    end_time = event_logs[i]["event_time"]
                                
                                if quiz_sessions.has_key(session_id):
                                    quiz_sessions[session_id]["time_array"].append({"start_time":start_time, "end_time":end_time})
                                else:
                                    quiz_sessions[session_id] = {"course_learner_id":course_learner_id, "time_array":[{"start_time":start_time, "end_time":end_time}]}
                                
                                final_time = event_logs[i]["event_time"]
                                
                                session_id = ""
                                start_time = ""
                                end_time = ""
                                
                    if final_time != "":
                        new_logs = []                
                        for log in event_logs:                 
                            if log["event_time"] >= final_time:
                                new_logs.append(log)
                                
                        updated_learner_all_event_logs[course_learner_id] = new_logs
                    
        current_date = getNextDay(current_date)
    
    # To compress the session event_logs
    for session_id in quiz_sessions.keys():
        if len(quiz_sessions[session_id]["time_array"]) > 1:            
            
            start_time = ""
            end_time = ""
            updated_time_array = []
            
            for i in range(len(quiz_sessions[session_id]["time_array"])):                
                if i == 0:
                    start_time = quiz_sessions[session_id]["time_array"][i]["start_time"]
                    end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
                else:
                    if quiz_sessions[session_id]["time_array"][i]["start_time"] > end_time + datetime.timedelta(hours=0.5):
                        updated_time_array.append({"start_time":start_time, "end_time":end_time})                        
                        start_time = quiz_sessions[session_id]["time_array"][i]["start_time"]
                        end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
                        if i == len(quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({"start_time":start_time, "end_time":end_time})
                    else:
                        end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
                        
                        if i == len(quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({"start_time":start_time, "end_time":end_time})
            
            quiz_sessions[session_id]["time_array"] = updated_time_array
            
    
    quiz_session_record = []
    
    for session_id in quiz_sessions.keys():
        course_learner_id = quiz_sessions[session_id]["course_learner_id"]
        for i in range(len(quiz_sessions[session_id]["time_array"])):
    
            start_time = quiz_sessions[session_id]["time_array"][i]["start_time"]
            end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
            if start_time < end_time:
                duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds
                final_session_id = session_id + "_" + str(start_time) + "_" + str(end_time)
                
                if duration > 5:
                    array = [final_session_id, course_learner_id, start_time, end_time, duration]
                    quiz_session_record.append(array)
                    
    # Database version
    for array in quiz_session_record:
        session_id = array[0]
        course_learner_id = array[1]
        start_time = array[2]
        end_time = array[3]
        duration = process_null(array[4])
        sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
        data = (session_id, course_learner_id, start_time, end_time, duration)
        cursor.execute(sql, data)
    
    ''' 
Example #2
0
def sessions(metadata_path, daily_log_path, remaining_session_log_path,
             cursor):

    utc = pytz.UTC

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    end_date = course_metadata_map["end_date"]

    learner_logs = {}
    remaining_learner_logs = {}

    # Read remaining event logs
    if os.path.exists(remaining_session_log_path):
        remaining_input_file = open(remaining_session_log_path)
        learner_logs = json.loads(remaining_input_file.read(),
                                  object_hook=json_util.object_hook)

    # Course_learner_id set
    course_learner_id_set = set()
    for course_learner_id in learner_logs.keys():
        course_learner_id_set.add(course_learner_id)

    input_file = open(daily_log_path, "r")
    for line in input_file:

        jsonObject = json.loads(line)

        # Skip records without user_id
        if "user_id" not in jsonObject["context"] or jsonObject["context"][
                "user_id"] == "" or jsonObject["context"]["user_id"] == None:
            continue

        # For session separation
        global_learner_id = jsonObject["context"]["user_id"]
        event_type = str(jsonObject["event_type"])

        course_id = jsonObject["context"]["course_id"]
        course_learner_id = course_id + "_" + str(global_learner_id)

        event_time = jsonObject["time"]

        # Check whether the event record belongs to that day
        log_date = event_time[0:10]
        if log_date not in daily_log_path:
            # print "Log not belonging to the day...\t" + log_date
            continue

        event_time = event_time[0:19]
        event_time = event_time.replace("T", " ")
        event_time = datetime.datetime.strptime(event_time,
                                                "%Y-%m-%d %H:%M:%S")
        event_time = event_time.replace(tzinfo=utc)

        if course_learner_id in course_learner_id_set:
            learner_logs[course_learner_id].append({
                "event_time": event_time,
                "event_type": event_type
            })
        else:
            learner_logs[course_learner_id] = [{
                "event_time": event_time,
                "event_type": event_type
            }]
            course_learner_id_set.add(course_learner_id)

    input_file.close()

    # For session separation
    for learner in learner_logs.keys():

        course_learner_id = learner
        event_logs = learner_logs[learner]

        # Sorting
        event_logs.sort(cmp=cmp_datetime,
                        key=operator.itemgetter('event_time'))

        session_id = ""
        start_time = ""
        end_time = ""

        final_time = ""

        for i in range(len(event_logs)):

            if start_time == "":

                # Initialization
                start_time = event_logs[i]["event_time"]
                end_time = event_logs[i]["event_time"]

            else:

                if event_logs[i]["event_time"] > end_time + datetime.timedelta(
                        hours=0.5):

                    session_id = course_learner_id + "_" + str(
                        start_time) + "_" + str(end_time)
                    duration = (end_time - start_time).days * 24 * 60 * 60 + (
                        end_time - start_time).seconds

                    if duration > 5:
                        array = (session_id, course_learner_id, start_time,
                                 end_time, process_null(duration))
                        sql = "replace into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
                        try:
                            cursor.execute(sql, array)
                        except Exception as e:
                            pass

                    final_time = event_logs[i]["event_time"]

                    # Re-initialization
                    session_id = ""
                    start_time = event_logs[i]["event_time"]
                    end_time = event_logs[i]["event_time"]

                else:

                    if event_logs[i]["event_type"] == "page_close":

                        end_time = event_logs[i]["event_time"]

                        session_id = course_learner_id + "_" + str(
                            start_time) + "_" + str(end_time)
                        duration = (end_time -
                                    start_time).days * 24 * 60 * 60 + (
                                        end_time - start_time).seconds

                        if duration > 5:
                            array = (session_id, course_learner_id, start_time,
                                     end_time, process_null(duration))
                            sql = "replace into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
                            try:
                                cursor.execute(sql, array)
                            except Exception as e:
                                pass

                        # Re-initialization
                        session_id = ""
                        start_time = ""
                        end_time = ""

                        final_time = event_logs[i]["event_time"]

                    else:

                        end_time = event_logs[i]["event_time"]

        if final_time != "":
            new_logs = []
            for log in event_logs:
                if log["event_time"] > final_time:
                    new_logs.append(log)

            remaining_learner_logs[course_learner_id] = new_logs

    # Output remaining logs
    if str(end_date)[0:10] not in daily_log_path:
        output_file = open(remaining_session_log_path, "w")
        output_file.write(
            json.dumps(remaining_learner_logs, default=json_util.default))
        output_file.close()
    else:
        os.remove(remaining_session_log_path)
Example #3
0
def quiz_mode(metadata_path, log_path, cursor):
    
    # quiz_question_record = []
    # submissions = {}
    # assessments = {}
    
    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    
    quiz_question_map = course_metadata_map["quiz_question_map"]
    block_type_map = course_metadata_map["block_type_map"]
    element_time_map_due = course_metadata_map["element_time_map_due"]

    for question_id in quiz_question_map:

        question_due = ""

        question_weight = quiz_question_map[question_id]

        quiz_question_parent = course_metadata_map["child_parent_map"][question_id]
        
        if (question_due == "") and (quiz_question_parent in element_time_map_due):
            question_due = element_time_map_due[quiz_question_parent]

        while not block_type_map.has_key(quiz_question_parent):
            quiz_question_parent = course_metadata_map["child_parent_map"][quiz_question_parent]
            if (question_due == "") and (quiz_question_parent in element_time_map_due):
                question_due = element_time_map_due[quiz_question_parent]        
        
        quiz_question_type = block_type_map[quiz_question_parent]
        question_due = process_null(question_due)
        # array_quiz = [question_id, quiz_question_type, question_weight, question_due]
        # quiz_question_record.append(array_quiz)
        sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)"
        data = (question_id, quiz_question_type, question_weight, question_due)                    
        cursor.execute(sql, data)          
                            
    # Processing events data
    submission_event_collection = []

    # Problem check
    submission_event_collection.append("problem_check")     # Server
    
    '''
    submission_event_collection.append("save_problem_check")
    submission_event_collection.append("problem_check_fail")
    submission_event_collection.append("save_problem_check_fail")
    
    # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully.
    submission_event_collection.append("problem_graded")
    
    # The server emits problem_rescore events when a problem is successfully rescored.
    submission_event_collection.append("problem_rescore")
    submission_event_collection.append("problem_rescore_fail")
    
    submission_event_collection.append("problem_reset") # event_source: serve
    submission_event_collection.append("reset_problem")
    submission_event_collection.append("reset_problem_fail")
    
    # The server emits problem_save events after a user saves a problem.
    submission_event_collection.append("problem_save") # event_source: server
    submission_event_collection.append("save_problem_fail")
    submission_event_collection.append("save_problem_success")
    
    # Show answer
    submission_event_collection.append("problem_show")
    submission_event_collection.append("showanswer")
    '''
    
    current_date = course_metadata_map["start_date"]   
    end_next_date = getNextDay(course_metadata_map["end_date"])

    log_files = os.listdir(log_path)
    
    submission_uni_index = 0
    while True:
        
        if current_date == end_next_date:
            break;
        
        for file in log_files:
            if current_date in file:
                
                print file
                
                input_file = open(log_path + file,"r")                
                lines = input_file.readlines()
                        
                for line in lines:                              
                    
                    jsonObject = json.loads(line)
                
                    if jsonObject["event_type"] in submission_event_collection:
                        
                        # Some daily logs don't have the "user_id" value
                        if "user_id" not in jsonObject["context"]:
                            continue
                        
                        global_learner_id = jsonObject["context"]["user_id"]
                        
                        if global_learner_id != "":
                            
                            course_id = jsonObject["context"]["course_id"]
                            course_learner_id = course_id + "_" + str(global_learner_id)
                            
                            question_id = ""
                        
                            grade = ""
                            max_grade = ""
                            
                            event_time = jsonObject["time"]
                            event_time = event_time[0:19]
                            event_time = event_time.replace("T", " ")
                            event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S")               
                        
                            if isinstance(jsonObject["event"], dict):
                                question_id = jsonObject["event"]["problem_id"]
                                
                                # The fields "grade" and "max_grade" are specific to submission event "problem_check"
                                if jsonObject["event"].has_key("grade") and jsonObject["event"].has_key("max_grade"):
                                    grade = jsonObject["event"]["grade"]
                                    max_grade = jsonObject["event"]["max_grade"]
                
                            if question_id != "":
                                
                                submission_id = course_learner_id + "_" + question_id + "_" + str(submission_uni_index)
                                submission_uni_index = submission_uni_index + 1
                            
                                # For submissions
                                # array_submission = [submission_id, course_learner_id, question_id, event_time]
                                # submissions[submission_id] = array_submission
                                submission_timestamp = event_time
                                sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values (%s,%s,%s,%s)"
                                data = (submission_id, course_learner_id, question_id, submission_timestamp)
                                cursor.execute(sql, data) 
                            
                                # For assessments
                                if grade != "" and max_grade != "":
                                    # array_assessment = [submission_id, course_learner_id, max_grade, grade]
                                    # assessments[submission_id] = array_assessment
                                    assessment_id = submission_id
                                    sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values (%s,%s,%s,%s)"
                                    data = (assessment_id, course_learner_id, max_grade, grade)
                                    cursor.execute(sql, data)
                                        
        
        current_date = getNextDay(current_date)
        
    # submission_record = []
    # assessment_record = []
    
    # for submission_id in submissions.keys():
    #     submission_record.append(submissions[submission_id])
        
    # for assessment_id in assessments.keys():
    #     assessment_record.append(assessments[assessment_id])
    
    # Database version
    # Quiz_question table
    # for array in quiz_question_record:
    #     question_id = array[0]
    #     question_type = array[1]
    #     question_weight = array[2]
    #     question_due = array[3]
    #     sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values"
    #     sql += "('%s','%s','%s','%s');" % (question_id, question_type, question_weight, question_due)                    
    #     cursor.execute(sql)
        
    # Submissions table
    # for array in submission_record:
    #     submission_id = array[0]
    #     course_learner_id = array[1]
    #     question_id = array[2]
    #     submission_timestamp = array[3]
    #     sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values"
    #     sql += "('%s','%s','%s','%s');" % (submission_id, course_learner_id, question_id, submission_timestamp)
    #     cursor.execute(sql)
        
    # Submissions table
    # for array in assessment_record:
    #     assessment_id = array[0]
    #     course_learner_id = array[1]
    #     max_grade = array[2]
    #     grade = array[3]
    #     sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values"
    #     sql += "('%s','%s','%s','%s');" % (assessment_id, course_learner_id, max_grade, grade)
    #     cursor.execute(sql)

    ''' 
def quiz_sessions(metadata_path, daily_log_path, remaining_forum_session_log_path, cursor):
    
    utc = pytz.UTC
    
    course_metadata_map =  ExtractCourseInformation(metadata_path)
    end_date = course_metadata_map["end_date"]       
                            
    # Quiz-related events
    quiz_event_types = []

    # Problem check
    quiz_event_types.append("problem_check")     # Server
    quiz_event_types.append("save_problem_check")
    quiz_event_types.append("problem_check_fail")
    quiz_event_types.append("save_problem_check_fail")
    
    # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully.
    quiz_event_types.append("problem_graded")
    
    # The server emits problem_rescore events when a problem is successfully rescored.
    quiz_event_types.append("problem_rescore")
    quiz_event_types.append("problem_rescore_fail")
    
    quiz_event_types.append("problem_reset") # event_source: serve
    quiz_event_types.append("reset_problem")
    quiz_event_types.append("reset_problem_fail")
    
    # The server emits problem_save events after a user saves a problem.
    quiz_event_types.append("problem_save") # event_source: server
    quiz_event_types.append("save_problem_fail")
    quiz_event_types.append("save_problem_success")
    
    # Show answer
    quiz_event_types.append("problem_show")
    quiz_event_types.append("showanswer")
    
    quiz_event_types.append("edx.problem.hint.demandhint_displayed")
    quiz_event_types.append("edx.problem.hint.feedback_displayed")
    
    child_parent_map = course_metadata_map["child_parent_map"]
    
    learner_logs = {}
    remaining_learner_logs = {}
    
    quiz_sessions = {}
    
    # Read remaining event logs
    if os.path.exists(remaining_forum_session_log_path):
        remaining_input_file = open(remaining_forum_session_log_path)
        learner_logs = json.loads(remaining_input_file.read(), object_hook=json_util.object_hook)
        
    # Course_learner_id set
    course_learner_id_set = set()
    for course_learner_id in learner_logs.keys():
        course_learner_id_set.add(course_learner_id)
    
    input_file = open(daily_log_path, "r")
    for line in input_file:
        
        jsonObject = json.loads(line)
        
        # Skip records without user_id
        if "user_id" not in jsonObject["context"] or jsonObject["context"]["user_id"] == "" or jsonObject["context"]["user_id"] == None:
            continue
        
        # For quiz session separation
        global_learner_id = jsonObject["context"]["user_id"]
        event_type = str(jsonObject["event_type"])
        
        course_id = jsonObject["context"]["course_id"]
        course_learner_id = course_id + "_" + str(global_learner_id)
        
        event_time = jsonObject["time"]
        
        # Check whether the event record belongs to that day
        log_date = event_time[0:10]
        if log_date not in daily_log_path:
            # print "Log not belonging to the day...\t" + log_date
            continue
        
        event_time = event_time[0:19]
        event_time = event_time.replace("T", " ")
        event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S")
        event_time = event_time.replace(tzinfo=utc)
        
        if learner_logs.has_key(course_learner_id):
            learner_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type})
        else:
            learner_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}]
            
    input_file.close()
    
    # For quiz session separation
    for learner in learner_logs.keys():
                    
        course_learner_id = learner                    
        event_logs = learner_logs[learner]
                    
        # Sorting
        event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time'))
                    
        session_id = ""
        start_time = ""
        end_time = ""
                    
        final_time = ""                  
                    
        for i in range(len(event_logs)):
                        
            if session_id == "":
                            
                if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"]:
                                
                    event_type_array = event_logs[i]["event_type"].split("/")
                                
                    if "problem+block" in event_logs[i]["event_type"]:
                        question_id = event_type_array[4]
                                    
                    if "_problem;_" in event_logs[i]["event_type"]:
                        question_id = event_type_array[6].replace(";_", "/")
                                
                    if question_id in child_parent_map.keys():                                    
                        parent_block_id = child_parent_map[question_id]                                
                        session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id
                        start_time = event_logs[i]["event_time"]
                        end_time = event_logs[i]["event_time"]                                
                                                                                        
            else:
                            
                if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"] or event_logs[i]["event_type"] in quiz_event_types:

                    if event_logs[i]["event_time"] > end_time + datetime.timedelta(hours=0.5):
                        
                        if quiz_sessions.has_key(session_id):
                            quiz_sessions[session_id]["time_array"].append({"start_time":start_time, "end_time":end_time})
                        else:
                            quiz_sessions[session_id] = {"course_learner_id":course_learner_id, "time_array":[{"start_time":start_time, "end_time":end_time}]}
                        
                        final_time = event_logs[i]["event_time"]
                        
                        if "problem+block" in event_logs[i]["event_type"] or "_problem;_" in event_logs[i]["event_type"] or event_logs[i]["event_type"] in quiz_event_types:
                            event_type_array = event_logs[i]["event_type"].split("/")
                            
                            if "problem+block" in event_logs[i]["event_type"]:
                                question_id = event_type_array[4]
                        
                            if "_problem;_" in event_logs[i]["event_type"]:
                                question_id = event_type_array[6].replace(";_", "/")
                            
                            if question_id in child_parent_map.keys():
                                parent_block_id = child_parent_map[question_id]
                                session_id = "quiz_session_" + parent_block_id + "_" +course_learner_id
                                start_time = event_logs[i]["event_time"]
                                end_time = event_logs[i]["event_time"]
                            else:
                                session_id = ""
                                start_time = ""
                                end_time = ""     
                    else:                                    
                        end_time = event_logs[i]["event_time"]
                                                                
                else:

                    if event_logs[i]["event_time"] <= end_time + datetime.timedelta(hours=0.5):
                        end_time = event_logs[i]["event_time"]
                    
                    if quiz_sessions.has_key(session_id):
                        quiz_sessions[session_id]["time_array"].append({"start_time":start_time, "end_time":end_time})
                    else:
                        quiz_sessions[session_id] = {"course_learner_id":course_learner_id, "time_array":[{"start_time":start_time, "end_time":end_time}]}
                    
                    final_time = event_logs[i]["event_time"]
                    
                    session_id = ""
                    start_time = ""
                    end_time = ""
                                
        if final_time != "":
            new_logs = []                
            for log in event_logs:                 
                if log["event_time"] > final_time:
                    new_logs.append(log)
                    
            remaining_learner_logs[course_learner_id] = new_logs
            
    # Output remaining logs
    if str(end_date)[0:10] not in daily_log_path:
        output_file = open(remaining_forum_session_log_path, "w")
        output_file.write(json.dumps(remaining_learner_logs, default=json_util.default))
        output_file.close()
    else:
        os.remove(remaining_forum_session_log_path)    
    
    # To compress the session event_logs
    for session_id in quiz_sessions.keys():
        if len(quiz_sessions[session_id]["time_array"]) > 1:            
            
            start_time = ""
            end_time = ""
            updated_time_array = []
            
            for i in range(len(quiz_sessions[session_id]["time_array"])):                
                if i == 0:
                    start_time = quiz_sessions[session_id]["time_array"][i]["start_time"]
                    end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
                else:
                    if quiz_sessions[session_id]["time_array"][i]["start_time"] > end_time + datetime.timedelta(hours=0.5):
                        updated_time_array.append({"start_time":start_time, "end_time":end_time})                        
                        start_time = quiz_sessions[session_id]["time_array"][i]["start_time"]
                        end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
                        if i == len(quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({"start_time":start_time, "end_time":end_time})
                    else:
                        end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
                        
                        if i == len(quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({"start_time":start_time, "end_time":end_time})
            
            quiz_sessions[session_id]["time_array"] = updated_time_array
    
    for session_id in quiz_sessions.keys():
        course_learner_id = quiz_sessions[session_id]["course_learner_id"]
        for i in range(len(quiz_sessions[session_id]["time_array"])):
            start_time = quiz_sessions[session_id]["time_array"][i]["start_time"]
            end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
            if start_time < end_time:
                duration = process_null((end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds)
                final_session_id = session_id + "_" + str(start_time) + "_" + str(end_time)
                if duration > 5:
                    array = (final_session_id, course_learner_id, start_time, end_time, duration)
                    sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
                    try:
                        cursor.execute(sql, array)
                    except Exception as e:
                        pass
Example #5
0
def learner_mode(metadata_path, course_code, cursor):

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    array = (course_metadata_map["course_id"],
             course_metadata_map["course_name"],
             course_metadata_map["start_time"],
             course_metadata_map["end_time"])
    sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)"
    cursor.execute(sql, array)

    # Course_element table
    for element_id in course_metadata_map["element_time_map"].keys():

        element_start_time = course_metadata_map["element_time_map"][
            element_id]

        # Some contents released just one hour earlier than the hour of start time.
        # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00.
        # However, those 2nd week contents are count as 1st week.
        # In order to avoid above situation, I use date to replace datetime here.
        week = process_null(
            getDayDiff(course_metadata_map["start_time"].date(),
                       element_start_time.date()) / 7 + 1)

        array = (element_id,
                 course_metadata_map["element_type_map"][element_id], week,
                 course_metadata_map["course_id"])
        sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)"
        cursor.execute(sql, array)

    # Quiz_question table
    quiz_question_map = course_metadata_map["quiz_question_map"]
    block_type_map = course_metadata_map["block_type_map"]
    element_time_map_due = course_metadata_map["element_time_map_due"]

    for question_id in quiz_question_map:

        question_due = ""
        question_weight = quiz_question_map[question_id]
        quiz_question_parent = course_metadata_map["child_parent_map"][
            question_id]

        if (question_due == "") and (quiz_question_parent
                                     in element_time_map_due):
            question_due = element_time_map_due[quiz_question_parent]

        while not block_type_map.has_key(quiz_question_parent):
            quiz_question_parent = course_metadata_map["child_parent_map"][
                quiz_question_parent]
            if (question_due == "") and (quiz_question_parent
                                         in element_time_map_due):
                question_due = element_time_map_due[quiz_question_parent]

        quiz_question_type = block_type_map[quiz_question_parent]
        question_due = process_null(question_due)

        array = (question_id, quiz_question_type, question_weight,
                 question_due)
        sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)"
        cursor.execute(sql, array)

    files = os.listdir(metadata_path)

    # Learner_demographic table
    learner_mail_map = {}

    # Course_learner table
    course_learner_map = {}
    learner_enrollment_time_map = {}

    # Enrolled learners set
    enrolled_learner_set = set()

    course_id = ""

    # Processing student_courseenrollment data
    for file in files:
        if "student_courseenrollment" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()
            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                course_id = record[2]
                time = datetime.datetime.strptime(record[3],
                                                  "%Y-%m-%d %H:%M:%S")
                course_learner_id = course_id + "_" + global_learner_id

                if cmp_datetime(course_metadata_map["end_time"], time):

                    enrolled_learner_set.add(global_learner_id)

                    array = (global_learner_id, course_id, course_learner_id)
                    sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)"
                    cursor.execute(sql, array)

                    course_learner_map[global_learner_id] = course_learner_id
                    learner_enrollment_time_map[global_learner_id] = time
            input_file.close()

    # Processing auth_user data
    for file in files:
        if "auth_user-" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()
            for line in lines:
                record = line.split("\t")
                if record[0] in enrolled_learner_set:
                    learner_mail_map[record[0]] = record[4]
            input_file.close()

    # Processing certificates_generatedcertificate data
    num_uncertifiedLearners = 0
    num_certifiedLearners = 0
    for file in files:
        if "certificates_generatedcertificate" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()

            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                final_grade = process_null(record[3])
                enrollment_mode = record[14].replace("\n", "")
                certificate_status = record[7]

                register_time = ""
                if course_learner_map.has_key(global_learner_id):
                    register_time = learner_enrollment_time_map[
                        global_learner_id]
                register_time = process_null(register_time)

                if course_learner_map.has_key(global_learner_id):
                    num_certifiedLearners += 1
                    array = (course_learner_map[global_learner_id],
                             final_grade, enrollment_mode, certificate_status,
                             register_time)
                    sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)"
                    cursor.execute(sql, array)
                else:
                    num_uncertifiedLearners += 1
            input_file.close()

    # Processing auth_userprofile data
    for file in files:
        if "auth_userprofile" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()

            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                gender = record[7]
                year_of_birth = process_null(process_null(record[9]))
                level_of_education = record[10]
                country = record[13]

                course_learner_id = process_null(course_id + "_" +
                                                 global_learner_id)

                if global_learner_id in enrolled_learner_set:
                    array = (course_learner_id, gender, year_of_birth,
                             level_of_education, country,
                             learner_mail_map[global_learner_id])
                    sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)"
                    cursor.execute(sql, array)
            input_file.close()

    # Generating forum_interaction records for courses starting before 1T2015
    if "1T2015" in course_code or "2014" in course_code or "2013" in course_code:
        forum_interaction_mongo(metadata_path, cursor)
Example #6
0
def video_interaction(metadata_path, log_path, cursor):
    
    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    
    current_date = course_metadata_map["start_date"]   
    end_next_date = getNextDay(course_metadata_map["end_date"])
    
    video_interaction_map = {}
    
    # Video-related event types
    video_event_types = []

    video_event_types.append("hide_transcript")
    video_event_types.append("edx.video.transcript.hidden")
    
    video_event_types.append("edx.video.closed_captions.hidden")
    video_event_types.append("edx.video.closed_captions.shown")
    
    video_event_types.append("load_video")
    video_event_types.append("edx.video.loaded")
    
    video_event_types.append("pause_video")
    video_event_types.append("edx.video.paused")
    
    video_event_types.append("play_video")
    video_event_types.append("edx.video.played")
    
    video_event_types.append("seek_video")
    video_event_types.append("edx.video.position.changed")
    
    video_event_types.append("show_transcript")
    video_event_types.append("edx.video.transcript.shown")
    
    video_event_types.append("speed_change_video")
    
    video_event_types.append("stop_video")
    video_event_types.append("edx.video.stopped")
    
    video_event_types.append("video_hide_cc_menu")
    video_event_types.append("edx.video.language_menu.hidden")
    
    video_event_types.append("video_show_cc_menu")
    video_event_types.append("edx.video.language_menu.shown")
    
    '''
    # Navigation-related event types
    navigation_event_types = []
    navigation_event_types.append("page_close")
    navigation_event_types.append("seq_goto")
    navigation_event_types.append("seq_next")
    navigation_event_types.append("seq_prev")
    '''
    
    learner_video_event_logs = {}
    updated_learner_video_event_logs = {}
    
    log_files = os.listdir(log_path)
    
    while True:
        
        if current_date == end_next_date:
            break;
        
        for file in log_files:           
            if current_date in file:
                
                print file

                learner_video_event_logs.clear()
                learner_video_event_logs = updated_learner_video_event_logs.copy()
                updated_learner_video_event_logs.clear()
                
                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_video_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)
                
                input_file = open(log_path + file,"r")
                lines = input_file.readlines()
                        
                for line in lines:
                    
                    jsonObject = json.loads(line)
                    
                    if jsonObject["event_type"] in video_event_types:
                        
                        # Some daily logs don't have the "user_id" value
                        if "user_id" not in jsonObject["context"]:
                            continue
                        
                        global_learner_id = jsonObject["context"]["user_id"]
                        
                        if global_learner_id != "":
                            
                            course_id = jsonObject["context"]["course_id"]
                            course_learner_id = course_id + "_" + str(global_learner_id)
                            
                            video_id = ""
                        
                            event_time = jsonObject["time"]
                            event_time = event_time[0:19]
                            event_time = event_time.replace("T", " ")
                            event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S")
                        
                            event_type = jsonObject["event_type"]
                        
                            # For seek event
                            new_time = 0
                            old_time = 0
                        
                            # For speed change event
                            new_speed = 0
                            old_speed = 0
                        
                            # This sub-condition does not exist in log data
                            # if isinstance(jsonObject["event"], dict):
                            #     video_id = jsonObject["event"]["id"]
                        
                            if isinstance(jsonObject["event"], unicode):
                                event_jsonObject = json.loads(jsonObject["event"])
                                video_id = event_jsonObject["id"]
                                
                                video_id = video_id.replace("-", "://", 1)
                                video_id = video_id.replace("-", "/")
                            
                                # For video seek event
                                if "new_time" in event_jsonObject and "old_time" in event_jsonObject:
                                    new_time = event_jsonObject["new_time"]
                                    old_time = event_jsonObject["old_time"]                                                                      
                                                                                
                                # For video speed change event           
                                if "new_speed" in event_jsonObject and "old_speed" in event_jsonObject:
                                    new_speed = event_jsonObject["new_speed"]
                                    old_speed = event_jsonObject["old_speed"]
                        
                            # To record video seek event                
                            if event_type in ["seek_video","edx.video.position.changed"]:
                                if new_time is not None and old_time is not None:
                                    if course_learner_id in course_learner_id_set:
                                        learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_time":new_time, "old_time":old_time})
                                    else:
                                        learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_time":new_time, "old_time":old_time}]
                                        course_learner_id_set.add(course_learner_id)
                                continue
                        
                            # To record video speed change event                
                            if event_type in ["speed_change_video"]:
                                if course_learner_id in course_learner_id_set:
                                    learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_speed":new_speed, "old_speed":old_speed})
                                else:
                                    learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id, "new_speed":new_speed, "old_speed":old_speed}]
                                    course_learner_id_set.add(course_learner_id)
                                continue                                                                      
                         
                            if course_learner_id in course_learner_id_set:
                                learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "video_id":video_id})
                            else:
                                learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "video_id":video_id}]
                                course_learner_id_set.add(course_learner_id)
                    
                    # For non-video-related events                                    
                    if jsonObject["event_type"] not in video_event_types:
                        
                        # Some daily logs don't have the "user_id" value
                        if "user_id" not in jsonObject["context"]:
                            continue
                        
                        global_learner_id = jsonObject["context"]["user_id"]
                        
                        if global_learner_id != "":
                            course_id = jsonObject["context"]["course_id"]
                            course_learner_id = course_id + "_" + str(global_learner_id)                                  
                        
                            event_time = jsonObject["time"]
                            event_time = event_time[0:19]
                            event_time = event_time.replace("T", " ")
                            event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S")
                        
                            event_type = jsonObject["event_type"]                  
                                                      
                            if course_learner_id in course_learner_id_set:
                                learner_video_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type})
                            else:
                                learner_video_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}]
                                course_learner_id_set.add(course_learner_id)
                                  
                for course_learner_id in learner_video_event_logs.keys():
                    
                    video_id = ""
                    
                    event_logs = learner_video_event_logs[course_learner_id]
                    
                    # Sorting
                    event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time'))
                    
                    video_start_time = ""
                    final_time = ""
                    
                    # For video seek event
                    times_forward_seek = 0
                    duration_forward_seek = 0
                    times_backward_seek = 0
                    duration_backward_seek = 0
                    
                    # For video speed change event
                    speed_change_last_time = ""
                    times_speed_up = 0
                    times_speed_down = 0               
                    
                    # For video pause event                   
                    pause_check = False
                    pause_start_time = ""
                    duration_pause = 0                    
                                      
                    for log in event_logs:
                        
                        if log["event_type"] in ["play_video", "edx.video.played"]:
                            
                            video_start_time = log["event_time"]
                            video_id = log["video_id"]

                            if pause_check:
                                
                                duration_pause = (log["event_time"] - pause_start_time).seconds
                                video_interaction_id = course_learner_id + "_" + video_id + "_" + str(pause_start_time)
                                
                                if duration_pause > 2 and duration_pause < 600:
                                    if video_interaction_id in video_interaction_map.keys():
                                        video_interaction_map[video_interaction_id]["times_pause"] = 1                                        
                                        video_interaction_map[video_interaction_id]["duration_pause"] = duration_pause
                                
                                pause_check = False
                                                        
                            continue 
                        
                        if video_start_time != "":                                                    
                           
                            if log["event_time"] > video_start_time + datetime.timedelta(hours=0.5):
                                
                                video_start_time = ""
                                video_id = ""
                                final_time = log["event_time"]
                                
                            else:                               
                                
                                # 0. Seek
                                if log["event_type"] in ["seek_video", "edx.video.position.changed"] and video_id == log["video_id"]:                                                                       
                                    # Forward seek event
                                    if log["new_time"] > log["old_time"]:
                                        times_forward_seek += 1
                                        duration_forward_seek += log["new_time"] - log["old_time"]
                                    # Backward seek event                                    
                                    if log["new_time"] < log["old_time"]:
                                        times_backward_seek += 1
                                        duration_backward_seek += log["old_time"] - log["new_time"]
                                    continue
                                
                                # 1. Speed change
                                if log["event_type"] == "speed_change_video" and video_id == log["video_id"]:
                                    if speed_change_last_time == "":
                                        speed_change_last_time = log["event_time"]
                                        old_speed = log["old_speed"]
                                        new_speed = log["new_speed"]                                        
                                        if old_speed < new_speed:
                                            times_speed_up += 1
                                        if old_speed > new_speed:
                                            times_speed_down += 1
                                    else:
                                        if (log["event_time"] - speed_change_last_time).seconds > 10:
                                            old_speed = log["old_speed"]
                                            new_speed = log["new_speed"]                                        
                                            if old_speed < new_speed:
                                                times_speed_up += 1
                                            if old_speed > new_speed:
                                                times_speed_down += 1
                                        speed_change_last_time = log["event_time"]
                                    continue
                                
                                # 2. Pause/Stop situation
                                if log["event_type"] in ["pause_video", "edx.video.paused", "stop_video", "edx.video.stopped"] and video_id == log["video_id"]:                                    
                                    
                                    watch_duration = (log["event_time"] - video_start_time).seconds
                                    
                                    video_end_time = log["event_time"]
                                    video_interaction_id = course_learner_id + "_" + video_id + "_" + str(video_end_time)
                                 
                                    if watch_duration > 5:                                        
                                        video_interaction_map[video_interaction_id] = {"course_learner_id":course_learner_id, "video_id":video_id, "type": "video", "watch_duration":watch_duration,
                                                                        "times_forward_seek":times_forward_seek, "duration_forward_seek":duration_forward_seek, 
                                                                        "times_backward_seek":times_backward_seek, "duration_backward_seek":duration_backward_seek,
                                                                        "times_speed_up":times_speed_up, "times_speed_down":times_speed_down,
                                                                        "start_time":video_start_time, "end_time":video_end_time}

                                    if log["event_type"] in ["pause_video", "edx.video.paused"]:
                                        pause_check = True
                                        pause_start_time = video_end_time
                                    
                                    # For video seek event
                                    times_forward_seek = 0
                                    duration_forward_seek = 0
                                    times_backward_seek = 0
                                    duration_backward_seek = 0
                                    
                                    # For video speed change event
                                    speed_change_last_time = ""
                                    times_speed_up = 0
                                    times_speed_down = 0
                                    
                                    # For video general information                                  
                                    video_start_time =""
                                    video_id = ""
                                    final_time = log["event_time"]
                                    
                                    continue
                                    
                                # 3/4  Page changed/Session closed
                                if log["event_type"] not in video_event_types:
                                    
                                    video_end_time = log["event_time"]
                                    watch_duration = (video_end_time - video_start_time).seconds                
                                    video_interaction_id = course_learner_id + "_" + video_id + "_" + str(video_end_time)
                                
                                    if watch_duration > 5:                                        
                                        video_interaction_map[video_interaction_id] = {"course_learner_id":course_learner_id, "video_id":video_id, "type": "video", "watch_duration":watch_duration,
                                                                        "times_forward_seek":times_forward_seek, "duration_forward_seek":duration_forward_seek, 
                                                                        "times_backward_seek":times_backward_seek, "duration_backward_seek":duration_backward_seek,
                                                                        "times_speed_up":times_speed_up, "times_speed_down":times_speed_down,
                                                                        "start_time":video_start_time, "end_time":video_end_time}
                                    
                                    # For video seek event
                                    times_forward_seek = 0
                                    duration_forward_seek = 0
                                    times_backward_seek = 0
                                    duration_backward_seek = 0
                                    
                                    # For video speed change event
                                    speed_change_last_time = ""
                                    times_speed_up = 0
                                    times_speed_down = 0
                                    
                                    # For video general information
                                    video_start_time = ""                                    
                                    video_id = ""
                                    final_time = log["event_time"]
                                    
                                    continue
                        
                    if final_time != "":
                        new_logs = []                
                        for log in event_logs:                 
                            if log["event_time"] > final_time:
                                new_logs.append(log)
                                
                        updated_learner_video_event_logs[course_learner_id] = new_logs                
                     
        current_date = getNextDay(current_date)
        
    video_interaction_record = []
    
    for interaction_id in video_interaction_map.keys():
        video_interaction_id = interaction_id
        course_learner_id = video_interaction_map[interaction_id]["course_learner_id"]
        video_id = video_interaction_map[interaction_id]["video_id"]
        duration = video_interaction_map[interaction_id]["watch_duration"]
        times_forward_seek = video_interaction_map[interaction_id]["times_forward_seek"]
        duration_forward_seek = video_interaction_map[interaction_id]["duration_forward_seek"]
        times_backward_seek = video_interaction_map[interaction_id]["times_backward_seek"]
        duration_backward_seek = video_interaction_map[interaction_id]["duration_backward_seek"]
        times_speed_up = video_interaction_map[interaction_id]["times_speed_up"]
        times_speed_down = video_interaction_map[interaction_id]["times_speed_down"]
        start_time = video_interaction_map[interaction_id]["start_time"]
        end_time = video_interaction_map[interaction_id]["end_time"]
        
        if "times_pause" in video_interaction_map[interaction_id]:
            times_pause = video_interaction_map[interaction_id]["times_pause"]
            duration_pause = video_interaction_map[interaction_id]["duration_pause"]
        else:
            times_pause = 0
            duration_pause = 0
            
        array = [video_interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time]
        video_interaction_record.append(array)
    
    # Video_interaction table
    # Database version
    for array in video_interaction_record:
        interaction_id = array[0]
        course_learner_id = array[1]
        video_id = array[2]
        duration = process_null(array[3])
        times_forward_seek = process_null(array[4])
        duration_forward_seek = process_null(array[5])
        times_backward_seek = process_null(array[6])
        duration_backward_seek = process_null(array[7])
        times_speed_up = process_null(array[8])
        times_speed_down = process_null(array[9])
        times_pause = process_null(array[10])
        duration_pause = process_null(array[11])
        start_time = array[12]
        end_time = array[13]
        sql = "insert into video_interaction(interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        data = (interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time)
        cursor.execute(sql, data)
        
    # File version
    '''
def sessions(metadata_path, log_path, cursor):
    
    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    
    current_date = course_metadata_map["start_date"]   
    end_next_date = getNextDay(course_metadata_map["end_date"])
    
    learner_all_event_logs = {}
    updated_learner_all_event_logs = {}
    session_record = []
    
    log_files = os.listdir(log_path)
    
    while True:
        
        if current_date == end_next_date:
            break;
        
        for file in log_files:           
            
            if current_date in file:
                
                print file

                learner_all_event_logs.clear()
                learner_all_event_logs = updated_learner_all_event_logs.copy()
                updated_learner_all_event_logs.clear()
                
                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_all_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)
                
                input_file = open(log_path + file,"r")
                lines = input_file.readlines()
                        
                for line in lines:
                    
                    jsonObject = json.loads(line)
                    
                    # Some daily logs don't have the "user_id" value
                    if "user_id" not in jsonObject["context"]:
                        continue
                    
                    global_learner_id = jsonObject["context"]["user_id"]
                    event_type = str(jsonObject["event_type"])
                    
                    if global_learner_id != "":
                        course_id = jsonObject["context"]["course_id"]
                        course_learner_id = course_id + "_" + str(global_learner_id)
                        
                        event_time = jsonObject["time"]
                        event_time = event_time[0:19]
                        event_time = event_time.replace("T", " ")
                        event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S")
                        
                        if course_learner_id in course_learner_id_set:
                            learner_all_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type})
                        else:
                            learner_all_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}]
                            course_learner_id_set.add(course_learner_id)
                     
                for course_learner_id in learner_all_event_logs.keys():
                                 
                    event_logs = learner_all_event_logs[course_learner_id]
                    
                    # Sorting
                    event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time'))
                      
                    session_id = ""
                    start_time = ""
                    end_time = ""
                    
                    final_time = ""
                    
                    for i in range(len(event_logs)):
                        
                        if start_time == "":
                            
                            # Initialization
                            start_time = event_logs[i]["event_time"]
                            end_time = event_logs[i]["event_time"]
                            
                        else:
                            
                            if event_logs[i]["event_time"] > end_time + datetime.timedelta(hours=0.5):
                                
                                session_id = course_learner_id + "_" + str(start_time) + "_" + str(end_time)
                                duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds
                                
                                if duration > 5:
                                    array = [session_id, course_learner_id, start_time, end_time, duration]
                                    session_record.append(array)
                                    
                                final_time = event_logs[i]["event_time"]
                                    
                                # Re-initialization
                                session_id = ""
                                start_time = event_logs[i]["event_time"]
                                end_time = event_logs[i]["event_time"]
                            
                            else:
                                
                                if event_logs[i]["event_type"] == "page_close":
                                    
                                    end_time = event_logs[i]["event_time"]
                                    
                                    session_id = course_learner_id + "_" + str(start_time) + "_" + str(end_time)
                                    duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds
                                
                                    if duration > 5:
                                        array = [session_id, course_learner_id, start_time, end_time, duration]
                                        session_record.append(array)
                                        
                                    # Re-initialization
                                    session_id = ""
                                    start_time = ""
                                    end_time = ""
                                    
                                    final_time = event_logs[i]["event_time"]
                                    
                                else:
                                    
                                    end_time = event_logs[i]["event_time"]
                        
                    if final_time != "":
                        new_logs = []                
                        for log in event_logs:                 
                            if log["event_time"] >= final_time:
                                new_logs.append(log)
                                
                        updated_learner_all_event_logs[course_learner_id] = new_logs
                        
        current_date = getNextDay(current_date)
    
    # Filter duplicated records
    updated_session_record = []
    session_id_set = set()
    for array in session_record:
        session_id = array[0]
        if session_id not in session_id_set:
            session_id_set.add(session_id)
            updated_session_record.append(array)
            
    session_record = updated_session_record
    
    # Database version
    for array in session_record:
        session_id = array[0]
        course_learner_id = array[1]
        start_time = array[2]
        end_time = array[3]
        duration = process_null(array[4])
        sql = "insert into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
        data = (session_id, course_learner_id, start_time, end_time, duration)
        cursor.execute(sql, data)
        
            
    # File version
    '''
def learner_mode(metadata_path, course_code, cursor):
    
    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    array = (course_metadata_map["course_id"], course_metadata_map["course_name"], course_metadata_map["start_time"], course_metadata_map["end_time"])
    sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)" 
    cursor.execute(sql, array)
    
    # Course_element table     
    for element_id in course_metadata_map["element_time_map"].keys():
                  
        element_start_time = course_metadata_map["element_time_map"][element_id]
        
        # Some contents released just one hour earlier than the hour of start time.
        # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00.
        # However, those 2nd week contents are count as 1st week.
        # In order to avoid above situation, I use date to replace datetime here.
        week = process_null(getDayDiff(course_metadata_map["start_time"].date(), element_start_time.date()) / 7 + 1)
                
        array = (element_id, course_metadata_map["element_type_map"][element_id], week, course_metadata_map["course_id"])
        sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)" 
        cursor.execute(sql, array)
        
    # Quiz_question table
    quiz_question_map = course_metadata_map["quiz_question_map"]
    block_type_map = course_metadata_map["block_type_map"]
    element_time_map_due = course_metadata_map["element_time_map_due"]

    for question_id in quiz_question_map:

        question_due = ""
        question_weight = quiz_question_map[question_id]
        quiz_question_parent = course_metadata_map["child_parent_map"][question_id]
        
        if (question_due == "") and (quiz_question_parent in element_time_map_due):
            question_due = element_time_map_due[quiz_question_parent]

        while not block_type_map.has_key(quiz_question_parent):
            quiz_question_parent = course_metadata_map["child_parent_map"][quiz_question_parent]
            if (question_due == "") and (quiz_question_parent in element_time_map_due):
                question_due = element_time_map_due[quiz_question_parent]        
        
        quiz_question_type = block_type_map[quiz_question_parent]
        question_due = process_null(question_due)
        
        array = (question_id, quiz_question_type, question_weight, question_due)
        sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)"                            
        cursor.execute(sql, array)        
    
    files = os.listdir(metadata_path)
    
    # Learner_demographic table
    learner_mail_map = {}
    
    # Course_learner table
    course_learner_map = {}
    learner_enrollment_time_map = {}
    
    # Enrolled learners set
    enrolled_learner_set = set()
    
    course_id = ""
    
    # Processing student_courseenrollment data  
    for file in files:       
        if "student_courseenrollment" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()                        
            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                course_id = record[2]
                time = datetime.datetime.strptime(record[3], "%Y-%m-%d %H:%M:%S")
                course_learner_id = course_id + "_" + global_learner_id
                    
                if cmp_datetime(course_metadata_map["end_time"], time):           
                    
                    enrolled_learner_set.add(global_learner_id)
                    
                    array = (global_learner_id, course_id, course_learner_id)
                    sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)"
                    cursor.execute(sql, array)

                    course_learner_map[global_learner_id] = course_learner_id
                    learner_enrollment_time_map[global_learner_id] = time                    
            input_file.close()
  
    # Processing auth_user data  
    for file in files:               
        if "auth_user-" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()
            for line in lines:
                record = line.split("\t")
                if record[0] in enrolled_learner_set:
                    learner_mail_map[record[0]] = record[4]
            input_file.close()
                    
    # Processing certificates_generatedcertificate data
    num_uncertifiedLearners = 0
    num_certifiedLearners = 0    
    for file in files:       
        if "certificates_generatedcertificate" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()
            
            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                final_grade = process_null(record[3])
                enrollment_mode = record[14].replace("\n", "")
                certificate_status = record[7]
                
                register_time = ""
                if course_learner_map.has_key(global_learner_id):
                    register_time = learner_enrollment_time_map[global_learner_id]
                register_time = process_null(register_time)          
                
                if course_learner_map.has_key(global_learner_id):
                    num_certifiedLearners += 1
                    array = (course_learner_map[global_learner_id], final_grade, enrollment_mode, certificate_status, register_time)
                    sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)"
                    cursor.execute(sql, array)
                else:
                    num_uncertifiedLearners += 1            
            input_file.close()
    
    # Processing auth_userprofile data                    
    for file in files:       
        if "auth_userprofile" in file:
            input_file = open(str(metadata_path + file), "r")
            input_file.readline()
            lines = input_file.readlines()
                        
            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                gender = record[7]
                year_of_birth = process_null(process_null(record[9]))
                level_of_education = record[10]
                country = record[13]
                
                course_learner_id = process_null(course_id + "_" + global_learner_id)
                                
                if global_learner_id in enrolled_learner_set:
                    array = (course_learner_id, gender, year_of_birth, level_of_education, country, learner_mail_map[global_learner_id])
                    sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)"
                    cursor.execute(sql, array)           
            input_file.close()
            
    # Generating forum_interaction records for courses starting before 1T2015
    if "1T2015" in course_code or "2014" in course_code or "2013" in course_code:
        forum_interaction_mongo(metadata_path, cursor)
Example #9
0
def forum_sessions(metadata_path, log_path, cursor):
    
    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    
    start_date = course_metadata_map["start_date"]
    end_date = course_metadata_map["end_date"]

    current_date = start_date   
    end_next_date = getNextDay(end_date)
    
    forum_event_types = []
    forum_event_types.append("edx.forum.comment.created")
    forum_event_types.append("edx.forum.response.created")
    forum_event_types.append("edx.forum.response.voted")
    forum_event_types.append("edx.forum.thread.created")
    forum_event_types.append("edx.forum.thread.voted")
    forum_event_types.append("edx.forum.searched")
        
    learner_all_event_logs = {}
    updated_learner_all_event_logs = {}
    
    forum_sessions_record = []
    
    log_files = os.listdir(log_path)
    
    while True:
        
        if current_date == end_next_date:
            break;
        
        for log_file in log_files:
            
            if current_date in log_file:                
                
                print log_file 
                learner_all_event_logs.clear()
                learner_all_event_logs = updated_learner_all_event_logs.copy()
                updated_learner_all_event_logs.clear()
                
                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_all_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)

                log_file = open(log_path + log_file,"r")
                lines = log_file.readlines()

                for line in lines:
                    
                    jsonObject = json.loads(line)
                    
                    # Some daily logs don't have the "user_id" value
                    if "user_id" not in jsonObject["context"]:
                        continue

                    if jsonObject["context"]["user_id"] == "":
                        continue
                    
                    # For forum session separation
                    global_learner_id = jsonObject["context"]["user_id"]
                    event_type = str(jsonObject["event_type"])
                    
                    if "/discussion/" in event_type or event_type in forum_event_types:
                        if event_type != "edx.forum.searched":
                            event_type = "forum_activity"
                                            
                    if global_learner_id != "":
                        
                        course_id = jsonObject["context"]["course_id"]
                        course_learner_id = course_id + "_" + str(global_learner_id)
                        
                        event_time = jsonObject["time"]
                        event_time = event_time[0:19]
                        event_time = event_time.replace("T", " ")
                        event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S")

                        # added for relevant elements
                        event_page = ""
                        if jsonObject.has_key("page"):
                            event_page = str(jsonObject["page"])
                        
                        event_path = ""
                        if jsonObject.has_key("path"):
                            event_path = str(jsonObject["path"])
                        
                        event_referer = ""
                        if jsonObject.has_key("referer"):
                            event_referer = str(jsonObject["referer"])
                                               
                        if course_learner_id in course_learner_id_set:
                            learner_all_event_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type, "page":event_page, "path":event_path, "referer":event_referer})
                        else:
                            learner_all_event_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type, "page":event_page, "path":event_path, "referer":event_referer}]
                            course_learner_id_set.add(course_learner_id)
                            
                # For forum session separation
                for learner in learner_all_event_logs.keys():
                    
                    course_learner_id = learner                    
                    event_logs = learner_all_event_logs[learner]
                    course_id = course_learner_id.split("_")[0]
                    
                    # Sorting
                    event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time'))
                    
                    session_id = ""
                    start_time = ""
                    end_time = ""                    
                    times_search = 0
                    
                    final_time = ""

                    # represent the elements which just before the session.
                    session_rel_element_pre = ""
                    # represent the elements which is mentioned in the session.
                    session_rel_element_cur = ""
                    
                    for i in range(len(event_logs)):

                        rel_element_cur = courseElementsFinder(event_logs[i], course_id)

                        if session_id == "":                            
                            
                            if event_logs[i]["event_type"] in ["forum_activity", "edx.forum.searched"]:
                                # Initialization
                                session_id = "forum_session_" + course_learner_id
                                start_time = event_logs[i]["event_time"]
                                end_time = event_logs[i]["event_time"]
                                if event_logs[i]["event_type"] == "edx.forum.searched":
                                    times_search += 1
                                # Added for relevant element id
                                session_rel_element_cur = rel_element_cur                                                        
                        else:
                            
                            if event_logs[i]["event_type"] in ["forum_activity", "edx.forum.searched"]:

                                if event_logs[i]["event_time"] > end_time + datetime.timedelta(hours=0.5):
                                    
                                    session_id = session_id + "_" + str(start_time) + "_" + str(end_time)
                                    duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds
                                    
                                    if duration > 5:
                                        rel_element_id = ""
                                        if session_rel_element_cur != "":
                                            rel_element_id = session_rel_element_cur
                                        else:
                                            rel_element_id = session_rel_element_pre
                                        array = [session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id]
                                        forum_sessions_record.append(array)
                                    
                                    final_time = event_logs[i]["event_time"]
                                    
                                    # Re-initialization
                                    session_id = "forum_session_" + course_learner_id
                                    start_time = event_logs[i]["event_time"]
                                    end_time = event_logs[i]["event_time"]
                                    if event_logs[i]["event_type"] == "edx.forum.searched":
                                        times_search = 1
                                    # Added for relevant element id
                                    session_rel_element_cur = rel_element_cur
                                        
                                else:
                                    
                                    end_time = event_logs[i]["event_time"]
                                    if event_logs[i]["event_type"] == "edx.forum.searched":
                                        times_search += 1
                                    if session_rel_element_cur == "":
                                        session_rel_element_cur = rel_element_cur
                                                        
                            else:
                                
                                if event_logs[i]["event_time"] <= end_time + datetime.timedelta(hours=0.5):
                                    end_time = event_logs[i]["event_time"]

                                session_id = session_id + "_" + str(start_time) + "_" + str(end_time)
                                duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds
                                
                                if duration > 5:
                                    rel_element_id = ""
                                    if session_rel_element_cur != "":
                                        rel_element_id = session_rel_element_cur
                                    else:
                                        rel_element_id = session_rel_element_pre
                                    array = [session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id]
                                    forum_sessions_record.append(array)
                                    
                                final_time = event_logs[i]["event_time"]
                                    
                                # Re-initialization
                                session_id = ""
                                start_time = ""
                                end_time = ""
                                times_search = 0

                        # session_rel_element_pre is used for recording the element id 
                        # of the most recent event logs before the session logs
                        if rel_element_cur != "":
                            session_rel_element_pre = rel_element_cur
  
                    if final_time != "":
                        new_logs = []                
                        for log in event_logs:                 
                            if log["event_time"] >= final_time:
                                new_logs.append(log)
                                
                        updated_learner_all_event_logs[course_learner_id] = new_logs
                
                log_file.close()
                
        current_date = getNextDay(current_date)
    
    # Database version
    for array in forum_sessions_record:
        session_id = array[0]
        course_learner_id = array[1]
        times_search = process_null(array[2])
        start_time = array[3]
        end_time = array[4]
        duration = process_null(array[5])
        rel_element_id = array[6]
        sql = "insert into forum_sessions (session_id, course_learner_id, times_search, start_time, end_time, duration, relevent_element_id) values (%s,%s,%s,%s,%s,%s,%s)"
        data = (session_id, course_learner_id, times_search, start_time, end_time, duration, rel_element_id)
        cursor.execute(sql, data)
            
    # File version
    '''
Example #10
0
def sessions(metadata_path, log_path, cursor):

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)

    current_date = course_metadata_map["start_date"]
    end_next_date = getNextDay(course_metadata_map["end_date"])

    learner_all_event_logs = {}
    updated_learner_all_event_logs = {}
    session_record = []

    log_files = os.listdir(log_path)

    while True:

        if current_date == end_next_date:
            break

        for file in log_files:

            if current_date in file:

                print file

                learner_all_event_logs.clear()
                learner_all_event_logs = updated_learner_all_event_logs.copy()
                updated_learner_all_event_logs.clear()

                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_all_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)

                input_file = open(log_path + file, "r")
                lines = input_file.readlines()

                for line in lines:

                    jsonObject = json.loads(line)

                    # Some daily logs don't have the "user_id" value
                    if "user_id" not in jsonObject["context"]:
                        continue

                    global_learner_id = jsonObject["context"]["user_id"]
                    event_type = str(jsonObject["event_type"])

                    if global_learner_id != "":
                        course_id = jsonObject["context"]["course_id"]
                        course_learner_id = course_id + "_" + str(
                            global_learner_id)

                        event_time = jsonObject["time"]
                        event_time = event_time[0:19]
                        event_time = event_time.replace("T", " ")
                        event_time = datetime.datetime.strptime(
                            event_time, "%Y-%m-%d %H:%M:%S")

                        if course_learner_id in course_learner_id_set:
                            learner_all_event_logs[course_learner_id].append({
                                "event_time":
                                event_time,
                                "event_type":
                                event_type
                            })
                        else:
                            learner_all_event_logs[course_learner_id] = [{
                                "event_time":
                                event_time,
                                "event_type":
                                event_type
                            }]
                            course_learner_id_set.add(course_learner_id)

                for course_learner_id in learner_all_event_logs.keys():

                    event_logs = learner_all_event_logs[course_learner_id]

                    # Sorting
                    event_logs.sort(cmp=cmp_datetime,
                                    key=operator.itemgetter('event_time'))

                    session_id = ""
                    start_time = ""
                    end_time = ""

                    final_time = ""

                    for i in range(len(event_logs)):

                        if start_time == "":

                            # Initialization
                            start_time = event_logs[i]["event_time"]
                            end_time = event_logs[i]["event_time"]

                        else:

                            if event_logs[i][
                                    "event_time"] > end_time + datetime.timedelta(
                                        hours=0.5):

                                session_id = course_learner_id + "_" + str(
                                    start_time) + "_" + str(end_time)
                                duration = (end_time -
                                            start_time).days * 24 * 60 * 60 + (
                                                end_time - start_time).seconds

                                if duration > 5:
                                    array = [
                                        session_id, course_learner_id,
                                        start_time, end_time, duration
                                    ]
                                    session_record.append(array)

                                final_time = event_logs[i]["event_time"]

                                # Re-initialization
                                session_id = ""
                                start_time = event_logs[i]["event_time"]
                                end_time = event_logs[i]["event_time"]

                            else:

                                if event_logs[i]["event_type"] == "page_close":

                                    end_time = event_logs[i]["event_time"]

                                    session_id = course_learner_id + "_" + str(
                                        start_time) + "_" + str(end_time)
                                    duration = (end_time - start_time
                                                ).days * 24 * 60 * 60 + (
                                                    end_time -
                                                    start_time).seconds

                                    if duration > 5:
                                        array = [
                                            session_id, course_learner_id,
                                            start_time, end_time, duration
                                        ]
                                        session_record.append(array)

                                    # Re-initialization
                                    session_id = ""
                                    start_time = ""
                                    end_time = ""

                                    final_time = event_logs[i]["event_time"]

                                else:

                                    end_time = event_logs[i]["event_time"]

                    if final_time != "":
                        new_logs = []
                        for log in event_logs:
                            if log["event_time"] >= final_time:
                                new_logs.append(log)

                        updated_learner_all_event_logs[
                            course_learner_id] = new_logs

        current_date = getNextDay(current_date)

    # Filter duplicated records
    updated_session_record = []
    session_id_set = set()
    for array in session_record:
        session_id = array[0]
        if session_id not in session_id_set:
            session_id_set.add(session_id)
            updated_session_record.append(array)

    session_record = updated_session_record

    # Database version
    for array in session_record:
        session_id = array[0]
        course_learner_id = array[1]
        start_time = array[2]
        end_time = array[3]
        duration = process_null(array[4])
        sql = "insert into sessions(session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
        data = (session_id, course_learner_id, start_time, end_time, duration)
        cursor.execute(sql, data)

    # File version
    '''
Example #11
0
def forum_sessions(metadata_path, log_path, cursor):

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)

    start_date = course_metadata_map["start_date"]
    end_date = course_metadata_map["end_date"]

    current_date = start_date
    end_next_date = getNextDay(end_date)

    forum_event_types = []
    forum_event_types.append("edx.forum.comment.created")
    forum_event_types.append("edx.forum.response.created")
    forum_event_types.append("edx.forum.response.voted")
    forum_event_types.append("edx.forum.thread.created")
    forum_event_types.append("edx.forum.thread.voted")
    forum_event_types.append("edx.forum.searched")

    learner_all_event_logs = {}
    updated_learner_all_event_logs = {}

    forum_sessions_record = []

    log_files = os.listdir(log_path)

    while True:

        if current_date == end_next_date:
            break

        for log_file in log_files:

            if current_date in log_file:

                print log_file
                learner_all_event_logs.clear()
                learner_all_event_logs = updated_learner_all_event_logs.copy()
                updated_learner_all_event_logs.clear()

                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_all_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)

                log_file = open(log_path + log_file, "r")
                lines = log_file.readlines()

                for line in lines:

                    jsonObject = json.loads(line)

                    # Some daily logs don't have the "user_id" value
                    if "user_id" not in jsonObject["context"]:
                        continue

                    if jsonObject["context"]["user_id"] == "":
                        continue

                    # For forum session separation
                    global_learner_id = jsonObject["context"]["user_id"]
                    event_type = str(jsonObject["event_type"])

                    if "/discussion/" in event_type or event_type in forum_event_types:
                        if event_type != "edx.forum.searched":
                            event_type = "forum_activity"

                    if global_learner_id != "":

                        course_id = jsonObject["context"]["course_id"]
                        course_learner_id = course_id + "_" + str(
                            global_learner_id)

                        event_time = jsonObject["time"]
                        event_time = event_time[0:19]
                        event_time = event_time.replace("T", " ")
                        event_time = datetime.datetime.strptime(
                            event_time, "%Y-%m-%d %H:%M:%S")

                        # added for relevant elements
                        event_page = ""
                        if jsonObject.has_key("page"):
                            event_page = str(jsonObject["page"])

                        event_path = ""
                        if jsonObject.has_key("path"):
                            event_path = str(jsonObject["path"])

                        event_referer = ""
                        if jsonObject.has_key("referer"):
                            event_referer = str(jsonObject["referer"])

                        if course_learner_id in course_learner_id_set:
                            learner_all_event_logs[course_learner_id].append({
                                "event_time":
                                event_time,
                                "event_type":
                                event_type,
                                "page":
                                event_page,
                                "path":
                                event_path,
                                "referer":
                                event_referer
                            })
                        else:
                            learner_all_event_logs[course_learner_id] = [{
                                "event_time":
                                event_time,
                                "event_type":
                                event_type,
                                "page":
                                event_page,
                                "path":
                                event_path,
                                "referer":
                                event_referer
                            }]
                            course_learner_id_set.add(course_learner_id)

                # For forum session separation
                for learner in learner_all_event_logs.keys():

                    course_learner_id = learner
                    event_logs = learner_all_event_logs[learner]
                    course_id = course_learner_id.split("_")[0]

                    # Sorting
                    event_logs.sort(cmp=cmp_datetime,
                                    key=operator.itemgetter('event_time'))

                    session_id = ""
                    start_time = ""
                    end_time = ""
                    times_search = 0

                    final_time = ""

                    # represent the elements which just before the session.
                    session_rel_element_pre = ""
                    # represent the elements which is mentioned in the session.
                    session_rel_element_cur = ""

                    for i in range(len(event_logs)):

                        rel_element_cur = courseElementsFinder(
                            event_logs[i], course_id)

                        if session_id == "":

                            if event_logs[i]["event_type"] in [
                                    "forum_activity", "edx.forum.searched"
                            ]:
                                # Initialization
                                session_id = "forum_session_" + course_learner_id
                                start_time = event_logs[i]["event_time"]
                                end_time = event_logs[i]["event_time"]
                                if event_logs[i][
                                        "event_type"] == "edx.forum.searched":
                                    times_search += 1
                                # Added for relevant element id
                                session_rel_element_cur = rel_element_cur
                        else:

                            if event_logs[i]["event_type"] in [
                                    "forum_activity", "edx.forum.searched"
                            ]:

                                if event_logs[i][
                                        "event_time"] > end_time + datetime.timedelta(
                                            hours=0.5):

                                    session_id = session_id + "_" + str(
                                        start_time) + "_" + str(end_time)
                                    duration = (end_time - start_time
                                                ).days * 24 * 60 * 60 + (
                                                    end_time -
                                                    start_time).seconds

                                    if duration > 5:
                                        rel_element_id = ""
                                        if session_rel_element_cur != "":
                                            rel_element_id = session_rel_element_cur
                                        else:
                                            rel_element_id = session_rel_element_pre
                                        array = [
                                            session_id, course_learner_id,
                                            times_search, start_time, end_time,
                                            duration, rel_element_id
                                        ]
                                        forum_sessions_record.append(array)

                                    final_time = event_logs[i]["event_time"]

                                    # Re-initialization
                                    session_id = "forum_session_" + course_learner_id
                                    start_time = event_logs[i]["event_time"]
                                    end_time = event_logs[i]["event_time"]
                                    if event_logs[i][
                                            "event_type"] == "edx.forum.searched":
                                        times_search = 1
                                    # Added for relevant element id
                                    session_rel_element_cur = rel_element_cur

                                else:

                                    end_time = event_logs[i]["event_time"]
                                    if event_logs[i][
                                            "event_type"] == "edx.forum.searched":
                                        times_search += 1
                                    if session_rel_element_cur == "":
                                        session_rel_element_cur = rel_element_cur

                            else:

                                if event_logs[i][
                                        "event_time"] <= end_time + datetime.timedelta(
                                            hours=0.5):
                                    end_time = event_logs[i]["event_time"]

                                session_id = session_id + "_" + str(
                                    start_time) + "_" + str(end_time)
                                duration = (end_time -
                                            start_time).days * 24 * 60 * 60 + (
                                                end_time - start_time).seconds

                                if duration > 5:
                                    rel_element_id = ""
                                    if session_rel_element_cur != "":
                                        rel_element_id = session_rel_element_cur
                                    else:
                                        rel_element_id = session_rel_element_pre
                                    array = [
                                        session_id, course_learner_id,
                                        times_search, start_time, end_time,
                                        duration, rel_element_id
                                    ]
                                    forum_sessions_record.append(array)

                                final_time = event_logs[i]["event_time"]

                                # Re-initialization
                                session_id = ""
                                start_time = ""
                                end_time = ""
                                times_search = 0

                        # session_rel_element_pre is used for recording the element id
                        # of the most recent event logs before the session logs
                        if rel_element_cur != "":
                            session_rel_element_pre = rel_element_cur

                    if final_time != "":
                        new_logs = []
                        for log in event_logs:
                            if log["event_time"] >= final_time:
                                new_logs.append(log)

                        updated_learner_all_event_logs[
                            course_learner_id] = new_logs

                log_file.close()

        current_date = getNextDay(current_date)

    # Database version
    for array in forum_sessions_record:
        session_id = array[0]
        course_learner_id = array[1]
        times_search = process_null(array[2])
        start_time = array[3]
        end_time = array[4]
        duration = process_null(array[5])
        rel_element_id = array[6]
        sql = "insert into forum_sessions (session_id, course_learner_id, times_search, start_time, end_time, duration, relevent_element_id) values (%s,%s,%s,%s,%s,%s,%s)"
        data = (session_id, course_learner_id, times_search, start_time,
                end_time, duration, rel_element_id)
        cursor.execute(sql, data)

    # File version
    '''
Example #12
0
def learner_mode(metadata_path, cursor):

    course_record = []
    course_element_record = []
    learner_index_record = []
    course_learner_record = []
    learner_demographic_record = []

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    course_record.append([
        course_metadata_map["course_id"], course_metadata_map["course_name"],
        course_metadata_map["start_time"], course_metadata_map["end_time"]
    ])

    # Course_element table
    for element_id in course_metadata_map["element_time_map"].keys():

        element_start_time = course_metadata_map["element_time_map"][
            element_id]
        # Some contents released just one hour earlier than the hour of start time.
        # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00.
        # However, those 2nd week contents are count as 1st week.
        # In order to avoid above situation, I use date to replace datetime here.
        week = getDayDiff(course_metadata_map["start_time"].date(),
                          element_start_time.date()) / 7 + 1

        array = [
            element_id, course_metadata_map["element_type_map"][element_id],
            week, course_metadata_map["course_id"]
        ]
        course_element_record.append(array)

    files = os.listdir(metadata_path)

    # Learner_demographic table
    learner_mail_map = {}

    # Course_learner table
    course_learner_map = {}
    learner_enrollment_time_map = {}

    # Enrolled learners set
    enrolled_learner_set = set()

    course_id = ""

    # Processing student_courseenrollment data
    for file in files:
        if "student_courseenrollment" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()

            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                course_id = record[2]
                time = datetime.datetime.strptime(record[3],
                                                  "%Y-%m-%d %H:%M:%S")
                course_learner_id = course_id + "_" + global_learner_id

                if cmp_datetime(course_metadata_map["end_time"], time):
                    enrolled_learner_set.add(global_learner_id)

                    array = [global_learner_id, course_id, course_learner_id]
                    learner_index_record.append(array)

                    course_learner_map[global_learner_id] = course_learner_id
                    learner_enrollment_time_map[global_learner_id] = time

            input_file.close()

            print "The number of enrolled learners is: " + str(
                len(enrolled_learner_set)) + "\n"

    # Processing auth_user data
    for file in files:
        if "auth_user-" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()
            for line in lines:
                record = line.split("\t")
                if record[0] in enrolled_learner_set:
                    learner_mail_map[record[0]] = record[4]
            input_file.close()

    # Processing certificates_generatedcertificate data
    num_uncertifiedLearners = 0
    num_certifiedLearners = 0
    for file in files:
        if "certificates_generatedcertificate" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()

            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                final_grade = record[3]
                enrollment_mode = record[14].replace("\n", "")
                certificate_status = record[7]

                register_time = ""
                if course_learner_map.has_key(global_learner_id):
                    register_time = learner_enrollment_time_map[
                        global_learner_id]

                if course_learner_map.has_key(global_learner_id):
                    num_certifiedLearners += 1
                    array = [
                        course_learner_map[global_learner_id], final_grade,
                        enrollment_mode, certificate_status, register_time
                    ]
                    course_learner_record.append(array)
                else:
                    num_uncertifiedLearners += 1

            input_file.close()

            print "The number of uncertified & certified learners is: " + str(
                num_uncertifiedLearners) + "\t" + str(
                    num_certifiedLearners) + "\n"

    # Processing auth_userprofile data
    for file in files:
        if "auth_userprofile" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()

            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                gender = record[7]
                year_of_birth = record[9]
                level_of_education = record[10]
                country = record[13]

                course_learner_id = course_id + "_" + global_learner_id

                if global_learner_id in enrolled_learner_set:
                    array = [
                        course_learner_id, gender, year_of_birth,
                        level_of_education, country,
                        learner_mail_map[global_learner_id]
                    ]
                    learner_demographic_record.append(array)

            input_file.close()

    # Database version
    # Course table
    for array in course_record:
        course_id = course_metadata_map["course_id"]
        course_name = course_metadata_map["course_name"]
        start_time = course_metadata_map["start_time"]
        end_time = course_metadata_map["end_time"]
        sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)"
        data = (course_id, course_name, start_time, end_time)
        cursor.execute(sql, data)

    for array in course_element_record:
        element_id = array[0]
        element_type = array[1]
        week = process_null(array[2])
        course_id = array[3]
        sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)"
        data = (element_id, element_type, week, course_id)
        cursor.execute(sql, data)

    # Learner_index table
    for array in learner_index_record:
        global_learner_id = array[0]
        course_id = array[1]
        course_learner_id = array[2]
        sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)"
        data = (global_learner_id, course_id, course_learner_id)
        cursor.execute(sql, data)

    # Course_learner table
    for array in course_learner_record:
        course_learner_id = array[0]
        final_grade = process_null(array[1])
        enrollment_mode = array[2]
        certificate_status = array[3]
        register_time = process_null(array[4])
        sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)"
        data = (course_learner_id, final_grade, enrollment_mode,
                certificate_status, register_time)
        cursor.execute(sql, data)

    # Learner_demographic table
    for array in learner_demographic_record:
        course_learner_id = process_null(array[0])
        gender = array[1]
        year_of_birth = process_null(process_null(array[2]))
        level_of_education = array[3]
        country = array[4]
        email = array[5]
        email = email.replace("\'", "")
        sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)"
        data = (course_learner_id, gender, year_of_birth, level_of_education,
                country, email)
        cursor.execute(sql, data)

    # File version
    '''
Example #13
0
def quiz_sessions(metadata_path, log_path, cursor):

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)

    # Processing events data
    submission_event_collection = []

    # Problem check
    submission_event_collection.append("problem_check")  # Server
    submission_event_collection.append("save_problem_check")
    submission_event_collection.append("problem_check_fail")
    submission_event_collection.append("save_problem_check_fail")

    # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully.
    submission_event_collection.append("problem_graded")

    # The server emits problem_rescore events when a problem is successfully rescored.
    submission_event_collection.append("problem_rescore")
    submission_event_collection.append("problem_rescore_fail")

    submission_event_collection.append("problem_reset")  # event_source: serve
    submission_event_collection.append("reset_problem")
    submission_event_collection.append("reset_problem_fail")

    # The server emits problem_save events after a user saves a problem.
    submission_event_collection.append("problem_save")  # event_source: server
    submission_event_collection.append("save_problem_fail")
    submission_event_collection.append("save_problem_success")

    # Show answer
    submission_event_collection.append("problem_show")
    submission_event_collection.append("showanswer")

    current_date = course_metadata_map["start_date"]
    end_next_date = getNextDay(course_metadata_map["end_date"])

    log_files = os.listdir(log_path)

    child_parent_map = course_metadata_map["child_parent_map"]

    learner_all_event_logs = {}
    updated_learner_all_event_logs = {}
    quiz_sessions = {}

    while True:

        if current_date == end_next_date:
            break

        for file in log_files:
            if current_date in file:

                print file

                learner_all_event_logs.clear()
                learner_all_event_logs = updated_learner_all_event_logs.copy()
                updated_learner_all_event_logs.clear()

                # Course_learner_id set
                course_learner_id_set = set()
                for course_learner_id in learner_all_event_logs.keys():
                    course_learner_id_set.add(course_learner_id)

                input_file = open(log_path + file, "r")
                lines = input_file.readlines()

                for line in lines:

                    jsonObject = json.loads(line)

                    # Some daily logs don't have the "user_id" value
                    if "user_id" not in jsonObject["context"]:
                        continue

                    global_learner_id = jsonObject["context"]["user_id"]
                    event_type = str(jsonObject["event_type"])

                    if global_learner_id != "":

                        course_id = jsonObject["context"]["course_id"]
                        course_learner_id = course_id + "_" + str(
                            global_learner_id)

                        event_time = jsonObject["time"]
                        event_time = event_time[0:19]
                        event_time = event_time.replace("T", " ")
                        event_time = datetime.datetime.strptime(
                            event_time, "%Y-%m-%d %H:%M:%S")

                        if learner_all_event_logs.has_key(course_learner_id):
                            learner_all_event_logs[course_learner_id].append({
                                "event_time":
                                event_time,
                                "event_type":
                                event_type
                            })
                        else:
                            learner_all_event_logs[course_learner_id] = [{
                                "event_time":
                                event_time,
                                "event_type":
                                event_type
                            }]

                # For quiz session separation
                for course_learner_id in learner_all_event_logs.keys():

                    event_logs = learner_all_event_logs[course_learner_id]

                    # Sorting
                    event_logs.sort(cmp=cmp_datetime,
                                    key=operator.itemgetter('event_time'))

                    session_id = ""
                    start_time = ""
                    end_time = ""

                    final_time = ""

                    for i in range(len(event_logs)):

                        if session_id == "":

                            if "problem+block" in event_logs[i][
                                    "event_type"] or "_problem;_" in event_logs[
                                        i]["event_type"] or event_logs[i][
                                            "event_type"] in submission_event_collection:

                                event_type_array = event_logs[i][
                                    "event_type"].split("/")

                                if "problem+block" in event_logs[i][
                                        "event_type"]:
                                    question_id = event_type_array[4]

                                if "_problem;_" in event_logs[i]["event_type"]:
                                    question_id = event_type_array[6].replace(
                                        ";_", "/")

                                if question_id in child_parent_map.keys():

                                    parent_block_id = child_parent_map[
                                        question_id]

                                    session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id
                                    start_time = event_logs[i]["event_time"]
                                    end_time = event_logs[i]["event_time"]

                        else:

                            if "problem+block" in event_logs[i][
                                    "event_type"] or "_problem;_" in event_logs[
                                        i]["event_type"] or event_logs[i][
                                            "event_type"] in submission_event_collection:

                                if event_logs[i][
                                        "event_time"] > end_time + datetime.timedelta(
                                            hours=0.5):

                                    if quiz_sessions.has_key(session_id):
                                        quiz_sessions[session_id][
                                            "time_array"].append({
                                                "start_time":
                                                start_time,
                                                "end_time":
                                                end_time
                                            })
                                    else:
                                        quiz_sessions[session_id] = {
                                            "course_learner_id":
                                            course_learner_id,
                                            "time_array": [{
                                                "start_time": start_time,
                                                "end_time": end_time
                                            }]
                                        }

                                    final_time = event_logs[i]["event_time"]

                                    if "problem+block" in event_logs[i][
                                            "event_type"] or "_problem;_" in event_logs[
                                                i]["event_type"] or event_logs[
                                                    i]["event_type"] in submission_event_collection:
                                        event_type_array = event_logs[i][
                                            "event_type"].split("/")

                                        if "problem+block" in event_logs[i][
                                                "event_type"]:
                                            question_id = event_type_array[4]

                                        if "_problem;_" in event_logs[i][
                                                "event_type"]:
                                            question_id = event_type_array[
                                                6].replace(";_", "/")

                                        if question_id in child_parent_map.keys(
                                        ):
                                            parent_block_id = child_parent_map[
                                                question_id]
                                            session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id
                                            start_time = event_logs[i][
                                                "event_time"]
                                            end_time = event_logs[i][
                                                "event_time"]
                                        else:
                                            session_id = ""
                                            start_time = ""
                                            end_time = ""
                                else:
                                    end_time = event_logs[i]["event_time"]

                            else:

                                if event_logs[i][
                                        "event_time"] <= end_time + datetime.timedelta(
                                            hours=0.5):
                                    end_time = event_logs[i]["event_time"]

                                if quiz_sessions.has_key(session_id):
                                    quiz_sessions[session_id][
                                        "time_array"].append({
                                            "start_time":
                                            start_time,
                                            "end_time":
                                            end_time
                                        })
                                else:
                                    quiz_sessions[session_id] = {
                                        "course_learner_id":
                                        course_learner_id,
                                        "time_array": [{
                                            "start_time": start_time,
                                            "end_time": end_time
                                        }]
                                    }

                                final_time = event_logs[i]["event_time"]

                                session_id = ""
                                start_time = ""
                                end_time = ""

                    if final_time != "":
                        new_logs = []
                        for log in event_logs:
                            if log["event_time"] >= final_time:
                                new_logs.append(log)

                        updated_learner_all_event_logs[
                            course_learner_id] = new_logs

        current_date = getNextDay(current_date)

    # To compress the session event_logs
    for session_id in quiz_sessions.keys():
        if len(quiz_sessions[session_id]["time_array"]) > 1:

            start_time = ""
            end_time = ""
            updated_time_array = []

            for i in range(len(quiz_sessions[session_id]["time_array"])):
                if i == 0:
                    start_time = quiz_sessions[session_id]["time_array"][i][
                        "start_time"]
                    end_time = quiz_sessions[session_id]["time_array"][i][
                        "end_time"]
                else:
                    if quiz_sessions[session_id]["time_array"][i][
                            "start_time"] > end_time + datetime.timedelta(
                                hours=0.5):
                        updated_time_array.append({
                            "start_time": start_time,
                            "end_time": end_time
                        })
                        start_time = quiz_sessions[session_id]["time_array"][
                            i]["start_time"]
                        end_time = quiz_sessions[session_id]["time_array"][i][
                            "end_time"]
                        if i == len(
                                quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({
                                "start_time": start_time,
                                "end_time": end_time
                            })
                    else:
                        end_time = quiz_sessions[session_id]["time_array"][i][
                            "end_time"]

                        if i == len(
                                quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({
                                "start_time": start_time,
                                "end_time": end_time
                            })

            quiz_sessions[session_id]["time_array"] = updated_time_array

    quiz_session_record = []

    for session_id in quiz_sessions.keys():
        course_learner_id = quiz_sessions[session_id]["course_learner_id"]
        for i in range(len(quiz_sessions[session_id]["time_array"])):

            start_time = quiz_sessions[session_id]["time_array"][i][
                "start_time"]
            end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
            if start_time < end_time:
                duration = (end_time - start_time).days * 24 * 60 * 60 + (
                    end_time - start_time).seconds
                final_session_id = session_id + "_" + str(
                    start_time) + "_" + str(end_time)

                if duration > 5:
                    array = [
                        final_session_id, course_learner_id, start_time,
                        end_time, duration
                    ]
                    quiz_session_record.append(array)

    # Database version
    for array in quiz_session_record:
        session_id = array[0]
        course_learner_id = array[1]
        start_time = array[2]
        end_time = array[3]
        duration = process_null(array[4])
        sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
        data = (session_id, course_learner_id, start_time, end_time, duration)
        cursor.execute(sql, data)
    ''' 
Example #14
0
def quiz_mode(metadata_path, log_path, cursor):

    # quiz_question_record = []
    # submissions = {}
    # assessments = {}

    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)

    quiz_question_map = course_metadata_map["quiz_question_map"]
    block_type_map = course_metadata_map["block_type_map"]
    element_time_map_due = course_metadata_map["element_time_map_due"]

    for question_id in quiz_question_map:

        question_due = ""

        question_weight = quiz_question_map[question_id]

        quiz_question_parent = course_metadata_map["child_parent_map"][
            question_id]

        if (question_due == "") and (quiz_question_parent
                                     in element_time_map_due):
            question_due = element_time_map_due[quiz_question_parent]

        while not block_type_map.has_key(quiz_question_parent):
            quiz_question_parent = course_metadata_map["child_parent_map"][
                quiz_question_parent]
            if (question_due == "") and (quiz_question_parent
                                         in element_time_map_due):
                question_due = element_time_map_due[quiz_question_parent]

        quiz_question_type = block_type_map[quiz_question_parent]
        question_due = process_null(question_due)
        # array_quiz = [question_id, quiz_question_type, question_weight, question_due]
        # quiz_question_record.append(array_quiz)
        sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values (%s,%s,%s,%s)"
        data = (question_id, quiz_question_type, question_weight, question_due)
        cursor.execute(sql, data)

    # Processing events data
    submission_event_collection = []

    # Problem check
    submission_event_collection.append("problem_check")  # Server
    '''
    submission_event_collection.append("save_problem_check")
    submission_event_collection.append("problem_check_fail")
    submission_event_collection.append("save_problem_check_fail")
    
    # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully.
    submission_event_collection.append("problem_graded")
    
    # The server emits problem_rescore events when a problem is successfully rescored.
    submission_event_collection.append("problem_rescore")
    submission_event_collection.append("problem_rescore_fail")
    
    submission_event_collection.append("problem_reset") # event_source: serve
    submission_event_collection.append("reset_problem")
    submission_event_collection.append("reset_problem_fail")
    
    # The server emits problem_save events after a user saves a problem.
    submission_event_collection.append("problem_save") # event_source: server
    submission_event_collection.append("save_problem_fail")
    submission_event_collection.append("save_problem_success")
    
    # Show answer
    submission_event_collection.append("problem_show")
    submission_event_collection.append("showanswer")
    '''

    current_date = course_metadata_map["start_date"]
    end_next_date = getNextDay(course_metadata_map["end_date"])

    log_files = os.listdir(log_path)

    submission_uni_index = 0
    while True:

        if current_date == end_next_date:
            break

        for file in log_files:
            if current_date in file:

                print file

                input_file = open(log_path + file, "r")
                lines = input_file.readlines()

                for line in lines:

                    jsonObject = json.loads(line)

                    if jsonObject["event_type"] in submission_event_collection:

                        # Some daily logs don't have the "user_id" value
                        if "user_id" not in jsonObject["context"]:
                            continue

                        global_learner_id = jsonObject["context"]["user_id"]

                        if global_learner_id != "":

                            course_id = jsonObject["context"]["course_id"]
                            course_learner_id = course_id + "_" + str(
                                global_learner_id)

                            question_id = ""

                            grade = ""
                            max_grade = ""

                            event_time = jsonObject["time"]
                            event_time = event_time[0:19]
                            event_time = event_time.replace("T", " ")
                            event_time = datetime.datetime.strptime(
                                event_time, "%Y-%m-%d %H:%M:%S")

                            if isinstance(jsonObject["event"], dict):
                                question_id = jsonObject["event"]["problem_id"]

                                # The fields "grade" and "max_grade" are specific to submission event "problem_check"
                                if jsonObject["event"].has_key(
                                        "grade"
                                ) and jsonObject["event"].has_key("max_grade"):
                                    grade = jsonObject["event"]["grade"]
                                    max_grade = jsonObject["event"][
                                        "max_grade"]

                            if question_id != "":

                                submission_id = course_learner_id + "_" + question_id + "_" + str(
                                    submission_uni_index)
                                submission_uni_index = submission_uni_index + 1

                                # For submissions
                                # array_submission = [submission_id, course_learner_id, question_id, event_time]
                                # submissions[submission_id] = array_submission
                                submission_timestamp = event_time
                                sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values (%s,%s,%s,%s)"
                                data = (submission_id, course_learner_id,
                                        question_id, submission_timestamp)
                                cursor.execute(sql, data)

                                # For assessments
                                if grade != "" and max_grade != "":
                                    # array_assessment = [submission_id, course_learner_id, max_grade, grade]
                                    # assessments[submission_id] = array_assessment
                                    assessment_id = submission_id
                                    sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values (%s,%s,%s,%s)"
                                    data = (assessment_id, course_learner_id,
                                            max_grade, grade)
                                    cursor.execute(sql, data)

        current_date = getNextDay(current_date)

    # submission_record = []
    # assessment_record = []

    # for submission_id in submissions.keys():
    #     submission_record.append(submissions[submission_id])

    # for assessment_id in assessments.keys():
    #     assessment_record.append(assessments[assessment_id])

    # Database version
    # Quiz_question table
    # for array in quiz_question_record:
    #     question_id = array[0]
    #     question_type = array[1]
    #     question_weight = array[2]
    #     question_due = array[3]
    #     sql = "insert into quiz_questions(question_id, question_type, question_weight, question_due) values"
    #     sql += "('%s','%s','%s','%s');" % (question_id, question_type, question_weight, question_due)
    #     cursor.execute(sql)

    # Submissions table
    # for array in submission_record:
    #     submission_id = array[0]
    #     course_learner_id = array[1]
    #     question_id = array[2]
    #     submission_timestamp = array[3]
    #     sql = "insert into submissions(submission_id, course_learner_id, question_id, submission_timestamp) values"
    #     sql += "('%s','%s','%s','%s');" % (submission_id, course_learner_id, question_id, submission_timestamp)
    #     cursor.execute(sql)

    # Submissions table
    # for array in assessment_record:
    #     assessment_id = array[0]
    #     course_learner_id = array[1]
    #     max_grade = array[2]
    #     grade = array[3]
    #     sql = "insert into assessments(assessment_id, course_learner_id, max_grade, grade) values"
    #     sql += "('%s','%s','%s','%s');" % (assessment_id, course_learner_id, max_grade, grade)
    #     cursor.execute(sql)
    ''' 
def forum_sessions(metadata_path, daily_log_path, remaining_forum_session_log_path, cursor):
    
    utc = pytz.UTC
    
    course_metadata_map =  ExtractCourseInformation(metadata_path)
    end_date = course_metadata_map["end_date"]
    
    # Forum-related events
    forum_event_types = []
    forum_event_types.append("edx.forum.comment.created")
    forum_event_types.append("edx.forum.response.created")
    forum_event_types.append("edx.forum.response.voted")
    forum_event_types.append("edx.forum.thread.created")
    forum_event_types.append("edx.forum.thread.voted")
    forum_event_types.append("edx.forum.searched")
    
    learner_logs = {}
    remaining_learner_logs = {}
    
    # Read remaining event logs
    if os.path.exists(remaining_forum_session_log_path):
        remaining_input_file = open(remaining_forum_session_log_path)
        learner_logs = json.loads(remaining_input_file.read(), object_hook=json_util.object_hook)
        
    # Course_learner_id set
    course_learner_id_set = set()
    for course_learner_id in learner_logs.keys():
        course_learner_id_set.add(course_learner_id)
    
    input_file = open(daily_log_path, "r")
    for line in input_file:
        
        jsonObject = json.loads(line)
        
        # Skip records without user_id
        if "user_id" not in jsonObject["context"] or jsonObject["context"]["user_id"] == "" or jsonObject["context"]["user_id"] == None:
            continue
            
        # For forum session separation
        global_learner_id = jsonObject["context"]["user_id"]
        event_type = str(jsonObject["event_type"])
                    
        if "/discussion/" in event_type or event_type in forum_event_types:
            if event_type != "edx.forum.searched":
                event_type = "forum_activity"
                                            
        course_id = jsonObject["context"]["course_id"]
        course_learner_id = course_id + "_" + str(global_learner_id)
                        
        event_time = jsonObject["time"]
        
        # Check whether the event record belongs to that day
        log_date = event_time[0:10]
        if log_date not in daily_log_path:
            # print "Log not belonging to the day...\t" + log_date
            continue
        
        event_time = event_time[0:19]
        event_time = event_time.replace("T", " ")
        event_time = datetime.datetime.strptime(event_time,"%Y-%m-%d %H:%M:%S")
        event_time = event_time.replace(tzinfo=utc)
                                               
        if course_learner_id in course_learner_id_set:
            learner_logs[course_learner_id].append({"event_time":event_time, "event_type":event_type})
        else:
            learner_logs[course_learner_id] = [{"event_time":event_time, "event_type":event_type}]
            course_learner_id_set.add(course_learner_id)
            
    input_file.close()
                            
    # For forum session separation
    for learner in learner_logs.keys():
                    
        course_learner_id = learner                    
        event_logs = learner_logs[learner]
                    
        # Sorting
        event_logs.sort(cmp=cmp_datetime, key=operator.itemgetter('event_time'))
                    
        session_id = ""
        start_time = ""
        end_time = ""                    
        times_search = 0
                    
        final_time = ""
                    
        for i in range(len(event_logs)):
            
            if session_id == "":                            
                            
                if event_logs[i]["event_type"] in ["forum_activity", "edx.forum.searched"]:
                    # Initialization
                    session_id = "forum_session_" + course_learner_id
                    start_time = event_logs[i]["event_time"]
                    end_time = event_logs[i]["event_time"]
                    if event_logs[i]["event_type"] == "edx.forum.searched":
                        times_search += 1                                                     
            else:
                            
                if event_logs[i]["event_type"] in ["forum_activity", "edx.forum.searched"]:

                    if event_logs[i]["event_time"] > end_time + datetime.timedelta(hours=0.5):
                                    
                        session_id = session_id + "_" + str(start_time) + "_" + str(end_time)
                        duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds
                        
                        times_search = process_null(times_search)
                        duration = process_null(duration)
                                    
                        if duration > 5:                                
                            array = (session_id, course_learner_id, times_search, start_time, end_time, duration, "")    
                            sql = "insert into forum_sessions (session_id, course_learner_id, times_search, start_time, end_time, duration, relevent_element_id) values (%s,%s,%s,%s,%s,%s,%s)"
                            try:
                                cursor.execute(sql, array)
                            except Exception as e:
                                pass                            
                                    
                        final_time = event_logs[i]["event_time"]
                                    
                        # Re-initialization
                        session_id = "forum_session_" + course_learner_id
                        start_time = event_logs[i]["event_time"]
                        end_time = event_logs[i]["event_time"]
                        if event_logs[i]["event_type"] == "edx.forum.searched":
                            times_search = 1
                        
                    else:
                                    
                        end_time = event_logs[i]["event_time"]
                        if event_logs[i]["event_type"] == "edx.forum.searched":
                            times_search += 1
                                                        
                else:
                                
                    if event_logs[i]["event_time"] <= end_time + datetime.timedelta(hours=0.5):
                        end_time = event_logs[i]["event_time"]

                    session_id = session_id + "_" + str(start_time) + "_" + str(end_time)
                    duration = (end_time - start_time).days * 24 * 60 * 60 + (end_time - start_time).seconds
                    
                    times_search = process_null(times_search)
                    duration = process_null(duration)
                                
                    if duration > 5:                                
                        array = (session_id, course_learner_id, times_search, start_time, end_time, duration, "")                        
                        sql = "insert into forum_sessions (session_id, course_learner_id, times_search, start_time, end_time, duration, relevent_element_id) values (%s,%s,%s,%s,%s,%s,%s)"
                        try:
                            cursor.execute(sql, array)
                        except Exception as e:
                            pass
                                    
                    final_time = event_logs[i]["event_time"]
                                    
                    # Re-initialization
                    session_id = ""
                    start_time = ""
                    end_time = ""
                    times_search = 0
  
        if final_time != "":
            new_logs = []                
            for log in event_logs:                 
                if log["event_time"] > final_time:
                    new_logs.append(log)
            remaining_learner_logs[course_learner_id] = new_logs
        
    # Output remaining logs
    if str(end_date)[0:10] not in daily_log_path:
        output_file = open(remaining_forum_session_log_path, "w")
        output_file.write(json.dumps(remaining_learner_logs, default=json_util.default))
        output_file.close()
    else:
        os.remove(remaining_forum_session_log_path)
        
        
def learner_mode(metadata_path, cursor):
    
    course_record = []
    course_element_record = []
    learner_index_record = []
    course_learner_record = []
    learner_demographic_record = []
    
    # Collect course information
    course_metadata_map = ExtractCourseInformation(metadata_path)
    course_record.append([course_metadata_map["course_id"], course_metadata_map["course_name"], course_metadata_map["start_time"], course_metadata_map["end_time"]])
    
    # Course_element table     
    for element_id in course_metadata_map["element_time_map"].keys():
                
        element_start_time = course_metadata_map["element_time_map"][element_id]
        # Some contents released just one hour earlier than the hour of start time.
        # For example, start time is 2015-10-15 09:00:00, while 2nd week contents' release time is 2015-10-22 08:00:00.
        # However, those 2nd week contents are count as 1st week.
        # In order to avoid above situation, I use date to replace datetime here.
        week = getDayDiff(course_metadata_map["start_time"].date(), element_start_time.date()) / 7 + 1
        
        array = [element_id, course_metadata_map["element_type_map"][element_id], week, course_metadata_map["course_id"]]
        course_element_record.append(array)
    
    files = os.listdir(metadata_path)
    
    # Learner_demographic table
    learner_mail_map = {}
    
    # Course_learner table
    course_learner_map = {}
    learner_enrollment_time_map = {}
    
    # Enrolled learners set
    enrolled_learner_set = set()
    
    course_id = ""
    
    # Processing student_courseenrollment data  
    for file in files:       
        if "student_courseenrollment" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()
                        
            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                course_id = record[2]
                time = datetime.datetime.strptime(record[3],"%Y-%m-%d %H:%M:%S")
                course_learner_id = course_id + "_" + global_learner_id
                    
                if cmp_datetime(course_metadata_map["end_time"], time):           
                    enrolled_learner_set.add(global_learner_id)
                    
                    array = [global_learner_id, course_id, course_learner_id]
                    learner_index_record.append(array)

                    course_learner_map[global_learner_id] = course_learner_id
                    learner_enrollment_time_map[global_learner_id] = time
                    
            input_file.close()  
        
            print "The number of enrolled learners is: " + str(len(enrolled_learner_set)) + "\n"
  
    # Processing auth_user data  
    for file in files:               
        if "auth_user-" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()
            for line in lines:
                record = line.split("\t")
                if record[0] in enrolled_learner_set:
                    learner_mail_map[record[0]] = record[4]
            input_file.close()
                    
    # Processing certificates_generatedcertificate data
    num_uncertifiedLearners = 0
    num_certifiedLearners = 0    
    for file in files:       
        if "certificates_generatedcertificate" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()
            
            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                final_grade = record[3]
                enrollment_mode = record[14].replace("\n", "")
                certificate_status = record[7]
                
                register_time = ""
                if course_learner_map.has_key(global_learner_id):
                    register_time = learner_enrollment_time_map[global_learner_id]              
                
                if course_learner_map.has_key(global_learner_id):
                    num_certifiedLearners += 1
                    array = [course_learner_map[global_learner_id], final_grade, enrollment_mode, certificate_status, register_time]
                    course_learner_record.append(array)
                else:
                    num_uncertifiedLearners += 1
            
            input_file.close()

            print "The number of uncertified & certified learners is: " + str(num_uncertifiedLearners) + "\t" + str(num_certifiedLearners) + "\n"    
    
    # Processing auth_userprofile data                    
    for file in files:       
        if "auth_userprofile" in file:
            input_file = open(metadata_path + file, "r")
            input_file.readline()
            lines = input_file.readlines()
                        
            for line in lines:
                record = line.split("\t")
                global_learner_id = record[1]
                gender = record[7]
                year_of_birth = record[9]
                level_of_education = record[10]
                country = record[13]
                
                course_learner_id = course_id + "_" + global_learner_id
                                
                if global_learner_id in enrolled_learner_set:
                    array = [course_learner_id, gender, year_of_birth, level_of_education, country, learner_mail_map[global_learner_id]]
                    learner_demographic_record.append(array)            
            
            input_file.close()
            
    # Database version
    # Course table
    for array in course_record:
        course_id = course_metadata_map["course_id"]
        course_name = course_metadata_map["course_name"]
        start_time = course_metadata_map["start_time"]
        end_time = course_metadata_map["end_time"]
        sql = "insert into courses(course_id, course_name, start_time, end_time) values (%s,%s,%s,%s)" 
        data = (course_id, course_name, start_time, end_time)
        cursor.execute(sql, data)
        
    for array in course_element_record:
        element_id = array[0]
        element_type = array[1]
        week = process_null(array[2])
        course_id = array[3]
        sql = "insert into course_elements(element_id, element_type, week, course_id) values (%s,%s,%s,%s)" 
        data = (element_id, element_type, week, course_id)
        cursor.execute(sql, data)
    
    # Learner_index table
    for array in learner_index_record:
        global_learner_id = array[0]
        course_id = array[1]
        course_learner_id = array[2]
        sql = "insert into learner_index(global_learner_id, course_id, course_learner_id) values (%s,%s,%s)"
        data = (global_learner_id, course_id, course_learner_id)
        cursor.execute(sql, data)
    
    # Course_learner table
    for array in course_learner_record:
        course_learner_id = array[0]
        final_grade = process_null(array[1])
        enrollment_mode = array[2]
        certificate_status = array[3]
        register_time = process_null(array[4])
        sql = "insert into course_learner(course_learner_id, final_grade, enrollment_mode, certificate_status, register_time) values (%s,%s,%s,%s,%s)"
        data = (course_learner_id, final_grade, enrollment_mode, certificate_status, register_time)
        cursor.execute(sql, data)
    
    # Learner_demographic table
    for array in learner_demographic_record:
        course_learner_id = process_null(array[0])
        gender = array[1]
        year_of_birth = process_null(process_null(array[2]))
        level_of_education = array[3]
        country = array[4]
        email = array[5]
        email = email.replace("\'", "")
        sql = "insert into learner_demographic(course_learner_id, gender, year_of_birth, level_of_education, country, email) values (%s,%s,%s,%s,%s,%s)"
        data = (course_learner_id, gender, year_of_birth, level_of_education, country, email)
        cursor.execute(sql, data)
    
    # File version
    '''
Example #17
0
def quiz_sessions(metadata_path, daily_log_path,
                  remaining_forum_session_log_path, cursor):

    utc = pytz.UTC

    course_metadata_map = ExtractCourseInformation(metadata_path)
    end_date = course_metadata_map["end_date"]

    # Quiz-related events
    quiz_event_types = []

    # Problem check
    quiz_event_types.append("problem_check")  # Server
    quiz_event_types.append("save_problem_check")
    quiz_event_types.append("problem_check_fail")
    quiz_event_types.append("save_problem_check_fail")

    # The server emits a problem_graded event each time a user selects Check for a problem and it is graded success- fully.
    quiz_event_types.append("problem_graded")

    # The server emits problem_rescore events when a problem is successfully rescored.
    quiz_event_types.append("problem_rescore")
    quiz_event_types.append("problem_rescore_fail")

    quiz_event_types.append("problem_reset")  # event_source: serve
    quiz_event_types.append("reset_problem")
    quiz_event_types.append("reset_problem_fail")

    # The server emits problem_save events after a user saves a problem.
    quiz_event_types.append("problem_save")  # event_source: server
    quiz_event_types.append("save_problem_fail")
    quiz_event_types.append("save_problem_success")

    # Show answer
    quiz_event_types.append("problem_show")
    quiz_event_types.append("showanswer")

    quiz_event_types.append("edx.problem.hint.demandhint_displayed")
    quiz_event_types.append("edx.problem.hint.feedback_displayed")

    child_parent_map = course_metadata_map["child_parent_map"]

    learner_logs = {}
    remaining_learner_logs = {}

    quiz_sessions = {}

    # Read remaining event logs
    if os.path.exists(remaining_forum_session_log_path):
        remaining_input_file = open(remaining_forum_session_log_path)
        learner_logs = json.loads(remaining_input_file.read(),
                                  object_hook=json_util.object_hook)

    # Course_learner_id set
    course_learner_id_set = set()
    for course_learner_id in learner_logs.keys():
        course_learner_id_set.add(course_learner_id)

    input_file = open(daily_log_path, "r")
    for line in input_file:

        jsonObject = json.loads(line)

        # Skip records without user_id
        if "user_id" not in jsonObject["context"] or jsonObject["context"][
                "user_id"] == "" or jsonObject["context"]["user_id"] == None:
            continue

        # For quiz session separation
        global_learner_id = jsonObject["context"]["user_id"]
        event_type = str(jsonObject["event_type"])

        course_id = jsonObject["context"]["course_id"]
        course_learner_id = course_id + "_" + str(global_learner_id)

        event_time = jsonObject["time"]

        # Check whether the event record belongs to that day
        log_date = event_time[0:10]
        if log_date not in daily_log_path:
            # print "Log not belonging to the day...\t" + log_date
            continue

        event_time = event_time[0:19]
        event_time = event_time.replace("T", " ")
        event_time = datetime.datetime.strptime(event_time,
                                                "%Y-%m-%d %H:%M:%S")
        event_time = event_time.replace(tzinfo=utc)

        if learner_logs.has_key(course_learner_id):
            learner_logs[course_learner_id].append({
                "event_time": event_time,
                "event_type": event_type
            })
        else:
            learner_logs[course_learner_id] = [{
                "event_time": event_time,
                "event_type": event_type
            }]

    input_file.close()

    # For quiz session separation
    for learner in learner_logs.keys():

        course_learner_id = learner
        event_logs = learner_logs[learner]

        # Sorting
        event_logs.sort(cmp=cmp_datetime,
                        key=operator.itemgetter('event_time'))

        session_id = ""
        start_time = ""
        end_time = ""

        final_time = ""

        for i in range(len(event_logs)):

            if session_id == "":

                if "problem+block" in event_logs[i][
                        "event_type"] or "_problem;_" in event_logs[i][
                            "event_type"]:

                    event_type_array = event_logs[i]["event_type"].split("/")

                    if "problem+block" in event_logs[i]["event_type"]:
                        question_id = event_type_array[4]

                    if "_problem;_" in event_logs[i]["event_type"]:
                        question_id = event_type_array[6].replace(";_", "/")

                    if question_id in child_parent_map.keys():
                        parent_block_id = child_parent_map[question_id]
                        session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id
                        start_time = event_logs[i]["event_time"]
                        end_time = event_logs[i]["event_time"]

            else:

                if "problem+block" in event_logs[i][
                        "event_type"] or "_problem;_" in event_logs[i][
                            "event_type"] or event_logs[i][
                                "event_type"] in quiz_event_types:

                    if event_logs[i][
                            "event_time"] > end_time + datetime.timedelta(
                                hours=0.5):

                        if quiz_sessions.has_key(session_id):
                            quiz_sessions[session_id]["time_array"].append({
                                "start_time":
                                start_time,
                                "end_time":
                                end_time
                            })
                        else:
                            quiz_sessions[session_id] = {
                                "course_learner_id":
                                course_learner_id,
                                "time_array": [{
                                    "start_time": start_time,
                                    "end_time": end_time
                                }]
                            }

                        final_time = event_logs[i]["event_time"]

                        if "problem+block" in event_logs[i][
                                "event_type"] or "_problem;_" in event_logs[i][
                                    "event_type"] or event_logs[i][
                                        "event_type"] in quiz_event_types:
                            event_type_array = event_logs[i][
                                "event_type"].split("/")

                            if "problem+block" in event_logs[i]["event_type"]:
                                question_id = event_type_array[4]

                            if "_problem;_" in event_logs[i]["event_type"]:
                                question_id = event_type_array[6].replace(
                                    ";_", "/")

                            if question_id in child_parent_map.keys():
                                parent_block_id = child_parent_map[question_id]
                                session_id = "quiz_session_" + parent_block_id + "_" + course_learner_id
                                start_time = event_logs[i]["event_time"]
                                end_time = event_logs[i]["event_time"]
                            else:
                                session_id = ""
                                start_time = ""
                                end_time = ""
                    else:
                        end_time = event_logs[i]["event_time"]

                else:

                    if event_logs[i][
                            "event_time"] <= end_time + datetime.timedelta(
                                hours=0.5):
                        end_time = event_logs[i]["event_time"]

                    if quiz_sessions.has_key(session_id):
                        quiz_sessions[session_id]["time_array"].append({
                            "start_time":
                            start_time,
                            "end_time":
                            end_time
                        })
                    else:
                        quiz_sessions[session_id] = {
                            "course_learner_id":
                            course_learner_id,
                            "time_array": [{
                                "start_time": start_time,
                                "end_time": end_time
                            }]
                        }

                    final_time = event_logs[i]["event_time"]

                    session_id = ""
                    start_time = ""
                    end_time = ""

        if final_time != "":
            new_logs = []
            for log in event_logs:
                if log["event_time"] > final_time:
                    new_logs.append(log)

            remaining_learner_logs[course_learner_id] = new_logs

    # Output remaining logs
    if str(end_date)[0:10] not in daily_log_path:
        output_file = open(remaining_forum_session_log_path, "w")
        output_file.write(
            json.dumps(remaining_learner_logs, default=json_util.default))
        output_file.close()
    else:
        os.remove(remaining_forum_session_log_path)

    # To compress the session event_logs
    for session_id in quiz_sessions.keys():
        if len(quiz_sessions[session_id]["time_array"]) > 1:

            start_time = ""
            end_time = ""
            updated_time_array = []

            for i in range(len(quiz_sessions[session_id]["time_array"])):
                if i == 0:
                    start_time = quiz_sessions[session_id]["time_array"][i][
                        "start_time"]
                    end_time = quiz_sessions[session_id]["time_array"][i][
                        "end_time"]
                else:
                    if quiz_sessions[session_id]["time_array"][i][
                            "start_time"] > end_time + datetime.timedelta(
                                hours=0.5):
                        updated_time_array.append({
                            "start_time": start_time,
                            "end_time": end_time
                        })
                        start_time = quiz_sessions[session_id]["time_array"][
                            i]["start_time"]
                        end_time = quiz_sessions[session_id]["time_array"][i][
                            "end_time"]
                        if i == len(
                                quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({
                                "start_time": start_time,
                                "end_time": end_time
                            })
                    else:
                        end_time = quiz_sessions[session_id]["time_array"][i][
                            "end_time"]

                        if i == len(
                                quiz_sessions[session_id]["time_array"]) - 1:
                            updated_time_array.append({
                                "start_time": start_time,
                                "end_time": end_time
                            })

            quiz_sessions[session_id]["time_array"] = updated_time_array

    for session_id in quiz_sessions.keys():
        course_learner_id = quiz_sessions[session_id]["course_learner_id"]
        for i in range(len(quiz_sessions[session_id]["time_array"])):
            start_time = quiz_sessions[session_id]["time_array"][i][
                "start_time"]
            end_time = quiz_sessions[session_id]["time_array"][i]["end_time"]
            if start_time < end_time:
                duration = process_null((end_time - start_time).days * 24 *
                                        60 * 60 +
                                        (end_time - start_time).seconds)
                final_session_id = session_id + "_" + str(
                    start_time) + "_" + str(end_time)
                if duration > 5:
                    array = (final_session_id, course_learner_id, start_time,
                             end_time, duration)
                    sql = "insert into quiz_sessions (session_id, course_learner_id, start_time, end_time, duration) values (%s,%s,%s,%s,%s)"
                    try:
                        cursor.execute(sql, array)
                    except Exception as e:
                        pass
Example #18
0
def video_interaction(metadata_path, daily_log_path,
                      remaining_video_interaction_log_path, cursor):

    utc = pytz.UTC

    course_metadata_map = ExtractCourseInformation(metadata_path)
    end_date = course_metadata_map["end_date"]

    video_interaction_map = {}

    # Video-related event types
    video_event_types = []
    video_event_types.append("hide_transcript")
    video_event_types.append("edx.video.transcript.hidden")
    video_event_types.append("edx.video.closed_captions.hidden")
    video_event_types.append("edx.video.closed_captions.shown")
    video_event_types.append("load_video")
    video_event_types.append("edx.video.loaded")
    video_event_types.append("pause_video")
    video_event_types.append("edx.video.paused")
    video_event_types.append("play_video")
    video_event_types.append("edx.video.played")
    video_event_types.append("seek_video")
    video_event_types.append("edx.video.position.changed")
    video_event_types.append("show_transcript")
    video_event_types.append("edx.video.transcript.shown")
    video_event_types.append("speed_change_video")
    video_event_types.append("stop_video")
    video_event_types.append("edx.video.stopped")
    video_event_types.append("video_hide_cc_menu")
    video_event_types.append("edx.video.language_menu.hidden")
    video_event_types.append("video_show_cc_menu")
    video_event_types.append("edx.video.language_menu.shown")

    learner_logs = {}
    remaining_learner_logs = {}

    # Read remaining event logs
    if os.path.exists(remaining_video_interaction_log_path):
        remaining_input_file = open(remaining_video_interaction_log_path)
        learner_logs = json.loads(remaining_input_file.read(),
                                  object_hook=json_util.object_hook)

    # Course_learner_id set
    course_learner_id_set = set()
    for course_learner_id in learner_logs.keys():
        course_learner_id_set.add(course_learner_id)

    input_file = open(daily_log_path, "r")
    for line in input_file:

        jsonObject = json.loads(line)

        # Skip records without user_id
        if "user_id" not in jsonObject["context"] or jsonObject["context"][
                "user_id"] == "" or jsonObject["context"]["user_id"] == None:
            continue

        global_learner_id = jsonObject["context"]["user_id"]
        event_type = jsonObject["event_type"]

        course_id = jsonObject["context"]["course_id"]
        course_learner_id = course_id + "_" + str(global_learner_id)

        event_time = jsonObject["time"]

        # Check whether the event record belongs to that day
        log_date = event_time[0:10]
        if log_date not in daily_log_path:
            # print "Log not belonging to the day...\t" + log_date
            continue

        event_time = event_time[0:19]
        event_time = event_time.replace("T", " ")
        event_time = datetime.datetime.strptime(event_time,
                                                "%Y-%m-%d %H:%M:%S")
        event_time = event_time.replace(tzinfo=utc)

        # For video-related events
        if event_type in video_event_types:

            video_id = ""

            # For seek event
            new_time = 0
            old_time = 0

            # For speed change event
            new_speed = 0
            old_speed = 0

            if isinstance(jsonObject["event"], unicode):
                event_jsonObject = json.loads(jsonObject["event"])
                video_id = event_jsonObject["id"]

                video_id = video_id.replace("-", "://", 1)
                video_id = video_id.replace("-", "/")

                # For video seek event
                if "new_time" in event_jsonObject and "old_time" in event_jsonObject:
                    new_time = event_jsonObject["new_time"]
                    old_time = event_jsonObject["old_time"]

                # For video speed change event
                if "new_speed" in event_jsonObject and "old_speed" in event_jsonObject:
                    new_speed = event_jsonObject["new_speed"]
                    old_speed = event_jsonObject["old_speed"]

            # To record video seek event
            if event_type in ["seek_video", "edx.video.position.changed"]:
                if new_time is not None and old_time is not None:
                    if course_learner_id in course_learner_id_set:
                        learner_logs[course_learner_id].append({
                            "event_time":
                            event_time,
                            "event_type":
                            event_type,
                            "video_id":
                            video_id,
                            "new_time":
                            new_time,
                            "old_time":
                            old_time
                        })
                    else:
                        learner_logs[course_learner_id] = [{
                            "event_time": event_time,
                            "event_type": event_type,
                            "video_id": video_id,
                            "new_time": new_time,
                            "old_time": old_time
                        }]
                        course_learner_id_set.add(course_learner_id)
                continue

            # To record video speed change event
            if event_type in ["speed_change_video"]:
                if course_learner_id in course_learner_id_set:
                    learner_logs[course_learner_id].append({
                        "event_time":
                        event_time,
                        "event_type":
                        event_type,
                        "video_id":
                        video_id,
                        "new_speed":
                        new_speed,
                        "old_speed":
                        old_speed
                    })
                else:
                    learner_logs[course_learner_id] = [{
                        "event_time": event_time,
                        "event_type": event_type,
                        "video_id": video_id,
                        "new_speed": new_speed,
                        "old_speed": old_speed
                    }]
                    course_learner_id_set.add(course_learner_id)
                continue

            if course_learner_id in course_learner_id_set:
                learner_logs[course_learner_id].append({
                    "event_time": event_time,
                    "event_type": event_type,
                    "video_id": video_id
                })
            else:
                learner_logs[course_learner_id] = [{
                    "event_time": event_time,
                    "event_type": event_type,
                    "video_id": video_id
                }]
                course_learner_id_set.add(course_learner_id)

        # For non-video-related events
        if event_type not in video_event_types:
            if course_learner_id in course_learner_id_set:
                learner_logs[course_learner_id].append({
                    "event_time": event_time,
                    "event_type": event_type
                })
            else:
                learner_logs[course_learner_id] = [{
                    "event_time": event_time,
                    "event_type": event_type
                }]
                course_learner_id_set.add(course_learner_id)

    input_file.close()

    # For video interaction extraction
    for learner in learner_logs.keys():

        course_learner_id = learner
        event_logs = learner_logs[learner]

        # Sorting
        event_logs.sort(cmp=cmp_datetime,
                        key=operator.itemgetter('event_time'))

        video_id = ""
        video_start_time = ""
        final_time = ""

        # For video seek event
        times_forward_seek = 0
        duration_forward_seek = 0
        times_backward_seek = 0
        duration_backward_seek = 0

        # For video speed change event
        speed_change_last_time = ""
        times_speed_up = 0
        times_speed_down = 0

        # For video pause event
        pause_check = False
        pause_start_time = ""
        duration_pause = 0

        for i in range(len(event_logs)):

            if event_logs[i]["event_type"] in [
                    "play_video", "edx.video.played"
            ]:
                video_start_time = event_logs[i]["event_time"]
                video_id = event_logs[i]["video_id"]

                if pause_check:
                    duration_pause = (event_logs[i]["event_time"] -
                                      pause_start_time).days * 24 * 60 * 60 + (
                                          event_logs[i]["event_time"] -
                                          pause_start_time).seconds
                    video_interaction_id = course_learner_id + "_" + video_id + "_" + str(
                        pause_start_time)
                    if duration_pause > 2 and duration_pause < 600:
                        if video_interaction_id in video_interaction_map.keys(
                        ):
                            video_interaction_map[video_interaction_id][
                                "times_pause"] = 1
                            video_interaction_map[video_interaction_id][
                                "duration_pause"] = duration_pause
                    pause_check = False
                continue

            if video_start_time != "":

                if event_logs[i][
                        "event_time"] > video_start_time + datetime.timedelta(
                            hours=0.5):

                    video_start_time = ""
                    video_id = ""
                    final_time = event_logs[i]["event_time"]

                else:

                    # 0. Seek
                    if event_logs[i]["event_type"] in [
                            "seek_video", "edx.video.position.changed"
                    ] and video_id == event_logs[i]["video_id"]:
                        # Forward seek event
                        if event_logs[i]["new_time"] > event_logs[i][
                                "old_time"]:
                            times_forward_seek += 1
                            duration_forward_seek += event_logs[i][
                                "new_time"] - event_logs[i]["old_time"]
                        # Backward seek event
                        if event_logs[i]["new_time"] < event_logs[i][
                                "old_time"]:
                            times_backward_seek += 1
                            duration_backward_seek += event_logs[i][
                                "old_time"] - event_logs[i]["new_time"]
                        continue

                    # 1. Speed change
                    if event_logs[i][
                            "event_type"] == "speed_change_video" and video_id == event_logs[
                                i]["video_id"]:
                        if speed_change_last_time == "":
                            speed_change_last_time = event_logs[i][
                                "event_time"]
                            old_speed = event_logs[i]["old_speed"]
                            new_speed = event_logs[i]["new_speed"]
                            if old_speed < new_speed:
                                times_speed_up += 1
                            if old_speed > new_speed:
                                times_speed_down += 1
                        else:
                            if (event_logs[i]["event_time"] -
                                    speed_change_last_time).seconds > 10:
                                old_speed = event_logs[i]["old_speed"]
                                new_speed = event_logs[i]["new_speed"]
                                if old_speed < new_speed:
                                    times_speed_up += 1
                                if old_speed > new_speed:
                                    times_speed_down += 1
                            speed_change_last_time = event_logs[i][
                                "event_time"]
                        continue

                    # 2. Pause/Stop situation
                    if event_logs[i]["event_type"] in [
                            "pause_video", "edx.video.paused", "stop_video",
                            "edx.video.stopped"
                    ] and video_id == event_logs[i]["video_id"]:

                        watch_duration = (event_logs[i]["event_time"] -
                                          video_start_time).seconds

                        video_end_time = event_logs[i]["event_time"]
                        video_interaction_id = course_learner_id + "_" + video_id + "_" + str(
                            video_start_time) + "_" + str(video_end_time)

                        if watch_duration > 5:
                            video_interaction_map[video_interaction_id] = {
                                "course_learner_id": course_learner_id,
                                "video_id": video_id,
                                "type": "video",
                                "watch_duration": watch_duration,
                                "times_forward_seek": times_forward_seek,
                                "duration_forward_seek": duration_forward_seek,
                                "times_backward_seek": times_backward_seek,
                                "duration_backward_seek":
                                duration_backward_seek,
                                "times_speed_up": times_speed_up,
                                "times_speed_down": times_speed_down,
                                "start_time": video_start_time,
                                "end_time": video_end_time
                            }

                        if event_logs[i]["event_type"] in [
                                "pause_video", "edx.video.paused"
                        ]:
                            pause_check = True
                            pause_start_time = video_end_time

                        # For video seek event
                        times_forward_seek = 0
                        duration_forward_seek = 0
                        times_backward_seek = 0
                        duration_backward_seek = 0

                        # For video speed change event
                        speed_change_last_time = ""
                        times_speed_up = 0
                        times_speed_down = 0

                        # For video general information
                        video_start_time = ""
                        video_id = ""
                        final_time = event_logs[i]["event_time"]

                        continue

                    # 3/4  Page changed/Session closed
                    if event_logs[i]["event_type"] not in video_event_types:

                        video_end_time = event_logs[i]["event_time"]
                        watch_duration = (video_end_time -
                                          video_start_time).seconds
                        video_interaction_id = course_learner_id + "_" + video_id + "_" + str(
                            video_start_time) + "_" + str(video_end_time)

                        if watch_duration > 5:
                            video_interaction_map[video_interaction_id] = {
                                "course_learner_id": course_learner_id,
                                "video_id": video_id,
                                "type": "video",
                                "watch_duration": watch_duration,
                                "times_forward_seek": times_forward_seek,
                                "duration_forward_seek": duration_forward_seek,
                                "times_backward_seek": times_backward_seek,
                                "duration_backward_seek":
                                duration_backward_seek,
                                "times_speed_up": times_speed_up,
                                "times_speed_down": times_speed_down,
                                "start_time": video_start_time,
                                "end_time": video_end_time
                            }

                        # For video seek event
                        times_forward_seek = 0
                        duration_forward_seek = 0
                        times_backward_seek = 0
                        duration_backward_seek = 0

                        # For video speed change event
                        speed_change_last_time = ""
                        times_speed_up = 0
                        times_speed_down = 0

                        # For video general information
                        video_start_time = ""
                        video_id = ""
                        final_time = event_logs[i]["event_time"]

                        continue

        if final_time != "":
            new_logs = []
            for log in event_logs:
                if log["event_time"] > final_time:
                    new_logs.append(log)

            remaining_learner_logs[course_learner_id] = new_logs

    # Output remaining logs
    if str(end_date)[0:10] not in daily_log_path:
        output_file = open(remaining_video_interaction_log_path, "w")
        output_file.write(
            json.dumps(remaining_learner_logs, default=json_util.default))
        output_file.close()
    else:
        os.remove(remaining_video_interaction_log_path)

    for interaction_id in video_interaction_map.keys():
        video_interaction_id = interaction_id
        course_learner_id = video_interaction_map[interaction_id][
            "course_learner_id"]
        video_id = video_interaction_map[interaction_id]["video_id"]
        duration = process_null(
            video_interaction_map[interaction_id]["watch_duration"])
        times_forward_seek = process_null(
            video_interaction_map[interaction_id]["times_forward_seek"])
        duration_forward_seek = process_null(
            video_interaction_map[interaction_id]["duration_forward_seek"])
        times_backward_seek = process_null(
            video_interaction_map[interaction_id]["times_backward_seek"])
        duration_backward_seek = process_null(
            video_interaction_map[interaction_id]["duration_backward_seek"])
        times_speed_up = process_null(
            video_interaction_map[interaction_id]["times_speed_up"])
        times_speed_down = process_null(
            video_interaction_map[interaction_id]["times_speed_down"])
        start_time = video_interaction_map[interaction_id]["start_time"]
        end_time = video_interaction_map[interaction_id]["end_time"]

        if "times_pause" in video_interaction_map[interaction_id]:
            times_pause = process_null(
                video_interaction_map[interaction_id]["times_pause"])
            duration_pause = process_null(
                video_interaction_map[interaction_id]["duration_pause"])
        else:
            times_pause = 0
            duration_pause = 0

        array = [
            video_interaction_id, course_learner_id, video_id, duration,
            times_forward_seek, duration_forward_seek, times_backward_seek,
            duration_backward_seek, times_speed_up, times_speed_down,
            times_pause, duration_pause, start_time, end_time
        ]
        sql = "insert into video_interaction(interaction_id, course_learner_id, video_id, duration, times_forward_seek, duration_forward_seek, times_backward_seek, duration_backward_seek, times_speed_up, times_speed_down, times_pause, duration_pause, start_time, end_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        try:
            cursor.execute(sql, array)
        except Exception as e:
            pass