def process_forum_entries(): """ input file: "forum_entries_courses.csv" header: service,course_id,kurs,semester,description,user,name,nid,id,parent_id,date,subject_length,text_length returns a dict with: (kurs,semester) -> "username" -> (written count [int], avg subject length [float], median text length [int]) """ KURS_COL = 2 SEMESTER_COL = 3 USER_COL = 5 SUBJ_COL = 11 TEXT_COL = 12 def handle_forum_entries(grouped): result = {} for username in grouped: if not is_valid_matrikel_nummer(username): continue value = grouped[username] written_count = len(value) subject_lengths = [int(line[SUBJ_COL]) for line in value] text_lengths = [int(line[TEXT_COL]) for line in value] result[username] = (written_count, average(subject_lengths), median(text_lengths)) return result lines = load_lines("forum_entries_courses.csv") by_kurs_semester = groupby(lines, [KURS_COL, SEMESTER_COL]) result = groupby(by_kurs_semester, [USER_COL], handle_forum_entries) return result
def process_entries_read(kurs_mapping): """ input: kurs_mapping - dict with: forum_course_id -> (kurs,semester) input file: "forum_readlist.py" header: service,course_id,username,nid,id returns a dict with (kurs,semester) -> "username" -> read count [integer] """ COURSE_ID_COL = 1 USERNAME_COL = 2 def calculate_read(grouped): return dict((username, len(grouped[username])) for username in grouped\ if is_valid_matrikel_nummer(username)) lines = load_lines("forum_readlist.csv") by_course_id = groupby(lines, [COURSE_ID_COL]) by_user = groupby(by_course_id, [USERNAME_COL], calculate_read) result = {} for course_id in by_course_id: key = kurs_mapping[course_id] result[key] = by_user[course_id] return result
def process_median_feedback_length(): """ input file: "abgabe_feedback_courses.csv" header: service,course_id,kurs,semester,description,user_id,task,subtask,author,comment_length returns a dict with (kurs,semester) -> "username" -> median feedback [float] """ KURS_COL = 2 SEMESTER_COL = 3 USER_COL = 5 COMMENT_LENGTH_COL = 9 def calculate_feedback(grouped): result = {} for username in grouped: if not is_valid_matrikel_nummer(username): continue value = grouped[username] feedback_lengths = [int(line[COMMENT_LENGTH_COL]) for line in value] median_feedback = median(feedback_lengths) result[username] = median_feedback return result lines = load_lines("abgabe_feedback_courses.csv") by_kurs_semester = groupby(lines, [KURS_COL, SEMESTER_COL]) result = groupby(by_kurs_semester, [USER_COL], calculate_feedback) return result
def process_avg_score(): """ input file: "abgabe_assessment_results_courses.csv" header: service,course_id,kurs,semester,description,user_id,result_id,result_value returns a dict with (kurs,semester) -> "username" -> (quality score [float], avg score [float], number of scores [int]) where quality score: avg score * number of scores """ KURS_COL = 2 SEMESTER_COL = 3 USER_COL = 5 RESULT_ID_COL = 6 RESULT_SCORE_COL = 7 def calculate_avg_score(grouped): result = {} for username in grouped: if not is_valid_matrikel_nummer(username): continue value = grouped[username] by_result_id = groupby(value, [RESULT_ID_COL]) result_scores = [float(line[0][RESULT_SCORE_COL]) \ for line in by_result_id.values()] avg_score = average(result_scores) num_scores = len(by_result_id.keys()) max_score = max(result_scores) if max_score == 0: quality_score = 0 else: quality_score = float(sum(result_scores)) / float(max_score) result[username] = (quality_score, avg_score, num_scores) return result lines = load_lines("abgabe_assessment_results_courses.csv") by_kurs_semester = groupby(lines, [KURS_COL, SEMESTER_COL]) result = groupby(by_kurs_semester, [USER_COL], calculate_avg_score) return result
def process_plus_count(): """ input file: "abgabe_assessment_pluses_courses.csv" header: service,course_id,kurs,semester,description,user_id,plus_date returns a dict with (kurs,semester) -> "username" -> plus count [integer] """ def count_pluses(d): return dict((x, len(d[x])) for x in d\ if is_valid_matrikel_nummer(x)) lines = load_lines("abgabe_assessment_pluses_courses.csv") KURS_COL = 2 SEMESTER_COL = 3 USER_COL = 5 by_kurs_semester = groupby(lines, [KURS_COL, SEMESTER_COL]) result = groupby(by_kurs_semester, [USER_COL], count_pluses) return result
def process_kurs_mapping(): """ input file: "forum_entries_courses.csv" header: service,course_id,kurs,semester,description,user,name,nid,id,parent_id,date,subject_length,text_length returns a dict with: (course_id) -> (kurs,semester) """ KURS_COL = 2 SEMESTER_COL = 3 COURSE_ID_COL = 1 lines = load_lines("forum_entries_courses.csv") by_kurs_semester = groupby(lines, [KURS_COL, SEMESTER_COL]) return dict((by_kurs_semester[x][0][COURSE_ID_COL], x)\ for x in by_kurs_semester)
def calculate_avg_score(grouped): result = {} for username in grouped: if not is_valid_matrikel_nummer(username): continue value = grouped[username] by_result_id = groupby(value, [RESULT_ID_COL]) result_scores = [float(line[0][RESULT_SCORE_COL]) \ for line in by_result_id.values()] avg_score = average(result_scores) num_scores = len(by_result_id.keys()) max_score = max(result_scores) if max_score == 0: quality_score = 0 else: quality_score = float(sum(result_scores)) / float(max_score) result[username] = (quality_score, avg_score, num_scores) return result
from common import transpose, first, groupby, cat import unittest if __name__ == '__main__': assert tuple(transpose(((1, 2, 3), (4, 5, 6)))) == ((1, 4), (2, 5), (3, 6)) assert first('abc') == first(['a', 'b', 'c']) == 'a' assert cat(['a', 'b', 'c']) == 'abc' assert (groupby(['test', 'one', 'two', 'three', 'four'], key=len) == { 3: ['one', 'two'], 4: ['test', 'four'], 5: ['three'] }) unittest.main()