Example #1
0
    def write_dropout_time(click_path, lect_path, outpath_hour):
        lectures = [ lect for lect in Lecture.lectures(lect_path) if Lecture.is_valid_lecture(lect) ]
        lectures.sort(key=lambda l: int(l['open_time']))
        second_half_lects = set([ l['id'] for l in lectures[((len(lectures)+1)/2):] ])

        user_lects = defaultdict(set)
        user_last_access = defaultdict(int)    
        for log in Clickstream.logs(click_path):
            if log['timestamp']/1000 > user_last_access[log['username']]:
                user_last_access[log['username']] = log['timestamp']/1000
            
            m = re.search('lecture_id=([\\d]+)', log['page_url'])
            if m==None: m = re.search('lecture/([\\d]+)', log['page_url'])
            if m == None: continue
            user_lects[log['username']].add(m.group(1))
        
        dropout_time = dict()
        for username, lects in user_lects.iteritems():
            if len( lects & second_half_lects )==0:
                dropout_time[username] = user_last_access[username]
        
        outfile = open(outpath_hour,'w')
        out_csv = csv.writer(outfile)
        out_csv.writerow(['username','dropout_time'])
        out_csv.writerows( [ [username, time] for username,time in dropout_time.iteritems() ] )
        outfile.close()
demo_csv = csv.reader(open(demo_path))
demo_csv.next()
raw_demo_header = demo_csv.next()
demo_header = []
demo_start, demo_end = (20,56)
for h in raw_demo_header[demo_start:(demo_end+1)]:
    demo_header.append(re.sub(".*-", "", h).strip())
print demo_header
demo_map = dict()
for row in demo_csv:
    demo_map[row[2]] = [ 1 if r=="TRUE" else 0 for r in row[demo_start:(demo_end+1)] ]


action_cnt = defaultdict(int)
user_seqs = defaultdict(list)
for lect in Lecture.lecture_submissions(db_path):
    user_seqs[lect['session_user_id']].append((lect['submission_time'],"L"+lect['item_id'],"0"))  # include both 'view' and 'download'
    action_cnt["L"+lect['item_id']] += 1
    
for quiz in Quiz.quiz_submissions(db_path):
    user_seqs[quiz['session_user_id']].append((quiz['submission_time'],"Q"+quiz['item_id'],(0 if quiz['raw_score']=="NULL" else quiz['raw_score'])))
    action_cnt["Q"+quiz['item_id']] += 1


# quiz_submission = dict()
# for log in Clickstream.logs(in_path):
#     action = re.sub("/$","", re.sub("https://class.coursera.org/algebra-001/", "", log["page_url"]))
#     
#     if not re.search("^(lecture|signature|quiz)", action): continue
#     action_refined = None
#