def gen_user_event_last_time(): enr_df = utils.load_enroll() df = utils.load_log() min_date = utils.to_seconds(df['time'].min()) df['course_id_x_event'] = df['course_id'] + 'x' + df['event'] feat = [] df = df.sort('time') for idx, row in df.groupby(['username', 'course_id_x_event']): times = sorted(row['time'].tolist()) last_time = utils.to_seconds(times[-1]) feat.append({ 'username': idx[0], 'course_id_x_event': idx[1], 'last_time': last_time - min_date, }) feat = pd.DataFrame(feat) featp = feat.pivot_table(values='last_time', index='username', columns='course_id_x_event').reset_index() colsz = len(featp.columns) - 1 featp.columns = ['username'] + list(range(colsz)) enr_df = enr_df.merge(featp, how='left', on='username') enr_df.fillna(-1, inplace=True) return {'X': np.array(enr_df[list(range(colsz))])}
def gen_course_user_source_first_time(): enr_df = utils.load_enroll() df = utils.load_log() min_date = utils.toordinal(df['time'].min()) df.sort('time') feat = [] df = df.sort('time') for idx, row in df.groupby(['username', 'source', 'course_id']): times = sorted(row['time'].tolist()) first_time = utils.toordinal(times[0]) last_time = utils.toordinal(times[-1]) feat.append({ 'username': idx[0], 'source': idx[1], 'course_id': idx[2], 'first_time': first_time - min_date, }) feat = pd.DataFrame(feat) featp = feat.pivot_table(values='first_time', index='username', columns=['course_id', 'source']).reset_index() col_sz = len(featp.columns) - 1 featp.columns = ['username'] + list(range(col_sz)) enr_df = enr_df.merge(featp, how='left', on='username') enr_df.fillna(-1, inplace=True) return { 'X': np.array(enr_df[list(range(col_sz))]), }
def gen_course_user_prob_last_time_fine(): enr_df = utils.load_enroll() df = utils.load_log() min_date = utils.to_hours(df['time'].min()) feat = [] df = df.sort('time') df = df[df['event'] == 'problem'] for idx, row in df.groupby(['username', 'course_id']): times = sorted(row['time'].tolist()) last_time = utils.to_hours(times[-1]) feat.append({ 'username': idx[0], 'course_id': idx[1], 'last_time': last_time - min_date, }) feat = pd.DataFrame(feat) featp = feat.pivot_table(values='last_time', index='username', columns='course_id').reset_index() featp.columns = ['username'] + list(range(39)) enr_df = enr_df.merge(featp, how='left', on='username') enr_df.fillna(-1, inplace=True) return { 'X': np.array(enr_df[list(range(39))]), }
def gen_prob_time_by_username_fine(): # same as "time_feat.gen_time_by_username.npz" in initial_analysis enr_df = utils.load_enroll() df = utils.load_log() min_date = utils.to_seconds(df['time'].min()) df = df[df['event'] == 'problem'] feat = [] df = df.sort('time') for idx, row in df.groupby('username'): times = sorted(row['time'].tolist()) first_time = utils.to_seconds(times[0]) last_time = utils.to_seconds(times[-1]) feat.append({ 'username': idx, 'first_time': first_time - min_date, 'last_time': last_time - min_date, }) feat = pd.DataFrame(feat) enr_df = enr_df.merge(feat, how='left', on='username') enr_df['first_time'] = enr_df['first_time'].fillna(-1) enr_df['last_time'] = enr_df['last_time'].fillna(-1) return { 'first': utils.reshape(enr_df['first_time']), 'last': utils.reshape(enr_df['last_time']), }
def gen_user_active_time_ratio(): # dim: 24 by username df = utils.load_enroll() log_df = utils.load_log() arr = [] log_sz = len(log_df.groupby('username')) for i, (eid, part_df) in enumerate(log_df.groupby('username')): if i % 1000 == 0: l.info("{0} of {1}".format(i, log_sz)) base = {v: 0 for v in range(24)} hour_counter = Counter(part_df['time'].apply(lambda x: int( datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S').strftime('%H'))) ) base.update(hour_counter) base.update({'username': eid}) arr.append(base) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='username').fillna(0) # row normalize X = np.array(df[list(range(24))]) X = X.astype(np.float32) rowsum = X.sum(axis=1) X = X / rowsum[:, np.newaxis] nan_place = np.isnan(X) X[nan_place] = 0.0 return {'X': X}
def gen_season_user_last_time(): enr_df = utils.load_enroll() df = utils.load_log() df = pd.merge(df, pd.read_csv("data/input/ref_course_code.csv"), how='left', on='course_id') min_date = utils.toordinal(df['time'].min()) feat = [] df = df.sort('time') for idx, row in df.groupby(['username', 'season']): times = sorted(row['time'].tolist()) first_time = utils.toordinal(times[0]) last_time = utils.toordinal(times[-1]) feat.append({ 'username': idx[0], 'season': idx[1], 'last_time': last_time - min_date, }) feat = pd.DataFrame(feat) featp = feat.pivot_table(values='last_time', index='username', columns='season').reset_index() featp.columns = ['username'] + list(range(2)) enr_df = enr_df.merge(featp, how='left', on='username') enr_df.fillna(-1, inplace=True) return { 'X': np.array(enr_df[list(range(2))]), }
def gen_multiple_server_access(): """ # of multiple server,access,xxxxxxx """ df = utils.load_enroll() log_df = utils.load_log() log_sz = len(log_df.groupby('enrollment_id')) feat = [] for i, (eid, part_df) in enumerate(log_df.groupby('enrollment_id')): if i % 1000 == 0: l.info("{0} of {1}".format(i, log_sz)) object_count = Counter( part_df[(part_df['source'] == 'server') & (part_df['event'] == 'problem')]['object']) len_multi_server = len([k for k, v in object_count.items() if v > 1]) part_d = {'enrollment_id': eid} part_d['multi'] = len_multi_server feat.append(part_d) feat_df = pd.DataFrame(feat) df = df.merge(feat_df, how='left', on='enrollment_id').fillna(-1) return {'X': utils.reshape(df['multi'])}
def gen_prob_time_by_enrollment_fine(): # same as "time_feat.gen_first_time.npz" in initial_analysis enr_df = utils.load_enroll() df = utils.load_log() dx = df.groupby('course_id').agg({'time': 'min'}).reset_index() course_min_time = {} for idx, row in dx.iterrows(): course_min_time[row['course_id']] = utils.to_seconds(row['time']) feat = [] df = df.sort('time') df = df[df['event'] == 'problem'] for idx, row in df.groupby('enrollment_id'): times = sorted(row['time'].tolist()) course_id = row['course_id'].tolist()[0] first_time = utils.to_seconds(times[0]) last_time = utils.to_seconds(times[-1]) min_time = course_min_time[course_id] feat.append({ 'enrollment_id': idx, 'first_time': first_time - min_time, 'last_time': last_time - min_time, }) feat = pd.DataFrame(feat) enr_df = enr_df.merge(feat, how='left', on='enrollment_id') enr_df['first_time'] = enr_df['first_time'].fillna(-1) enr_df['last_time'] = enr_df['last_time'].fillna(-1) return { 'first': utils.reshape(enr_df['first_time']), 'last': utils.reshape(enr_df['last_time']), }
def gen_course_user_active_hours(): enr_df = utils.load_enroll() df = utils.load_log() feat = [] df = df.sort('time') for idx, row in df.groupby(['username', 'course_id']): uniq_hour = len(row['time'].apply(lambda x: datetime.datetime.strptime( x, '%Y-%m-%dT%H:%M:%S').strftime('%Y%m%d%H')).unique()) feat.append({ 'username': idx[0], 'course_id': idx[1], 'uniq_hour': uniq_hour, }) feat = pd.DataFrame(feat) featp = feat.pivot_table(values='uniq_hour', index='username', columns='course_id').reset_index() featp.columns = ['username'] + list(range(39)) enr_df = enr_df.merge(featp, how='left', on='username') enr_df.fillna(0, inplace=True) return { 'X': np.array(enr_df[list(range(39))]), }
def gen_last_source(): df = utils.load_enroll() log_df = utils.load_log() log_sz = len(log_df.groupby('enrollment_id')) log_df = log_df.groupby('enrollment_id').agg({ 'source': 'last' }).reset_index() df = df.merge(log_df, how='left', on='enrollment_id').fillna(-1) return {'X': utils.onehot_encode(df['source'])}
def gen_last_category(): df = utils.load_enroll() log_df = utils.load_log_with_obj_attrib() log_sz = len(log_df.groupby('enrollment_id')) log_df = log_df.groupby('enrollment_id').agg({ 'category': 'last' }).reset_index() df = df.merge(log_df, how='left', on='enrollment_id').fillna(-1) return {'X': utils.onehot_encode(df['category'])}
def gen_user_loguniq(): df = utils.load_enroll() log_df = utils.load_log() arr = [] for eid, part_df in log_df.groupby('username'): part_d = {'username': eid} part_d['evuniq'] = len(part_df['object'].unique()) arr.append(part_d) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='username').fillna(0) return {'X': utils.reshape(df['evuniq'])}
def gen_user_uniq_course(): df = utils.load_enroll() log_df = utils.load_log() user_df = log_df[['username', 'course_id']].groupby('username').agg({ 'course_id': lambda x: len(x.unique()) }).rename(columns={ 'course_id': 'course_uniq' }).reset_index() df = df.merge(user_df, how='left', on='username').fillna(0) return {'X': utils.reshape(df['course_uniq'])}
def gen_loglen(): enr_df = utils.load_enroll() log_df = utils.load_log() log_count_df = log_df[['enrollment_id']].groupby('enrollment_id').agg({ 'enrollment_id': 'count' }).rename(columns={ 'enrollment_id': 'log_count' }).reset_index() enr_df = enr_df.merge(log_count_df, how='left', on='enrollment_id').fillna(0) return {'X': utils.reshape(enr_df['log_count'])}
def gen_enrollment_order(): enr_df = utils.load_enroll() feat_raw = [] for idx, enr_row in enr_df.groupby(['course_id']): enr_id_list = enr_row.sort('enrollment_id').enrollment_id.tolist() enr_order_list = np.arange(len(enr_id_list)) feat_raw.append( pd.DataFrame({ 'enrollment_id': enr_id_list, 'order': enr_order_list })) feat = pd.concat(feat_raw) enr_df = enr_df.merge(feat, how='left', on='enrollment_id') return {'X': utils.reshape(enr_df['order'])}
def gen_uniq_event_source(): df = utils.load_enroll() log_df = utils.load_log() log_df['source_event'] = log_df['source'] + log_df['event'] arr = [] for eid, part_df in log_df.groupby('enrollment_id'): part_d = {'enrollment_id': eid} part_d['sz'] = len(part_df['source_event'].unique()) arr.append(part_d) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='enrollment_id').fillna(0) return {'X': utils.reshape(df['sz'])}
def gen_page_close_obj_topfreq(): df = utils.load_enroll() log_df = utils.load_log() log_df = log_df[log_df['event'] == 'page_close'] arr = [] for eid, part_df in log_df.groupby('enrollment_id'): part_d = {'enrollment_id': eid} part_d['sz'] = part_df['object'].describe()['freq'] arr.append(part_d) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='enrollment_id').fillna(0) return {'X': utils.reshape(df['sz'])}
def gen_prob_loglen(): df = utils.load_enroll() log_df = utils.load_log() log_df = log_df[log_df['event'] == 'problem'] arr = [] for eid, part_df in log_df.groupby('enrollment_id'): part_d = {'enrollment_id': eid} part_d['sz'] = len(part_df) arr.append(part_d) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='enrollment_id').fillna(0) return {'X': utils.reshape(df['sz'])}
def gen_prob_first_last_in_judgement_time(): enr_df = utils.load_enroll() df = utils.load_log() df = df[df['event'] == 'problem'] df_by_course = df.groupby('course_id').agg({'time': 'max'}).reset_index() course_evaluation_period = { row['course_id']: utils.to_evaluation_period(row['time'], days=1) for idx, row in df_by_course.iterrows() } course_list = course_evaluation_period.keys() course_df = { course_id: df[ (df['time'] >= course_evaluation_period[course_id]['begin']) & (df['time'] <= course_evaluation_period[course_id]['end']) ] for course_id in course_list } feat = [] df = df.sort('time') sz = len(df) for i, (idx, df_part) in enumerate(df.groupby(['username', 'course_id'])): if i % 100 == 0: l.info("{0} of 200k".format(i)) username = idx[0] course_id = idx[1] d = course_df[course_id][ (course_df[course_id]['username'] == username) ] first_time = -1 if len(d) == 0 else utils.to_seconds(d['time'].min()) last_time = -1 if len(d) == 0 else utils.to_seconds(d['time'].max()) feat.append({ 'username': idx[0], 'course_id': idx[1], 'last_time': last_time, 'first_time': first_time, }) feat = pd.DataFrame(feat) enr_df = enr_df.merge(feat, how='left', on=['username', 'course_id']) enr_df.fillna(-1, inplace=True) return { 'first_time': utils.reshape(enr_df['first_time']), 'last_time': utils.reshape(enr_df['last_time']), }
def gen_userhour(): df = utils.load_enroll() log_df = utils.load_log() arr = [] for eid, part_df in log_df.groupby('username'): part_d = {'username': eid} part_d['user_uniq_hour'] = len( part_df['time'].apply(lambda x: datetime.datetime.strptime( x, '%Y-%m-%dT%H:%M:%S').strftime('%Y%m%d%H')).unique()) arr.append(part_d) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='username').fillna(0) return {'X': utils.reshape(df['user_uniq_hour'])}
def gen_base(): df = utils.load_enroll() train_sz = len(pd.read_csv(utils.ENROLL_TRAIN)) truth_df = pd.read_csv(utils.TRUTH_TRAIN, names=['enrollment_id', 'target']) df = df.merge(truth_df, how='left', on='enrollment_id') assert train_sz == 120542 assert len(df) == 200904 return { 'y': utils.reshape(df['target'])[:train_sz], 'id_train': utils.reshape(df['enrollment_id'])[:train_sz], 'id_test': utils.reshape(df['enrollment_id'])[train_sz:], }
def gen_loguniq(): # Compute number of uniq object by enrollment_id log_df = utils.load_log() arr = [] for eid, part_df in log_df.groupby('enrollment_id'): part_d = {'enrollment_id': eid} part_d['uniq_object'] = len(part_df['object'].unique()) arr.append(part_d) feat_df = pd.DataFrame(arr) # Merge with enrollment_id enr_df = utils.load_enroll() enr_df = enr_df.merge(feat_df, how='left', on='enrollment_id').fillna(0) return {'X': utils.reshape(enr_df['uniq_object'])}
def gen_prob_loghour(): df = utils.load_enroll() log_df = utils.load_log() log_df = log_df[log_df['event'] == 'problem'] arr = [] for eid, part_df in log_df.groupby('enrollment_id'): part_d = {'enrollment_id': eid} part_d['uniq_days'] = len( part_df['time'].apply(lambda x: datetime.datetime.strptime( x, '%Y-%m-%dT%H:%M:%S').strftime('%Y%m%d%H')).unique()) arr.append(part_d) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='enrollment_id').fillna(0) return {'X': utils.reshape(df['uniq_days'])}
def gen_uniq_source_event_obj(): # Compute number of uniq object by enrollment_id log_df = utils.load_log() arr = [] for eid, part_df in log_df.groupby('enrollment_id'): part_d = {'enrollment_id': eid} part_d['sz'] = len(part_df[['source', 'event', 'object']].drop_duplicates()) arr.append(part_d) feat_df = pd.DataFrame(arr) # Merge with enrollment_id enr_df = utils.load_enroll() enr_df = enr_df.merge(feat_df, how='left', on='enrollment_id').fillna(0) return {'X': utils.reshape(enr_df['sz'])}
def gen_course_user_loguniq_in_judgement_time(): enr_df = utils.load_enroll() df = utils.load_log() df_by_course = df.groupby('course_id').agg({'time': 'max'}).reset_index() course_evaluation_period = { row['course_id']: utils.to_evaluation_period(row['time']) for idx, row in df_by_course.iterrows() } l.info("# Preparing extracted logs for each courses") course_df = { course_id: df[(df['time'] >= course_evaluation_period[course_id]['begin']) & (df['time'] <= course_evaluation_period[course_id]['end'])] for course_id in course_evaluation_period.keys() } feat = [] df = df.sort('time') for i, (idx, df_part) in enumerate(df.groupby(['username', 'course_id'])): if i % 100 == 0: l.info("{0} of 200k".format(i)) username = idx[0] course_id = idx[1] d = course_df[course_id][( course_df[course_id]['username'] == username)] d_agg = d.groupby('course_id').agg({ 'object': lambda series_x: len(series_x.unique()), }).reset_index().rename(columns={'object': 'count'}) if len(d_agg) == 0: elem_dict = {} else: elem_dict = dict(zip(d_agg['course_id'], d_agg['count'])) elem_dict['username'] = idx[0] elem_dict['course_id'] = idx[1] feat.append(elem_dict) feat = pd.DataFrame(feat) enr_df = enr_df.merge(feat, how='left', on=['username', 'course_id']) enr_df.fillna(0, inplace=True) available_feat = [ name for name in feat.columns if name not in ['username', 'course_id'] ] return {'X': np.array(enr_df[available_feat])}
def gen_active_hours_in_judgement_time(): enr_df = utils.load_enroll() df = utils.load_log() df_by_course = df.groupby('course_id').agg({'time': 'max'}).reset_index() course_evaluation_period = { row['course_id']: utils.to_evaluation_period(row['time'], days=1) for idx, row in df_by_course.iterrows() } # Preparing extracted logs for each courses course_df = { course_id: df[ (df['time'] >= course_evaluation_period[course_id]['begin']) & (df['time'] <= course_evaluation_period[course_id]['end']) ] for course_id in course_evaluation_period.keys() } feat = [] df = df.sort('time') sz = len(df) for i, (idx, df_part) in enumerate(df.groupby(['username', 'course_id'])): if i % 100 == 0: l.info("{0} of 200k".format(i)) username = idx[0] course_id = idx[1] d = course_df[course_id][ (course_df[course_id]['username'] == username) ] uniq_hour = len(d['time'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S').strftime( '%Y%m%d%H')).unique()) feat.append({ 'username': idx[0], 'course_id': idx[1], 'uniq_hour': uniq_hour, }) feat = pd.DataFrame(feat) enr_df = enr_df.merge(feat, how='left', on=['username', 'course_id']) enr_df.fillna(-1, inplace=True) return {'X': utils.reshape(enr_df['uniq_hour'])}
def gen_log_cha_obj_uniq(): df = utils.load_enroll() log_df = utils.load_log() arr = [] log_sz = len(log_df.groupby('enrollment_id')) for i, (eid, part_df) in enumerate(log_df.groupby('enrollment_id')): if i % 1000 == 0: l.info("{0} of {1}".format(i, log_sz)) ev = part_df[part_df['event'] == 'chapter'] part_d = {'enrollment_id': eid} part_d['evuniq'] = len(ev['object'].unique()) arr.append(part_d) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='enrollment_id').fillna(0) return {'X': utils.reshape(df['evuniq'])}
def gen_username_uniq_discuss(): df = utils.load_enroll() log_df = utils.load_log() arr = [] log_sz = len(log_df.groupby('username')) for i, (eid, part_df) in enumerate(log_df.groupby('username')): if i % 1000 == 0: l.info("{0} of {1}".format(i, log_sz)) part_d = {'username': eid} dis = part_df[part_df['event'] == 'discussion'] part_d['user_discuss'] = len(dis['object'].unique()) arr.append(part_d) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='username').fillna(0) return {'X': utils.reshape(df['user_discuss'])}
def gen_loghourtime(): df = utils.load_enroll() log_df = utils.load_log() log_sz = len(log_df.groupby('enrollment_id')) arr = [] for i, (eid, part_df) in enumerate(log_df.groupby('enrollment_id')): if i % 1000 == 0: l.info("{0} of {1}".format(i, log_sz)) part_d = {'enrollment_id': eid} part_d['uniq_hour'] = len( part_df['time'].apply(lambda x: datetime.datetime.strptime( x, '%Y-%m-%dT%H:%M:%S').strftime('%Y%m%d%H')).unique()) arr.append(part_d) feat_df = pd.DataFrame(arr) df = df.merge(feat_df, how='left', on='enrollment_id').fillna(0) return {'X': utils.reshape(df['uniq_hour'])}
def gen_loglen_in_judgement_time(): enr_df = utils.load_enroll() df = utils.load_log() df_by_course = df.groupby('course_id').agg({'time': 'max'}).reset_index() course_evaluation_period = { row['course_id']: utils.to_evaluation_period(row['time'], days=1) for idx, row in df_by_course.iterrows() } course_list = course_evaluation_period.keys() course_df = { course_id: df[ (df['time'] >= course_evaluation_period[course_id]['begin']) & (df['time'] <= course_evaluation_period[course_id]['end']) ] for course_id in course_list } feat = [] df = df.sort('time') sz = len(df) for i, (idx, df_part) in enumerate(df.groupby(['username', 'course_id'])): if i % 100 == 0: l.info("{0} of 200k".format(i)) username = idx[0] course_id = idx[1] n_activities = len(course_df[course_id][ (course_df[course_id]['username'] == username) ]) print(n_activities) feat.append({ 'username': idx[0], 'course_id': idx[1], 'n_activities': n_activities, }) feat = pd.DataFrame(feat) enr_df = enr_df.merge(feat, how='left', on=['username', 'course_id']) enr_df.fillna(-1, inplace=True) return {'X': utils.reshape(enr_df['n_activities'])}