def gen_course_user_first_time(): enr_df = utils.load_enroll() df = utils.load_log() min_date = utils.toordinal(df['time'].min()) feat = [] df = df.sort('time') for idx, row in df.groupby(['username', 'course_id']): times = sorted(row['time'].tolist()) first_time = utils.toordinal(times[0]) last_time = utils.toordinal(times[-1]) feat.append({ 'username': idx[0], 'course_id': idx[1], 'first_time': first_time - min_date, }) feat = pd.DataFrame(feat) featp = feat.pivot_table(values='first_time', index='username', columns='course_id').reset_index() featp.columns = ['username'] + list(range(39)) enr_df = enr_df.merge(featp, how='left', on='username') enr_df.fillna(-1, inplace=True) return {'X': np.array(enr_df[list(range(39))])}
def gen_time_by_username(): # same as "time_feat.gen_time_by_username.npz" in initial_analysis enr_df = utils.load_enroll() df = utils.load_log() min_date = utils.toordinal(df['time'].min()) feat = [] df = df.sort('time') for idx, row in df.groupby('username'): times = sorted(row['time'].tolist()) first_time = utils.toordinal(times[0]) last_time = utils.toordinal(times[-1]) feat.append({ 'username': idx, 'first_time': first_time - min_date, 'last_time': last_time - min_date, }) feat = pd.DataFrame(feat) enr_df = enr_df.merge(feat, how='left', on='username') enr_df['first_time'] = enr_df['first_time'].fillna(-1) enr_df['last_time'] = enr_df['last_time'].fillna(-1) return { 'first': utils.reshape(enr_df['first_time']), 'last': utils.reshape(enr_df['last_time']), }
def gen_event_last_time(): enr_df = utils.load_enroll() df = utils.load_log_with_obj_attrib() min_date = utils.toordinal(df['time'].min()) feat = [] df = df.sort('time') for idx, row in df.groupby(['username', 'event']): times = sorted(row['time'].tolist()) first_time = utils.toordinal(times[0]) last_time = utils.toordinal(times[-1]) feat.append({ 'username': idx[0], 'event': idx[1], 'last_time': last_time - min_date, }) feat = pd.DataFrame(feat) featp = feat.pivot_table(values='last_time', index='username', columns='event').reset_index() col_sz = len(featp.columns) - 1 featp.columns = ['username'] + list(range(col_sz)) enr_df = enr_df.merge(featp, how='left', on='username') enr_df.fillna(-1, inplace=True) return { 'X': np.array(enr_df[list(range(col_sz))]), }
def gen_time_by_enrollment(): # same as "time_feat.gen_first_time.npz" in initial_analysis enr_df = utils.load_enroll() df = utils.load_log() dx = df.groupby('course_id').agg({'time': 'min'}).reset_index() course_min_time = {} for idx, row in dx.iterrows(): course_min_time[row['course_id']] = utils.toordinal(row['time']) feat = [] df = df.sort('time') for idx, row in df.groupby('enrollment_id'): times = sorted(row['time'].tolist()) course_id = row['course_id'].tolist()[0] first_time = utils.toordinal(times[0]) last_time = utils.toordinal(times[-1]) min_time = course_min_time[course_id] feat.append({ 'enrollment_id': idx, 'first_time': first_time - min_time, 'last_time': last_time - min_time, }) feat = pd.DataFrame(feat) enr_df = enr_df.merge(feat, how='left', on='enrollment_id') enr_df['first_time'] = enr_df['first_time'].fillna(-1) enr_df['last_time'] = enr_df['last_time'].fillna(-1) return { 'first': utils.reshape(enr_df['first_time']), 'last': utils.reshape(enr_df['last_time']), }