Ejemplo n.º 1
0
def gen_course_user_loguniq_in_judgement_time():
    enr_df = utils.load_enroll()

    df = utils.load_log()
    df_by_course = df.groupby('course_id').agg({'time': 'max'}).reset_index()
    course_evaluation_period = {
        row['course_id']: utils.to_evaluation_period(row['time'])
        for idx, row in df_by_course.iterrows()
    }

    l.info("# Preparing extracted logs for each courses")
    course_df = {
        course_id:
        df[(df['time'] >= course_evaluation_period[course_id]['begin'])
           & (df['time'] <= course_evaluation_period[course_id]['end'])]
        for course_id in course_evaluation_period.keys()
    }

    feat = []
    df = df.sort('time')
    for i, (idx, df_part) in enumerate(df.groupby(['username', 'course_id'])):
        if i % 100 == 0:
            l.info("{0} of 200k".format(i))
        username = idx[0]
        course_id = idx[1]

        d = course_df[course_id][(
            course_df[course_id]['username'] == username)]
        d_agg = d.groupby('course_id').agg({
            'object':
            lambda series_x: len(series_x.unique()),
        }).reset_index().rename(columns={'object': 'count'})
        if len(d_agg) == 0:
            elem_dict = {}
        else:
            elem_dict = dict(zip(d_agg['course_id'], d_agg['count']))
        elem_dict['username'] = idx[0]
        elem_dict['course_id'] = idx[1]
        feat.append(elem_dict)

    feat = pd.DataFrame(feat)
    enr_df = enr_df.merge(feat, how='left', on=['username', 'course_id'])
    enr_df.fillna(0, inplace=True)
    available_feat = [
        name for name in feat.columns if name not in ['username', 'course_id']
    ]
    return {'X': np.array(enr_df[available_feat])}
Ejemplo n.º 2
0
def gen_first_last_in_judgement_time():
    enr_df = utils.load_enroll()

    df = utils.load_log()
    df_by_course = df.groupby('course_id').agg({'time': 'max'}).reset_index()
    course_evaluation_period = {
        row['course_id']: utils.to_evaluation_period(row['time'])
        for idx, row in df_by_course.iterrows()
    }
    course_list = course_evaluation_period.keys()

    course_df = {
        course_id:
        df[(df['time'] >= course_evaluation_period[course_id]['begin'])
           & (df['time'] <= course_evaluation_period[course_id]['end'])]
        for course_id in course_list
    }

    feat = []
    df = df.sort('time')
    sz = len(df)
    for i, (idx, df_part) in enumerate(df.groupby(['username', 'course_id'])):
        if i % 100 == 0:
            l.info("{0} of {1}".format(i, sz))
        username = idx[0]
        course_id = idx[1]
        d = course_df[course_id][(
            course_df[course_id]['username'] == username)]
        first_time = -1 if len(d) == 0 else utils.to_seconds(d['time'].min())
        last_time = -1 if len(d) == 0 else utils.to_seconds(d['time'].max())

        feat.append({
            'username': idx[0],
            'course_id': idx[1],
            'last_time': last_time,
            'first_time': first_time,
        })

    feat = pd.DataFrame(feat)
    enr_df = enr_df.merge(feat, how='left', on=['username', 'course_id'])
    enr_df.fillna(-1, inplace=True)

    return {
        'first_time': utils.reshape(enr_df['first_time']),
        'last_time': utils.reshape(enr_df['last_time']),
    }
Ejemplo n.º 3
0
def gen_active_hours_in_judgement_time():
    enr_df = utils.load_enroll()

    df = utils.load_log()
    df_by_course = df.groupby('course_id').agg({'time': 'max'}).reset_index()
    course_evaluation_period = {
        row['course_id']: utils.to_evaluation_period(row['time'], days=1)
        for idx, row in df_by_course.iterrows()
    }

    # Preparing extracted logs for each courses
    course_df = {
        course_id: df[
            (df['time'] >= course_evaluation_period[course_id]['begin']) &
            (df['time'] <= course_evaluation_period[course_id]['end'])
        ]
        for course_id in course_evaluation_period.keys()
    }

    feat = []
    df = df.sort('time')
    sz = len(df)
    for i, (idx, df_part) in enumerate(df.groupby(['username', 'course_id'])):
        if i % 100 == 0:
            l.info("{0} of 200k".format(i))
        username = idx[0]
        course_id = idx[1]
        d = course_df[course_id][
            (course_df[course_id]['username'] == username)
        ]
        uniq_hour = len(d['time'].apply(lambda x:
            datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S').strftime(
                '%Y%m%d%H')).unique())

        feat.append({
            'username': idx[0],
            'course_id': idx[1],
            'uniq_hour': uniq_hour,
        })

    feat = pd.DataFrame(feat)
    enr_df = enr_df.merge(feat, how='left', on=['username', 'course_id'])
    enr_df.fillna(-1, inplace=True)

    return {'X': utils.reshape(enr_df['uniq_hour'])}
Ejemplo n.º 4
0
def gen_proobjuniq_object_in_judgement_time():
    enr_df = utils.load_enroll()

    df = utils.load_log()
    df = df[df['event'] == 'problem']

    df_by_course = df.groupby('course_id').agg({'time': 'max'}).reset_index()
    course_evaluation_period = {
        row['course_id']: utils.to_evaluation_period(row['time'], days=1)
        for idx, row in df_by_course.iterrows()
    }
    course_list = course_evaluation_period.keys()

    course_df = {
        course_id: df[
            (df['time'] >= course_evaluation_period[course_id]['begin']) &
            (df['time'] <= course_evaluation_period[course_id]['end'])
        ]
        for course_id in course_list
    }

    feat = []
    df = df.sort('time')
    sz = len(df)
    for i, (idx, df_part) in enumerate(df.groupby(['username', 'course_id'])):
        if i % 100 == 0:
            l.info("{0} of 200k".format(i))
        username = idx[0]
        course_id = idx[1]
        n_activities = len(course_df[course_id][
            (course_df[course_id]['username'] == username)
        ]['object'].unique())

        feat.append({
            'username': idx[0],
            'course_id': idx[1],
            'n_activities': n_activities,
        })

    feat = pd.DataFrame(feat)
    enr_df = enr_df.merge(feat, how='left', on=['username', 'course_id'])
    enr_df.fillna(-1, inplace=True)

    return {'X': utils.reshape(enr_df['n_activities'])}