def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file):

    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)
    n_trn = trn.shape[0]

    trn.time = pd.to_datetime(trn.time)
    tst.time = pd.to_datetime(tst.time)

    df = pd.concat([trn, tst], axis=0)

    last_date = df[['course_id', 'time']].groupby('course_id',
                                                  as_index=False).max()
    last_date.columns = ['course_id', 'last_date']

    df = pd.merge(df, last_date, on='course_id', how='left')

    df['days_before_last_date'] = (df.last_date - df.time).apply(lambda x: pd.Timedelta(x).days)
    df['weeks_before_last_date'] = df.days_before_last_date // 7
    df.ix[df.weeks_before_last_date == 4, 'weeks_before_last_date'] = 3
    df['last_month'] = df.last_date.apply(lambda x: x.month)

    df.drop(['time', 'last_date'], axis=1, inplace=True)
    df.set_index('enrollment_id', inplace=True)

    X = encode_categorical_features(df, n=n_trn, min_obs=10)
    X = X.tocsr()

    dump_svmlight_file(X[:n_trn], trn.enrollment_id.values, train_feature_file,
                       zero_based=False)
    dump_svmlight_file(X[n_trn:], tst.enrollment_id.values, test_feature_file,
                       zero_based=False)
def generate_feature(train_file, label_file, test_file, feature_dir,
                     feature_name):
    # Load data files
    logging.info('Loading training and test data')
    trn = pd.read_csv(train_file, index_col=0)
    tst = pd.read_csv(test_file, index_col=0)
    label = pd.read_csv(label_file, index_col=0)
    n_trn = trn.shape[0]
    n_tst = tst.shape[0]

    logging.info('Combining training and test data')
    df = pd.concat([trn, tst], ignore_index=True)

    cols = list(df.columns)
    num_cols = [x for x in cols if x[0] == 'n']
    cnt_cols = [x for x in cols if x[0] == 'o']
    cat_cols = [x for x in cols if x[0] == 'c' or x[0] == 'r']

    # no transformation for numerical variables
    logging.info('Imputing missing values in numerical columns by 0')
    X_num = df.ix[:, num_cols].values
    X_num[np.isnan(X_num)] = 0.

    # log(1 + x) transformation for ordinal variables
    X_cnt = df.ix[:, cnt_cols].values
    X_cnt[np.isnan(X_cnt)] = 0.
    X_cnt = np.log1p(X_cnt)

    # One-Hot-Encoding for categorical variables
    logging.info('One-hot-encoding categorical columns')

    X_col = encode_categorical_features(df[cat_cols],
                                        min_obs=3,
                                        n=n_trn,
                                        nan_as_var=False)
    X = sparse.hstack((X_num, X_cnt, X_col))
    X = X.tocsr()

    logging.info('Saving features into {}'.format(feature_dir))
    for i in range(label.shape[1]):
        train_feature_file = os.path.join(
            feature_dir, '{}.trn{:02d}.sps'.format(feature_name, i))
        test_feature_file = os.path.join(
            feature_dir, '{}.tst{:02d}.sps'.format(feature_name, i))

        dump_svmlight_file(X[:n_trn],
                           label.ix[:, i],
                           train_feature_file,
                           zero_based=False)
        dump_svmlight_file(X[n_trn:],
                           np.zeros((n_tst, )),
                           test_feature_file,
                           zero_based=False)
def generate_feature(train_file, label_file, test_file, feature_dir,
                     feature_name):
    # Load data files
    logging.info('Loading training and test data')
    trn = pd.read_csv(train_file, index_col=0)
    tst = pd.read_csv(test_file, index_col=0)
    label = pd.read_csv(label_file, index_col=0)
    n_trn = trn.shape[0]
    n_tst = tst.shape[0]

    logging.info('Combining training and test data')
    df = pd.concat([trn, tst], ignore_index=True)

    cols = list(df.columns)
    num_cols = [x for x in cols if x[0] == 'n']
    cnt_cols = [x for x in cols if x[0] == 'o']
    cat_cols = [x for x in cols if x[0] == 'c' or x[0] == 'r']

    # no transformation for numerical variables
    logging.info('Imputing missing values in numerical columns by 0')
    X_num = df.ix[:, num_cols].values
    X_num[np.isnan(X_num)] = 0.

    # log(1 + x) transformation for ordinal variables
    X_cnt = df.ix[:, cnt_cols].values
    X_cnt[np.isnan(X_cnt)] = 0.
    X_cnt = np.log1p(X_cnt)

    # One-Hot-Encoding for categorical variables
    logging.info('One-hot-encoding categorical columns')

    X_col = encode_categorical_features(df[cat_cols], min_obs=10, n=n_trn,
                                        nan_as_var=False)
    X = sparse.hstack((X_num, X_cnt, X_col))
    X = X.tocsr()

    logging.info('Saving features into {}'.format(feature_dir))
    for i in range(label.shape[1]):
        train_feature_file = os.path.join(feature_dir, '{}.trn{:02d}.sps'.format(feature_name, i))
        test_feature_file = os.path.join(feature_dir, '{}.tst{:02d}.sps'.format(feature_name, i))

        dump_svmlight_file(X[:n_trn], label.ix[:, i], train_feature_file,
                           zero_based=False)
        dump_svmlight_file(X[n_trn:], np.zeros((n_tst,)), test_feature_file,
                           zero_based=False)
Example #4
0
def generate_feature(train_file, test_file, object_file, train_feature_file,
                     test_feature_file):

    logging.info('loading input data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)
    obj = pd.read_csv(object_file, header=None)
    obj.columns = ['course_id', 'object', 'category', 'children', 'start']

    n_trn = trn.shape[0]

    trn.time = pd.to_datetime(trn.time)
    tst.time = pd.to_datetime(tst.time)

    df = pd.concat([trn, tst], axis=0)

    # get last dates of courses
    last_date = df[['course_id', 'time']].groupby('course_id',
                                                  as_index=False).max()
    last_date.columns = ['course_id', 'last_date']

    # extract object information
    obj.children.fillna('', inplace=True)
    obj['n_children'] = obj.children.apply(
        lambda x: int(np.log2(1 + len(x.split()))))
    obj.start.replace('null', '1999-01-01 00:00:00', inplace=True)
    obj.start = pd.to_datetime(obj.start)
    obj = pd.merge(obj, last_date, on='course_id', how='left')

    obj['obj_days_before_last_date'] = (
        obj.last_date - obj.start).apply(lambda x: pd.Timedelta(x).days)
    obj.ix[obj.obj_days_before_last_date > 30,
           'obj_days_before_last_date'] = 30

    # merge log data with last coursedate and object information
    df = pd.merge(df, last_date, on='course_id', how='left')
    df = pd.merge(
        df,
        obj[['object', 'category', 'n_children', 'obj_days_before_last_date']],
        on='object',
        how='left')

    df['days_before_last_date'] = (
        df.last_date - df.time).apply(lambda x: pd.Timedelta(x).days)
    df['weeks_before_last_date'] = df.days_before_last_date // 7
    df.ix[df.weeks_before_last_date == 4, 'weeks_before_last_date'] = 3
    df['last_month'] = df.last_date.apply(lambda x: x.month)

    df['obj_10_days_after_last_date'] = df.obj_days_before_last_date.apply(
        lambda x: 1 if x < 0 and x >= -10 else 0)
    df.obj_days_before_last_date = df.obj_days_before_last_date.apply(
        lambda x: np.sign(x) * int(np.log2(1 + np.sign(x) * x)) \
                  if ~pd.isnull(x) else x
            )

    df.drop(['time', 'last_date'], axis=1, inplace=True)
    df.set_index('enrollment_id', inplace=True)

    X = encode_categorical_features(df, n=n_trn, min_obs=100, nan_as_var=True)
    X = X.tocsr()

    dump_svmlight_file(X[:n_trn],
                       trn.enrollment_id.values,
                       train_feature_file,
                       zero_based=False)
    dump_svmlight_file(X[n_trn:],
                       tst.enrollment_id.values,
                       test_feature_file,
                       zero_based=False)
Example #5
0
def generate_feature(train_file, test_file, object_file, user_feature_file,
                     course_feature_file, train_feature_file,
                     test_feature_file):

    logging.info('loading input data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)
    obj = pd.read_csv(object_file, header=None)
    obj.columns = ['course_id', 'object', 'category', 'children', 'start']

    n_trn = trn.shape[0]

    trn.time = pd.to_datetime(trn.time)
    tst.time = pd.to_datetime(tst.time)

    df = pd.concat([trn, tst], axis=0)
    df['count'] = 1

    # get last dates of courses
    last_date = df[['course_id', 'time']].groupby('course_id',
                                                  as_index=False).max()
    last_date.columns = ['course_id', 'last_date']

    # extract object information
    obj.children.fillna('', inplace=True)
    obj['n_children'] = obj.children.apply(
        lambda x: int(np.log2(1 + len(x.split()))))
    obj.start.replace('null', '2013-10-27 00:00:00', inplace=True)
    obj.start = pd.to_datetime(obj.start)
    obj = pd.merge(obj, last_date, on='course_id', how='left')

    obj['obj_days_before_last_date'] = (
        obj.last_date - obj.start).apply(lambda x: pd.Timedelta(x).days)
    obj.ix[obj.obj_days_before_last_date > 30,
           'obj_days_before_last_date'] = 32
    obj.ix[obj.obj_days_before_last_date < -10,
           'obj_days_before_last_date'] = -16

    # merge log data with last coursedate and object information
    df = pd.merge(df, last_date, on='course_id', how='left')
    df = pd.merge(
        df,
        obj[['object', 'category', 'n_children', 'obj_days_before_last_date']],
        on='object',
        how='left')

    df['days_before_last_date'] = (
        df.last_date - df.time).apply(lambda x: pd.Timedelta(x).days)
    df['weeks_before_last_date'] = df.days_before_last_date // 7
    df.ix[df.weeks_before_last_date == 4, 'weeks_before_last_date'] = 3
    df['last_month'] = df.last_date.apply(lambda x: x.month)

    df['days_after_obj_date'] = df.obj_days_before_last_date - df.days_before_last_date
    df.ix[df.days_after_obj_date < 0, 'days_after_obj_date'] = -1

    df.days_after_obj_date = df.days_after_obj_date.apply(lambda x: np.sign(
        x) * int(np.log2(1 + np.abs(x))) if ~pd.isnull(x) else x)

    df['obj_10_days_after_last_date'] = df.obj_days_before_last_date.apply(
        lambda x: 1 if x < 0 and x >= -10 else 0)
    df.obj_days_before_last_date = df.obj_days_before_last_date.apply(
        lambda x: np.sign(x) * int(np.log2(1 + np.abs(x)))
        if ~pd.isnull(x) else x)

    eid = df.drop(['time', 'last_date'], axis=1)
    eid.set_index('enrollment_id', inplace=True)

    X = encode_categorical_features(eid, n=n_trn, min_obs=10, nan_as_var=True)
    X = X.tocsr()

    dump_svmlight_file(X[:n_trn],
                       trn.enrollment_id.values,
                       train_feature_file,
                       zero_based=False)
    dump_svmlight_file(X[n_trn:],
                       tst.enrollment_id.values,
                       test_feature_file,
                       zero_based=False)

    cid = df.drop(['enrollment_id', 'time', 'last_date'], axis=1)
    cid.set_index('course_id', inplace=True)

    X = encode_categorical_features(cid, n=n_trn, min_obs=10, nan_as_var=True)
    X = X.tocsr()
    with open(course_feature_file, 'w') as f:
        for i in range(X.shape[0]):
            x = X[i].toarray().flatten()
            idx = np.where(x != 0)[0]
            features = ' '.join(['{}:{}'.format(j + 1, x[j]) for j in idx])
            f.write('{} {}\n'.format(cid.index.values[i], features))

    uid = df.drop(['enrollment_id', 'time', 'last_date'], axis=1)
    uid.set_index('username', inplace=True)

    X = encode_categorical_features(uid, n=n_trn, min_obs=10, nan_as_var=True)
    X = X.tocsr()
    with open(user_feature_file, 'w') as f:
        for i in range(X.shape[0]):
            x = X[i].toarray().flatten()
            idx = np.where(x != 0)[0]
            features = ' '.join(['{}:{}'.format(j + 1, x[j]) for j in idx])
            f.write('{} {}\n'.format(uid.index.values[i], features))