def target_encoding(train: pd.DataFrame, test: pd.DataFrame,
                    encode_col: List[str], target_col: str,
                    cv) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Target encoding

    Args:
        train (pd.DataFrame): train
        test (pd.DataFrame): test
        encode_col (List[str]): encoded columns
        target_col (str): target column
        cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: train, test
    """
    warnings.simplefilter('ignore')

    te = TargetEncoder(cv=cv)

    train_fe = te.fit_transform(train[encode_col], train[target_col])
    train_fe.columns = ['te_' + c for c in train_fe.columns]
    train = pd.concat([train, train_fe], axis=1)

    test_fe = te.transform(test[encode_col])
    test_fe.columns = ['te_' + c for c in test_fe.columns]
    test = pd.concat([test, test_fe], axis=1)

    return train, test
Esempio n. 2
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col=ID_COL)
    tst = pd.read_csv(test_file, index_col=ID_COL)

    y = trn[TARGET_COL]
    n_trn = trn.shape[0]

    trn.drop(TARGET_COL, axis=1, inplace=True)
    logging.info('categorical: {trn.shape[1]})')

    df = pd.concat([trn, tst], axis=0)

    logging.info('label encoding categorical variables')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
    te = TargetEncoder(cv=cv)
    te.fit(trn, y)
    df = te.transform(df)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(df.columns):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(df.values[:n_trn, ], y.values, train_feature_file)
    save_data(df.values[n_trn:, ], None, test_feature_file)
Esempio n. 3
0
def target_encoding(train: pd.DataFrame, test: pd.DataFrame, encode_col,
                    target_col, cv):
    warnings.simplefilter('ignore')
    te = TargetEncoder(cv=cv)

    train_fe = te.fit_transform(train[encode_col], train[target_col])
    train_fe.columns = ['te_' + c for c in train_fe.columns]

    test_fe = te.transform(test[encode_col])
    test_fe.columns = ['te_' + c for c in test_fe.columns]

    return pd.concat([train_fe, test_fe]).reset_index(drop=True)
Esempio n. 4
0
def target_encoding(train: pd.DataFrame, test: pd.DataFrame,
                    col_definition: dict, option: dict):
    """
    col_definition: encode_col, target_col
    option: cv
    """
    warnings.simplefilter('ignore')

    te = TargetEncoder(cv=option['cv'])

    train_fe = te.fit_transform(train[col_definition['encode_col']],
                                train[col_definition['target_col']])
    train_fe.columns = ['te_' + c for c in train_fe.columns]
    train = pd.concat([train, train_fe], axis=1)

    test_fe = te.transform(test[col_definition['encode_col']])
    test_fe.columns = ['te_' + c for c in test_fe.columns]
    test = pd.concat([test, test_fe], axis=1)

    return train, test
Esempio n. 5
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file)
    tst = pd.read_csv(test_file)

    y = trn[TARGET_COL]
    n_trn = trn.shape[0]

    features = [x for x in trn.columns if x not in [ID_COL, TARGET_COL]]

    logging.info('target encoding')
    cv = StratifiedKFold(n_splits=N_FOLD, random_state=SEED)
    te = TargetEncoder(cv=cv)
    trn[features] = te.fit_transform(trn[features], y)
    tst[features] = te.transform(tst[features])

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(features):
            f.write('{}\t{}\tint\n'.format(i, col))

    logging.info('saving features')
    save_data(trn[features].values, y.values, train_feature_file)
    save_data(tst[features].values, None, test_feature_file)
Esempio n. 6
0
def generate_feature(train_file, test_file, train_feature_file,
                     test_feature_file, feature_map_file):
    logging.info('loading raw data')
    trn = pd.read_csv(train_file, index_col=ID_COL)
    tst = pd.read_csv(test_file, index_col=ID_COL)

    logging.info('label encoding categorical variables')

    y = trn.loc[:, TARGET_COL]
    n_trn = trn.shape[0]
    trn = trn.drop(TARGET_COL, axis=1)
    df = pd.concat([trn, tst], axis=0)

    # build features
    features_bin = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
    features_cat = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
    features_hex = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
    features_ord = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
    features_cyc = ['day', 'month']

    logging.info("Dummy encode: bin 0 to 4")
    # convert bins 0, 1, 2 to object so that
    # get_dummies recognizes them and creates missing indicators
    bin_012 = ['bin_0', 'bin_1', 'bin_2']
    df[bin_012] = df[bin_012].astype(object)

    dummies = pd.get_dummies(df[features_bin], dummy_na=True)
    df = df.drop(features_bin, axis=1)
    df = pd.concat([df, dummies], axis=1)

    logging.info("Target encoding: nom 0 to 9 and cyclical features")
    target_enc_cols = features_ord + features_cat + features_hex + features_cyc
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)
    te = TargetEncoder(cv=cv)
    te.fit(trn.loc[:, target_enc_cols], y)
    df.loc[:, target_enc_cols] = te.transform(df.loc[:, target_enc_cols])

#    logging.info("Label encode ordinals: ord 0 to 5")
#    map_ord_0 = None  # already a numeric column
#    map_ord_1 = {'Novice': 1, 'Contributor': 2,
#                 'Expert': 3, 'Master': 4, 'Grandmaster': 5}
#    map_ord_2 = {'Freezing': 1, 'Cold': 2, 'Warm': 3,
#                 'Hot': 4, 'Boiling Hot': 5, 'Lava Hot': 6}
#    map_ord_3 = dict(zip(df['ord_3'].value_counts().sort_index().keys(),
#                         range(1, len(df['ord_3'].value_counts()) + 1)))
#    map_ord_4 = dict(zip(df['ord_4'].value_counts().sort_index().keys(),
#                         range(1, len(df['ord_4'].value_counts()) + 1)))
#
#    temp_ord_5 = pd.DataFrame(
#        df['ord_5'].value_counts().sort_index().keys(), columns=['ord_5'])
#    temp_ord_5['First'] = temp_ord_5['ord_5'].astype(str).str[0].str.upper()
#    temp_ord_5['Second'] = temp_ord_5['ord_5'].astype(str).str[1].str.upper()
#    temp_ord_5['First'] = temp_ord_5['First'].replace(map_ord_4)
#    temp_ord_5['Second'] = temp_ord_5['Second'].replace(map_ord_4)
#    temp_ord_5['Add'] = temp_ord_5['First'] + temp_ord_5['Second']
#    temp_ord_5['Mul'] = temp_ord_5['First'] * temp_ord_5['Second']
#    map_ord_5 = dict(zip(temp_ord_5['ord_5'],
#                         temp_ord_5['Mul']))
#
#    maps = [map_ord_0, map_ord_1, map_ord_2, map_ord_3, map_ord_4, map_ord_5]
#    for i, m in zip(range(0, 6), maps):
#        if i != 0:
#            df[f'ord_{i}'] = df[f'ord_{i}'].map(m)
#        df[f'ord_{i}'] = (df[f'ord_{i}'].fillna(df[f'ord_{i}'].median()))

#    logging.info("cyclical features")
#    df[features_cyc] = df[features_cyc].astype(object)
#    dummies_cyc = pd.get_dummies(df[features_cyc], dummy_na=True)
#    df = df.drop(features_cyc, axis=1)
#    df = pd.concat([df, dummies_cyc], axis=1)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(df.columns):
            f.write('{}\t{}\tq\n'.format(i, col))

    logging.info('saving features')
    save_data(df.values[:n_trn, ], y.values, train_feature_file)

    save_data(df.values[n_trn:, ], None, test_feature_file)
Esempio n. 7
0
def test_TargetEncoder(generate_data):
    df = generate_data()
    feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]

    te = TargetEncoder()
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = TargetEncoder(cv=cv)
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    te = TargetEncoder(cv=cv)
    te.fit(df[cat_cols], df[TARGET_COL])
    X_cat = te.transform(df[cat_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(
        X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)
Esempio n. 8
0
def test_TargetEncoder():
    df = pd.DataFrame(np.random.randint(0, N_CATEGORY,
                                        size=(N_OBS, N_FEATURE)),
                      columns=['c{}'.format(x) for x in range(N_FEATURE)])
    feature_cols = df.columns
    df[TARGET_COL] = np.random.rand(N_OBS)

    te = TargetEncoder()
    X_cat = te.fit_transform(df[feature_cols], df[TARGET_COL])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(feature_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = TargetEncoder(cv=cv)
    X_cat = te.fit_transform(df[feature_cols], df[TARGET_COL])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(feature_cols)

    te = TargetEncoder(cv=cv)
    te.fit(df[feature_cols], df[TARGET_COL])
    X_cat = te.transform(df[feature_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(
        X_cat.head()))

    assert X_cat.shape[1] == len(feature_cols)