コード例 #1
0
def data_prepare():
    df_train, df_test = base_data_process.eda(age2group=True, one_hot=False, scale=True)
    base_data_process.label2index(df_train, LABEL)
    label = df_train[LABEL]
    df_train.drop(columns=[LABEL], inplace=True)
    df_train.drop(columns=[ID], inplace=True)
    label_one_hot = pd.get_dummies(label)
    feats = [f for f in df_train.columns if f not in category_list]
    log.info('feats are {}'.format(feats))
    category_encode_size_map = {}
    for c in category_list:
        if c not in df_train.columns:
            log.warn('{} not in df'.format(c))
            continue
        category_encode_size_map[c] = len(df_train[c].unique())

        log.info('{} has {} classes'.format(c, len(df_train[c].unique())))

        # category_encode_size_map = {}
        # for c in category_list:
        #     if c not in df_train.columns:
        #         continue
        #     le = preprocessing.LabelEncoder()
        #     le.fit(pd.concat([df_train[c], df_test[c]], axis=0))
        #
        #     df_train[c] = le.transform(df_train[c])
        #     df_test[c] = le.transform(df_test[c])
        #
        #     category_encode_size_map[c] = len(le.classes_)

        # log.info('{} has {} classes, origin classes are {}'.format(c, len(le.classes_), le.classes_))

    return df_train, df_test, label, label_one_hot, feats, category_encode_size_map
コード例 #2
0
    return base_util.pickle_load('../../origin_data/label2index.pkl')


def batch_yield(df, batch_size):
    if batch_size == -1:
        batch_size = len(df)
    # equal to shuffle
    df = df.sample(frac=1)
    # len(df) // batch_size * batch_size <= len(df)
    total_batch = len(df) // batch_size

    for i in range(total_batch):
        data = df.iloc[i * batch_size:i * batch_size + batch_size, :]
        labels = data[LABEL]
        data.drop([LABEL], axis=1, inplace=True)
        yield data, labels


def save_result(ids, labels, submit_path):
    df_test = pd.DataFrame()
    df_test[ID] = ids
    df_test[LABEL] = labels
    df_test.columns = [ID, 'predict']
    print('====shape df_test====', df_test.shape)
    df_test.to_csv(submit_path, index=False)


if __name__ == '__main__':
    df_train, df_test = base_data_process.eda(age2group=True, one_hot=True)
    data_prepare(df_train, df_test)
コード例 #3
0
        df['predict'] = index2label(y_pre)
        df.to_csv('result{}.csv'.format(name), index=False)


# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[[
        "feature", "importance"
    ]].groupby("feature").mean().sort_values(by="importance",
                                             ascending=False)[:40].index
    best_features = feature_importance_df_.loc[
        feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance",
                y="feature",
                data=best_features.sort_values(by="importance",
                                               ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')


if __name__ == '__main__':
    if not os.path.exists('origin_data_save'):
        os.mkdir('origin_data_save')
    with timer('data process'):
        df_train, df_test = eda()
        label2index(df_train, LABEL)
    with timer('model process'):
        model(df_train, df_test, num_folds=5, num_boost_round=10000)
コード例 #4
0
        feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance",
                y="feature",
                data=best_features.sort_values(by="importance",
                                               ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig(name)


if __name__ == '__main__':
    if not os.path.exists('origin_data_save'):
        os.mkdir('origin_data_save')
    with timer('data process'):
        df_train, df_test = eda(age2group=True, one_hot=False, scale=False)
        cols_index_to_use = [
            59, 90, 98, 9, 33, 79, 63, 7, 44, 19, 47, 74, 38, 66
        ]
        cols_to_use = ['col_{}'.format(i) for i in cols_index_to_use]

        ll_df = pd.read_csv('../../origin_data/last_layer_100.csv',
                            index_col=False,
                            header=0)
        ll_df_test = pd.read_csv('../../origin_data/last_layer_100_test.csv',
                                 index_col=False,
                                 header=0)

        ll_df = ll_df[cols_to_use]
        ll_df_test = ll_df_test[cols_to_use]
コード例 #5
0
        11,
        'verbose':
        -1
    }

    trials = Trials()

    with timer('optimization'):
        # Run optimization
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    trials=trials,
                    max_evals=config_dict['max_evals'])

    print('-' * 100)
    log.warn(best)

    with open('model_trials.pkl', mode='wb') as mt:
        pickle.dump(trials, mt)


config_dict = {'train': pd.DataFrame(), 'max_evals': 1000}

if __name__ == '__main__':
    df_train, df_test = eda(True, False)

    config_dict['train'] = df_train.iloc[:, :]
    label2index(df_train, 'current_service')
    optimization()