コード例 #1
0
def data_prepare():
    df_train, df_test = base_data_process.eda(age2group=True, one_hot=False, scale=True)
    base_data_process.label2index(df_train, LABEL)
    label = df_train[LABEL]
    df_train.drop(columns=[LABEL], inplace=True)
    df_train.drop(columns=[ID], inplace=True)
    label_one_hot = pd.get_dummies(label)
    feats = [f for f in df_train.columns if f not in category_list]
    log.info('feats are {}'.format(feats))
    category_encode_size_map = {}
    for c in category_list:
        if c not in df_train.columns:
            log.warn('{} not in df'.format(c))
            continue
        category_encode_size_map[c] = len(df_train[c].unique())

        log.info('{} has {} classes'.format(c, len(df_train[c].unique())))

        # category_encode_size_map = {}
        # for c in category_list:
        #     if c not in df_train.columns:
        #         continue
        #     le = preprocessing.LabelEncoder()
        #     le.fit(pd.concat([df_train[c], df_test[c]], axis=0))
        #
        #     df_train[c] = le.transform(df_train[c])
        #     df_test[c] = le.transform(df_test[c])
        #
        #     category_encode_size_map[c] = len(le.classes_)

        # log.info('{} has {} classes, origin classes are {}'.format(c, len(le.classes_), le.classes_))

    return df_train, df_test, label, label_one_hot, feats, category_encode_size_map
コード例 #2
0
def data_prepare(df_train, df_test):
    conti_list = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'contract_time',
                  'former_complaint_fee', 'former_complaint_num', 'last_month_traffic', 'local_caller_time',
                  'local_trafffic_month', 'month_traffic', 'online_time', 'pay_num', 'pay_times',
                  'service1_caller_time', 'service2_caller_time', 'pay_num_per_time', 'll']

    normalize_process(df_train, df_test, conti_list)
    # label 2 index
    base_data_process.label2index(df_train, LABEL)

    log.info('current path: {}'.format(os.getcwd()))
    with timer('save train data'):
        df_train.to_csv('../../origin_data/train_modified.csv', index=False)
    with timer('save test data'):
        df_test.to_csv('../../origin_data/test_modified.csv', index=False)
コード例 #3
0
def data_prepare(df_train, df_test):
    conti_list = [
        '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
        'contract_time', 'former_complaint_fee', 'former_complaint_num',
        'last_month_traffic', 'local_caller_time', 'local_trafffic_month',
        'month_traffic', 'online_time', 'pay_num', 'pay_times',
        'service1_caller_time', 'service2_caller_time', 'pay_num_per_time',
        'll'
    ]

    normalize_process(df_train, df_test, conti_list)
    # label 2 index
    base_data_process.label2index(df_train, LABEL)

    base_util.pickle_dump(
        (base_data_process.encode_map, base_data_process.decode_list),
        '../../origin_data/label2index.pkl')

    with timer('save train data'):
        df_train.to_csv('../../origin_data/train_modified.csv', index=False)
    with timer('save test data'):
        df_test.to_csv('../../origin_data/test_modified.csv', index=False)
コード例 #4
0
        df['predict'] = index2label(y_pre)
        df.to_csv('result{}.csv'.format(name), index=False)


# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[[
        "feature", "importance"
    ]].groupby("feature").mean().sort_values(by="importance",
                                             ascending=False)[:40].index
    best_features = feature_importance_df_.loc[
        feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance",
                y="feature",
                data=best_features.sort_values(by="importance",
                                               ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')


if __name__ == '__main__':
    if not os.path.exists('origin_data_save'):
        os.mkdir('origin_data_save')
    with timer('data process'):
        df_train, df_test = eda()
        label2index(df_train, LABEL)
    with timer('model process'):
        model(df_train, df_test, num_folds=5, num_boost_round=10000)
コード例 #5
0
        11,
        'verbose':
        -1
    }

    trials = Trials()

    with timer('optimization'):
        # Run optimization
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    trials=trials,
                    max_evals=config_dict['max_evals'])

    print('-' * 100)
    log.warn(best)

    with open('model_trials.pkl', mode='wb') as mt:
        pickle.dump(trials, mt)


config_dict = {'train': pd.DataFrame(), 'max_evals': 1000}

if __name__ == '__main__':
    df_train, df_test = eda(True, False)

    config_dict['train'] = df_train.iloc[:, :]
    label2index(df_train, 'current_service')
    optimization()