Exemple #1
0
def data_prepare(df_train, df_test):
    conti_list = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'contract_time',
                  'former_complaint_fee', 'former_complaint_num', 'last_month_traffic', 'local_caller_time',
                  'local_trafffic_month', 'month_traffic', 'online_time', 'pay_num', 'pay_times',
                  'service1_caller_time', 'service2_caller_time', 'pay_num_per_time', 'll']

    normalize_process(df_train, df_test, conti_list)
    # label 2 index
    base_data_process.label2index(df_train, LABEL)

    log.info('current path: {}'.format(os.getcwd()))
    with timer('save train data'):
        df_train.to_csv('../../origin_data/train_modified.csv', index=False)
    with timer('save test data'):
        df_test.to_csv('../../origin_data/test_modified.csv', index=False)
def cross_validation(train,
                     params,
                     ID_COLUMN_NAME,
                     LABEL_COLUMN_NAME,
                     N_FOLD=5):
    '''
    :return: loss
    '''
    NUM_BOOST_ROUND = 1000
    EARLY_STOPPING_ROUNDS = 50

    # Cross validation model
    folds = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=1001)
    feats = [
        f for f in train.columns
        if f not in [LABEL_COLUMN_NAME, ID_COLUMN_NAME]
    ]
    for i_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train[feats], train[LABEL_COLUMN_NAME])):
        dtrain = lgb.Dataset(data=train[feats].iloc[train_idx],
                             label=train[LABEL_COLUMN_NAME].iloc[train_idx],
                             free_raw_data=False,
                             silent=True)
        dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx],
                             label=train[LABEL_COLUMN_NAME].iloc[valid_idx],
                             free_raw_data=False,
                             silent=True)
        with timer('cross validation-fold {} train model'.format(i_fold)):
            log.info('params is {}'.format(params))
            clf = lgb.train(num_boost_round=NUM_BOOST_ROUND,
                            params=params,
                            verbose_eval=10,
                            train_set=dtrain,
                            valid_sets=[dvalid],
                            early_stopping_rounds=EARLY_STOPPING_ROUNDS)
        with timer('cross validation-fold {} predict'.format(i_fold)):
            v_data = clf.predict(dvalid.data)
            y_pre = []
            for d in v_data:
                max = d[0]
                max_i = 0
                for i in range(1, 15):
                    if d[i] > max:
                        max = d[i]
                        max_i = i
                y_pre.append(max_i)
        f1 = f1_score(dvalid.label, y_pre, average='macro')
        return f1
Exemple #3
0
def write2file(col_id, pre_label, name=None):
    with timer('write result {}'.format(name)):
        y_pre = one_hot2label_index(pre_label)
        df = pd.DataFrame()
        df[ID] = col_id
        df['predict'] = index2label(y_pre)
        df.to_csv('result{}.csv'.format(name), index=False)
def data_prepare(df_train, df_test):
    conti_list = [
        '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
        'contract_time', 'former_complaint_fee', 'former_complaint_num',
        'last_month_traffic', 'local_caller_time', 'local_trafffic_month',
        'month_traffic', 'online_time', 'pay_num', 'pay_times',
        'service1_caller_time', 'service2_caller_time', 'pay_num_per_time',
        'll'
    ]

    normalize_process(df_train, df_test, conti_list)
    # label 2 index
    base_data_process.label2index(df_train, LABEL)

    base_util.pickle_dump(
        (base_data_process.encode_map, base_data_process.decode_list),
        '../../origin_data/label2index.pkl')

    with timer('save train data'):
        df_train.to_csv('../../origin_data/train_modified.csv', index=False)
    with timer('save test data'):
        df_test.to_csv('../../origin_data/test_modified.csv', index=False)
def optimization():
    space = {
        'learning_rate':
        0.1,
        'boosting_type':
        hp.choice('boosting_type', ['gbdt']),
        'num_leaves':
        hp.choice('num_leaves', [15, 20, 30, 50, 65, 80, 100, 150, 400]),
        'bin_construct_sample_cnt':
        hp.choice('bin_construct_sample_cnt',
                  [10000, 20000, 60000, 100000, 200000]),
        'min_data_in_leaf':
        hp.quniform('min_data_in_leaf', 20, 500, 10),
        'reg_alpha':
        hp.choice('reg_alpha', [0, 0.001, 0.01, 0.1, 0.2]),
        'reg_lambda':
        hp.choice('reg_lambda', [0, 0.001, 0.01, 0.1, 0.2]),
        'feature_fraction':
        hp.uniform('feature_fraction', 0.8, 1.0),
        'bagging_fraction':
        hp.uniform('bagging_fraction', 0.8, 1.0),
        'bagging_freq':
        hp.choice('bagging_freq', [0, 2, 6, 10, 16]),
        'is_unbalance':
        hp.choice('is_unbalance', [True, False]),
        'num_threads':
        40,
        'objective':
        'multiclass',
        'num_class':
        15,
        'verbose':
        -1
    }

    trials = Trials()

    with timer('optimization'):
        # Run optimization
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    trials=trials,
                    max_evals=config_dict['max_evals'])

    print('-' * 100)
    log.warn(best)

    with open('model_trials.pkl', mode='wb') as mt:
        pickle.dump(trials, mt)
Exemple #6
0
def write_result(file_name, ids, labels, label_type='label_index'):
    # todo 完成数据的写入部分
    '''
    :param file_name:
    :param id_series:
    :param label_list:
    :param label_type:
    :return:
    '''
    load_label2index()
    df_test = pd.DataFrame()
    df_test[ID] = ids

    if label_type == 'one_hot':
        labels = one_hot2label_index(labels)
    if label_type in ['label_index', 'one_hot']:
        labels = [decode_list[label] for label in labels]
    df_test[LABEL] = labels
    df_test.columns = [ID, 'predict']
    print('====shape df_test====', df_test.shape)
    with timer('write result to {}'.format(file_name)):
        df_test.to_csv(file_name, index=False)
def objective(hyperparameters):
    # Keep track of evals
    global ITERATION

    ITERATION += 1

    # Make sure parameters that need to be integers are integers
    for parameter_name in [
            'num_leaves', 'bin_construct_sample_cnt', 'bagging_freq',
            'min_data_in_leaf'
    ]:
        hyperparameters[parameter_name] = int(hyperparameters[parameter_name])

    with timer('run lgb') as ti:
        # Perform n_folds cross validation
        f1 = cross_validation(config_dict['train'], hyperparameters, 'user_id',
                              'current_service')
        loss = 1 - f1**2

        run_time = ti.get_delay_t0()

    # Write to the csv file ('a' means append)
    of_connection = open('hyperparameters.csv', 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, hyperparameters, ITERATION, run_time, 1 - loss])
    of_connection.close()

    log.info('iteration-{} f1:{} loss:{} train_time:{}'.format(
        ITERATION, f1, loss, run_time))
    # Dictionary with information for evaluation
    return {
        'loss': loss,
        'hyperparameters': hyperparameters,
        'iteration': ITERATION,
        'train_time': run_time,
        'status': STATUS_OK
    }
Exemple #8
0
def model(train,
          test,
          num_folds=5,
          stratified=True,
          num_boost_round=1000,
          save_path='origin_data_save'):
    LABEL_SIZE = train[LABEL].value_counts().count()

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train.shape, test.shape))

    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    sub_preds = np.zeros(shape=(test.shape[0], LABEL_SIZE))
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train.columns if f not in [LABEL, ID]]
    for i_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train[feats], train[LABEL])):
        dtrain = lgb.Dataset(data=train[feats].iloc[train_idx],
                             label=train[LABEL].iloc[train_idx],
                             free_raw_data=False,
                             silent=True)
        dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx],
                             label=train[LABEL].iloc[valid_idx],
                             free_raw_data=False,
                             silent=True)

        params = {
            'bagging_fraction': 0.94795171020152,
            'bagging_freq': 6,
            'bin_construct_sample_cnt': 200000,
            'boosting_type': 'gbdt',
            'feature_fraction': 0.9953235660931046,
            'is_unbalance': False,
            'learning_rate': 0.005,
            'min_data_in_leaf': 30,
            'num_class': 11,
            'num_leaves': 80,
            'num_threads': 40,
            'objective': 'multiclass',
            'reg_alpha': 0.001,
            'reg_lambda': 0.1,
            'verbose': -1
        }
        with timer('fold {} train model'.format(i_fold)):
            clf = lgb.train(num_boost_round=num_boost_round,
                            params=params,
                            train_set=dtrain,
                            valid_sets=[dvalid],
                            early_stopping_rounds=50)
            clf.save_model(
                (save_path + '/model{}_{}.txt').format(i_fold,
                                                       int(time.time())))
        with timer('fold {} predict'.format(i_fold)):
            v_data = clf.predict(dvalid.data)
            y_pre = one_hot2label_index(v_data)
            sub_preds += clf.predict(test[feats])
            write2file(test[ID], sub_preds, i_fold)
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance(
            importance_type='gain')
        fold_importance_df["fold"] = i_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        f1 = f1_score(dvalid.label, y_pre, average='macro')
        log.warn('Fold {} f1 : {} score {}'.format(i_fold + 1, f1, f1**2))
        del clf, dtrain, dvalid
        gc.collect()
    display_importances(feature_importance_df)
Exemple #9
0
        df['predict'] = index2label(y_pre)
        df.to_csv('result{}.csv'.format(name), index=False)


# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[[
        "feature", "importance"
    ]].groupby("feature").mean().sort_values(by="importance",
                                             ascending=False)[:40].index
    best_features = feature_importance_df_.loc[
        feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance",
                y="feature",
                data=best_features.sort_values(by="importance",
                                               ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')


if __name__ == '__main__':
    if not os.path.exists('origin_data_save'):
        os.mkdir('origin_data_save')
    with timer('data process'):
        df_train, df_test = eda()
        label2index(df_train, LABEL)
    with timer('model process'):
        model(df_train, df_test, num_folds=5, num_boost_round=10000)