Example #1
0
def func(x, X, y, start):

    score = numpy.dot(X, x)
    score = score / score.max()  # sigmoid(score)
    thresh, score = mcc_optimize(score, y)
    if 1:  # numpy.random.random() < 0.1:
        logger.info('  thresh: %s, score: %s, rest:%s' %
                    (thresh, score, time.time() - start))

    if time.time() - start > 1200:
        logger.info('END  thresh: %s, score: %s' % (thresh, score))
        with open('weight_n.pkl', 'wb') as f:
            pickle.dump(x, f, -1)
        return 'aaa'
    return -score
Example #2
0
def func(x, model, X, y, start):

    W = Parallel(n_jobs=-1, verbose=0, backend="threading")(
        delayed(parallel_helper)(e, 'predict_proba', X, check_input=False)
        for e in model.estimators_)
    W = numpy.array([w[:, 1] for w in W]).T
    # W = numpy.array([m.predict_proba(X)[:, 1] for m in model.estimators_]).T
    score = numpy.dot(W, x)
    score = score / score.max()  # sigmoid(score)
    thresh, score = mcc_optimize(score, y)
    if 1:  # numpy.random.random() < 0.1:
        logger.info('  thresh: %s, score: %s, rest:%s' %
                    (thresh, score, time.time() - start))

    if time.time() - start > 1200:
        logger.info('END  thresh: %s, score: %s' % (thresh, score))
        with open('weight.pkl', 'wb') as f:
            pickle.dump(x, f, -1)
        return 'aaa'
    return -score
Example #3
0
            ans = numpy.array(ans).T
            insample_ans = numpy.array(insample_ans).T
            if all_ans is None:
                all_ans = ans
                all_target = target[test_idx]
                all_ids = ids.ix[test_idx].values
            else:
                all_ans = numpy.r_[all_ans, ans]
                all_target = numpy.r_[all_target, target[test_idx]]
                all_ids = numpy.r_[all_ids, ids.ix[test_idx]]

            model = XGBClassifier(seed=0)
            model.fit(ans, target[test_idx])
            pred = model.predict_proba(ans)[:, 1]
            logger.info('model thresh: %s, score: %s' %
                        mcc_optimize(pred, target[test_idx]))
            pred = ans.max(axis=1)
            logger.info('max thresh: %s, score: %s' %
                        mcc_optimize(pred, target[test_idx]))
            pred = ans.min(axis=1)
            logger.info('min thresh: %s, score: %s' %
                        mcc_optimize(pred, target[test_idx]))

            logger.info('mean thresh: %s, score: %s' %
                        mcc_optimize(ans.mean(axis=1), target[test_idx]))

            for j in range(ans.shape[1]):
                score = roc_auc_score(target[test_idx], ans[:, j])
                logger.info('score: %s' % score)
                logger.info('model thresh: %s, score: %s' %
                            mcc_optimize(ans[:, j], target[test_idx]))
Example #4
0

@jit
def mcc_scoring2(y_pred_prb, y):
    list_thresh = numpy.arange(1, 100) / 100
    max_score = -1
    idx = None
    for thresh in list_thresh:
        y_pred = numpy.where(y_pred_prb >= thresh, 1, 0)
        score = mcc(y, y_pred)
        if score > max_score:
            max_score = score
            idx = thresh
    return idx, max_score


if __name__ == '__main__':
    logger.info('load start')
    data = pandas.read_csv('stack_1_pred.csv')
    target = data[TARGET_COLUMN_NAME].values
    pred = data['pred'].values

    logger.info('load end')
    logger.info('shape %s %s' % data.shape)
    logger.info('shape %s' % target.shape)
    logger.info('pos num: %s, pos rate: %s' %
                (sum(target), float(sum(target)) / target.shape[0]))

    thresh, score = mcc_optimize(pred, target)
    logger.info('model:%s, thresh: %s, total score: %s' % (0, thresh, score))
Example #5
0
                                train_dmatrix,
                                evals=[(test_dmatrix, 'eval')],
                                feval=evalmcc_xgb_min,
                                num_boost_round=924,
                                early_stopping_rounds=924,
                                verbose_eval=True)

                avg_ntree += booster.best_ntree_limit
                ans = booster.predict(test_dmatrix,
                                      ntree_limit=booster.best_ntree_limit)
                tree_limit = booster.best_ntree_limit
                score = roc_auc_score(target.ix[test_idx].values, ans)
                logger.info('score: %s' % score)
                logger.info('tree: %s' % tree_limit)
                logger.info('model thresh: %s, score: %s' %
                            mcc_optimize(ans, target.ix[test_idx].values))
                logger.info('train_end')
                if all_ans is None:
                    all_ans = ans
                    all_target = target[test_idx]
                    all_ids = data.ix[test_idx].index.values.astype(int)
                else:
                    all_ans = numpy.r_[all_ans, ans]
                    all_target = numpy.r_[all_target, target[test_idx]]
                    all_ids = numpy.r_[
                        all_ids, data.ix[test_idx].index.values.astype(int)]

                ans = booster.predict(test_dmatrix,
                                      ntree_limit=booster.best_iteration - 10)
                logger.info('model thresh: %s, score: %s' %
                            mcc_optimize(ans, target.ix[test_idx].values))
Example #6
0
                                     n_folds=n_iter,
                                     shuffle=True,
                                     random_state=ep)
            avg_cost = 0.
            logger.info('epock: %s' % ep)
            for i, (_, batch_idx) in enumerate(batchs):
                logger.info(' batch: %s/%s' % (i + 1, n_iter))
                batch_xs = train_data[batch_idx]
                batch_ys = train_target[batch_idx]
                _, cost = sess.run([train_step, loss],
                                   feed_dict={
                                       x: batch_xs,
                                       y_: batch_ys.reshape(-1, 1)
                                   })
                #train_step.run({x: batch_xs, y_: batch_ys.reshape(-1, 1)})
                avg_cost += cost / n_iter
            logger.info('loss: %s' % avg_cost)
            pred = y.eval({x: test_data}, session=sess).reshape((1, -1))[0]
            print(pred)
            score = mcc_optimize(pred, test_target)
            logger.info('score: %s %s' % score)
            score = roc_auc_score(test_target, pred)
            logger.info('suc: %s' % score)

        pred = y.eval({x: test_data}, session=sess).reshape((1, -1))[0]

        score = mcc_optimize(pred, test_target)
        logger.info('score: %s %s' % score)
        score = roc_auc_score(test_target, pred)
        logger.info('suc: %s' % score)
Example #7
0
        pred /= sum(params.values())

        return pred.clip(0, 1)

    def predict(params):
        # use_preds = [pred for i, pred in enumerate(list_preds) if params[i]]
        use_preds = [params[path] * load_test(path) for path in list_dir]
        # pred = np.mean(use_preds, axis=0)
        pred = np.sum(use_preds, axis=0)
        pred /= sum(params.values())
        return pred.clip(0, 1)

    trials = Trials()
    min_params = optimize(list_dir, score_func, trials)
    logger.info(f'min params: {min_params}')
    preds = pred_func({i: min_params[path] for i, path in enumerate(list_dir)})

    best_proba, sc = mcc_optimize(preds, y_train)
    logger.warn('search: %s' % sc)

    list_test = [load_test(path) for path in list_dir]
    p_test = predict(min_params)

    ids = np.loadtxt('ids.npy')

    sub = pd.DataFrame()
    sub['Id'] = ids.astype(int)
    sub['Response'] = p_test >= best_proba
    sub.to_csv(DIR + 'submit_ens.csv', index=False)
    logger.info('exit')
Example #8
0
            logger.info('train_end')
            ans = numpy.array(ans).T
            insample_ans = numpy.array(insample_ans).T
            if all_ans is None:
                all_ans = ans
                all_target = target[test_idx]
                all_ids = ids.ix[test_idx].values
            else:
                all_ans = numpy.r_[all_ans, ans]
                all_target = numpy.r_[all_target, target[test_idx]]
                all_ids = numpy.r_[all_ids, ids.ix[test_idx]]

            model = XGBClassifier(seed=0)
            model.fit(ans, target[test_idx])
            pred = model.predict_proba(ans)[:, 1]
            logger.info('model thresh: %s, score: %s' % mcc_optimize(pred, target[test_idx]))
            pred = ans.max(axis=1)
            logger.info('max thresh: %s, score: %s' % mcc_optimize(pred, target[test_idx]))
            pred = ans.min(axis=1)
            logger.info('min thresh: %s, score: %s' % mcc_optimize(pred, target[test_idx]))
            score = roc_auc_score(target[test_idx], ans[:, -1])
            logger.info('mean thresh: %s, score: %s' % mcc_optimize(ans.mean(axis=1), target[test_idx]))
            logger.info('all thresh: %s, score: %s' % mcc_optimize(ans[:, -1], target[test_idx]))
            logger.info('score: %s' % score)
            score = roc_auc_score(target[test_idx], pred)
            logger.info('INSAMPLE score: %s' % score)
            pred = model.predict_proba(insample_ans)[:, 1]  # ans.max(axis=1)
            score = roc_auc_score(target[train_idx], pred)
            logger.info('INSAMPLE train score: %s' % score)

            list_estimator.append(model)
Example #9
0
        logger.info('%s/%s param: %s' % (i + 1, len(pg), params))
        pred_proba_all = []
        y_true = []
        for train_idx, test_idx in cv:
            model = NMOpt()

            model.fit(data[train_idx], target[train_idx])

            # pred_proba = data[test_idx, -1]
            pred_proba = model.predict_proba(data[test_idx])[:, 1]
            pred_proba_all = numpy.r_[pred_proba_all, pred_proba]
            y_true = numpy.r_[y_true, target[test_idx]]
            score = roc_auc_score(target[test_idx], pred_proba)
            # logger.info('    score: %s' % score)
            list_score.append(score)
            thresh, score = mcc_optimize(pred_proba, target[test_idx])
            logger.info('    thresh: %s, score: %s' % (thresh, score))
        score = numpy.mean(list_score)
        thresh, score = mcc_optimize(pred_proba_all, y_true)
        max_score = max(max_score, score)
        logger.info('thresh: %s, total score: %s, max_score: %s' %
                    (thresh, score, max_score))
        if max_score == score:
            best_param = params
            best_thresh = thresh
    logger.info('best_thresh: %s, total max score: %s' %
                (best_thresh, max_score))
    # model = XGBClassifier(seed=0)
    # model = LogisticRegression(n_jobs=-1, class_weight='balanced')
    # model.set_params(**best_param)
    model = NMOpt()
Example #10
0
            logger.info('train_end')
            if all_ans is None:
                all_ans = ans
                all_target = target[test_idx]
                all_ids = data.ix[test_idx].index.values.astype(int)
            else:
                all_ans = numpy.r_[all_ans, ans]
                all_target = numpy.r_[all_target, target[test_idx]]
                all_ids = numpy.r_[all_ids,
                                   data.ix[test_idx].index.values.astype(int)]

            score = roc_auc_score(target[test_idx], ans)
            logger.info('score: %s' % score)
            logger.info('model thresh: %s, score: %s' %
                        mcc_optimize(ans, target[test_idx]))

        logger.info('cv model thresh: %s, score: %s' %
                    mcc_optimize(all_ans, all_target))

    for i in ['']:
        logger.info('model: %s' % i)
        cols = [col for col in feature_column if 'L%s' % i in col]
        logger.info('model xg: %s' % i)
        model = XGBClassifier(seed=0)
        model.set_params(**params)
        model.fit(data[cols], target)

    ids = pandas.read_csv('stack_1_id_1.csv')['0'].values
    _data = pandas.read_csv('stack_1_data_1.csv')
    logger.info('old data %s %s' % _data.shape)
Example #11
0
    for params in ParameterGrid(all_params):
        logger.info('param: %s' % (params))
        for train_idx, test_idx in list(cv)[:1]:
            with gzip.open('train_fm.svm', 'wb') as f:
                dump_svmlight_file(data[train_idx], target[train_idx], f)
            del output
            gc.collect()
            with gzip.open('test_svm.svm', 'wb') as f:
                dump_svmlight_file(data[test_idx], target[test_idx], f)

            model = TFFMClassifier(
                order=2,
                rank=10,
                optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
                n_epochs=50,
                batch_size=100000,
                init_std=0.001,
                reg=0.001,
                input_type='sparse')
            """
            model = FMClassification()
            """
            model.fit(data[train_idx], target[train_idx], show_progress=True)
            ans = model.predict_proba(data[test_idx])[:, 1]

            score = roc_auc_score(target[test_idx], ans)
            logger.info('score: %s' % score)
            logger.info('all thresh: %s, score: %s' %
                        mcc_optimize(ans, target[test_idx]))
            score = roc_auc_score(target[test_idx], ans)
Example #12
0
                gc.collect()
                model.set_params(**params)
                if 1:
                    model.fit(data.ix[train_idx, cols], target[train_idx])
                else:
                    model.fit(data.ix[train_idx, cols],
                              target[train_idx],
                              eval_set=[(data.ix[test_idx,
                                                 cols], target[test_idx])],
                              eval_metric=evalmcc_xgb_min,
                              early_stopping_rounds=1000,
                              verbose=True)

                ans = model.predict_proba(data.ix[test_idx, cols])[:, 1]
                score = roc_auc_score(target[test_idx], ans)
                thresh, mcc = mcc_optimize(ans, target[test_idx])
                logger.info('auc: %s thresh: %s, score: %s' %
                            (score, thresh, mcc))
                """
                for t in range(1, 101):
                    ans = model.predict_proba(data.ix[test_idx, cols], ntree_limit=t)[:, 1]
                    score = roc_auc_score(target[test_idx], ans)
                    logger.info('    score: %s' % score)
                    logger.info('    model thresh: %s, score: %s' % mcc_optimize(ans, target[test_idx]))
                """
            logger.info('train_end')
            if all_ans is None:
                all_ans = ans
                all_target = target[test_idx]
                all_ids = ids.ix[test_idx].values
            else:
Example #13
0
TEST_DATA = os.path.join(DATA_DIR, 'test_simple_join.csv.gz')
TARGET_COLUMN_NAME = u'Response'

from utils import mcc_optimize, evalmcc_xgb_min
from feature import LIST_FEATURE_COLUMN_NAME
log_fmt = '%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s '
logging.basicConfig(format=log_fmt, datefmt='%Y-%m-%d/%H:%M:%S', level='INFO')
logger = logging.getLogger(__name__)

if __name__ == '__main__':
    logger.info('load start')
    target = pandas.read_csv('stack_1_target_1.csv')['0'].values
    data = pandas.read_csv('stack_1_data_1.csv').values
    logger.info('load end')
    logger.info('shape %s %s' % data.shape)
    logger.info('shape %s' % target.shape)
    logger.info('pos num: %s, pos rate: %s' %
                (sum(target), float(sum(target)) / target.shape[0]))

    with open('list_xgb_model.pkl', 'rb') as f:
        list_model = pickle.load(f)

    for i in range(data.shape[1]):
        thresh, score = mcc_optimize(data[:, i], target)
        auc_score = roc_auc_score(target, data[:, i])
        print('"%s"' % list_model[i].__repr__(),
              auc_score,
              thresh,
              score,
              sep=',')
Example #14
0
                    except AttributeError:
                        ans.append(
                            model.predict_proba(data.ix[test_idx, cols])[:, 1])

            ans = numpy.array(ans).T
            if all_ans is None:
                all_ans = ans
                all_target = target[test_idx]
                all_ids = data.ix[test_idx].index.values.astype(int)
            else:
                all_ans = numpy.r_[all_ans, ans]
                all_target = numpy.r_[all_target, target[test_idx]]
                all_ids = numpy.r_[all_ids,
                                   data.ix[test_idx].index.values.astype(int)]

            for j in range(ans.shape[1]):
                score = roc_auc_score(target[test_idx], ans[:, j])
                logger.info('score: %s' % score)
                logger.info('model thresh: %s, score: %s' %
                            mcc_optimize(ans[:, j], target[test_idx].values))

    pandas.DataFrame(all_ans).to_csv('stack_1_data_1.csv.gz',
                                     index=False,
                                     compression='gzip')
    pandas.DataFrame(all_target).to_csv('stack_1_target_1.csv.gz',
                                        index=False,
                                        compression='gzip')
    pandas.DataFrame(all_ids).to_csv('stack_1_id_1.csv.gz',
                                     index=False,
                                     compression='gzip')
Example #15
0
def loss_func(y, pred):
    best_proba, best_mcc = mcc_optimize(pred, y)
    return - best_mcc
Example #16
0
            insample_ans = numpy.array(insample_ans).T
            if all_ans is None:
                all_ans = ans
                all_target = target.ix[test_idx].values
                all_ids = data.ix[test_idx].index.values.astype(int)
            else:
                all_ans = numpy.r_[all_ans, ans]
                all_target = numpy.r_[all_target, target.ix[test_idx].values]
                all_ids = numpy.r_[all_ids,
                                   data.ix[test_idx].index.values.astype(int)]

            model = XGBClassifier(seed=0)
            model.fit(ans, target.ix[test_idx])
            pred = model.predict_proba(ans)[:, 1]
            logger.info('model thresh: %s, score: %s' %
                        mcc_optimize(pred, target.ix[test_idx].values))
            pred = ans.max(axis=1)
            logger.info('max thresh: %s, score: %s' %
                        mcc_optimize(pred, target.ix[test_idx].values))
            pred = ans.min(axis=1)
            logger.info('min thresh: %s, score: %s' %
                        mcc_optimize(pred, target.ix[test_idx].values))

            logger.info(
                'mean thresh: %s, score: %s' %
                mcc_optimize(ans.mean(axis=1), target.ix[test_idx].values))

            for j in range(ans.shape[1]):
                score = roc_auc_score(target.ix[test_idx].values, ans[:, j])
                logger.info('score: %s' % score)
                logger.info(
Example #17
0
def train():

    df = pd.concat([
        pd.read_feather('train_0713.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_feat_agg.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_feat_agg_sec.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_hash_cnt.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_hash_cnt_nos38.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_hash_cnt_sec.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_date_min.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_num_pass_sec.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_diff.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_time_mean.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_time_mean_norm.ftr', nthreads=8).astype(DTYPE),
        pd.read_feather('train_magic.ftr', nthreads=8)[[
            'magic1', 'magic2', 'magic3', 'magic4'
        ]].astype(DTYPE),
    ],
                   axis=1)

    df_cols = pd.read_csv('result_0715_allfeat/feature_importances.csv')
    drop_cols = df_cols[df_cols['imp'] == 0]['col'].values
    df.drop(drop_cols, axis=1, errors='ignore', inplace=True)

    df_cols = pd.read_csv('result_0715_magic/feature_importances.csv')
    drop_cols = df_cols[df_cols['imp'] == 0]['col'].values
    df.drop(drop_cols, axis=1, errors='ignore', inplace=True)

    df_cols = pd.read_csv('result_0715_sec/feature_importances.csv')
    drop_cols = df_cols[df_cols['imp'] == 0]['col'].values
    df.drop(drop_cols, axis=1, errors='ignore', inplace=True)

    df_cols = pd.read_csv('result_0716_sec_hash/feature_importances.csv')
    drop_cols = df_cols[df_cols['imp'] == 0]['col'].values
    df.drop(drop_cols, axis=1, errors='ignore', inplace=True)

    df_cols = pd.read_csv('result_0716_num_sec/feature_importances.csv')
    drop_cols = df_cols[df_cols['imp'] == 0]['col'].values
    df.drop(drop_cols, axis=1, errors='ignore', inplace=True)

    df_cols = pd.read_csv('result_0716_rate001/feature_importances.csv')
    drop_cols = df_cols[df_cols['imp'] == 0]['col'].values
    df.drop(drop_cols, axis=1, errors='ignore', inplace=True)

    df_cols = pd.read_csv('result_0717_s38/feature_importances.csv')
    drop_cols = df_cols[df_cols['imp'] == 0]['col'].values
    df.drop(drop_cols, axis=1, errors='ignore', inplace=True)

    df_cols = pd.read_csv('result_0717_diff/feature_importances.csv')
    drop_cols = df_cols[df_cols['imp'] == 0]['col'].values
    df.drop(drop_cols, axis=1, errors='ignore', inplace=True)

    df_cols = pd.read_csv('result_0717_time_mean/feature_importances.csv')
    drop_cols = df_cols[df_cols['imp'] == 0]['col'].values
    df.drop(drop_cols, axis=1, errors='ignore', inplace=True)

    df_cols = pd.read_csv('result_0718_time_mean/feature_importances.csv')
    drop_cols = df_cols[df_cols['imp'] == 0]['col'].values
    df.drop(drop_cols, axis=1, errors='ignore', inplace=True)

    logger.info(f'load 1 {df.shape}')

    y_train = df['Response'].values
    df.drop(['Response', 'Id'], axis=1, errors='ignore', inplace=True)

    logger.info(f'load dropcols {df.shape}')
    gc.collect()
    x_train = df.values  # sparse.csc_matrix(df.values, dtype=DTYPE)

    usecols = df.columns.values.tolist()

    del df
    gc.collect()

    logger.info('train data size {}'.format(x_train.shape))
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871)

    with open(DIR + 'usecols.pkl', 'wb') as f:
        pickle.dump(usecols, f, -1)

    # {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_bin': 511, 'max_depth': -1, 'metric': 'None', 'min_child_weight': 10, 'min_split_gain': 0, 'num_leaves': 31, 'objective': 'binary', 'reg_alpha': 1, 'scale_pos_weight': 1, 'seed': 114, 'subsample': 0.99, 'subsample_freq': 1, 'verbose': -1, 'xgboost_dart_mode': True}
    all_params = {
        'min_child_weight': [10],
        'subsample': [0.99],
        'subsample_freq': [1],
        'seed': [114],
        'colsample_bytree': [0.7],
        'learning_rate': [0.01],
        'max_depth': [-1],
        'min_split_gain': [0],
        'reg_alpha': [1],
        'max_bin': [511],
        'num_leaves': [31],
        'objective': ['binary'],
        'scale_pos_weight': [1],
        'verbose': [-1],
        'boosting_type': ['gbdt'],
        'metric': ["None"],
        'xgboost_dart_mode': [True],
        # 'device': ['gpu'],
    }

    use_score = 0
    min_score = (100, 100, 100)
    for params in tqdm(list(ParameterGrid(all_params))):
        cnt = -1
        list_score = []
        list_score2 = []
        list_best_iter = []
        all_pred = np.zeros(y_train.shape[0])
        for train, test in cv.split(x_train, y_train):
            cnt += 1
            trn_x = x_train[
                train]  # [[i for i in range(x_train.shape[0]) if train[i]]]
            val_x = x_train[
                test]  # [[i for i in range(x_train.shape[0]) if test[i]]]
            trn_y = y_train[train]
            val_y = y_train[test]
            train_data = lgb.Dataset(
                trn_x,  # .values.astype(np.float32),
                label=trn_y,
                feature_name=usecols)
            test_data = lgb.Dataset(
                val_x,  # .values.astype(np.float32),
                label=val_y,
                feature_name=usecols)
            del trn_x
            gc.collect()
            clf = lgb.train(
                params,
                train_data,
                100000,  # params['n_estimators'],
                early_stopping_rounds=500,
                valid_sets=[test_data],
                feval=cst_metric_xgb,
                # callbacks=[callback],
                verbose_eval=10)
            pred = clf.predict(val_x).clip(0, 1)

            all_pred[test] = pred
            best_proba, best_mcc = mcc_optimize(pred, val_y)
            _score = -best_mcc
            _score2 = log_loss(val_y, pred)

            logger.info('   _score: %s' % _score)
            logger.info('   _best_proba: %s' % best_proba)
            logger.info('   _score2: %s' % _score2)

            list_score.append(_score)
            list_score2.append(_score2)

            if clf.best_iteration != 0:
                list_best_iter.append(clf.best_iteration)
            else:
                list_best_iter.append(params['n_estimators'])

            with open(DIR + 'train_cv_pred_%s.pkl' % cnt, 'wb') as f:
                pickle.dump(pred, f, -1)
            with open(DIR + 'model_%s.pkl' % cnt, 'wb') as f:
                pickle.dump(clf, f, -1)
            gc.collect()
            break
        with open(DIR + 'train_cv_tmp.pkl', 'wb') as f:
            pickle.dump(all_pred, f, -1)

        logger.info('trees: {}'.format(list_best_iter))
        # trees = np.mean(list_best_iter, dtype=int)
        score = (np.mean(list_score), np.min(list_score), np.max(list_score))
        score2 = (np.mean(list_score2), np.min(list_score2),
                  np.max(list_score2))

        logger.info('param: %s' % (params))
        logger.info('cv: {})'.format(list_score))
        logger.info('cv2: {})'.format(list_score2))

        logger.info('loss: {} (avg min max {})'.format(score[use_score],
                                                       score))
        logger.info('qwk: {} (avg min max {})'.format(score2[use_score],
                                                      score2))

        if min_score[use_score] > score[use_score]:
            min_score = score
            min_params = params
        logger.info('best score: {} {}'.format(min_score[use_score],
                                               min_score))

        logger.info('best params: {}'.format(min_params))

    imp = pd.DataFrame(clf.feature_importance(), columns=['imp'])
    imp['col'] = usecols
    n_features = imp.shape[0]
    imp = imp.sort_values('imp', ascending=False)
    imp.to_csv(DIR + 'feature_importances_0.csv')
    logger.info('imp use {} {}'.format(imp[imp.imp > 0].shape, n_features))

    del val_x
    del trn_y
    del val_y
    del train_data
    del test_data
    gc.collect()

    trees = np.mean(list_best_iter)

    logger.info('all data size {}'.format(x_train.shape))

    train_data = lgb.Dataset(x_train, label=y_train, feature_name=usecols)
    del x_train
    gc.collect()
    logger.info('train start')
    clf = lgb.train(
        min_params,
        train_data,
        int(trees * 1.1),
        feval=cst_metric_xgb,
        # valid_sets=[train_data],
        verbose_eval=10,
        callbacks=[callback])
    logger.info('train end')
    with open(DIR + 'model.pkl', 'wb') as f:
        pickle.dump(clf, f, -1)
    # del x_train
    gc.collect()

    logger.info('save end')
    return best_proba
Example #18
0
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.1],
        'min_child_weight': [1],
        'subsample': [1],
        'reg_alpha': [0, 0.1, 0.01],
        'colsample_bytree': [1],
        'scale_pos_weight': [1]
    }
    _all_params = {'C': [10**i for i in range(-3, 2)], 'penalty': ['l2']}
    cv = StratifiedKFold(target, n_folds=5, shuffle=True, random_state=0)
    list_score = []
    max_score = -100
    best_thresh = None
    pg = list(ParameterGrid(all_params))
    for i in range(data.shape[1]):
        thresh, score = mcc_optimize(data[:, i], target)
        logger.info('model:%s, thresh: %s, total score: %s, max_score: %s' %
                    (i, thresh, score, max_score))

    for i, params in enumerate(pg):
        logger.info('%s/%s param: %s' % (i + 1, len(pg), params))
        pred_proba_all = []
        y_true = []
        for train_idx, test_idx in cv:
            model = XGBClassifier(seed=0)
            #model = LogisticRegression(n_jobs=-1, class_weight='balanced')
            model.set_params(**params)

            model.fit(data[train_idx],
                      target[train_idx],
                      eval_metric=evalmcc_xgb_min,
Example #19
0
                """
                booster = train(params, train_dmatrix,
                        evals=[(test_dmatrix, 'eval')],
                        feval=evalmcc_xgb_min,
                        num_boost_round=700,
                        early_stopping_rounds=200,
                        verbose_eval=True)
                """
                avg_ntree += booster.best_ntree_limit
                ans = booster.predict(test_dmatrix,
                                      ntree_limit=booster.best_ntree_limit)
                tree_limit = booster.best_ntree_limit
                score = roc_auc_score(target.ix[test_idx].values, ans)
                logger.info('score: %s' % score)
                logger.info('tree: %s' % tree_limit)
                logger.info('model thresh: %s, score: %s' % mcc_optimize(ans, target.ix[test_idx].values))
                logger.info('train_end')
                if all_ans is None:
                    all_ans = ans
                    all_target = target[test_idx]
                    all_ids = data.ix[test_idx].index.values.astype(int)
                else:
                    all_ans = numpy.r_[all_ans, ans]
                    all_target = numpy.r_[all_target, target[test_idx]]
                    all_ids = numpy.r_[all_ids, data.ix[test_idx].index.values.astype(int)]

                ans = booster.predict(test_dmatrix,
                                      ntree_limit=booster.best_iteration - 10)
                logger.info('model thresh: %s, score: %s' % mcc_optimize(ans, target.ix[test_idx].values))

                ans = booster.predict(test_dmatrix,
Example #20
0
def cst_metric_xgb(pred, dtrain):
    label = dtrain.get_label().astype(np.int)
    best_proba, best_mcc = mcc_optimize(pred, label)
    return 'mcc', best_mcc, True
Example #21
0
def train(x_train):

    #y_train = pd.read_feather('train_0713.ftr')['Response'].values
    #np.savetxt('y_train.npy', y_train)
    y_train = np.loadtxt('y_train.npy')

    usecols = x_train.columns.values.tolist()

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871)

    with open(DIR + 'usecols.pkl', 'wb') as f:
        pickle.dump(usecols, f, -1)
    for _, test in cv.split(x_train, y_train):
        x_train = x_train.iloc[test].values
        y_train = y_train[test]
        np.savetxt('index.npy', test)
        break

    all_params = {
        'boosting_type': 'gbdt',
        'colsample_bytree': 0.8,
        'learning_rate': 0.01,
        'max_bin': 255,
        'max_depth': -1,
        'metric': 'None',
        'min_child_weight': 50,
        'min_split_gain': 0.01,
        'num_leaves': 15,
        'objective': 'xentropy ',
        'reg_alpha': 0,
        'scale_pos_weight': 1,
        'seed': 114514,
        'subsample': 1,
        'subsample_freq': 0,
        'verbose': -1
    }
    """
    all_params = {'min_child_weight': [80],
                  'subsample': [1],
                  'subsample_freq': [0],
                  'seed': [114514],
                  'colsample_bytree': [0.8],
                  'learning_rate': [0.01],
                  'max_depth': [4],
                  'min_split_gain': [0.01],
                  'reg_alpha': [0.001],
                  'reg_lambda': [0.1],
                  'max_bin': [255],
                  'num_leaves': [15],
                  'objective': ['xentropy'],
                  'scale_pos_weight': [1],
                  'verbose': [-1],
                  'boosting_type': ['gbdt'],
                  'metric': ['rmse'],
                  # 'skip_drop': [0.7],
                  }
    """
    all_params = {k: [v] for k, v in all_params.items()}
    use_score = 0
    min_score = (100, 100, 100)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=871)
    for params in tqdm(list(ParameterGrid(all_params))):
        cnt = -1
        list_score = []
        list_score2 = []
        list_best_iter = []
        list_thresh = []
        all_pred = np.zeros(y_train.shape[0])
        for train, test in cv.split(x_train, y_train):
            cnt += 1
            trn_x = x_train[train]
            val_x = x_train[test]
            trn_y = y_train[train]
            val_y = y_train[test]

            train_data = lgb.Dataset(trn_x, label=trn_y, feature_name=usecols)
            test_data = lgb.Dataset(val_x, label=val_y, feature_name=usecols)
            del trn_x
            gc.collect()
            clf = lgb.train(
                params,
                train_data,
                100000,  # params['n_estimators'],
                early_stopping_rounds=100,
                valid_sets=[test_data],
                feval=cst_metric_xgb,
                # callbacks=[callback],
                verbose_eval=10)
            pred = clf.predict(val_x).clip(0, 1)

            all_pred[test] = pred

            best_mcc, _score = mcc_optimize(pred, val_y)
            _score2 = _score  # - roc_auc_score(val_y, pred)

            logger.info('   _score: %s' % _score)
            logger.info('   _mcc: %s' % best_mcc)
            logger.info('   _score2: %s' % _score2)
            list_thresh.append(best_mcc)
            list_score.append(_score)
            list_score2.append(_score2)

            if clf.best_iteration != 0:
                list_best_iter.append(clf.best_iteration)
            else:
                list_best_iter.append(params['n_estimators'])

            with open(DIR + 'train_cv_pred_%s.pkl' % cnt, 'wb') as f:
                pickle.dump(pred, f, -1)
            with open(DIR + 'model_%s.pkl' % cnt, 'wb') as f:
                pickle.dump(clf, f, -1)
            gc.collect()
        with open(DIR + 'train_cv_tmp.pkl', 'wb') as f:
            pickle.dump(all_pred, f, -1)

        logger.info('trees: {}'.format(list_best_iter))
        # trees = np.mean(list_best_iter, dtype=int)
        score = (np.mean(list_score), np.min(list_score), np.max(list_score))
        score2 = (np.mean(list_score2), np.min(list_score2),
                  np.max(list_score2))

        logger.info('param: %s' % (params))
        logger.info('cv: {})'.format(list_score))
        logger.info('mcc: {})'.format(list_thresh))
        logger.info('cv2: {})'.format(list_score2))

        logger.info('loss: {} (avg min max {})'.format(score[use_score],
                                                       score))
        logger.info('all loss: {}'.format(
            np.sqrt(mean_squared_error(y_train, all_pred))))
        logger.info('qwk: {} (avg min max {})'.format(score2[use_score],
                                                      score2))

        if min_score[use_score] > score[use_score]:
            min_score = score
            min_params = params
        logger.info('best score: {} {}'.format(min_score[use_score],
                                               min_score))

        logger.info('best params: {}'.format(min_params))

    imp = pd.DataFrame(clf.feature_importance(), columns=['imp'])
    imp['col'] = usecols
    n_features = imp.shape[0]
    imp = imp.sort_values('imp', ascending=False)
    imp.to_csv(DIR + 'feature_importances_0.csv')
    logger.info('imp use {} {}'.format(imp[imp.imp > 0].shape, n_features))

    del val_x
    del trn_y
    del val_y
    del train_data
    del test_data
    gc.collect()

    trees = np.mean(list_best_iter)

    logger.info('all data size {}'.format(x_train.shape))

    train_data = lgb.Dataset(x_train, label=y_train, feature_name=usecols)
    del x_train
    gc.collect()
    logger.info('train start')
    clf = lgb.train(min_params,
                    train_data,
                    int(trees * 1.1),
                    valid_sets=[train_data],
                    verbose_eval=10)
    logger.info('train end')
    with open(DIR + 'model.pkl', 'wb') as f:
        pickle.dump(clf, f, -1)
    with open(DIR + 'list_thresh', 'wb') as f:
        pickle.dump(list_thresh, f, -1)
    # del x_train
    gc.collect()

    logger.info('save end')
Example #22
0
                                                                             1]

            logger.info('train_end')
            """
            if all_ans is None:
                all_ans = ans
                all_target = target[test_idx]
                all_ids = ids.ix[test_idx].values
            else:
                all_ans = numpy.r_[all_ans, ans]
                all_target = numpy.r_[all_target, target[test_idx]]
                all_ids = numpy.r_[all_ids, ids.ix[test_idx]]
            """
            score_auc = roc_auc_score(target[test_idx], ans)
            logger.info('score: %s' % score_auc)
            thresh, score = mcc_optimize(ans, target[test_idx])
            logger.info('model thresh: %s, score: %s' % (thresh, score))

        if add_col is None:
            base_score = score
            base_score_auc = score_auc
            continue

        if score > base_score:
            logger.info('col: %s, mcc is good %s' %
                        (add_col, score - base_score))
            feature_column_use.append(add_col)
            pandas.DataFrame(feature_column_use).to_csv(
                'feature_column_use_mcc.csv')

        if score_auc > base_score_auc:
Example #23
0
            model.fit(data[train_idx], target[train_idx],
                      eval_metric=evalmcc_xgb_min,
                      verbose=False)

            #pred_proba = data[test_idx, -1]
            pred_proba = model.predict_proba(data[test_idx])[:, 1]
            pred_proba_all = numpy.r_[pred_proba_all, pred_proba]
            y_true = numpy.r_[y_true, target[test_idx]]
            score = roc_auc_score(target[test_idx], pred_proba)
            #logger.info('    score: %s' % score)
            #thresh, score = mcc_scoring(model, data[test_idx], target[test_idx])
            list_score.append(score)
            #logger.info('    thresh: %s' % thresh)
        score = numpy.mean(list_score)
        thresh, score = mcc_optimize(pred_proba_all, y_true)
        max_score = max(max_score, score)
        logger.info('thresh: %s, total score: %s, max_score: %s' % (thresh, score, max_score))
        if max_score == score:
            best_param = params
            best_thresh = thresh
    logger.info('best_thresh: %s, total max score: %s' % (best_thresh, max_score))
    model = XGBClassifier(seed=0)
    #model = LogisticRegression(n_jobs=-1, class_weight='balanced')
    model.set_params(**best_param)
    model.fit(data[train_idx], target[train_idx],
              eval_metric=evalmcc_xgb_min,
              verbose=False)

    with open('stack_model_1.pkl', 'wb') as f:
        pickle.dump(model, f, -1)