Beispiel #1
0
def drop_feature(X_train,
                 y_train,
                 X_test='',
                 coverage_threshold=0.1,
                 ks_threshold=0.05):

    if not coverage_threshold and not ks_threshold:
        return (X_train, X_test)

    sample_num = len(X_train)
    for col in X_train.columns:

        nan_index = pd.isnull(X_train[col])
        x_col = X_train[col][~nan_index]
        y_col = y_train[~nan_index]

        coverage_ratio = len(x_col) / float(sample_num)
        class_num = len(set(x_col))
        ks_value = max(ks.ks_analysis(
            x_col.values, y_col.values)['ks_value']) if class_num > 2 else 1
        if any([
                class_num < 2, coverage_ratio < coverage_threshold,
                ks_value < ks_threshold
        ]):
            X_train = X_train.drop([col], axis=1)
            if len(X_test): X_test = X_test.drop([col], axis=1)

    return (X_train, X_test)
Beispiel #2
0
    def ks_analysises(self):

        #调用ks_analysis对全部特征的有效性进行ks分析
        print('start ks_analysis...')
        print('[total|done|todo]')

        dfks = pd.DataFrame()
        features_num = len(self.__features)
        for i, col in enumerate(self.__features):
            try:
                dfcol = ks.ks_analysis(self.__dfdata[col].values,
                                       self.__dfdata['label'].values)
                ks_value = max(dfcol['ks_value'])
                dfcol.index = [[col] * len(dfcol), [ks_value] * len(dfcol),
                               range(len(dfcol))]
                dfks = pd.concat([dfks, dfcol])
                dfks.index.names = ['feature', 'ks', 'seq']
            except:
                pass
            if np.mod(i + 1, 100) == 0:
                print('[{}|{}|{}]'.format(features_num, i + 1,
                                          features_num - i - 1))
        print('[{}|{}|{}]'.format(features_num, i + 1, features_num - i - 1))
        dfks = dfks.sort_index(axis=0,
                               level=[1, 0, 2],
                               ascending=[False, True, True])
        return (dfks)
Beispiel #3
0
    def test(self, clf, dftest=pd.DataFrame()):

        info = "\nstart test model ... "
        print(info)
        self.report_info = self.report_info + info + '\n'

        # 若传入新的dftest,则需要再次做数据预处理
        if len(dftest) > 0:

            print('preprocessing test data...\n')

            # 禁止数据预处理期间打印输出
            stdout = sys.stdout
            sys.stdout = open(os.devnull, 'w')

            X_train, y_train, X_test, y_test = self.preprocess_data(
                self.dftrain, dftest)

            # 恢复打印输出
            sys.stdout = stdout

            # 预处理后的训练和测试集
            self.X_train, self.y_train = X_train, y_train
            self.X_test, self.y_test = X_test, y_test

        y_test_hat = clf.predict_proba(self.X_test)[:, -1]
        dfks_test = ks.ks_analysis(y_test_hat, np.ravel(self.y_test))
        ks_test = max(dfks_test['ks_value'])
        auc_test = metrics.roc_auc_score(np.ravel(self.y_test), y_test_hat)

        info = 'test: ks = {} \t auc = {} '.format(ks_test, auc_test) + '\n'
        prettyks = ks.print_ks(y_test_hat, np.ravel(self.y_test))
        info = info + str(prettyks) + '\n'
        print(info)
        self.report_info = self.report_info + info + '\n'
Beispiel #4
0
    def test(self, bst, dftest=pd.DataFrame()):

        info = "\nstart test xgboost model ... \n"
        print(info)
        self.report_info = self.report_info + info + '\n'

        # 若传入新的dftest,则需要再次做数据预处理
        if len(dftest) > 0:

            print('preprocessing test data...')

            # 禁止数据预处理期间打印输出
            stdout = sys.stdout
            sys.stdout = open(os.devnull, 'w')

            X_train, y_train, X_test, y_test = self.preprocess_data(
                self.dftrain, dftest)

            # 恢复打印输出
            sys.stdout = stdout

            # 预处理后的测试集
            self.X_test, self.y_test = X_test, y_test
            self.dtest = xgb.DMatrix(self.X_test, self.y_test['label'])

        y_test_hat = bst.predict(self.dtest)
        dfks_test = ks.ks_analysis(y_test_hat, np.ravel(self.y_test))
        ks_test = max(dfks_test['ks_value'])
        auc_test = auc(np.ravel(self.y_test), y_test_hat)

        info = 'test: ks = {} \t auc = {} '.format(ks_test, auc_test) + '\n'
        prettyks = ks.print_ks(y_test_hat, np.ravel(self.y_test))
        info = info + str(prettyks) + '\n'
        print(info)
        self.report_info = self.report_info + info + '\n'
Beispiel #5
0
 def ks_analysis(self, data, label):
     """
     'feature_interval',#区间
     'order_num', #订单数量
     'order_ratio', #订单占比
     'overdue_num', #逾期订单数量
     'overdue_ratio', #逾期订单占比
     'normal_num', #正常订单数量
     'normal_ratio', #正常订单占比
     'overdue_cum_ratio', #累计逾期订单比例
     'normal_cum_ratio', #累计正常订单比例
     'ks_value' #ks统计值
     """
     return (ks.ks_analysis(data, label))
Beispiel #6
0
def fill_nan(X_train, y_train, X_test='', method='infer'):

    X_train_new, X_test_new = pd.DataFrame(), pd.DataFrame()
    method_dict = {'mean': np.mean, 'median': np.median, 'most': most_value}

    if method in ['0', '-1']:
        for col in X_train.columns:
            nan_index = pd.isnull(X_train[col])
            X_train_new[col] = X_train[col].fillna(value=int(method))
            if len(X_test):
                X_test_new[col] = X_test[col].fillna(value=int(method))
            if any(nan_index):
                X_train_new[col + '_isnan'] = nan_index.astype('int').values
                if len(X_test):
                    X_test_new[col + '_isnan'] = pd.isnull(
                        X_test[col]).astype('int').values

    elif method in ['mean', 'median', 'most']:
        for col in X_train.columns:
            nan_index = pd.isnull(X_train[col])

            #根据不同的填充策略计算相应的填充值
            func = method_dict[method]
            fill_value = func(X_train[col].dropna().values)
            X_train_new[col] = X_train[col].fillna(value=fill_value)
            if len(X_test):
                X_test_new[col] = X_test[col].fillna(value=fill_value)
            if any(nan_index):
                X_train_new[col + '_isnan'] = nan_index.astype('int').values
                if len(X_test):
                    X_test_new[col + '_isnan'] = pd.isnull(
                        X_test[col]).astype('int').values

    elif method == 'infer':
        for col in X_train.columns:
            nan_index = pd.isnull(X_train[col])
            x_col = X_train[col][~nan_index]
            y_col = y_train[~nan_index]

            if not any(nan_index):
                X_train_new[col] = X_train[col]
                if len(X_test): X_test_new[col] = X_test[col]
                continue

            nan_overdue_ratio = (lambda x: sum(x) / float(len(x)))(
                y_train[nan_index].values)
            dfks = ks.ks_analysis(X_train[col].values, y_train.values)

            # 找到逾期率最接近缺失样本逾期率的分组
            g = np.abs(dfks['overdue_ratio'].values -
                       nan_overdue_ratio).argmin()

            # 寻找到对应的取值范围
            str_interval = dfks['feature_interval'][g]

            p, q = [
                float(x)
                for x in re.sub(r'\[|\]|\)', '', str_interval).split(',')
            ]
            if ')' in str_interval:
                l = [x for x in x_col if x >= p and x < q]
            else:
                l = [x for x in x_col if x >= p and x <= q]

            # 计算该范围内特征的平均值
            fill_value = np.mean(l)

            X_train_new[col] = X_train[col].fillna(value=fill_value)
            if len(X_test):
                X_test_new[col] = X_test[col].fillna(value=fill_value)
            if any(nan_index):
                X_train_new[col + '_isnan'] = nan_index.astype('int').values
                if len(X_test):
                    X_test_new[col + '_isnan'] = pd.isnull(
                        X_test[col]).astype('int').values

    # 如果有的X_test列有缺失值,但对应X_train不缺失,使用中位数填充策略
    for col in X_test_new.columns:
        nan_index = pd.isnull(X_test_new[col])
        if not any(nan_index): continue
        fill_value = np.median(X_train_new[col].values)
        X_test_new[col] = X_test_new[col].fillna(value=fill_value)
    return (X_train_new, X_test_new)
Beispiel #7
0
    def train(self, clf, cv=5, model_idx=5):

        if cv:
            #skf = StratifiedKFold(n_splits = cv,shuffle=True)

            k, ks_mean_train, auc_mean_train, ks_mean_validate, auc_mean_validate = 0, 0, 0, 0, 0

            models = {}

            #for train_index,validate_index in skf.split(self.X_train,np.ravel(self.y_train)):
            for train_index, validate_index in stratified_kfold(
                    self.X_train, np.ravel(self.y_train), nfolds=cv):

                k = k + 1
                nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                info = '\n{}: k = {}'.format(nowtime, k)
                print(info)
                self.report_info = self.report_info + info + '\n'

                X_train_k, y_train_k = self.X_train.iloc[
                    train_index, :], self.y_train.iloc[train_index, :]
                X_validate_k, y_validate_k = self.X_train.iloc[
                    validate_index, :], self.y_train.iloc[validate_index, :]

                clf.fit(X_train_k, np.ravel(y_train_k))
                predict_train_k = clf.predict_proba(X_train_k)[:, -1]
                predict_validate_k = clf.predict_proba(X_validate_k)[:, -1]

                dfks_train = ks.ks_analysis(predict_train_k, y_train_k.values)
                dfks_validate = ks.ks_analysis(predict_validate_k,
                                               y_validate_k.values)

                ks_train, ks_validate = max(dfks_train['ks_value']), max(
                    dfks_validate['ks_value'])

                auc_validate = metrics.roc_auc_score(np.ravel(y_validate_k),
                                                     predict_validate_k)
                auc_train = metrics.roc_auc_score(np.ravel(y_train_k),
                                                  predict_train_k)

                ks_mean_train = ks_mean_train + ks_train
                auc_mean_train = auc_mean_train + auc_train
                ks_mean_validate = ks_mean_validate + ks_validate
                auc_mean_validate = auc_mean_validate + auc_validate

                info = '\ntrain: ks = {} \t auc = {} '.format(
                    ks_train, auc_train)
                prettyks = ks.print_ks(predict_train_k, y_train_k.values)
                info = info + '\n' + str(prettyks) + '\n'
                info = info + '\nvalidate: ks = {} \t auc = {}'.format(
                    ks_validate, auc_validate) + '\n'
                prettyks = ks.print_ks(predict_validate_k, y_validate_k.values)
                info = info + str(prettyks) + '\n'
                print(info)
                self.report_info = self.report_info + info

                models[k] = clf

            ks_mean_train = ks_mean_train / float(k)
            auc_mean_train = auc_mean_train / float(k)
            ks_mean_validate = ks_mean_validate / float(k)
            auc_mean_validate = auc_mean_validate / float(k)

            nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            info = '\n================================================================================ %s\n' % nowtime
            info = info + 'train : ks mean {:.5f} ; auc mean {:.5f}'.format(
                ks_mean_train, auc_mean_train) + '\n'
            info = info + 'validate : ks mean {:.5f} ; auc mean {:.5f}'.format(
                ks_mean_validate, auc_mean_validate) + '\n'
            print(info)
            self.report_info = self.report_info + info

            clf = models[model_idx]

        # 处理 cv = 0 或 cv = None时无需交叉验证逻辑
        else:

            nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            info = '\n================================================================================ %s\n' % nowtime
            print(info)
            self.report_info = self.report_info + info

            clf.fit(self.X_train, np.ravel(self.y_train))
            predict_train = clf.predict_proba(self.X_train)[:, -1]
            dfks_train = ks.ks_analysis(predict_train, self.y_train.values)
            ks_train = max(dfks_train['ks_value'])
            auc_train = metrics.roc_auc_score(np.ravel(self.y_train),
                                              predict_train)

            info = '\ntrain: ks = {} \t auc = {} '.format(ks_train,
                                                          auc_train) + '\n'
            prettyks = ks.print_ks(predict_train, self.y_train.values)
            info = info + str(prettyks) + '\n'
            print(info)
            self.report_info = self.report_info + info

        return (clf)
Beispiel #8
0
    def train(self,
              cv=5,
              model_idx=5,
              params_dict=params_dict,
              n_jobs=4,
              verbose_eval=20):

        info = "start train xgboost model ..."
        print(info)
        self.report_info = self.report_info + info + '\n'

        params_dict_copy = params_dict.copy()
        params_dict_copy.update({'nthread': n_jobs})

        if cv:

            k, ks_mean_train, auc_mean_train, ks_mean_validate, auc_mean_validate = 0, 0, 0, 0, 0

            models = {}

            for train_index, validate_index in stratified_kfold(
                    self.X_train, np.ravel(self.y_train), nfolds=cv):

                k = k + 1
                nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                print(
                    '\n================================================================================ %s\n'
                    % nowtime)
                info = 'k = {}'.format(k)
                print(info)
                self.report_info = self.report_info + info + '\n'

                X_train_k, y_train_k = self.X_train.iloc[
                    train_index, :], self.y_train.iloc[train_index, :]
                X_validate_k, y_validate_k = self.X_train.iloc[
                    validate_index, :], self.y_train.iloc[validate_index, :]

                dtrain_k = xgb.DMatrix(X_train_k, y_train_k['label'])
                dvalid_k = xgb.DMatrix(X_validate_k, y_validate_k['label'])

                bst, _ = train_xgb(params_dict_copy, dtrain_k, dvalid_k, None,
                                   verbose_eval)
                predict_train_k = bst.predict(dtrain_k)
                predict_validate_k = bst.predict(dvalid_k)

                dfks_train = ks.ks_analysis(predict_train_k,
                                            dtrain_k.get_label())
                dfks_validate = ks.ks_analysis(predict_validate_k,
                                               dvalid_k.get_label())

                ks_train, ks_validate = max(dfks_train['ks_value']), max(
                    dfks_validate['ks_value'])

                auc_train = auc(dtrain_k.get_label(), predict_train_k)
                auc_validate = auc(dvalid_k.get_label(), predict_validate_k)

                ks_mean_train = ks_mean_train + ks_train
                auc_mean_train = auc_mean_train + auc_train
                ks_mean_validate = ks_mean_validate + ks_validate
                auc_mean_validate = auc_mean_validate + auc_validate

                info = '\ntrain: ks = {} \t auc = {} '.format(
                    ks_train, auc_train)
                prettyks = ks.print_ks(predict_train_k, dtrain_k.get_label())
                info = info + '\n' + str(prettyks) + '\n'
                info = info + '\nvalidate: ks = {} \t auc = {}'.format(
                    ks_validate, auc_validate) + '\n'
                prettyks = ks.print_ks(predict_validate_k,
                                       dvalid_k.get_label())
                info = info + str(prettyks) + '\n'
                print(info)
                self.report_info = self.report_info + info

                models[k] = bst

            ks_mean_train = ks_mean_train / float(k)
            auc_mean_train = auc_mean_train / float(k)
            ks_mean_validate = ks_mean_validate / float(k)
            auc_mean_validate = auc_mean_validate / float(k)

            nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            info = '\n================================================================================ %s\n' % nowtime
            info = info + 'train : ks mean {:.5f} ; auc mean {:.5f}'.format(
                ks_mean_train, auc_mean_train) + '\n'
            info = info + 'validate : ks mean {:.5f} ; auc mean {:.5f}'.format(
                ks_mean_validate, auc_mean_validate) + '\n'
            print(info)
            self.report_info = self.report_info + info

            bst = models[model_idx]

        # 处理 cv = 0 或 cv = None时无需交叉验证逻辑
        else:

            nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            info = '\n================================================================================ %s\n' % nowtime
            print(info)
            self.report_info = self.report_info + info

            bst, _ = train_xgb(params_dict_copy, self.dtrain, None, None,
                               verbose_eval)
            predict_train = bst.predict(self.dtrain)
            dfks_train = ks.ks_analysis(predict_train, self.y_train.values)
            ks_train = max(dfks_train['ks_value'])
            auc_train = auc(self.dtrain.get_label(), predict_train)

            info = '\ntrain: ks = {} \t auc = {} '.format(ks_train,
                                                          auc_train) + '\n'
            prettyks = ks.print_ks(predict_train, self.y_train.values)
            info = info + str(prettyks) + '\n'
            print(info)
            self.report_info = self.report_info + info

        # 计算特征重要性
        feature_scores = bst.get_score()
        dfimportance = pd.DataFrame({
            'feature': feature_scores.keys(),
            'importance': feature_scores.values()
        })
        try:
            dfimportance = dfimportance.sort_values('importance',
                                                    ascending=False)
        except AttributeError as err:
            dfimportance = dfimportance.sort('importance', ascending=False)

        dfimportance.index = range(len(dfimportance))

        self.dfimportance = dfimportance

        return (bst)