Exemple #1
0
def make_cat_features(df, filekey):
    mkdir_func(f'../features/{filekey}')
    train = df[~df[target].isnull()]
    test = df[df[target].isnull()]
    categorical_features = get_categorical_features(df=train, ignore_list=ignore_list)

    #========================================================================
    # Categorical Feature Encode
    #========================================================================
    # Factorize
    logger.info("Factorize Start!!")
    for col in categorical_features:
        for col in categorical_features:
            train[f"lbl_{col}@"], indexer = pd.factorize(train[col])
            test[f"lbl_{col}@"] = indexer.get_indexer(test[col])

    # Count Encoding
    logger.info("Count Encoding Start!!")
    for col in categorical_features:
        train = cnt_encoding(train, col, ignore_list=ignore_list)
        test = cnt_encoding(test, col, ignore_list=ignore_list)

    #========================================================================
    # Categorical Feature Save
    #========================================================================
    for col in train.columns:
        logger.info("Saving Features...")
        if col.count('@'):
            result_train = train[col].values
            result_test = test[col].values
            logger.info(f"COL: {col} | LENGTH: {len(result_train)}")
            utils.to_pkl_gzip(obj=result_train, path=f'../features/{filekey}/train_{col}')
            utils.to_pkl_gzip(obj=result_test, path=f'../features/{filekey}/test_{col}')
Exemple #2
0
def clean_app(app):
    logger.info(f'''
    #==============================================================================
    # APPLICATION CLEANSING
    #=============================================================================='''
                )

    revo = 'Revolving loans'
    drop_list = [
        col for col in app.columns if col.count('is_train')
        or col.count('is_test') or col.count('valid_no')
    ]
    app.drop(drop_list, axis=1, inplace=True)

    app['AMT_INCOME_TOTAL'] = app['AMT_INCOME_TOTAL'].where(
        app['AMT_INCOME_TOTAL'] < 1000000, 1000000)
    app['CODE_GENDER'].replace('XNA', 'F', inplace=True)

    cat_cols = get_categorical_features(data=app, ignore=[])
    for col in cat_cols:
        app[col].fillna('XNA', inplace=True)

    ' revo '
    amt_list = ['AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE']
    for col in amt_list:
        app[f'revo_{col}'] = app[col].where(app[f'NAME_CONTRACT_TYPE'] == revo,
                                            np.nan)

    utils.to_df_pickle(df=app,
                       path='../input',
                       fname='clean_application_train_test')
Exemple #3
0
def make_num_features(df, filekey):
    mkdir_func(f'../features/{filekey}')

    #  if filekey.count('bur'):
    df = interact_feature(df, filekey)

    #========================================================================
    # カテゴリの内容別にNumeric Featureを切り出す
    #========================================================================
    num_list = get_numeric_features(df=df, ignore_list=ignore_list)
    cat_list = get_categorical_features(df=df, ignore_list=[])

    #  few_list = []
    #  for cat in tqdm(cat_list):
    #      for val in tqdm(df[cat].drop_duplicates()):
    #          length = len(df[df[cat]==val])
    #          if length < len(df)*0.002:
    #              few_list.append(val)
    #              continue
    #          for num in num_list:
    #          #  pararell_process(, num_list)
    #              df[f'{num}_{cat}-{val}@'] = df[num].where(df[cat]==val, np.nan)
    #              df[f'{num}_{cat}-fewlist@'] = df[num].where(df[cat].isin(few_list), np.nan)

    logger.info(f'{fname} SET SHAPE : {df.shape}')

    #========================================================================
    # Feature Save & Categorical Encoding & Feature Save 
    #========================================================================
    train = df[~df[target].isnull()]
    test = df[df[target].isnull()]

    categorical_features = get_categorical_features(df=train, ignore_list=ignore_list)

    #========================================================================
    # Numeric Feature Save
    #========================================================================
    for col in train.columns:
        if col in categorical_features:continue
        result_train = train[col].values
        result_test = test[col].values
        logger.info(f"COL: {col} | LENGTH: {len(result_train)}")
        utils.to_pkl_gzip(obj=train[col].values, path=f'../features/{filekey}/train_{col}')
        if col != target:
            utils.to_pkl_gzip(obj=test[col].values, path=f'../features/{filekey}/test_{col}')
Exemple #4
0
def get_feature_set(feat_path='../features/all_features/*.gz',
                    feat_key='',
                    is_debug=False,
                    is_cat_encode=True):
    feat_path_list = glob.glob(feat_path)

    path_list = []
    for path in feat_path_list:
        filename = re.search(r'/([^/.]*).gz', path).group(1)
        if path.count(feat_key) and feat_key[:4] == filename[:4]:
            path_list.append(path)

    train, test = ml_utils.get_train_test(feat_path_list=path_list,
                                          target=target)
    print(train.shape, test.shape)

    if is_debug:
        train = train.head(10000)
        test = test.head(500)

    if is_cat_encode:
        #========================================================================
        # Categorical Encode
        cat_cols = utils.get_categorical_features(df=train,
                                                  ignore_list=ignore_list)
        print(f"Categorical: {cat_cols}")

        #Fit LabelEncoder
        for col in cat_cols:
            # 最も頻度の多いカテゴリでimpute
            max_freq = list(train[col].value_counts().index)[0]
            train[col].fillna(max_freq, inplace=True)
            test[col].fillna(max_freq, inplace=True)
            le = LabelEncoder().fit(
                pd.concat([train[col], test[col]],
                          axis=0).value_counts().index.tolist())
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])
        #========================================================================

    return train, test
Exemple #5
0
def multi_level_agg(df, prefix):
    # =======================================================================
    # 複数カテゴリの組み合わせを集計値に置き換える
    # =======================================================================
    method_list = ['mean']
    num_list = ['EXT_SOURCE_2']
    cat_list = get_categorical_features(df=df, ignore_list=ignore_list)
    cat_combi = combinations(cat_list, 2)
    #  amt_list = [col for col in num_list if col.count('AMT_')]
    #  days_list = [col for col in num_list if col.count('DAYS_')]

    # 直列処理
    for com in cat_combi:
        for num in num_list:
            for method in method_list:
                base = df[[key, target] + list(com)].drop_duplicates()
                tmp = df[list(com)+[num]]
                tmp_result = base_aggregation(
                    df=tmp, level=list(com), method=method, prefix=prefix, feature=num)
                result = base.merge(tmp_result, on=list(com), how='left')

                for col in result.columns:
                    if not(col.count('@')) or col in ignore_list:
                        continue

                    train_feat = result[result[target]>=0][col].values
                    test_feat = result[result[target].isnull()][col].values
                    col = col.replace('[', '_').replace(']', '_').replace(' ', '').replace(',', '_')
                    train_file_path = f"../features/1_first_valid/train_{col}"
                    test_file_path = f"../features/1_first_valid/test_{col}"

                    utils.to_pkl_gzip(obj=train_feat, path=train_file_path)
                    utils.to_pkl_gzip(obj=test_feat, path=test_file_path)

                    logger.info(f'''
                    #========================================================================
                    # COMPLETE MAKE FEATURE : {train_file_path}
                    #========================================================================''')
                del result, tmp_result
                gc.collect()
Exemple #6
0
def get_dataset(is_debug=False,
                is_cat_encode=True,
                feat_path='../features/4_winner/*.gz',
                base=[]):
    feat_path_list = glob.glob(feat_path)
    #  feat_path_list += glob.glob('../features/5_tmp/*.gz')

    train, test = ml_utils.get_train_test(feat_path_list=feat_path_list,
                                          target=target,
                                          base=base)
    print(train.shape, test.shape)

    #  if is_debug:
    #      train = train.head(10000)
    #      test = test.head(500)

    if is_cat_encode:
        #========================================================================
        # Categorical Encode
        cat_cols = utils.get_categorical_features(df=train,
                                                  ignore_list=ignore_list)
        print(f"Categorical: {cat_cols}")

        #Fit LabelEncoder
        for col in cat_cols:
            # 最も頻度の多いカテゴリでimpute
            max_freq = list(train[col].value_counts().index)[0]
            train[col].fillna(max_freq, inplace=True)
            test[col].fillna(max_freq, inplace=True)
            le = LabelEncoder().fit(
                pd.concat([train[col], test[col]],
                          axis=0).value_counts().index.tolist())
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])
        #========================================================================

    print(train.shape, test.shape)

    return train, test
Exemple #7
0
def single_level_agg(df, prefix):
    # =======================================================================
    # 1つのカテゴリを集計値に置き換える
    # =======================================================================
    method_list = ['mean', 'var']
    num_list = ['EXT_SOURCE_2']
    cat_list = get_categorical_features(df=df, ignore_list=ignore_list)
    #  amt_list = [col for col in num_list if col.count('AMT_')]
    #  days_list = [col for col in num_list if col.count('DAYS_')]

    # 直列処理
    for cat in cat_list:
        if len(df[cat].unique())<=3:
            continue
        for num in num_list:
            for method in method_list:
                base = df[[key, cat, target]].drop_duplicates()
                tmp = df[[cat, num]]
                tmp_result = base_aggregation(
                    df=tmp, level=cat, method=method, prefix=prefix, feature=num)
                result = base.merge(tmp_result, on=cat, how='left')

                for col in result.columns:
                    if not(col.count('@')) or col in ignore_list:
                        continue

                    train_file_path = f"../features/1_first_valid/train_{col}"
                    test_file_path = f"../features/1_first_valid/test_{col}"

                    utils.to_pkl_gzip(obj=result[result[target]>=0][col].values, path=train_file_path)
                    utils.to_pkl_gzip(obj=result[result[target].isnull()][col].values, path=test_file_path)

                    logger.info(f'''
                    #========================================================================
                    # COMPLETE MAKE FEATURE : {train_file_path}
                    #========================================================================''')
                del result, tmp_result
                gc.collect()
Exemple #8
0
def clean_app(app):
    logger.info(f'''
    #==============================================================================
    # APPLICATION
    #=============================================================================='''
                )

    app['CODE_GENDER'].replace('XNA', 'F', inplace=True)

    cat_cols = get_categorical_features(df=app, ignore_list=[])
    for col in cat_cols:
        app[col].fillna('XNA', inplace=True)

    ' revo '
    #  revo = 'Revolving loans'
    #  amt_list = ['AMT_ANNUITY', 'AMT_CREDIT', 'AMT_GOODS_PRICE']
    #  for col in amt_list:
    #      app[f'revo_{col}'] = app[col].where(app[f'NAME_CONTRACT_TYPE']==revo, np.nan)
    #      app[col] = app[col].where(app[f'NAME_CONTRACT_TYPE']!=revo, np.nan)

    utils.to_df_pkl(df=app,
                    path='../input',
                    fname='clean_application_train_test')
Exemple #9
0
comment = sys.argv[1]

if sys.argv[2].count('f'):
    train, test = MS_utils.get_feature_set(feat_key=sys.argv[2],
                                           base_path=base_path)
else:
    train, test = MS_utils.get_dataset(base=base)
print(train.shape, test.shape)

if is_debug:
    train = train.head(10000)
    test = test.head(5000)

#========================================================================
# Categorical Encode
cat_cols = utils.get_categorical_features(df=train, ignore_list=ignore_list)
print(f"Categorical: {cat_cols}")

#Fit LabelEncoder
for col in cat_cols:
    # 最も頻度の多いカテゴリでimpute
    max_freq = list(train[col].value_counts().index)[0]
    train[col].fillna(max_freq, inplace=True)
    test[col].fillna(max_freq, inplace=True)
    le = LabelEncoder().fit(
        pd.concat([train[col], test[col]],
                  axis=0).value_counts().index.tolist())
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
#========================================================================
Exemple #10
0
def Regressor(model_type, x_train, x_val, y_train, y_val, x_test,
    params={}, seed=1208, get_score='rmse', get_model=False,
    early_stopping_rounds=100, num_boost_round=10000):

    if str(type(x_train)).count('DataFrame'):
        use_cols = x_train.columns
    else:
        use_cols = np.arange(x_train.shape[1]) + 1

    if model_type=='linear':
        estimator = LinearRegression(**params)
    elif model_type=='ridge':
        estimator = Ridge(**params)
    elif model_type=='lasso':
        estimator = Lasso(**params)
    elif model_type=='rmf':
        params['n_jobs'] = -1
        params['n_estimators'] = 10000
        estimator = RandomForestRegressor(**params)
    elif model_type=='lgb':
        if len(params.keys())==0:
            metric = 'auc'
            params['n_jobs'] = 32
            params['metric'] = metric
            params['num_leaves'] = 31
            params['colsample_bytree'] = 0.3
            params['lambda_l2'] = 1.0
            params['learning_rate'] = 0.01
        num_boost_round = num_boost_round
        params['objective'] = 'regression'
        params['metric'] = 'mse'

    #========================================================================
    # Fitting
    if model_type!='lgb':
        estimator.fit(x_train, y_train)
    else:
        lgb_train = lgb.Dataset(data=x_train, label=y_train)
        lgb_val = lgb.Dataset(data=x_val, label=y_val)

        cat_cols = utils.get_categorical_features(df=x_train)

        estimator = lgb.train(
            params = params
            ,train_set = lgb_train
            ,valid_sets = lgb_val
            ,early_stopping_rounds = early_stopping_rounds
            ,num_boost_round = num_boost_round
            ,categorical_feature = cat_cols
            ,verbose_eval = 200
        )

    #========================================================================

    #========================================================================
    # Prediction
    oof_pred = estimator.predict(x_val)
    if len(x_test):
        test_pred = estimator.predict(x_test)
    else:
        test_pred = []
    #========================================================================

    #========================================================================
    # Scoring
    if get_score=='auc':
        score = roc_auc_score(y_val, oof_pred)
    else:
        score = np.sqrt(mean_squared_error(y_val, oof_pred))
        r2    = r2_score(y_val, oof_pred)
        print(f"""
        # R2 Score: {r2}
        """)
    # Model   : {model_type}
    # feature : {x_train.shape, x_val.shape}
    #========================================================================

    if model_type=='lgb':
        feim = get_tree_importance(estimator=estimator, use_cols=x_train.columns)
        feim.sort_values(by='importance', ascending=False, inplace=True)
    elif model_type=='lasso' or model_type=='ridge':
        feim = pd.Series(estimator.coef_, index=use_cols, name='coef')
        feim.sort_values(ascending=False, inplace=True)

    if get_model:
        return score, oof_pred, test_pred, feim, estimator
    else:
        return score, oof_pred, test_pred, feim, 0
Exemple #11
0
def data_check(logger,
               df,
               target,
               test=False,
               dummie=0,
               exclude_category=False,
               ignore_list=[]):
    '''
    Explain:
        学習を行う前にデータに問題がないかチェックする
        カテゴリカルなデータが入っていたらエンコーディング or Dropする
    Args:
    Return:
    '''
    logger.info(f'''
#==============================================================================
# DATA CHECK START
#=============================================================================='''
                )
    categorical_list = get_categorical_features(df, ignore_list=ignore_list)
    dt_list = get_datetime_features(df, ignore_list=ignore_list)
    logger.info(f'''
#==============================================================================
# CATEGORICAL FEATURE: {categorical_list}
# LENGTH: {len(categorical_list)}
# DUMMIE: {dummie}
#==============================================================================
    ''')

    #========================================================================
    # 連続値として扱うべきカラムがobjectになっていることがあるので
    #========================================================================
    #  for cat in categorical_list:
    #      try:
    #          df[cat] = df[cat].astype('int64')
    #          categorical_list.remove(cat)
    #      except ValueError:
    #          pass
    #========================================================================
    # datetime系のカラムはdrop
    #========================================================================
    for dt in dt_list:
        df.drop(dt, axis=1, inplace=True)

    ' 対象カラムのユニーク数が100より大きかったら、ラベルエンコーディングにする '
    label_list = []
    for cat in categorical_list:
        if len(df[cat].drop_duplicates()) > 100:
            label_list.append(cat)
            categorical_list.remove(cat)
        df = factorize_categoricals(df, label_list)

    if exclude_category:
        for cat in categorical_list:
            df.drop(cat, axis=1, inplace=True)
            move_feature(feature_name=cat)
        categorical_list = []
    elif dummie == 1:
        df = get_dummies(df, categorical_list)
        categorical_list = []
    elif dummie == 0:
        df = factorize_categoricals(df, categorical_list)
        categorical_list = []

    logger.info(f'df SHAPE: {df.shape}')

    ' Testsetで値のユニーク数が1のカラムを除外する '
    drop_list = []
    if test:
        for col in df.columns:
            length = df[col].nunique()
            if length <= 1 and col not in ignore_list:
                logger.info(f'''
    ***********WARNING************* LENGTH {length} COLUMN: {col}''')
                move_feature(feature_name=col)
                if col != target:
                    drop_list.append(col)

    logger.info(f'''
#==============================================================================
# DATA CHECK END
#=============================================================================='''
                )

    return df, drop_list
Exemple #12
0
def main():

    '''
    BASE AGGRIGATION
    単一カラムをlevelで粒度指定して基礎集計
    '''
    if agg_code == 'base':

        # =======================================================================
        # 集計するカラムリストを用意
        # =======================================================================
        num_list = get_numeric_features(df=df, ignore=ignore_list)

        # =======================================================================
        # 集計開始
        # =======================================================================
        for num in num_list:
            for method in method_list:
                arg_list.append(df, key, num, method, prefix, '', base)
        ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする '
        call_list = pararell_process(pararell_wrapper(base_aggregation), arg_list)
        result = pd.concat(call_list, axis=1)

        for col in result.columns:
            if not(col.count('@')) or col in ignore_list:
                continue
            print(col)
            #  utils.to_pickle(path=f"{dir}/{col}.fp", obj=result[col].values)
        sys.exit()


        #  for num in num_list:
        #      for method in method_list:
        #          tmp_result = base_aggregation(df=df, level=key, method=method, prefix=prefix, feature=num, drop=True)
        #          result = base.merge(tmp_result, on=key, how='left')
        #          for col in result.columns:
        #              if not(col.count('@')) or col in ignore_list:
        #                  continue
        #              utils.to_pickle(
        #                  path=f"{dir}/{col}.fp", obj=result[col].values)
                #  make_npy(result=result, ignore_list=ignore_features, logger=logger)

    elif agg_code == 'caliculate':

        '''
        CALICULATION
        複数カラムを四則演算し新たな特徴を作成する
        '''
        f1_list = []
        f2_list = []
        used_lsit = []
        for f1 in f1_list:
            for f2 in f2_list:
                ' 同じ組み合わせの特徴を計算しない '
                if f1 == f2:
                    continue
                if sorted([f1, f2]) in used_list:
                    continue
                used_list.append(sorted([f1, f2]))

                if diff:
                    df = diff_feature(df=df, first=f1, second=f2)
                elif div:
                    df = division_feature(df=df, first=f1, second=f2)
                elif pro:
                    df = product_feature(df=df, first=f1, second=f2)

        for col in df.columns:
            utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values)

    elif agg_code == 'cnt':
        '''
        COUNT ENCODING
        level粒度で集計し、cnt_valを重複有りでカウント
        '''
        cat_list = get_categorical_features(df=df, ignore=ignore_list)

        for category_col in cat_list:
            df = cnt_encoding(df, category_col, ignore_list)
        df = base.merge(df, on=key, how='inner')
        cnt_cols = [col for col in df.columns inf col.count('cntec')]
        for col in cnt_cols:
            utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values)

    elif agg_code == 'category':
        arg_list = []
        ' カテゴリカラム '
        cat_list = get_categorical_features(df=df, ignore=ignore_list)
        num_list = get_numeric_features(df=df, ignore=ignore_list)

        for cat in cat_list:
            for value in num_list:
                for method in method_list:
                    arg_list.append(base, df, key, cat, value,
                                    method, ignore_list, prefix)

        ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする '
        pararell_process(pararell_wrapper(select_category_value_agg), arg_list)
        #  select_category_value_agg(base, df=df, key=key, category_col=cat, value=value, method, ignore_list, prefix)

    elif agg_code == 'combi':
        combi_num = [1, 2, 3][0]
        cat_combi = list(combinations(categorical, combi_num))

    elif agg_code == 'dummie':

        ' データセットのカテゴリカラムをOneHotエンコーディングし、その平均をとる '
        cat_list = get_categorical_features(data, ignore_features)
        df = get_dummies(df=df, cat_list=cat_list)
Exemple #13
0
# ===========================================================================
# DATA LOAD
# ===========================================================================
base = utils.read_df_pkl(path='../input/base_app*')
fname = 'app'
prefix = f'{feat_no}{fname}_'
df = utils.read_df_pkl(path=f'../input/clean_{fname}*.p')

train = df[~df[target].isnull()]
test = df[df[target].isnull()]

neighbor = '110_app_neighbor81@'
train[neighbor] = utils.read_pkl_gzip('../input/[email protected]')
test[neighbor] = utils.read_pkl_gzip('../input/[email protected]')
combi = [neighbor, cat]
cat_list = get_categorical_features(df=df, ignore_list=ignore_list)

#========================================================================
# TARGET ENCODING
#========================================================================
for cat in cat_list:
    combi = cat
    feat_train, feat_test = target_encoding(logger=logger,
                                            train=train,
                                            test=test,
                                            key=key,
                                            level=combi,
                                            target=target,
                                            fold_type='stratified',
                                            group_col_name='',
                                            prefix='',
Exemple #14
0
# Global Variable
from info_home_credit import hcdr_key_cols
key, target, ignore_list = hcdr_key_cols()
#========================================================================

app = utils.read_df_pkl(path='../input/clean_app*.p')[[key, target]]

filekey='bureau'
filepath = f'../input/clean_{filekey}*.p'
df = utils.read_df_pkl(path=filepath)
df = df.merge(app, on=key, how='inner')

train = df[~df[target].isnull()]
test = df[df[target].isnull()]

categorical_features = get_categorical_features(df=train, ignore_list=ignore_list)

mkdir_func(f'../features/{filekey}')

#========================================================================
# Numeric Feature Save
#========================================================================
for col in train.columns:
    if col in categorical_features:continue

    utils.to_pkl_gzip(obj=train[col].values, path=f'../features/{filekey}/train_{col}')
    if col != target:
        utils.to_pkl_gzip(obj=test[col].values, path=f'../features/{filekey}/test_{col}')

#========================================================================
# Categorical Feature Encode
Exemple #15
0
def impute_regression(base, level, dataset, value, prefix=''):
    '''
    Explain:
    Args:
        base : 最後に特徴量をマージしてインデックスを揃える為のDF
        level: baseをマージする粒度
    Return:
    '''

    logger.info(f'\nimpute feature: {value}')

    ' 目的変数にマイナスが入っていた場合、分布の最小値が0となるように加算(対数変換のため) '
    values = dataset[value].values
    min_val = values[~np.isnan(values)].min()
    if min_val < 0:
        dataset[value] = dataset[value].values + min_val * -1

    dataset[target] = dataset[target].map(lambda x: None if x == -1 else x)

    ' 目的のカラムにおいてNullとなっている行がTestで、値が入ってる行がTrain '
    ' is Noneの使い方がわからんので、これでNull判別 '
    dataset['is_train'] = dataset[value].map(lambda x: 1
                                             if np.abs(x) >= 0 else 0)

    ' カテゴリ変数があったらとりあえず整数値にしとく '
    categorical = get_categorical_features(dataset, [])
    dataset = factorize_categoricals(dataset, categorical)

    train = dataset.query('is_train==1')
    test = dataset.query('is_train==0')

    ' カラムにNullがなかったら抜ける '
    if len(train) == 0 or len(test) == 0:
        return

    #  train.drop(['is_train', 'is_test'], axis=1, inplace=True)
    #  test.drop(['is_train', 'is_test'], axis=1, inplace=True)
    train.drop(['is_train'], axis=1, inplace=True)
    test.drop(['is_train'], axis=1, inplace=True)

    train[target] = train[target].fillna(-1)

    ' ターゲットが全て-1のとき '
    if len(train[target].drop_duplicates()) == 1:
        train[f'bin10_{value}'] = pd.qcut(x=train[value],
                                          q=2,
                                          duplicates='drop')
        train = factorize_categoricals(train, [f'bin10_{value}'])
        validation = set_validation(train,
                                    target=f'bin10_{value}',
                                    unique_id=unique_id,
                                    val_col=val_col)
        train.drop(f'bin10_{value}', axis=1, inplace=True)
    else:
        validation = set_validation(train,
                                    target,
                                    unique_id=unique_id,
                                    val_col=val_col)

    train = train.merge(validation, on=unique_id, how='left')

    train[val_col] = train[val_col].fillna(-1)

    ' imputeするfeatureのノイズとなりそうなfeatureはデータセットから除く '
    for col in train.columns:
        #  if col==target or (col.count('impute') and not(col.count('EXT'))):
        if col == target:
            logger.info(f'extract feature: {col}')
            train.drop(col, axis=1, inplace=True)

    #  x, y = train_test_split(train, test_size=0.2)
    #  x['valid_no'] = 0
    #  y['valid_no'] = 1
    #  valid_no = 1
    #  train = pd.concat([x, y], axis=0)

    logger.info(f'train shape: {train.shape}')

    ' testに対する予測結果(array)が戻り値 '
    impute_value, cv_score = cross_prediction(
        logger=logger,
        train=train,
        test=test,
        target=value,
        #  categorical_feature=categorical,
        val_col=val_col)
    if cv_score < 0.25:
        return 0
    ' 目的変数にマイナスが入っていた場合、対数変換の関係で行った前処理を元に戻す '
    if min_val < 0:
        train[value] = train[value].values + min_val
        impute_value = impute_value + min_val

    ' データセットにJoinする際のインデックスソートを揃える為、unique_idをカラムに戻す '
    train.reset_index(inplace=True)
    test.reset_index(inplace=True)
    train = train[level + [value]]
    test = test[level + [value]]
    test[value] = impute_value
    result = pd.concat([train, test], axis=0)

    result = base.merge(result, on=level, how='left')
    print(result.shape)
    print(result.head())
    print(result.tail())

    #  result.set_index(unique_id, inplace=True)
    #  print(result.loc[check_id, :])
    #  print(result.query('is_test==1').head(10))
    #  sys.exit()

    np.save(f'../features/1_first_valid/{prefix}{value}_impute',
            result[value].values)

    return cv_score
Exemple #16
0
def main():

    path = f'../input/{sys.argv[1]}*'
    df = utils.read_df_pickle(path=path)
    prefix = sys.argv[2]
    '''
    BASE AGGRIGATION
    単一カラムをlevelで粒度指定して基礎集計
    '''
    if agg_code == 'base':
        one_base_agg(df=df, prefix=prefix)
    elif agg_code == 'caliculate':
        df = two_calicurate(df=df)
        if prefix != 'app_':
            one_base_agg(df=df, prefix=prefix)
        else:
            for col in df.columns:
                utils.to_pickle(path=f"{dir}/{prefix}{col}.fp",
                                obj=df[col].values)

    elif agg_code == 'cnt':
        '''
        COUNT ENCODING
        level粒度で集計し、cnt_valを重複有りでカウント
        '''
        cat_list = get_categorical_features(df=df, ignore=ignore_list)

        for category_col in cat_list:
            df = cnt_encoding(df, category_col, ignore_list)
        df = base.merge(df, on=key, how='inner')
        cnt_cols = [col for col in df.columns if col.count('cntec')]
        for col in cnt_cols:
            if exclude_feature(col, df[col].values): continue
            utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values)

    elif agg_code == 'category':

        ' カテゴリカラムの中のvalue毎に集計する '
        arg_list = []
        cat_list = get_categorical_features(df=df, ignore=ignore_list)
        num_list = get_numeric_features(df=df, ignore=ignore_list)
        for cat in cat_list:
            for value in num_list:
                for method in method_list:
                    select_category_value_agg(base,
                                              df=df,
                                              key=key,
                                              category_col=cat,
                                              value=value,
                                              method=method,
                                              ignore_list=ignore_list,
                                              prefix=prefix)
                    #  arg_list.append(base, df, key, cat, value, method, ignore_list, prefix)

        #  pararell_process(select_cat_wrapper, arg_list)

    elif agg_code == 'combi':
        combi_num = [1, 2, 3][0]
        cat_combi = list(combinations(categorical, combi_num))

    elif agg_code == 'dummie':

        ' データセットのカテゴリカラムをOneHotエンコーディングし、その平均をとる '
        cat_list = get_categorical_features(data, ignore_features)
        df = get_dummies(df=df, cat_list=cat_list)