Exemple #1
0
print('add over60 category', customer_tb['age_rank'].cat.categories)
# convert data aggreageted
customer_tb.loc[customer_tb['age_rank']\
                .isin([60.0, 70.0, 80.0]), 'age_rank'] = 'over_60'
print('convert data aggreageted', customer_tb)
# delete master data which not using
customer_tb['age_rank'].cat.remove_unused_categories(inplace=True)

# combine category ################
customer_tb['sex_and_age'] = pd.Categorical(customer_tb[['sex', 'age']].apply(
    lambda x: '{}_{}'.format(int(np.floor(x[1] / 10) * 10), x[0]), axis=1))
print('combine category \n', customer_tb['sex_and_age'])

# generate numerical category ################
from preprocess.load_data.data_loader import load_production
production = load_production()
# a number of faults for each type
fault_cnt_per_type = production\
  .query('fault_flg')\
  .groupby('type')['fault_flg']\
  .count()
print('a number of fault for each type \n', fault_cnt_per_type)
# a number of products for each type
type_cnt = production.groupby('type')['fault_flg'].count()

production['type_fault_rate'] = production[['type', 'fault_flg']]\
  .apply(lambda x: (fault_cnt_per_type[x[0]] - int(x[1])) / (type_cnt[x[0]] - 1),
         axis=1)
print('a number of products for each type \n', production['type_fault_rate'])

# complete by knn ################
Exemple #2
0
from preprocess.load_data.data_loader import load_production
production_tb = load_production()

# 下の行から本書スタート
# SMOTE関数をライブラリから読み込み
from imblearn.over_sampling import SMOTE

# SMOTE関数の設定
# ratioは不均衡データにおける少ない例のデータを多い方のデータの何割まで増やすか設定
# (autoの場合は同じ数まで増やす、0.5と設定すると5割までデータを増やす)
# k_neighborsはsmoteのkパラメータ
# random_stateは乱数のseed(乱数の生成パターンの元)
sm = SMOTE(ratio='auto', k_neighbors=5, random_state=71)

# オーバーサンプリング実行
blance_data, balance_target = \
  sm.fit_sample(production_tb[['length', 'thickness']],
                production_tb['fault_flg'])
from preprocess.load_data.data_loader import load_production
production = load_production()

# 下の行から本書スタート
# 製品種別ごとの障害数
fault_cnt_per_type = production \
  .query('fault_flg') \
  .groupby('type')['fault_flg'] \
  .count()

# 製品種別ごとの製造数
type_cnt = production.groupby('type')['fault_flg'].count()

production['type_fault_rate'] = production[['type', 'fault_flg']] \
  .apply(lambda x:
         (fault_cnt_per_type[x[0]] - int(x[1])) / (type_cnt[x[0]] - 1),
         axis=1)