print('add over60 category', customer_tb['age_rank'].cat.categories) # convert data aggreageted customer_tb.loc[customer_tb['age_rank']\ .isin([60.0, 70.0, 80.0]), 'age_rank'] = 'over_60' print('convert data aggreageted', customer_tb) # delete master data which not using customer_tb['age_rank'].cat.remove_unused_categories(inplace=True) # combine category ################ customer_tb['sex_and_age'] = pd.Categorical(customer_tb[['sex', 'age']].apply( lambda x: '{}_{}'.format(int(np.floor(x[1] / 10) * 10), x[0]), axis=1)) print('combine category \n', customer_tb['sex_and_age']) # generate numerical category ################ from preprocess.load_data.data_loader import load_production production = load_production() # a number of faults for each type fault_cnt_per_type = production\ .query('fault_flg')\ .groupby('type')['fault_flg']\ .count() print('a number of fault for each type \n', fault_cnt_per_type) # a number of products for each type type_cnt = production.groupby('type')['fault_flg'].count() production['type_fault_rate'] = production[['type', 'fault_flg']]\ .apply(lambda x: (fault_cnt_per_type[x[0]] - int(x[1])) / (type_cnt[x[0]] - 1), axis=1) print('a number of products for each type \n', production['type_fault_rate']) # complete by knn ################
from preprocess.load_data.data_loader import load_production production_tb = load_production() # 下の行から本書スタート # SMOTE関数をライブラリから読み込み from imblearn.over_sampling import SMOTE # SMOTE関数の設定 # ratioは不均衡データにおける少ない例のデータを多い方のデータの何割まで増やすか設定 # (autoの場合は同じ数まで増やす、0.5と設定すると5割までデータを増やす) # k_neighborsはsmoteのkパラメータ # random_stateは乱数のseed(乱数の生成パターンの元) sm = SMOTE(ratio='auto', k_neighbors=5, random_state=71) # オーバーサンプリング実行 blance_data, balance_target = \ sm.fit_sample(production_tb[['length', 'thickness']], production_tb['fault_flg'])
from preprocess.load_data.data_loader import load_production production = load_production() # 下の行から本書スタート # 製品種別ごとの障害数 fault_cnt_per_type = production \ .query('fault_flg') \ .groupby('type')['fault_flg'] \ .count() # 製品種別ごとの製造数 type_cnt = production.groupby('type')['fault_flg'].count() production['type_fault_rate'] = production[['type', 'fault_flg']] \ .apply(lambda x: (fault_cnt_per_type[x[0]] - int(x[1])) / (type_cnt[x[0]] - 1), axis=1)