class PreprocClassify: def __init__(self, id_path, con_path, char_type): self.file_io = FileIO() self.encode = CategoryEncode() self.count_rec = CountRecord() self.extract_col = ExtractColumns() self.bin = Binning() self.ss = Scaler() # ファイルオープン self.id = self.file_io.open_file_as_pandas(id_path, char_type) self.con = self.file_io.open_file_as_pandas(con_path, char_type) def make_class_data(self, out_path): '''目的変数を識別し、分析対象ファイルにマージする''' # 売上<=0を削除 #org_df = self.con.drop(self.con[self.con['売上']<=0].index) # 目的変数列を抽出 cust_attr_col_list = [] # 抽出列リストを初期化 cust_attr_tg_list = ['売上'] # 抽出列リストに目的変数列を追加 cust_con_col = self.extract_col.extract(self.con, self.con['顧客ID'], extract_col=cust_attr_tg_list) # 不要な顧客ID列を削除 cust_con_col = cust_con_col.drop(['顧客ID'], axis=1) # 欠損値をゼロうめ cust_con_col = cust_con_col.fillna(0) # 抽出した目的変数列に対して、標準化(平均0, 分散1)処理を行う std_cust_con_col = self.ss.sl_standard_scaler(cust_con_col, data_type='float') # 標準化された目的変数列を平均より上か下かで識別 type_bins = [-1, 0, 1] # 範囲:(-1,1), 0で分割する type_bin_label_list = [0, 1] # 0より小: low, 0より大: high type_col = self.bin.list_divide(std_cust_con_col['売上'], type_bins, type_bin_label_list) # 分類用データの生成 type_df = pd.DataFrame( data=type_col, index=std_cust_con_col.index) # 分類用データ(numpy)をdataframeに変更 type_df.columns = ['クラス'] # dataframeのカラム名を変更 # 分類用データを既存の分析用データにマージ type_df = pd.concat([self.id, type_df], axis=1) # id列とtype_dfを連結 con = pd.merge(self.con, type_df, on='顧客ID', how='left') # 既存dataframeとtype_dfを連結 # 書き出し処理 self.file_io.export_csv_from_pandas(con, out_path)
class ExtractReserve: def __init__(self, in_path, in_char, out_path, out_char, reg_type_path): self.count_rec = CountRecord() self.file_io = FileIO() self.in_path = in_path self.in_char = in_char self.out_path = out_path self.out_char = out_char self.reg_type_path = reg_type_path def extract(self): # ファイルオープン処理 file = self.file_io.open_file_as_pandas(self.in_path, self.in_char) # 集計処理1 # 顧客ID, 状況, 指名区分を鍵としてレコード数を集計 status = self.count_rec.group_size( file, index_col='顧客ID', aggregate_col=['顧客ID', '状況', '指名区分']) # 集計処理2 # 顧客IDを鍵として認知媒体区分、登録区分を抽出 register_type = self.count_rec.drop_duplicates( file, index_col='顧客ID', keep_list=['顧客ID', '登録区分']) # 書き出し処理 self.file_io.export_csv_from_pandas(status, self.out_path) self.file_io.export_csv_from_pandas(register_type, self.reg_type_path) # ヘッダー付与のため再度ファイルオープン out_file = self.file_io.open_file_as_pandas(self.out_path, self.out_char) # ヘッダー付与 out_file.columns = ['顧客ID', '状況', '指名区分', '予約回数'] # 書き出し処理 self.file_io.export_csv_from_pandas(out_file, self.out_path)
class ExtractCancel: def __init__(self, in_path, in_char, out_path, out_char): self.count_rec = CountRecord() self.file_io = FileIO() self.in_path = in_path self.in_char = in_char self.out_path = out_path self.out_char = out_char def extract(self): # ファイルオープン処理 file = self.file_io.open_file_as_pandas(self.in_path,self.in_char) # 集計処理 vc = self.count_rec.count_record(file, '顧客ID') # 書き出し処理 self.file_io.export_csv_from_pandas(vc, self.out_path) # ヘッダー付与のため再度ファイルオープン out_file = self.file_io.open_file_as_pandas(self.out_path,self.out_char) # ヘッダー付与 out_file.columns = ['顧客ID','キャンセル回数'] # 書き出し処理 self.file_io.export_csv_from_pandas(out_file, self.out_path)
class ExtractLog: def __init__(self, in_path, in_char, stay_time_path, out_char, pv_sum_path, session_path): self.count_rec = CountRecord() self.file_io = FileIO() self.in_path = in_path self.in_char = in_char self.stay_time_path = stay_time_path self.out_char = out_char self.pv_sum_path = pv_sum_path self.session_path = session_path def extract(self): # ファイルオープン処理 file = self.file_io.open_file_as_pandas(self.in_path,self.in_char) # 不要列を削除 file = file.drop(['IPアドレス','メソッド','パス','HTTPバージョン','ファイル名','レスポンスバイト数','リファラ','ユーザーエージェント','レスポンスタイム'], axis=1) # timestamp列をdatetime表示 file['アクセス日時_unix'] = pd.to_datetime(file['アクセス日時']) # アクセス日時の差(秒)を算出 file['アクセス間隔'] = (file['アクセス日時_unix'].shift(-1) - file['アクセス日時_unix']).dt.seconds # 顧客IDの同一性を確認 file['顧客ID同一当否'] = (file['顧客ID'].shift(-1) == file['顧客ID']) # IDが同一でないセルのアクセス間隔をゼロにする file.loc[~file['顧客ID同一当否'], 'アクセス間隔'] = 0 # 同一セッションのアクセスであるフラグ file.loc[file['顧客ID同一当否'], 'セッションフラグ'] = 1 # 総滞在時間 stay_time = self.count_rec.group_sum(file, index_col='顧客ID', aggregate_col='アクセス間隔') # 閲覧ページ総数(集計処理) pv_sum = self.count_rec.count_record(file, '顧客ID') # セッション回数 same_session = self.count_rec.group_sum(file, index_col='顧客ID', aggregate_col='セッションフラグ') # 書き出し処理 #self.file_io.export_csv_from_pandas(file, './data/out/log.csv') self.file_io.export_csv_from_pandas(stay_time, self.stay_time_path) self.file_io.export_csv_from_pandas(pv_sum, self.pv_sum_path) self.file_io.export_csv_from_pandas(same_session, self.session_path) # ヘッダー付与のため再度ファイルオープン out_file1 = self.file_io.open_file_as_pandas(self.stay_time_path,self.out_char) out_file2 = self.file_io.open_file_as_pandas(self.pv_sum_path,self.out_char) out_file3 = self.file_io.open_file_as_pandas(self.session_path,self.out_char) # ヘッダー付与 out_file1.columns = ['顧客ID','滞在時間'] out_file2.columns = ['顧客ID','閲覧ページ総数'] out_file3.columns = ['顧客ID','閲覧ページ数/セッション'] # 書き出し処理 self.file_io.export_csv_from_pandas(out_file1, self.stay_time_path) self.file_io.export_csv_from_pandas(out_file2, self.pv_sum_path) self.file_io.export_csv_from_pandas(out_file3, self.session_path)
class LinRegression: def __init__(self): self.lr = LinearRegression() self.file_io = FileIO() #self.pca = PCAProcess() #self.chart = DrawChart() self.test = Test() #self.individual = IndividualTest() #self.sc = StandardScaler() #self.ms = MinMaxScaler() self.ss = Scaler() self.drop_na = DropNaN() def regression(self, in_path, out_path): # ファイルオープン処理 org_df = self.file_io.open_file_as_pandas(in_path,"utf-8") feat_shop = self.file_io.open_file_as_pandas('./data/out/feat_shop.csv','utf-8') feat_pref = self.file_io.open_file_as_pandas('./data/out/feat_pref.csv','utf-8') ''' # 目的変数 org_df['支払合計'] = org_df['現金外支払合計'] + org_df['現金支払合計'] # 不要な説明変数削除 org_df = org_df.drop(['現金外支払合計', '現金支払合計'],axis=1) # 目的変数がゼロ以下の行を削除 org_df = org_df.drop(org_df[org_df['支払合計']==0].index) # 欠損値が多すぎる列を削除 #org_df = org_df.drop(['売上単価'],axis=1) # 目的変数が欠損値の行を削除 org_df = org_df.dropna(subset=['支払合計']) ''' # shop追加 #org_df = pd.merge(org_df, feat_shop, on='顧客ID',how='left') org_df = pd.merge(org_df, feat_pref, on='顧客ID',how='left') org_df = org_df.drop(['Unnamed: 0_x','Unnamed: 0_y'],axis=1) org_df = org_df[org_df.columns.drop(list(org_df.filter(regex='Unnamed:')))] # 売上<=0を削除 org_df = org_df.drop(org_df[org_df['売上']<=0].index) # 不要列削除 #org_df = org_df.drop(['Unnamed: 0', '顧客ID'], axis=1) org_df = org_df.drop(['顧客ID'],axis=1) #org_df = org_df[org_df.columns.drop(list(org_df.filter(regex='Unnamed:')))] #org_df = org_df.columns.drop(org_df.columns.str.contains('Unnamed:')) # 欠損値が70%以上の列を削除 #org_df = self.drop_na.drop_na_col(org_df, len(org_df), 0.7) #print('\n rows of org_df is:') #print(len(org_df)) #print(type(len(org_df))) # 欠損値をゼロうめ org_df = org_df.fillna(0) # 目的変数Yと説明変数X Y = org_df['売上'] #Y = org_df['スコア'] #X = org_df.drop(['支払合計'],axis=1) X = org_df.drop(['売上単価','数量','売上'],axis=1) #X = org_df.drop(['商品コード','売上単価','数量','売上','明細ID','スコア'],axis=1) X = X.drop(['キャンセル回数','コンタクト回数','問い合わせ回数'],axis=1) #X = X.drop(['治療送客回数_あり','治療送客回数_なし','院長挨拶回数_あり','院長挨拶回数_なし','紹介カード受渡回数_あり','紹介カード受渡回数_なし','携帯TEL_有','携帯メール_有','性別_女','性別_男','自宅TEL_有','PCメール_有'],axis=1) #X = X.drop(['職業_学生','職業_会社員','職業_主婦','職業_自営業','職業_その他','職業_パート・アルバイト'],axis=1) X = X.drop(['登録区分_HP','登録区分_店舗','登録区分_CC'],axis=1) X = X.drop(['生年月日','滞在時間','閲覧ページ総数','閲覧ページ数/セッション'],axis=1) X = X.drop(['治療送客回数_空欄','指名回数_空欄','コース受諾回数_空欄','紹介カード受渡回数_空欄','院長挨拶回数_空欄','性別_空欄','携帯TEL_空欄','自宅TEL_空欄','携帯メール_空欄','PCメール_空欄','職業_空欄','登録区分_空欄'],axis=1) X = X[X.columns.drop(list(org_df.filter(regex='_nan')))] X = X[X.columns.drop(list(org_df.filter(regex='_なし')))] #X = X[X.columns.drop(list(org_df.filter(regex='_空欄')))] X = X[X.columns.drop(list(org_df.filter(regex='_無')))] X = X[X.columns.drop(list(org_df.filter(regex='_削除')))] X = X[X.columns.drop(list(org_df.filter(regex='施術時間')))] X = X[X.columns.drop(list(org_df.filter(regex='性別_男')))] X = X[X.columns.drop(list(org_df.filter(regex='性別_女')))] X = X[X.columns.drop(list(org_df.filter(regex='携帯TEL_有')))] X = X[X.columns.drop(list(org_df.filter(regex='治療送客回数_あり')))] X = X[X.columns.drop(list(org_df.filter(regex='紹介カード受渡回数_あり')))] X = X[X.columns.drop(list(org_df.filter(regex='町域_')))] # 結果にほとんど関係ないので削除 # 標準化 std_X = self.ss.standard_scaler(X,axis=1,data_type='float') #std_Y = pd.DataFrame(self.sc.fit_transform(Y)) #std_Y.columns = Y.columns #std_X = pd.DataFrame(self.sc.fit_transform(X)) #std_X.columns = X.columns # 正規化 #norm_Y = pd.DataFrame(self.ms.fit_transform(Y)) #norm_Y.columns = Y.columns #norm_X = pd.DataFrame(self.ms.fit_transform(X)) #norm_X.columns = X.columns #self.file_io.export_csv_from_pandas(X, './data/out/X.csv') # トレーニングデータとテストデータに分割(30%) X_train, X_test, Y_train, Y_test = self.test.make_train_test_data(std_X, Y, 0.3) #X_train, X_test, Y_train, Y_test = self.test.make_train_test_data(X, Y, 0.3) print(X_train.head()) print("--- X_train's shape ---\n {}\n".format(X_train.shape)) print(X_test.head()) print("--- X_test's shape ---\n {}\n".format(X_test.shape)) print(Y_train.head()) print("--- Y_train's shape ---\n {}\n".format(Y_train.shape)) print(Y_test.head()) print("--- Y_test's shape ---\n {}\n".format(Y_test.shape)) # 重回帰分析を実施 self.lr.fit(X_train, Y_train) # 偏回帰係数 print(pd.DataFrame({"Name":X.columns, "Coefficients":self.lr.coef_}).sort_values(by='Coefficients') ) # 切片 (誤差) print(self.lr.intercept_) # pandasファイル作成 org_pd = pd.DataFrame({"Name":X.columns, "Coefficients":self.lr.coef_}) # ファイルアウトプット self.file_io.export_csv_from_pandas(org_pd, out_path) # 精度を算出 # トレーニングデータ print(" --- train score ---\n {}\n".format(self.lr.score(X_train,Y_train))) # テストデータ print(" --- test score ---\n {}\n".format(self.lr.score(X_test,Y_test))) return self.lr.score(X_train,Y_train), self.lr.score(X_test,Y_test)
class ExtractSalesSp: def __init__(self, in_path, in_char, payment_path, out_char, cust_attr_path, target_attr_path, average_attr_path): self.count_rec = CountRecord() self.file_io = FileIO() self.in_path = in_path self.in_char = in_char self.payment_path = payment_path self.out_char = out_char self.cust_attr_path = cust_attr_path self.target_attr_path = target_attr_path self.average_attr_path = average_attr_path def extract(self): # ファイルオープン処理 file = self.file_io.open_file_as_pandas(self.in_path, self.in_char) # 顧客属性前処理:顧客属性取得のため、個別商品の行を売上の行に統合 sales_file = file.query('明細コード == 1') # 集計処理:顧客IDごとの支払情報を集計 cust_payment = self.count_rec.group_sum(sales_file, index_col='顧客ID', aggregate_col=['顧客ID', '施術時間']) # 顧客属性集計処理:顧客IDごとの属性情報を集計 ex_id = sales_file['顧客ID'] ex_nominate = sales_file['指名回数'] ex_course = sales_file['コース受諾回数'] ex_card = sales_file['紹介カード受渡回数'] ex_reception = sales_file['治療送客回数'] ex_director = sales_file['院長挨拶回数'] # 追加顧客属性 #ex_branch = sales_file['店舗'] #ex_accosiate = sales_file['担当者'] # マージ cust_attr = pd.concat([ex_id, ex_nominate], axis=1) cust_attr = pd.concat([cust_attr, ex_course], axis=1) cust_attr = pd.concat([cust_attr, ex_card], axis=1) cust_attr = pd.concat([cust_attr, ex_reception], axis=1) cust_attr = pd.concat([cust_attr, ex_director], axis=1) cust_attr = pd.concat([cust_attr, cust_payment], axis=1) #cust_attr = self.cont_rec.group_size(sales_file, index_col='顧客ID', keep_list=['顧客ID','指名回数','コース受託回数','紹介カード受渡回数','治療送客回数','院長挨拶回数']) # 集計処理2.2:顧客IDごとの個別商品属性情報を集計 ex_id_product = file['顧客ID'] ex_product_code = file['商品コード'] ex_price_product = file['売上単価'] ex_amount_product = file['数量'] # マージ product_attr = pd.concat([ex_id_product, ex_product_code], axis=1) product_attr = pd.concat([product_attr, ex_price_product], axis=1) product_attr = pd.concat([product_attr, ex_amount_product], axis=1) # 売上列追加 product_attr['売上'] = file['売上単価'] * file['数量'] # 個別商品IDに相当する列追加 product_attr['明細ID'] = file['伝票コード'] * 10 + file['明細コード'] # スコア列設定 product_attr['スコア'] = 0 # スコア設定 product_attr.loc[product_attr['商品コード'] == '1A1501', 'スコア'] = 5 product_attr.loc[product_attr['商品コード'] == '1B2201', 'スコア'] = 4 product_attr.loc[product_attr['商品コード'] == '1A1601', 'スコア'] = 3 product_attr.loc[product_attr['商品コード'] == '200071', 'スコア'] = 2 product_attr.loc[product_attr['商品コード'] == '200006', 'スコア'] = 1 product_attr['スコア'] = product_attr['スコア'] * product_attr['数量'] # 不要な行を削除 #product_attr = product_attr[(product_attr['商品コード']=='1A1501')|(product_attr['商品コード']=='1B2201')|(product_attr['商品コード']=='1A1601')|(product_attr['商品コード']=='200071')|(product_attr['商品コード']=='200006')] # 書き出し処理 self.file_io.export_csv_from_pandas(cust_payment, self.payment_path) self.file_io.export_csv_from_pandas(cust_attr, self.cust_attr_path) #self.file_io.export_csv_from_pandas(target_attr, self.target_attr_path) self.file_io.export_csv_from_pandas(product_attr, self.average_attr_path)
class IndividualTest: def __init__(self): self.test = Test() self.file_io = FileIO() self.lr = LinearRegression(normalize=True) self.br = BayesianRidge() #self.svr_lin = SVR(kernel='linear', C=1e5) self.svr_poly = SVR(kernel='poly', C=1e5, degree=2) self.svr_rbf = SVR(kernel='rbf', C=5e4, gamma='scale') self.svr_sig = SVR(kernel='sigmoid', C=1e3) #self.gridsearch = GridSearchCV(SVR(kernel='rbf'), scoring="r2", return_train_score=True) self.sc = StandardScaler() self.ms = MinMaxScaler() self.chart = DrawChart2() def lin_reg(self, X, Y, train_test_ratio, col_list, out_path): # 空のDataFrameを作成 df = pd.DataFrame( index=['coefficient', 'intercept', 'train_score', 'test_score'], columns=[]) #print(df.head()) for col in col_list: s_X = pd.DataFrame(X[col]) s_Y = Y # トレーニングデータとテストデータに分割(30%) s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data( s_X, s_Y, train_test_ratio) # 列ごとに単回帰分析 self.lr.fit(s_X_train, s_Y_train) # 偏回帰係数 coef = self.lr.coef_ # 切片 (誤差) intercept = self.lr.intercept_ # トレーニングスコア train_score = self.lr.score(s_X_train, s_Y_train) # テストスコア test_score = self.lr.score(s_X_test, s_Y_test) # DataFrameに追加 df[col] = [coef, intercept, train_score, test_score] # 回帰曲線 lin_pred = self.lr.predict(s_X_test) plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'go-') plt.show() #if col in ['売上単価','コース受諾回数_なし','数量','施術時間','指名回数_あり','治療送客回数_あり','治療送客回数_なし']: # グラフ描画 #self.chart.draw(self.lr, s_X_test, s_Y_test, col, 'score is {}'.format(test_score)) # csvファイルに書き出し self.file_io.export_csv_from_pandas(df, out_path) def bayesian_reg(self, X, Y, train_test_ratio, col_list, out_path): # 空のDataFrameを作成 df = pd.DataFrame( index=['coefficient', 'intercept', 'train_score', 'test_score'], columns=[]) #print(df.head()) for col in col_list: s_X = pd.DataFrame(X[col]) s_Y = Y # トレーニングデータとテストデータに分割(30%) s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data( s_X, s_Y, train_test_ratio) # 列ごとに単回帰分析 self.br.fit(s_X_train, s_Y_train) # 偏回帰係数 coef = self.br.coef_ # 切片 (誤差) intercept = self.br.intercept_ # トレーニングスコア train_score = self.br.score(s_X_train, s_Y_train) # テストスコア test_score = self.br.score(s_X_test, s_Y_test) # DataFrameに追加 df[col] = [coef, intercept, train_score, test_score] if col in [ '売上単価', 'コース受諾回数_なし', '数量', '施術時間', '指名回数_あり', '治療送客回数_あり', '治療送客回数_なし' ]: # グラフ描画 self.chart.draw(self.br, s_X_test, s_Y_test, col, 'score is {}'.format(test_score)) # csvファイルに書き出し self.file_io.export_csv_from_pandas(df, out_path) def svr_rbf_reg(self, X, Y, train_test_ratio, col_list, out_path): # 空のDataFrameを作成 df = pd.DataFrame(index=[ 'coefficient', 'suport_vector', 'intercept', 'train_score', 'test_score' ], columns=[]) #print(df.head()) for col in col_list: s_X = pd.DataFrame(X[col]) s_Y = Y # トレーニングデータとテストデータに分割(30%) s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data( s_X, s_Y, train_test_ratio) # 列ごとに回帰分析 #self.svr_lin.fit(s_X_train, s_Y_train) #self.svr_poly.fit(s_X_train, s_Y_train) self.svr_rbf.fit(s_X_train, s_Y_train) #self.gridsearch.fit(s_X_train, s_Y_train) # 偏回帰係数 coef = self.svr_rbf.dual_coef_ # サポートベクトル support_vec = self.svr_rbf.support_vectors_ # 切片 (誤差) intercept = self.svr_rbf.intercept_ # 精度 train_score = self.svr_rbf.score(s_X_train, s_Y_train) test_score = self.svr_rbf.score(s_X_test, s_Y_test) # DataFrameに追加 df[col] = [coef, support_vec, intercept, train_score, test_score] #lin_pred = self.svr_lin.predict(s_X_test) #poly_pred = self.svr_poly.predict(s_X_test) rbf_pred = self.svr_rbf.predict(s_X_test) plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-') plt.show() if col in ['生年月日']: #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'ro-') #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, poly_pred, 'yo-') plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-') plt.show() # csvファイルに書き出し self.file_io.export_csv_from_pandas(df, out_path) def svr_poly_reg(self, X, Y, train_test_ratio, col_list, out_path): # 空のDataFrameを作成 df = pd.DataFrame(index=[ 'coefficient', 'suport_vector', 'intercept', 'train_score', 'test_score' ], columns=[]) #print(df.head()) for col in col_list: s_X = pd.DataFrame(X[col]) s_Y = Y # トレーニングデータとテストデータに分割(30%) s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data( s_X, s_Y, train_test_ratio) # 列ごとに回帰分析 self.svr_poly.fit(s_X_train, s_Y_train) # 偏回帰係数 coef = self.svr_poly.dual_coef_ # サポートベクトル support_vec = self.svr_poly.support_vectors_ # 切片 (誤差) intercept = self.svr_poly.intercept_ # 精度 train_score = self.svr_poly.score(s_X_train, s_Y_train) test_score = self.svr_poly.score(s_X_test, s_Y_test) # DataFrameに追加 df[col] = [coef, support_vec, intercept, train_score, test_score] #lin_pred = self.svr_lin.predict(s_X_test) #poly_pred = self.svr_poly.predict(s_X_test) rbf_pred = self.svr_poly.predict(s_X_test) if col in ['生年月日']: #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'ro-') plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, poly_pred, 'yo-') #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-') plt.show() # csvファイルに書き出し self.file_io.export_csv_from_pandas(df, out_path) def svr_sig_reg(self, X, Y, train_test_ratio, col_list, out_path): # 空のDataFrameを作成 df = pd.DataFrame(index=[ 'coefficient', 'suport_vector', 'intercept', 'train_score', 'test_score' ], columns=[]) #print(df.head()) for col in col_list: s_X = pd.DataFrame(X[col]) s_Y = Y # トレーニングデータとテストデータに分割(30%) s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data( s_X, s_Y, train_test_ratio) # 列ごとに回帰分析 self.svr_sig.fit(s_X_train, s_Y_train) # 偏回帰係数 coef = self.svr_sig.dual_coef_ # サポートベクトル support_vec = self.svr_sig.support_vectors_ # 切片 (誤差) intercept = self.svr_sig.intercept_ # 精度 train_score = self.svr_sig.score(s_X_train, s_Y_train) test_score = self.svr_sig.score(s_X_test, s_Y_test) # DataFrameに追加 df[col] = [coef, support_vec, intercept, train_score, test_score] sig_pred = self.svr_sig.predict(s_X_test) plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, sig_pred, 'go-') plt.show() if col in ['生年月日', '閲覧ページ総数', '閲覧ページ数/セッション', '滞在時間']: plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, sig_pred, 'go-') plt.show() # csvファイルに書き出し self.file_io.export_csv_from_pandas( df, inifile.get('regression', 'ind_path'))
class ConcatCsvs: def __init__(self, id_path, cust_payment_path, cust_attr_path, target_attr_path, average_attr_path, cust_path, cancel_path, contact_path, cti_path, register_type_path, status_path, stay_time_path, pv_sum_path, session_path, shop_path, pref_path, char_type): self.file_io = FileIO() self.encode = CategoryEncode() self.count_rec = CountRecord() self.extract_col = ExtractColumns() self.bin = Binning() # ファイルオープン self.id = self.file_io.open_file_as_pandas(id_path, char_type) self.cust_payment = self.file_io.open_file_as_pandas( cust_payment_path, char_type) self.cust_attr = self.file_io.open_file_as_pandas( cust_attr_path, char_type) self.target_attr = self.file_io.open_file_as_pandas( target_attr_path, char_type) self.average_attr = self.file_io.open_file_as_pandas( average_attr_path, char_type) self.cust = self.file_io.open_file_as_pandas(cust_path, char_type) self.cancel = self.file_io.open_file_as_pandas(cancel_path, char_type) self.contact = self.file_io.open_file_as_pandas( contact_path, char_type) self.cti = self.file_io.open_file_as_pandas(cti_path, char_type) self.register_type = self.file_io.open_file_as_pandas( register_type_path, char_type) self.status = self.file_io.open_file_as_pandas(status_path, char_type) self.stay_time = self.file_io.open_file_as_pandas( stay_time_path, char_type) self.pv_sum = self.file_io.open_file_as_pandas(pv_sum_path, char_type) self.session = self.file_io.open_file_as_pandas( session_path, char_type) self.shop = self.file_io.open_file_as_pandas(shop_path, char_type) self.pref = self.file_io.open_file_as_pandas(pref_path, char_type) def concat(self, out_path, out_path2): # 特徴量抽出処理 # cust_payment # カテゴリーデータなし # --- check --- #print("--- cust_payment shape ---\n {}\n".format(self.cust_payment.shape)) #print(self.cust_payment.head()) # cust_attr cust_attr_col_list = [] cust_attr_tg_list = [ '指名回数', 'コース受諾回数', '紹介カード受渡回数', '治療送客回数', '院長挨拶回数' ] # カテゴリ列を抽出 cust_attr_category_col = self.extract_col.extract( self.cust_attr, self.cust_attr['顧客ID'], extract_col=cust_attr_tg_list) # 非カテゴリ列を抽出 cust_attr_non_category_col = self.extract_col.exclude( self.cust_attr, exclude_col=cust_attr_tg_list) # 特徴量抽出 org_cust_attr = self.encode.transform_feature( cust_attr_category_col, aggregate_col=cust_attr_tg_list) org_cust_attr = org_cust_attr.fillna(0) #org_cust_attr = org_cust_attr.drop('Unnamed: 0', axis=1) # ラベル付与 for col in cust_attr_tg_list: cust_attr_col_list += self.encode.transform_label( self.cust_attr[col], col) else: cust_attr_col_list += ['顧客ID'] # ラベル設定 org_cust_attr.columns = cust_attr_col_list # 集計処理 feat_cust_attr = self.count_rec.group_sum( org_cust_attr, index_col='顧客ID', aggregate_col=cust_attr_col_list) # カテゴリ列と非カテゴリ列を結合 feat_cust_attr = pd.merge(feat_cust_attr, cust_attr_non_category_col, on='顧客ID', how='left') feat_cust_attr = feat_cust_attr.drop('Unnamed: 0', axis=1) # --- check --- #print("--- feat_cust_attr shape ---\n {}\n".format(feat_cust_attr.shape)) #print(feat_cust_attr.head()) #self.file_io.export_csv_from_pandas(feat_cust_attr, './data/out/mid_feat_cust_attr.csv') # product_attr ''' product_attr_col_list = [] product_attr_tg_list = ['商品コード'] # カテゴリ列を抽出 product_attr_category_col = self.extract_col.extract(self.target_attr, self.target_attr['明細ID'], extract_col=product_attr_tg_list) # 元DSからカテゴリ列を除去することによって、非カテゴリ列を抽出 product_attr_non_category_col = self.extract_col.exclude(self.target_attr, exclude_col=product_attr_tg_list) # 特徴量抽出 org_product_attr = self.encode.transform_feature(product_attr_category_col, aggregate_col=product_attr_tg_list) org_product_attr = org_product_attr.fillna(0) #org_product_attr = org_product_attr.drop('Unnamed: 0', axis=1) #print(org_product_attr) # ラベル付与 for col in product_attr_tg_list: product_attr_col_list += self.encode.transform_label(self.target_attr[col],col) else: product_attr_col_list += ['明細ID'] # ラベル設定 org_product_attr.columns = product_attr_col_list # カテゴリ列と非カテゴリ列を結合 feat_product_attr = pd.merge(org_product_attr, product_attr_non_category_col, on='明細ID',how='left') feat_product_attr = feat_product_attr.drop('Unnamed: 0', axis=1) ''' # product_attr feat_product_attr = self.average_attr # --- check --- #print("--- feat_product_attr shape ---\n {}\n".format(feat_cust_attr.shape)) #print(feat_product_attr.head()) #self.file_io.export_csv_from_pandas(feat_product_attr, './data/out/mid_feat_product_attr.csv') # cust cust_col_list = [] cust_tg_list = ['性別', '携帯TEL', '自宅TEL', '携帯メール', 'PCメール', '職業'] # 外れ値を削除 new_cust = self.cust.drop(self.cust[self.cust['生年月日'].str.contains( '\*', na=True)].index) today = int(pd.to_datetime('today').strftime('%Y%m%d')) new_cust['生年月日'] = pd.to_datetime( new_cust['生年月日']).dt.strftime('%Y%m%d').astype(np.int64) new_cust['生年月日'] = ((today - new_cust['生年月日']) / 10000).astype( np.int64) new_cust['生年月日'] = self.bin.list_divide(new_cust['生年月日'], [0, 10, 20, 30, 40, 50], ['10', '20', '30', '40', '50']) # カテゴリ列を抽出 cust_category_col = self.extract_col.extract(new_cust, new_cust['顧客ID'], extract_col=cust_tg_list) # 非カテゴリ列を抽出 cust_non_category_col = self.extract_col.exclude( new_cust, exclude_col=cust_tg_list) # 特徴量抽出 feat_cust = self.encode.transform_feature(cust_category_col, aggregate_col=cust_tg_list) feat_cust = feat_cust.fillna(0) #feat_cust = feat_cust.drop('Unnamed: 0', axis=1) feat_cust = feat_cust[feat_cust.columns.drop( list(feat_cust.filter(regex='Unnamed:')))] # ラベル付与 for col in cust_tg_list: cust_col_list += self.encode.transform_label(new_cust[col], col) else: cust_col_list += ['顧客ID'] # ラベル設定 feat_cust.columns = cust_col_list # カテゴリ列と非カテゴリ列を結合 feat_cust = pd.merge(feat_cust, cust_non_category_col, on='顧客ID', how='left') #feat_cust = feat_cust.drop('Unnamed: 0', axis=1) # --- check --- #print("--- feat_cust shape ---\n {}\n".format(feat_cust.shape)) #print(feat_cust.head()) #self.file_io.export_csv_from_pandas(feat_cust, './data/out/mid_feat_cust.csv') # shop shop_col_list = [] shop_tg_list = ['担当店舗'] # カテゴリ列を抽出 shop_category_col = self.extract_col.extract(self.shop, self.shop['顧客ID'], extract_col=shop_tg_list) # 特徴量抽出 feat_shop = self.encode.transform_feature(shop_category_col, aggregate_col=shop_tg_list) feat_shop = feat_shop.fillna(0) #feat_shop = feat_cust.drop('Unnamed: 0', axis=1) feat_shop = feat_shop[feat_shop.columns.drop( list(feat_shop.filter(regex='Unnamed:')))] # ラベル付与 for col in shop_tg_list: shop_col_list += self.encode.transform_label(self.shop[col], col) else: shop_col_list += ['顧客ID'] # ラベル設定 feat_shop.columns = shop_col_list #feat_shop = feat_shop.drop('Unnamed: 0', axis=1) # --- check --- #print("--- feat_shop shape ---\n {}\n".format(feat_shop.shape)) #print(feat_shop.head()) #self.file_io.export_csv_from_pandas(feat_shop, './data/out/mid_feat_shop.csv') # pref pref_col_list = [] pref_tg_list = ['町域'] new_pref = self.pref.drop(self.pref[self.pref['町域'] == 0].index) # カテゴリ列を抽出 pref_category_col = self.extract_col.extract(new_pref, new_pref['顧客ID'], extract_col=pref_tg_list) # 特徴量抽出 feat_pref = self.encode.transform_feature(pref_category_col, aggregate_col=pref_tg_list) feat_pref = feat_pref.fillna(0) #feat_pref = feat_cust.drop('Unnamed: 0', axis=1) feat_pref = feat_pref[feat_pref.columns.drop( list(feat_pref.filter(regex='Unnamed:')))] # ラベル付与 for col in pref_tg_list: pref_col_list += self.encode.transform_label(self.pref[col], col) else: pref_col_list += ['顧客ID'] # ラベル設定 feat_pref.columns = pref_col_list #feat_pref = feat_pref.drop('Unnamed: 0', axis=1) # --- check --- #print("--- feat_pref shape ---\n {}\n".format(feat_pref.shape)) #print(feat_pref.head()) #self.file_io.export_csv_from_pandas(feat_pref, './data/out/mid_feat_pref.csv') # cancel # カテゴリーデータなし # --- check --- #print("--- cancel shape ---\n {}\n".format(cancel.shape)) #print(cancel.head()) # contact # カテゴリーデータなし # --- check --- #print("--- contact shape ---\n {}\n".format(contact.shape)) #print(contact.head()) # cti # カテゴリーデータなし # --- check --- #print("--- cti shape ---\n {}\n".format(cti.shape)) #print(cti.head()) # stay_time new_stay_time = self.stay_time new_stay_time['滞在時間'] = self.bin.quant_divide( new_stay_time['滞在時間'], 6, ['1', '2', '3', '4', '5']) bin_stay_time = new_stay_time.drop('Unnamed: 0', axis=1) # pv_sum new_pv_sum = self.pv_sum new_pv_sum['閲覧ページ総数'] = self.bin.quant_divide( new_pv_sum['閲覧ページ総数'], 11, ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']) bin_pv_sum = new_pv_sum.drop('Unnamed: 0', axis=1) # session new_session = self.session new_session['閲覧ページ数/セッション'] = self.bin.quant_divide( new_session['閲覧ページ数/セッション'], 6, ['1', '2', '3', '4', '5']) bin_session = new_session.drop('Unnamed: 0', axis=1) # register_type reg_col_list = [] reg_tg_list = ['登録区分'] # カテゴリ列を抽出 reg_category_col = self.extract_col.extract(self.register_type, self.register_type['顧客ID'], extract_col=reg_tg_list) # 非カテゴリ列を抽出 reg_non_category_col = self.extract_col.exclude( self.register_type, exclude_col=reg_tg_list) # 特徴量抽出 feat_register_type = self.encode.transform_feature( reg_category_col, aggregate_col=reg_tg_list) feat_register_type = feat_register_type.fillna(0) #feat_register_type = feat_register_type.drop('Unnamed: 0', axis=1) # ラベル付与 for col in reg_tg_list: reg_col_list += self.encode.transform_label( self.register_type[col], col) else: reg_col_list += ['顧客ID'] # ラベル設定 feat_register_type.columns = reg_col_list # カテゴリ列と非カテゴリ列を結合 feat_register_type = pd.merge(feat_register_type, reg_non_category_col, on='顧客ID', how='left') feat_register_type = feat_register_type.drop('Unnamed: 0', axis=1) # --- check --- #print("--- feat_register_type shape ---\n {}\n".format(feat_register_type.shape)) #print(feat_register_type.head()) #self.file_io.export_csv_from_pandas(feat_register_type, './data/out/mid_feat_register_type.csv') # status stat_col_list = [] stat_tg_list = ['状況', '指名区分'] # カテゴリ列を抽出 stat_category_col = self.extract_col.extract(self.status, self.status['顧客ID'], extract_col=stat_tg_list) # 非カテゴリ列を抽出 stat_non_category_col = self.extract_col.exclude( self.status, exclude_col=stat_tg_list) # 特徴量抽出 feat_status = self.encode.transform_feature(stat_category_col, aggregate_col=stat_tg_list) feat_status = feat_status.fillna(0) #feat_status = feat_status.drop('Unnamed: 0', axis=1) # ラベル付与 for col in stat_tg_list: stat_col_list += self.encode.transform_label(self.status[col], col) else: stat_col_list += ['顧客ID'] # ラベル設定 feat_status.columns = stat_col_list # カテゴリ列と非カテゴリ列を結合 feat_status = pd.merge(feat_status, stat_non_category_col, on='顧客ID', how='left') feat_status = feat_status.drop('Unnamed: 0', axis=1) #feat_status = feat_status.drop('Unnamed: 0', axis=1) # --- check --- #print("--- feat_status shape ---\n {}\n".format(feat_status.shape)) #print(feat_status.head()) #self.file_io.export_csv_from_pandas(feat_status, './data/out/mid_feat_status.csv') # 結合処理 con_file = pd.merge(feat_product_attr, self.cust_payment, on='顧客ID', how='left') #print("1.1: shape is {}".format(con_file.shape)) con_file = pd.merge(con_file, self.cancel, on='顧客ID', how='left') #print("1.2: shape is {}".format(con_file.shape)) con_file = pd.merge(con_file, self.contact, on='顧客ID', how='left') #print("1.3: shape is {}".format(con_file.shape)) con_file = pd.merge(con_file, self.cti, on='顧客ID', how='left') #print("1.4: shape is {}".format(con_file.shape)) con_file = pd.merge(con_file, bin_stay_time, on='顧客ID', how='left') #print("1.5: shape is {}".format(con_file.shape)) con_file = pd.merge(con_file, bin_pv_sum, on='顧客ID', how='left') #print("1.6: shape is {}".format(con_file.shape)) con_file = pd.merge(con_file, bin_session, on='顧客ID', how='left') #print("1.7: shape is {}".format(con_file.shape)) con_file = pd.merge(con_file, feat_cust_attr, on='顧客ID', how='left') #print("1.8: shape is {}".format(con_file.shape)) con_file = pd.merge(con_file, feat_cust, on='顧客ID', how='left') #print("1.9: shape is {}".format(con_file.shape)) con_file = pd.merge(con_file, feat_register_type, on='顧客ID', how='left') #print("1.10: shape is {}".format(con_file.shape)) #con_file = pd.merge(con_file, feat_status, on='顧客ID',how='left') #print("1.11: shape is {}".format(con_file.shape)) '''con_file = pd.concat([ self.cust_payment, feat_cust_attr, feat_cust, self.cancel, self.contact, self.cti, feat_register_type, feat_status, self.stay_time, self.pv_sum, self.session], axis=1, join_axes=['顧客ID'])''' # --- check --- #print("--- con_file shape ---\n {}\n".format(con_file.shape)) #print(con_file.head()) # 結合処理 con_product_file = pd.merge(self.id, self.cust_payment, on='顧客ID', how='left') con_product_file = pd.merge(con_product_file, feat_product_attr, on='顧客ID', how='left') #print("2.1: shape is {}".format(con_file.shape)) # 重複がある場合、削除 con_file = con_file.drop_duplicates() con_product_file = con_product_file.drop_duplicates() con_product_file = con_product_file.drop(['施術時間', '売上単価', '数量'], axis=1) # 書き出し処理 self.file_io.export_csv_from_pandas(con_file, out_path) self.file_io.export_csv_from_pandas(con_product_file, out_path2) self.file_io.export_csv_from_pandas(feat_shop, './data/out/feat_shop.csv') self.file_io.export_csv_from_pandas(feat_pref, './data/out/feat_pref.csv')