Example #1
0
 def __init__(self, in_path, in_char, out_path, out_char):
     self.count_rec = CountRecord()
     self.file_io = FileIO()
     self.in_path = in_path
     self.in_char = in_char
     self.out_path = out_path
     self.out_char = out_char
Example #2
0
 def __init__(self, in_path, in_char, stay_time_path, out_char, pv_sum_path, session_path):
     self.count_rec = CountRecord()
     self.file_io = FileIO()
     self.in_path = in_path
     self.in_char = in_char
     self.stay_time_path = stay_time_path
     self.out_char = out_char
     self.pv_sum_path = pv_sum_path
     self.session_path = session_path
Example #3
0
 def __init__(self, in_path, in_char, out_path, out_char, id_path, shop_path, pref_path):
     self.pref_code = FindPrefectureCode()
     self.file_io = FileIO()
     self.in_path = in_path
     self.in_char = in_char
     self.out_path = out_path
     self.out_char = out_char
     self.id_path = id_path
     self.shop_path = shop_path
     self.pref_path = pref_path
Example #4
0
 def __init__(self):
     self.lr = LinearRegression()
     self.file_io = FileIO()
     #self.pca = PCAProcess()
     #self.chart = DrawChart()
     self.test = Test()
     self.individual = IndividualTest()
     self.sc = StandardScaler()
     self.ms = MinMaxScaler()
     self.drop_na = DropNaN()
Example #5
0
 def __init__(self, in_path, in_char, payment_path, out_char,
              cust_attr_path, target_attr_path, average_attr_path):
     self.count_rec = CountRecord()
     self.file_io = FileIO()
     self.in_path = in_path
     self.in_char = in_char
     self.payment_path = payment_path
     self.out_char = out_char
     self.cust_attr_path = cust_attr_path
     self.target_attr_path = target_attr_path
     self.average_attr_path = average_attr_path
Example #6
0
    def __init__(self, id_path, con_path, char_type):

        self.file_io = FileIO()
        self.encode = CategoryEncode()
        self.count_rec = CountRecord()
        self.extract_col = ExtractColumns()
        self.bin = Binning()
        self.ss = Scaler()
        # ファイルオープン
        self.id = self.file_io.open_file_as_pandas(id_path, char_type)
        self.con = self.file_io.open_file_as_pandas(con_path, char_type)
Example #7
0
class PreprocClassify:
    def __init__(self, id_path, con_path, char_type):

        self.file_io = FileIO()
        self.encode = CategoryEncode()
        self.count_rec = CountRecord()
        self.extract_col = ExtractColumns()
        self.bin = Binning()
        self.ss = Scaler()
        # ファイルオープン
        self.id = self.file_io.open_file_as_pandas(id_path, char_type)
        self.con = self.file_io.open_file_as_pandas(con_path, char_type)

    def make_class_data(self, out_path):
        '''目的変数を識別し、分析対象ファイルにマージする'''

        # 売上<=0を削除
        #org_df = self.con.drop(self.con[self.con['売上']<=0].index)

        # 目的変数列を抽出
        cust_attr_col_list = []  # 抽出列リストを初期化
        cust_attr_tg_list = ['売上']  # 抽出列リストに目的変数列を追加
        cust_con_col = self.extract_col.extract(self.con,
                                                self.con['顧客ID'],
                                                extract_col=cust_attr_tg_list)

        # 不要な顧客ID列を削除
        cust_con_col = cust_con_col.drop(['顧客ID'], axis=1)

        # 欠損値をゼロうめ
        cust_con_col = cust_con_col.fillna(0)

        # 抽出した目的変数列に対して、標準化(平均0, 分散1)処理を行う
        std_cust_con_col = self.ss.sl_standard_scaler(cust_con_col,
                                                      data_type='float')

        # 標準化された目的変数列を平均より上か下かで識別
        type_bins = [-1, 0, 1]  # 範囲:(-1,1), 0で分割する
        type_bin_label_list = [0, 1]  # 0より小: low, 0より大: high
        type_col = self.bin.list_divide(std_cust_con_col['売上'], type_bins,
                                        type_bin_label_list)  # 分類用データの生成
        type_df = pd.DataFrame(
            data=type_col,
            index=std_cust_con_col.index)  # 分類用データ(numpy)をdataframeに変更
        type_df.columns = ['クラス']  # dataframeのカラム名を変更

        # 分類用データを既存の分析用データにマージ
        type_df = pd.concat([self.id, type_df], axis=1)  # id列とtype_dfを連結
        con = pd.merge(self.con, type_df, on='顧客ID',
                       how='left')  # 既存dataframeとtype_dfを連結

        # 書き出し処理
        self.file_io.export_csv_from_pandas(con, out_path)
Example #8
0
 def __init__(self):
     self.test = Test()
     self.file_io = FileIO()
     self.lr = LinearRegression(normalize=True)
     self.br = BayesianRidge()
     #self.svr_lin = SVR(kernel='linear', C=1e5)
     self.svr_poly = SVR(kernel='poly', C=1e5, degree=2)
     self.svr_rbf = SVR(kernel='rbf', C=5e4, gamma='scale')
     self.svr_sig = SVR(kernel='sigmoid', C=1e3)
     #self.gridsearch = GridSearchCV(SVR(kernel='rbf'), scoring="r2", return_train_score=True)
     self.sc = StandardScaler()
     self.ms = MinMaxScaler()
     self.chart = DrawChart2()
Example #9
0
class ExtractReserve:
    def __init__(self, in_path, in_char, out_path, out_char, reg_type_path):
        self.count_rec = CountRecord()
        self.file_io = FileIO()
        self.in_path = in_path
        self.in_char = in_char
        self.out_path = out_path
        self.out_char = out_char
        self.reg_type_path = reg_type_path

    def extract(self):
        # ファイルオープン処理
        file = self.file_io.open_file_as_pandas(self.in_path, self.in_char)
        # 集計処理1
        # 顧客ID, 状況, 指名区分を鍵としてレコード数を集計
        status = self.count_rec.group_size(
            file, index_col='顧客ID', aggregate_col=['顧客ID', '状況', '指名区分'])
        # 集計処理2
        # 顧客IDを鍵として認知媒体区分、登録区分を抽出
        register_type = self.count_rec.drop_duplicates(
            file, index_col='顧客ID', keep_list=['顧客ID', '登録区分'])
        # 書き出し処理
        self.file_io.export_csv_from_pandas(status, self.out_path)
        self.file_io.export_csv_from_pandas(register_type, self.reg_type_path)

        # ヘッダー付与のため再度ファイルオープン
        out_file = self.file_io.open_file_as_pandas(self.out_path,
                                                    self.out_char)
        # ヘッダー付与
        out_file.columns = ['顧客ID', '状況', '指名区分', '予約回数']
        # 書き出し処理
        self.file_io.export_csv_from_pandas(out_file, self.out_path)
Example #10
0
    def __init__(self):
        #self.lr = LinearRegression()
        self.file_io = FileIO()
        #self.pca = PCAProcess()
        #self.chart = DrawChart()
        self.test = Test()
        self.individual = IndividualTest()
        self.sc = StandardScaler()
        self.ms = MinMaxScaler()
        self.drop_na = DropNaN()

        self.droplist = []
        with open('droplist.txt') as f:
            self.droplist = [s.strip() for s in f.readlines()]
Example #11
0
    def __init__(
        self,
        id_path,
        cust_payment_path,
        cust_attr_path,
        target_attr_path,
        average_attr_path,
        cust_path,
        cancel_path,
        contact_path,
        cti_path,
        register_type_path,
        status_path,
        stay_time_path,
        pv_sum_path,
        session_path,
        shop_path,
        pref_path,
        char_type):

        self.file_io = FileIO()
        self.encode = CategoryEncode()
        self.count_rec = CountRecord()
        self.extract_col = ExtractColumns()
        self.bin = Binning()
        # ファイルオープン
        self.id = self.file_io.open_file_as_pandas(id_path,char_type)
        self.cust_payment = self.file_io.open_file_as_pandas(cust_payment_path, char_type)
        self.cust_attr = self.file_io.open_file_as_pandas(cust_attr_path, char_type)
        self.target_attr = self.file_io.open_file_as_pandas(target_attr_path, char_type)
        self.average_attr = self.file_io.open_file_as_pandas(average_attr_path, char_type)
        self.cust = self.file_io.open_file_as_pandas(cust_path, char_type)
        self.cancel = self.file_io.open_file_as_pandas(cancel_path, char_type)
        self.contact = self.file_io.open_file_as_pandas(contact_path, char_type)
        self.cti = self.file_io.open_file_as_pandas(cti_path, char_type)
        self.register_type = self.file_io.open_file_as_pandas(register_type_path, char_type)
        self.status = self.file_io.open_file_as_pandas(status_path, char_type)
        self.stay_time = self.file_io.open_file_as_pandas(stay_time_path, char_type)
        self.pv_sum = self.file_io.open_file_as_pandas(pv_sum_path, char_type)
        self.session = self.file_io.open_file_as_pandas(session_path, char_type)
        self.shop = self.file_io.open_file_as_pandas(shop_path, char_type)
        self.pref = self.file_io.open_file_as_pandas(pref_path, char_type)
Example #12
0
class ExtractCancel:

    def __init__(self, in_path, in_char, out_path, out_char):
        self.count_rec = CountRecord()
        self.file_io = FileIO()
        self.in_path = in_path
        self.in_char = in_char
        self.out_path = out_path
        self.out_char = out_char

    def extract(self):
        # ファイルオープン処理
        file = self.file_io.open_file_as_pandas(self.in_path,self.in_char)
        # 集計処理
        vc = self.count_rec.count_record(file, '顧客ID')
        # 書き出し処理
        self.file_io.export_csv_from_pandas(vc, self.out_path)

        # ヘッダー付与のため再度ファイルオープン
        out_file = self.file_io.open_file_as_pandas(self.out_path,self.out_char)
        # ヘッダー付与
        out_file.columns = ['顧客ID','キャンセル回数']
        # 書き出し処理
        self.file_io.export_csv_from_pandas(out_file, self.out_path)
Example #13
0
class ConcatCsvs:
    def __init__(self, id_path, cust_payment_path, cust_attr_path,
                 target_attr_path, average_attr_path, cust_path, cancel_path,
                 contact_path, cti_path, register_type_path, status_path,
                 stay_time_path, pv_sum_path, session_path, shop_path,
                 pref_path, char_type):

        self.file_io = FileIO()
        self.encode = CategoryEncode()
        self.count_rec = CountRecord()
        self.extract_col = ExtractColumns()
        self.bin = Binning()
        # ファイルオープン
        self.id = self.file_io.open_file_as_pandas(id_path, char_type)
        self.cust_payment = self.file_io.open_file_as_pandas(
            cust_payment_path, char_type)
        self.cust_attr = self.file_io.open_file_as_pandas(
            cust_attr_path, char_type)
        self.target_attr = self.file_io.open_file_as_pandas(
            target_attr_path, char_type)
        self.average_attr = self.file_io.open_file_as_pandas(
            average_attr_path, char_type)
        self.cust = self.file_io.open_file_as_pandas(cust_path, char_type)
        self.cancel = self.file_io.open_file_as_pandas(cancel_path, char_type)
        self.contact = self.file_io.open_file_as_pandas(
            contact_path, char_type)
        self.cti = self.file_io.open_file_as_pandas(cti_path, char_type)
        self.register_type = self.file_io.open_file_as_pandas(
            register_type_path, char_type)
        self.status = self.file_io.open_file_as_pandas(status_path, char_type)
        self.stay_time = self.file_io.open_file_as_pandas(
            stay_time_path, char_type)
        self.pv_sum = self.file_io.open_file_as_pandas(pv_sum_path, char_type)
        self.session = self.file_io.open_file_as_pandas(
            session_path, char_type)
        self.shop = self.file_io.open_file_as_pandas(shop_path, char_type)
        self.pref = self.file_io.open_file_as_pandas(pref_path, char_type)

    def concat(self, out_path, out_path2):
        # 特徴量抽出処理

        # cust_payment
        # カテゴリーデータなし
        # --- check ---
        #print("--- cust_payment shape ---\n {}\n".format(self.cust_payment.shape))
        #print(self.cust_payment.head())

        # cust_attr
        cust_attr_col_list = []
        cust_attr_tg_list = [
            '指名回数', 'コース受諾回数', '紹介カード受渡回数', '治療送客回数', '院長挨拶回数'
        ]
        # カテゴリ列を抽出
        cust_attr_category_col = self.extract_col.extract(
            self.cust_attr,
            self.cust_attr['顧客ID'],
            extract_col=cust_attr_tg_list)
        # 非カテゴリ列を抽出
        cust_attr_non_category_col = self.extract_col.exclude(
            self.cust_attr, exclude_col=cust_attr_tg_list)
        # 特徴量抽出
        org_cust_attr = self.encode.transform_feature(
            cust_attr_category_col, aggregate_col=cust_attr_tg_list)
        org_cust_attr = org_cust_attr.fillna(0)
        #org_cust_attr = org_cust_attr.drop('Unnamed: 0', axis=1)
        # ラベル付与
        for col in cust_attr_tg_list:
            cust_attr_col_list += self.encode.transform_label(
                self.cust_attr[col], col)
        else:
            cust_attr_col_list += ['顧客ID']
        # ラベル設定
        org_cust_attr.columns = cust_attr_col_list
        # 集計処理
        feat_cust_attr = self.count_rec.group_sum(
            org_cust_attr, index_col='顧客ID', aggregate_col=cust_attr_col_list)
        # カテゴリ列と非カテゴリ列を結合
        feat_cust_attr = pd.merge(feat_cust_attr,
                                  cust_attr_non_category_col,
                                  on='顧客ID',
                                  how='left')
        feat_cust_attr = feat_cust_attr.drop('Unnamed: 0', axis=1)
        # --- check ---
        #print("--- feat_cust_attr shape ---\n {}\n".format(feat_cust_attr.shape))
        #print(feat_cust_attr.head())
        #self.file_io.export_csv_from_pandas(feat_cust_attr, './data/out/mid_feat_cust_attr.csv')

        # product_attr
        '''
        product_attr_col_list = []
        product_attr_tg_list = ['商品コード']
        # カテゴリ列を抽出
        product_attr_category_col = self.extract_col.extract(self.target_attr, self.target_attr['明細ID'], extract_col=product_attr_tg_list)
        # 元DSからカテゴリ列を除去することによって、非カテゴリ列を抽出
        product_attr_non_category_col = self.extract_col.exclude(self.target_attr, exclude_col=product_attr_tg_list)
        # 特徴量抽出
        org_product_attr = self.encode.transform_feature(product_attr_category_col, aggregate_col=product_attr_tg_list)
        org_product_attr = org_product_attr.fillna(0)
        #org_product_attr = org_product_attr.drop('Unnamed: 0', axis=1)
        #print(org_product_attr)
        # ラベル付与
        for col in product_attr_tg_list:
            product_attr_col_list += self.encode.transform_label(self.target_attr[col],col)
        else:
            product_attr_col_list += ['明細ID']
        # ラベル設定
        org_product_attr.columns = product_attr_col_list
        # カテゴリ列と非カテゴリ列を結合
        feat_product_attr = pd.merge(org_product_attr, product_attr_non_category_col, on='明細ID',how='left')
        feat_product_attr = feat_product_attr.drop('Unnamed: 0', axis=1)
        '''
        # product_attr
        feat_product_attr = self.average_attr
        # --- check ---
        #print("--- feat_product_attr shape ---\n {}\n".format(feat_cust_attr.shape))
        #print(feat_product_attr.head())
        #self.file_io.export_csv_from_pandas(feat_product_attr, './data/out/mid_feat_product_attr.csv')

        # cust
        cust_col_list = []
        cust_tg_list = ['性別', '携帯TEL', '自宅TEL', '携帯メール', 'PCメール', '職業']
        # 外れ値を削除
        new_cust = self.cust.drop(self.cust[self.cust['生年月日'].str.contains(
            '\*', na=True)].index)
        today = int(pd.to_datetime('today').strftime('%Y%m%d'))
        new_cust['生年月日'] = pd.to_datetime(
            new_cust['生年月日']).dt.strftime('%Y%m%d').astype(np.int64)
        new_cust['生年月日'] = ((today - new_cust['生年月日']) / 10000).astype(
            np.int64)
        new_cust['生年月日'] = self.bin.list_divide(new_cust['生年月日'],
                                                [0, 10, 20, 30, 40, 50],
                                                ['10', '20', '30', '40', '50'])
        # カテゴリ列を抽出
        cust_category_col = self.extract_col.extract(new_cust,
                                                     new_cust['顧客ID'],
                                                     extract_col=cust_tg_list)
        # 非カテゴリ列を抽出
        cust_non_category_col = self.extract_col.exclude(
            new_cust, exclude_col=cust_tg_list)
        # 特徴量抽出
        feat_cust = self.encode.transform_feature(cust_category_col,
                                                  aggregate_col=cust_tg_list)
        feat_cust = feat_cust.fillna(0)
        #feat_cust = feat_cust.drop('Unnamed: 0', axis=1)
        feat_cust = feat_cust[feat_cust.columns.drop(
            list(feat_cust.filter(regex='Unnamed:')))]
        # ラベル付与
        for col in cust_tg_list:
            cust_col_list += self.encode.transform_label(new_cust[col], col)
        else:
            cust_col_list += ['顧客ID']
        # ラベル設定
        feat_cust.columns = cust_col_list
        # カテゴリ列と非カテゴリ列を結合
        feat_cust = pd.merge(feat_cust,
                             cust_non_category_col,
                             on='顧客ID',
                             how='left')
        #feat_cust = feat_cust.drop('Unnamed: 0', axis=1)
        # --- check ---
        #print("--- feat_cust shape ---\n {}\n".format(feat_cust.shape))
        #print(feat_cust.head())
        #self.file_io.export_csv_from_pandas(feat_cust, './data/out/mid_feat_cust.csv')

        # shop
        shop_col_list = []
        shop_tg_list = ['担当店舗']
        # カテゴリ列を抽出
        shop_category_col = self.extract_col.extract(self.shop,
                                                     self.shop['顧客ID'],
                                                     extract_col=shop_tg_list)
        # 特徴量抽出
        feat_shop = self.encode.transform_feature(shop_category_col,
                                                  aggregate_col=shop_tg_list)
        feat_shop = feat_shop.fillna(0)
        #feat_shop = feat_cust.drop('Unnamed: 0', axis=1)
        feat_shop = feat_shop[feat_shop.columns.drop(
            list(feat_shop.filter(regex='Unnamed:')))]
        # ラベル付与
        for col in shop_tg_list:
            shop_col_list += self.encode.transform_label(self.shop[col], col)
        else:
            shop_col_list += ['顧客ID']
        # ラベル設定
        feat_shop.columns = shop_col_list
        #feat_shop = feat_shop.drop('Unnamed: 0', axis=1)
        # --- check ---
        #print("--- feat_shop shape ---\n {}\n".format(feat_shop.shape))
        #print(feat_shop.head())
        #self.file_io.export_csv_from_pandas(feat_shop, './data/out/mid_feat_shop.csv')

        # pref
        pref_col_list = []
        pref_tg_list = ['町域']
        new_pref = self.pref.drop(self.pref[self.pref['町域'] == 0].index)
        # カテゴリ列を抽出
        pref_category_col = self.extract_col.extract(new_pref,
                                                     new_pref['顧客ID'],
                                                     extract_col=pref_tg_list)
        # 特徴量抽出
        feat_pref = self.encode.transform_feature(pref_category_col,
                                                  aggregate_col=pref_tg_list)
        feat_pref = feat_pref.fillna(0)
        #feat_pref = feat_cust.drop('Unnamed: 0', axis=1)
        feat_pref = feat_pref[feat_pref.columns.drop(
            list(feat_pref.filter(regex='Unnamed:')))]
        # ラベル付与
        for col in pref_tg_list:
            pref_col_list += self.encode.transform_label(self.pref[col], col)
        else:
            pref_col_list += ['顧客ID']
        # ラベル設定
        feat_pref.columns = pref_col_list
        #feat_pref = feat_pref.drop('Unnamed: 0', axis=1)
        # --- check ---
        #print("--- feat_pref shape ---\n {}\n".format(feat_pref.shape))
        #print(feat_pref.head())
        #self.file_io.export_csv_from_pandas(feat_pref, './data/out/mid_feat_pref.csv')

        # cancel
        # カテゴリーデータなし
        # --- check ---
        #print("--- cancel shape ---\n {}\n".format(cancel.shape))
        #print(cancel.head())

        # contact
        # カテゴリーデータなし
        # --- check ---
        #print("--- contact shape ---\n {}\n".format(contact.shape))
        #print(contact.head())

        # cti
        # カテゴリーデータなし
        # --- check ---
        #print("--- cti shape ---\n {}\n".format(cti.shape))
        #print(cti.head())

        # stay_time
        new_stay_time = self.stay_time
        new_stay_time['滞在時間'] = self.bin.quant_divide(
            new_stay_time['滞在時間'], 6, ['1', '2', '3', '4', '5'])
        bin_stay_time = new_stay_time.drop('Unnamed: 0', axis=1)

        # pv_sum
        new_pv_sum = self.pv_sum
        new_pv_sum['閲覧ページ総数'] = self.bin.quant_divide(
            new_pv_sum['閲覧ページ総数'], 11,
            ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
        bin_pv_sum = new_pv_sum.drop('Unnamed: 0', axis=1)

        # session
        new_session = self.session
        new_session['閲覧ページ数/セッション'] = self.bin.quant_divide(
            new_session['閲覧ページ数/セッション'], 6, ['1', '2', '3', '4', '5'])
        bin_session = new_session.drop('Unnamed: 0', axis=1)

        # register_type
        reg_col_list = []
        reg_tg_list = ['登録区分']
        # カテゴリ列を抽出
        reg_category_col = self.extract_col.extract(self.register_type,
                                                    self.register_type['顧客ID'],
                                                    extract_col=reg_tg_list)
        # 非カテゴリ列を抽出
        reg_non_category_col = self.extract_col.exclude(
            self.register_type, exclude_col=reg_tg_list)
        # 特徴量抽出
        feat_register_type = self.encode.transform_feature(
            reg_category_col, aggregate_col=reg_tg_list)
        feat_register_type = feat_register_type.fillna(0)
        #feat_register_type = feat_register_type.drop('Unnamed: 0', axis=1)
        # ラベル付与
        for col in reg_tg_list:
            reg_col_list += self.encode.transform_label(
                self.register_type[col], col)
        else:
            reg_col_list += ['顧客ID']
        # ラベル設定
        feat_register_type.columns = reg_col_list
        # カテゴリ列と非カテゴリ列を結合
        feat_register_type = pd.merge(feat_register_type,
                                      reg_non_category_col,
                                      on='顧客ID',
                                      how='left')
        feat_register_type = feat_register_type.drop('Unnamed: 0', axis=1)
        # --- check ---
        #print("--- feat_register_type shape ---\n {}\n".format(feat_register_type.shape))
        #print(feat_register_type.head())
        #self.file_io.export_csv_from_pandas(feat_register_type, './data/out/mid_feat_register_type.csv')

        # status
        stat_col_list = []
        stat_tg_list = ['状況', '指名区分']
        # カテゴリ列を抽出
        stat_category_col = self.extract_col.extract(self.status,
                                                     self.status['顧客ID'],
                                                     extract_col=stat_tg_list)
        # 非カテゴリ列を抽出
        stat_non_category_col = self.extract_col.exclude(
            self.status, exclude_col=stat_tg_list)
        # 特徴量抽出
        feat_status = self.encode.transform_feature(stat_category_col,
                                                    aggregate_col=stat_tg_list)
        feat_status = feat_status.fillna(0)
        #feat_status = feat_status.drop('Unnamed: 0', axis=1)
        # ラベル付与
        for col in stat_tg_list:
            stat_col_list += self.encode.transform_label(self.status[col], col)
        else:
            stat_col_list += ['顧客ID']
        # ラベル設定
        feat_status.columns = stat_col_list
        # カテゴリ列と非カテゴリ列を結合
        feat_status = pd.merge(feat_status,
                               stat_non_category_col,
                               on='顧客ID',
                               how='left')
        feat_status = feat_status.drop('Unnamed: 0', axis=1)
        #feat_status = feat_status.drop('Unnamed: 0', axis=1)
        # --- check ---
        #print("--- feat_status shape ---\n {}\n".format(feat_status.shape))
        #print(feat_status.head())
        #self.file_io.export_csv_from_pandas(feat_status, './data/out/mid_feat_status.csv')

        # 結合処理
        con_file = pd.merge(feat_product_attr,
                            self.cust_payment,
                            on='顧客ID',
                            how='left')
        #print("1.1: shape is {}".format(con_file.shape))
        con_file = pd.merge(con_file, self.cancel, on='顧客ID', how='left')
        #print("1.2: shape is {}".format(con_file.shape))
        con_file = pd.merge(con_file, self.contact, on='顧客ID', how='left')
        #print("1.3: shape is {}".format(con_file.shape))
        con_file = pd.merge(con_file, self.cti, on='顧客ID', how='left')
        #print("1.4: shape is {}".format(con_file.shape))
        con_file = pd.merge(con_file, bin_stay_time, on='顧客ID', how='left')
        #print("1.5: shape is {}".format(con_file.shape))
        con_file = pd.merge(con_file, bin_pv_sum, on='顧客ID', how='left')
        #print("1.6: shape is {}".format(con_file.shape))
        con_file = pd.merge(con_file, bin_session, on='顧客ID', how='left')
        #print("1.7: shape is {}".format(con_file.shape))
        con_file = pd.merge(con_file, feat_cust_attr, on='顧客ID', how='left')
        #print("1.8: shape is {}".format(con_file.shape))
        con_file = pd.merge(con_file, feat_cust, on='顧客ID', how='left')
        #print("1.9: shape is {}".format(con_file.shape))
        con_file = pd.merge(con_file,
                            feat_register_type,
                            on='顧客ID',
                            how='left')
        #print("1.10: shape is {}".format(con_file.shape))
        #con_file = pd.merge(con_file, feat_status, on='顧客ID',how='left')
        #print("1.11: shape is {}".format(con_file.shape))
        '''con_file = pd.concat([
            self.cust_payment,
            feat_cust_attr,
            feat_cust,
            self.cancel,
            self.contact,
            self.cti,
            feat_register_type,
            feat_status,
            self.stay_time,
            self.pv_sum,
            self.session], axis=1, join_axes=['顧客ID'])'''
        # --- check ---
        #print("--- con_file shape ---\n {}\n".format(con_file.shape))
        #print(con_file.head())

        # 結合処理
        con_product_file = pd.merge(self.id,
                                    self.cust_payment,
                                    on='顧客ID',
                                    how='left')
        con_product_file = pd.merge(con_product_file,
                                    feat_product_attr,
                                    on='顧客ID',
                                    how='left')
        #print("2.1: shape is {}".format(con_file.shape))

        # 重複がある場合、削除
        con_file = con_file.drop_duplicates()
        con_product_file = con_product_file.drop_duplicates()
        con_product_file = con_product_file.drop(['施術時間', '売上単価', '数量'],
                                                 axis=1)

        # 書き出し処理
        self.file_io.export_csv_from_pandas(con_file, out_path)
        self.file_io.export_csv_from_pandas(con_product_file, out_path2)
        self.file_io.export_csv_from_pandas(feat_shop,
                                            './data/out/feat_shop.csv')
        self.file_io.export_csv_from_pandas(feat_pref,
                                            './data/out/feat_pref.csv')
Example #14
0
class ExtractLog:

    def __init__(self, in_path, in_char, stay_time_path, out_char, pv_sum_path, session_path):
        self.count_rec = CountRecord()
        self.file_io = FileIO()
        self.in_path = in_path
        self.in_char = in_char
        self.stay_time_path = stay_time_path
        self.out_char = out_char
        self.pv_sum_path = pv_sum_path
        self.session_path = session_path

    def extract(self):
        # ファイルオープン処理
        file = self.file_io.open_file_as_pandas(self.in_path,self.in_char)

        # 不要列を削除
        file = file.drop(['IPアドレス','メソッド','パス','HTTPバージョン','ファイル名','レスポンスバイト数','リファラ','ユーザーエージェント','レスポンスタイム'], axis=1)
        # timestamp列をdatetime表示
        file['アクセス日時_unix'] = pd.to_datetime(file['アクセス日時'])
        # アクセス日時の差(秒)を算出
        file['アクセス間隔'] = (file['アクセス日時_unix'].shift(-1) - file['アクセス日時_unix']).dt.seconds
        # 顧客IDの同一性を確認
        file['顧客ID同一当否'] = (file['顧客ID'].shift(-1) == file['顧客ID'])
        # IDが同一でないセルのアクセス間隔をゼロにする
        file.loc[~file['顧客ID同一当否'], 'アクセス間隔'] = 0
        # 同一セッションのアクセスであるフラグ
        file.loc[file['顧客ID同一当否'], 'セッションフラグ'] = 1

        # 総滞在時間
        stay_time = self.count_rec.group_sum(file, index_col='顧客ID', aggregate_col='アクセス間隔')
        # 閲覧ページ総数(集計処理)
        pv_sum = self.count_rec.count_record(file, '顧客ID')
        # セッション回数
        same_session = self.count_rec.group_sum(file, index_col='顧客ID', aggregate_col='セッションフラグ')

        # 書き出し処理
        #self.file_io.export_csv_from_pandas(file, './data/out/log.csv')
        self.file_io.export_csv_from_pandas(stay_time, self.stay_time_path)
        self.file_io.export_csv_from_pandas(pv_sum, self.pv_sum_path)
        self.file_io.export_csv_from_pandas(same_session, self.session_path)

        # ヘッダー付与のため再度ファイルオープン
        out_file1 = self.file_io.open_file_as_pandas(self.stay_time_path,self.out_char)
        out_file2 = self.file_io.open_file_as_pandas(self.pv_sum_path,self.out_char)
        out_file3 = self.file_io.open_file_as_pandas(self.session_path,self.out_char)
        # ヘッダー付与
        out_file1.columns = ['顧客ID','滞在時間']
        out_file2.columns = ['顧客ID','閲覧ページ総数']
        out_file3.columns = ['顧客ID','閲覧ページ数/セッション']
        # 書き出し処理
        self.file_io.export_csv_from_pandas(out_file1, self.stay_time_path)
        self.file_io.export_csv_from_pandas(out_file2, self.pv_sum_path)
        self.file_io.export_csv_from_pandas(out_file3, self.session_path)
Example #15
0
class IndividualTest:
    def __init__(self):
        self.test = Test()
        self.file_io = FileIO()
        self.lr = LinearRegression(normalize=True)
        self.br = BayesianRidge()
        #self.svr_lin = SVR(kernel='linear', C=1e5)
        self.svr_poly = SVR(kernel='poly', C=1e5, degree=2)
        self.svr_rbf = SVR(kernel='rbf', C=5e4, gamma='scale')
        self.svr_sig = SVR(kernel='sigmoid', C=1e3)
        #self.gridsearch = GridSearchCV(SVR(kernel='rbf'), scoring="r2", return_train_score=True)
        self.sc = StandardScaler()
        self.ms = MinMaxScaler()
        self.chart = DrawChart2()

    def lin_reg(self, X, Y, train_test_ratio, col_list, out_path):

        # 空のDataFrameを作成
        df = pd.DataFrame(
            index=['coefficient', 'intercept', 'train_score', 'test_score'],
            columns=[])
        #print(df.head())

        for col in col_list:
            s_X = pd.DataFrame(X[col])
            s_Y = Y

            # トレーニングデータとテストデータに分割(30%)
            s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data(
                s_X, s_Y, train_test_ratio)

            # 列ごとに単回帰分析
            self.lr.fit(s_X_train, s_Y_train)

            # 偏回帰係数
            coef = self.lr.coef_

            # 切片 (誤差)
            intercept = self.lr.intercept_

            # トレーニングスコア
            train_score = self.lr.score(s_X_train, s_Y_train)

            # テストスコア
            test_score = self.lr.score(s_X_test, s_Y_test)

            # DataFrameに追加
            df[col] = [coef, intercept, train_score, test_score]

            # 回帰曲線
            lin_pred = self.lr.predict(s_X_test)

            plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'go-')
            plt.show()

            #if col in ['売上単価','コース受諾回数_なし','数量','施術時間','指名回数_あり','治療送客回数_あり','治療送客回数_なし']:
            # グラフ描画
            #self.chart.draw(self.lr, s_X_test, s_Y_test, col, 'score is {}'.format(test_score))

        # csvファイルに書き出し
        self.file_io.export_csv_from_pandas(df, out_path)

    def bayesian_reg(self, X, Y, train_test_ratio, col_list, out_path):

        # 空のDataFrameを作成
        df = pd.DataFrame(
            index=['coefficient', 'intercept', 'train_score', 'test_score'],
            columns=[])
        #print(df.head())

        for col in col_list:
            s_X = pd.DataFrame(X[col])
            s_Y = Y

            # トレーニングデータとテストデータに分割(30%)
            s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data(
                s_X, s_Y, train_test_ratio)

            # 列ごとに単回帰分析
            self.br.fit(s_X_train, s_Y_train)

            # 偏回帰係数
            coef = self.br.coef_

            # 切片 (誤差)
            intercept = self.br.intercept_

            # トレーニングスコア
            train_score = self.br.score(s_X_train, s_Y_train)

            # テストスコア
            test_score = self.br.score(s_X_test, s_Y_test)

            # DataFrameに追加
            df[col] = [coef, intercept, train_score, test_score]

            if col in [
                    '売上単価', 'コース受諾回数_なし', '数量', '施術時間', '指名回数_あり', '治療送客回数_あり',
                    '治療送客回数_なし'
            ]:
                # グラフ描画
                self.chart.draw(self.br, s_X_test, s_Y_test, col,
                                'score is {}'.format(test_score))

        # csvファイルに書き出し
        self.file_io.export_csv_from_pandas(df, out_path)

    def svr_rbf_reg(self, X, Y, train_test_ratio, col_list, out_path):

        # 空のDataFrameを作成
        df = pd.DataFrame(index=[
            'coefficient', 'suport_vector', 'intercept', 'train_score',
            'test_score'
        ],
                          columns=[])
        #print(df.head())

        for col in col_list:
            s_X = pd.DataFrame(X[col])
            s_Y = Y

            # トレーニングデータとテストデータに分割(30%)
            s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data(
                s_X, s_Y, train_test_ratio)

            # 列ごとに回帰分析
            #self.svr_lin.fit(s_X_train, s_Y_train)
            #self.svr_poly.fit(s_X_train, s_Y_train)
            self.svr_rbf.fit(s_X_train, s_Y_train)
            #self.gridsearch.fit(s_X_train, s_Y_train)

            # 偏回帰係数
            coef = self.svr_rbf.dual_coef_

            # サポートベクトル
            support_vec = self.svr_rbf.support_vectors_

            # 切片 (誤差)
            intercept = self.svr_rbf.intercept_

            # 精度
            train_score = self.svr_rbf.score(s_X_train, s_Y_train)
            test_score = self.svr_rbf.score(s_X_test, s_Y_test)

            # DataFrameに追加
            df[col] = [coef, support_vec, intercept, train_score, test_score]

            #lin_pred = self.svr_lin.predict(s_X_test)
            #poly_pred = self.svr_poly.predict(s_X_test)
            rbf_pred = self.svr_rbf.predict(s_X_test)

            plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-')
            plt.show()

            if col in ['生年月日']:
                #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'ro-')
                #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, poly_pred, 'yo-')
                plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-')
                plt.show()

        # csvファイルに書き出し
        self.file_io.export_csv_from_pandas(df, out_path)

    def svr_poly_reg(self, X, Y, train_test_ratio, col_list, out_path):

        # 空のDataFrameを作成
        df = pd.DataFrame(index=[
            'coefficient', 'suport_vector', 'intercept', 'train_score',
            'test_score'
        ],
                          columns=[])
        #print(df.head())

        for col in col_list:
            s_X = pd.DataFrame(X[col])
            s_Y = Y

            # トレーニングデータとテストデータに分割(30%)
            s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data(
                s_X, s_Y, train_test_ratio)

            # 列ごとに回帰分析
            self.svr_poly.fit(s_X_train, s_Y_train)

            # 偏回帰係数
            coef = self.svr_poly.dual_coef_

            # サポートベクトル
            support_vec = self.svr_poly.support_vectors_

            # 切片 (誤差)
            intercept = self.svr_poly.intercept_

            # 精度
            train_score = self.svr_poly.score(s_X_train, s_Y_train)
            test_score = self.svr_poly.score(s_X_test, s_Y_test)

            # DataFrameに追加
            df[col] = [coef, support_vec, intercept, train_score, test_score]

            #lin_pred = self.svr_lin.predict(s_X_test)
            #poly_pred = self.svr_poly.predict(s_X_test)
            rbf_pred = self.svr_poly.predict(s_X_test)

            if col in ['生年月日']:
                #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'ro-')
                plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, poly_pred, 'yo-')
                #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-')
                plt.show()

        # csvファイルに書き出し
        self.file_io.export_csv_from_pandas(df, out_path)

    def svr_sig_reg(self, X, Y, train_test_ratio, col_list, out_path):

        # 空のDataFrameを作成
        df = pd.DataFrame(index=[
            'coefficient', 'suport_vector', 'intercept', 'train_score',
            'test_score'
        ],
                          columns=[])
        #print(df.head())

        for col in col_list:
            s_X = pd.DataFrame(X[col])
            s_Y = Y

            # トレーニングデータとテストデータに分割(30%)
            s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data(
                s_X, s_Y, train_test_ratio)

            # 列ごとに回帰分析
            self.svr_sig.fit(s_X_train, s_Y_train)

            # 偏回帰係数
            coef = self.svr_sig.dual_coef_

            # サポートベクトル
            support_vec = self.svr_sig.support_vectors_

            # 切片 (誤差)
            intercept = self.svr_sig.intercept_

            # 精度
            train_score = self.svr_sig.score(s_X_train, s_Y_train)
            test_score = self.svr_sig.score(s_X_test, s_Y_test)

            # DataFrameに追加
            df[col] = [coef, support_vec, intercept, train_score, test_score]

            sig_pred = self.svr_sig.predict(s_X_test)

            plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, sig_pred, 'go-')
            plt.show()

            if col in ['生年月日', '閲覧ページ総数', '閲覧ページ数/セッション', '滞在時間']:
                plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, sig_pred, 'go-')
                plt.show()

        # csvファイルに書き出し
        self.file_io.export_csv_from_pandas(
            df, inifile.get('regression', 'ind_path'))
Example #16
0
class ExtractSalesSp:
    def __init__(self, in_path, in_char, payment_path, out_char,
                 cust_attr_path, target_attr_path, average_attr_path):
        self.count_rec = CountRecord()
        self.file_io = FileIO()
        self.in_path = in_path
        self.in_char = in_char
        self.payment_path = payment_path
        self.out_char = out_char
        self.cust_attr_path = cust_attr_path
        self.target_attr_path = target_attr_path
        self.average_attr_path = average_attr_path

    def extract(self):
        # ファイルオープン処理
        file = self.file_io.open_file_as_pandas(self.in_path, self.in_char)

        # 顧客属性前処理:顧客属性取得のため、個別商品の行を売上の行に統合
        sales_file = file.query('明細コード == 1')

        # 集計処理:顧客IDごとの支払情報を集計
        cust_payment = self.count_rec.group_sum(sales_file,
                                                index_col='顧客ID',
                                                aggregate_col=['顧客ID', '施術時間'])

        # 顧客属性集計処理:顧客IDごとの属性情報を集計
        ex_id = sales_file['顧客ID']
        ex_nominate = sales_file['指名回数']
        ex_course = sales_file['コース受諾回数']
        ex_card = sales_file['紹介カード受渡回数']
        ex_reception = sales_file['治療送客回数']
        ex_director = sales_file['院長挨拶回数']
        # 追加顧客属性
        #ex_branch = sales_file['店舗']
        #ex_accosiate = sales_file['担当者']

        # マージ
        cust_attr = pd.concat([ex_id, ex_nominate], axis=1)
        cust_attr = pd.concat([cust_attr, ex_course], axis=1)
        cust_attr = pd.concat([cust_attr, ex_card], axis=1)
        cust_attr = pd.concat([cust_attr, ex_reception], axis=1)
        cust_attr = pd.concat([cust_attr, ex_director], axis=1)
        cust_attr = pd.concat([cust_attr, cust_payment], axis=1)
        #cust_attr = self.cont_rec.group_size(sales_file, index_col='顧客ID', keep_list=['顧客ID','指名回数','コース受託回数','紹介カード受渡回数','治療送客回数','院長挨拶回数'])

        # 集計処理2.2:顧客IDごとの個別商品属性情報を集計
        ex_id_product = file['顧客ID']
        ex_product_code = file['商品コード']
        ex_price_product = file['売上単価']
        ex_amount_product = file['数量']
        # マージ
        product_attr = pd.concat([ex_id_product, ex_product_code], axis=1)
        product_attr = pd.concat([product_attr, ex_price_product], axis=1)
        product_attr = pd.concat([product_attr, ex_amount_product], axis=1)
        # 売上列追加
        product_attr['売上'] = file['売上単価'] * file['数量']
        # 個別商品IDに相当する列追加
        product_attr['明細ID'] = file['伝票コード'] * 10 + file['明細コード']
        # スコア列設定
        product_attr['スコア'] = 0
        # スコア設定
        product_attr.loc[product_attr['商品コード'] == '1A1501', 'スコア'] = 5
        product_attr.loc[product_attr['商品コード'] == '1B2201', 'スコア'] = 4
        product_attr.loc[product_attr['商品コード'] == '1A1601', 'スコア'] = 3
        product_attr.loc[product_attr['商品コード'] == '200071', 'スコア'] = 2
        product_attr.loc[product_attr['商品コード'] == '200006', 'スコア'] = 1
        product_attr['スコア'] = product_attr['スコア'] * product_attr['数量']
        # 不要な行を削除
        #product_attr = product_attr[(product_attr['商品コード']=='1A1501')|(product_attr['商品コード']=='1B2201')|(product_attr['商品コード']=='1A1601')|(product_attr['商品コード']=='200071')|(product_attr['商品コード']=='200006')]

        # 書き出し処理
        self.file_io.export_csv_from_pandas(cust_payment, self.payment_path)
        self.file_io.export_csv_from_pandas(cust_attr, self.cust_attr_path)
        #self.file_io.export_csv_from_pandas(target_attr, self.target_attr_path)
        self.file_io.export_csv_from_pandas(product_attr,
                                            self.average_attr_path)
Example #17
0
class ExtractCust:
    def __init__(self, in_path, in_char, out_path, out_char, id_path, shop_path, pref_path):
        self.pref_code = FindPrefectureCode()
        self.file_io = FileIO()
        self.in_path = in_path
        self.in_char = in_char
        self.out_path = out_path
        self.out_char = out_char
        self.id_path = id_path
        self.shop_path = shop_path
        self.pref_path = pref_path

    def extract(self):
        # 編集したいファイル(元ファイル)を開く
        file = self.file_io.open_file(self.in_path,"r",self.in_char)
        # 書き出し用のファイルを開く
        out_file = self.file_io.open_file(self.out_path,"w",self.out_char)
        id_file = self.file_io.open_file(self.id_path,"w",self.out_char)
        shop_file = self.file_io.open_file(self.shop_path,"w",self.out_char)
        pref_file = self.file_io.open_file(self.pref_path,"w",self.out_char)

        # 書き出し用ファイルのヘッダーを記述
        #out_file.write("顧客ID,担当店舗,生年月日,性別,携帯TEL,自宅TEL,携帯メール,PCメール,町域,職業\n")
        out_file.write("顧客ID,生年月日,性別,携帯TEL,自宅TEL,携帯メール,PCメール,職業\n")
        # id書き出し用ファルのヘッダーを記述
        id_file.write("顧客ID\n")
        # shop書き出し用ファイルのヘッダーを記述
        shop_file.write("顧客ID,担当店舗\n")
        # pref書き出し用ファイルのヘッダーを記述
        pref_file.write("顧客ID,町域\n")

        # 元ファイルのヘッダーをreadlineメソッドで1行飛ばす
        file.readline()
        # 元ファイルのレコード部分をreadlinesメソッドで全行を読み取る
        lines = file.readlines()

        # for文で1行ずつ取得
        for line in lines:
            # 改行コードをブランクに置換
            line = line.replace("\n","")
            # カンマ区切りでリストに変換する
            line = line.split(",")
            # 変換後のカンマ区切りの雛形を作り、変換処理した値を入れ込む
            row = "{},{},{},{},{},{},{},{}\n".format(
                line[3], #id
                #line[10], #shop
                line[15].replace("-",""), #birth
                line[16], #sex
                line[17], #moblie-num
                line[18], #tel-num
                line[19], #mobile-mail
                line[20], #pc-mail
                #self.pref_code.find_prefecture(line[22]), #address
                line[30], #job
                )
            id_row ="{}\n".format(
                line[3], #id
                )
            shop_row ="{},{}\n".format(
                line[3], #id
                line[10], #shop
            )
            pref_row ="{},{}\n".format(
                line[3], #id
                self.pref_code.find_prefecture(line[22]), #address
            )
            # 書き出し用のファイルに出力
            out_file.write(row)
            id_file.write(id_row)
            shop_file.write(shop_row)
            pref_file.write(pref_row)

        # ファイルを閉じる
        self.file_io.close_file(file)
        self.file_io.close_file(out_file)
        self.file_io.close_file(id_file)
        self.file_io.close_file(shop_file)
        self.file_io.close_file(pref_file)
Example #18
0
class LinRegressionInd:

    def __init__(self):
        #self.lr = LinearRegression()
        self.file_io = FileIO()
        #self.pca = PCAProcess()
        #self.chart = DrawChart()
        self.test = Test()
        self.individual = IndividualTest()
        self.sc = StandardScaler()
        self.ms = MinMaxScaler()
        self.drop_na = DropNaN()

        self.droplist = []
        with open('droplist.txt') as f:
            self.droplist = [s.strip() for s in f.readlines()]

    def regression(self, in_path, out_path):
        # ファイルオープン処理
        org_df = self.file_io.open_file_as_pandas(in_path,"utf-8")

        '''
        # 目的変数
        org_df['支払合計'] = org_df['現金外支払合計'] + org_df['現金支払合計']
        # 不要な説明変数削除
        org_df = org_df.drop(['現金外支払合計', '現金支払合計'],axis=1)
        # 売上関連説明変数削除
        org_df = org_df.drop(self.droplist,axis=1)
        # 目的の下限を設定
        org_df = org_df.drop(org_df[org_df['支払合計']<=0].index)
        # 目的変数の上限を設定
        org_df = org_df.drop(org_df[org_df['支払合計']>=40000].index)
        '''
        # 年齢の下限を設定
        org_df = org_df.drop(org_df[org_df['生年月日']<=20].index)
        # 年齢の上限を設定
        org_df = org_df.drop(org_df[org_df['生年月日']>=50].index)
        # 閲覧回数の下限を設定
        org_df = org_df.drop(org_df[org_df['閲覧ページ総数']<=0].index)
        # 閲覧回数の上限を設定
        org_df = org_df.drop(org_df[org_df['閲覧ページ総数']>=100].index)
        # スコア=0を削除
        #org_df = org_df.drop(org_df[org_df['スコア']<=0].index)
        '''
        # 欠損値が多すぎる列を削除
        #org_df = org_df.drop(['売上単価'],axis=1)
        # 目的変数が欠損値の行を削除
        org_df = org_df.dropna(subset=['支払合計'])
        '''
        # 不要列削除
        #org_df = org_df.drop(['Unnamed: 0', '顧客ID'], axis=1)
        org_df = org_df.drop(['顧客ID'],axis=1)
        org_df = org_df[org_df.columns.drop(list(org_df.filter(regex='Unnamed:')))]
        # 欠損値が70%以上の列を削除
        #org_df = self.drop_na.drop_na_col(org_df, len(org_df), 0.7)
        #print('\n rows of org_df is:')
        #print(len(org_df))
        #print(type(len(org_df)))
        # 欠損値をゼロうめ
        #org_df = org_df.fillna(0)

        # 説明変数Y
        Y = org_df['売上']
        #Y = org_df['支払合計']
        #Y = org_df['スコア']

        # 10等分
        #bin_Y = pd.cut(org_Y, 2, labels=False)
        #print(bin_Y)

        # 目的変数X
        #X = org_df.drop(['支払合計'],axis=1)
        X = org_df.drop(['売上単価','数量','売上'],axis=1)
        # 属性情報削除
        X = X.drop(['キャンセル回数','コンタクト回数','問い合わせ回数'],axis=1)
        X = X[X.columns.drop(list(org_df.filter(regex='施術時間')))]
        X = X[X.columns.drop(list(org_df.filter(regex='指名回数')))]
        #X = X[X.columns.drop(list(org_df.filter(regex='コース受諾回数')))]
        X = X[X.columns.drop(list(org_df.filter(regex='紹介カード受渡回数')))]
        X = X[X.columns.drop(list(org_df.filter(regex='治療送客回数')))]
        X = X[X.columns.drop(list(org_df.filter(regex='院長挨拶回数')))]
        X = X[X.columns.drop(list(org_df.filter(regex='性別')))]
        X = X[X.columns.drop(list(org_df.filter(regex='携帯TEL')))]
        X = X[X.columns.drop(list(org_df.filter(regex='自宅TEL')))]
        X = X[X.columns.drop(list(org_df.filter(regex='携帯メール')))]
        X = X[X.columns.drop(list(org_df.filter(regex='PCメール')))]
        X = X[X.columns.drop(list(org_df.filter(regex='職業')))]
        X = X[X.columns.drop(list(org_df.filter(regex='登録区分')))]


        # 欠損値をゼロうめ
        Y = Y.fillna(0)
        X = X.fillna(0)

        # 個別テスト
        self.individual.lin_reg(X, Y, 0.3, X.columns, out_path)
Example #19
0
class LinRegression:

    def __init__(self):
        self.lr = LinearRegression()
        self.file_io = FileIO()
        #self.pca = PCAProcess()
        #self.chart = DrawChart()
        self.test = Test()
        #self.individual = IndividualTest()
        #self.sc = StandardScaler()
        #self.ms = MinMaxScaler()
        self.ss = Scaler()
        self.drop_na = DropNaN()

    def regression(self, in_path, out_path):
        # ファイルオープン処理
        org_df = self.file_io.open_file_as_pandas(in_path,"utf-8")
        feat_shop = self.file_io.open_file_as_pandas('./data/out/feat_shop.csv','utf-8')
        feat_pref = self.file_io.open_file_as_pandas('./data/out/feat_pref.csv','utf-8')

        '''
        # 目的変数
        org_df['支払合計'] = org_df['現金外支払合計'] + org_df['現金支払合計']
        # 不要な説明変数削除
        org_df = org_df.drop(['現金外支払合計', '現金支払合計'],axis=1)
        # 目的変数がゼロ以下の行を削除
        org_df = org_df.drop(org_df[org_df['支払合計']==0].index)
        # 欠損値が多すぎる列を削除
        #org_df = org_df.drop(['売上単価'],axis=1)
        # 目的変数が欠損値の行を削除
        org_df = org_df.dropna(subset=['支払合計'])
        '''
        # shop追加
        #org_df = pd.merge(org_df, feat_shop, on='顧客ID',how='left')
        org_df = pd.merge(org_df, feat_pref, on='顧客ID',how='left')
        org_df = org_df.drop(['Unnamed: 0_x','Unnamed: 0_y'],axis=1)
        org_df = org_df[org_df.columns.drop(list(org_df.filter(regex='Unnamed:')))]
        # 売上<=0を削除
        org_df = org_df.drop(org_df[org_df['売上']<=0].index)
        # 不要列削除
        #org_df = org_df.drop(['Unnamed: 0', '顧客ID'], axis=1)
        org_df = org_df.drop(['顧客ID'],axis=1)
        #org_df = org_df[org_df.columns.drop(list(org_df.filter(regex='Unnamed:')))]
        #org_df = org_df.columns.drop(org_df.columns.str.contains('Unnamed:'))
        # 欠損値が70%以上の列を削除
        #org_df = self.drop_na.drop_na_col(org_df, len(org_df), 0.7)
        #print('\n rows of org_df is:')
        #print(len(org_df))
        #print(type(len(org_df)))
        # 欠損値をゼロうめ
        org_df = org_df.fillna(0)

        # 目的変数Yと説明変数X
        Y = org_df['売上']
        #Y = org_df['スコア']
        #X = org_df.drop(['支払合計'],axis=1)
        X = org_df.drop(['売上単価','数量','売上'],axis=1)
        #X = org_df.drop(['商品コード','売上単価','数量','売上','明細ID','スコア'],axis=1)
        X = X.drop(['キャンセル回数','コンタクト回数','問い合わせ回数'],axis=1)
        #X = X.drop(['治療送客回数_あり','治療送客回数_なし','院長挨拶回数_あり','院長挨拶回数_なし','紹介カード受渡回数_あり','紹介カード受渡回数_なし','携帯TEL_有','携帯メール_有','性別_女','性別_男','自宅TEL_有','PCメール_有'],axis=1)
        #X = X.drop(['職業_学生','職業_会社員','職業_主婦','職業_自営業','職業_その他','職業_パート・アルバイト'],axis=1)
        X = X.drop(['登録区分_HP','登録区分_店舗','登録区分_CC'],axis=1)
        X = X.drop(['生年月日','滞在時間','閲覧ページ総数','閲覧ページ数/セッション'],axis=1)
        X = X.drop(['治療送客回数_空欄','指名回数_空欄','コース受諾回数_空欄','紹介カード受渡回数_空欄','院長挨拶回数_空欄','性別_空欄','携帯TEL_空欄','自宅TEL_空欄','携帯メール_空欄','PCメール_空欄','職業_空欄','登録区分_空欄'],axis=1)
        X = X[X.columns.drop(list(org_df.filter(regex='_nan')))]
        X = X[X.columns.drop(list(org_df.filter(regex='_なし')))]
        #X = X[X.columns.drop(list(org_df.filter(regex='_空欄')))]
        X = X[X.columns.drop(list(org_df.filter(regex='_無')))]
        X = X[X.columns.drop(list(org_df.filter(regex='_削除')))]
        X = X[X.columns.drop(list(org_df.filter(regex='施術時間')))]
        X = X[X.columns.drop(list(org_df.filter(regex='性別_男')))]
        X = X[X.columns.drop(list(org_df.filter(regex='性別_女')))]
        X = X[X.columns.drop(list(org_df.filter(regex='携帯TEL_有')))]
        X = X[X.columns.drop(list(org_df.filter(regex='治療送客回数_あり')))]
        X = X[X.columns.drop(list(org_df.filter(regex='紹介カード受渡回数_あり')))]
        X = X[X.columns.drop(list(org_df.filter(regex='町域_')))] # 結果にほとんど関係ないので削除

        # 標準化
        std_X = self.ss.standard_scaler(X,axis=1,data_type='float')
        #std_Y = pd.DataFrame(self.sc.fit_transform(Y))
        #std_Y.columns = Y.columns
        #std_X = pd.DataFrame(self.sc.fit_transform(X))
        #std_X.columns = X.columns

        # 正規化
        #norm_Y = pd.DataFrame(self.ms.fit_transform(Y))
        #norm_Y.columns = Y.columns
        #norm_X = pd.DataFrame(self.ms.fit_transform(X))
        #norm_X.columns = X.columns
        #self.file_io.export_csv_from_pandas(X, './data/out/X.csv')

        # トレーニングデータとテストデータに分割(30%)
        X_train, X_test, Y_train, Y_test = self.test.make_train_test_data(std_X, Y, 0.3)
        #X_train, X_test, Y_train, Y_test = self.test.make_train_test_data(X, Y, 0.3)
        print(X_train.head())
        print("--- X_train's shape ---\n {}\n".format(X_train.shape))
        print(X_test.head())
        print("--- X_test's shape ---\n {}\n".format(X_test.shape))
        print(Y_train.head())
        print("--- Y_train's shape ---\n {}\n".format(Y_train.shape))
        print(Y_test.head())
        print("--- Y_test's shape ---\n {}\n".format(Y_test.shape))


        # 重回帰分析を実施
        self.lr.fit(X_train, Y_train)
        # 偏回帰係数
        print(pd.DataFrame({"Name":X.columns,
                            "Coefficients":self.lr.coef_}).sort_values(by='Coefficients') )
        # 切片 (誤差)
        print(self.lr.intercept_)

        # pandasファイル作成
        org_pd = pd.DataFrame({"Name":X.columns,
                            "Coefficients":self.lr.coef_})
        # ファイルアウトプット
        self.file_io.export_csv_from_pandas(org_pd, out_path)

        # 精度を算出
        # トレーニングデータ
        print(" --- train score ---\n {}\n".format(self.lr.score(X_train,Y_train)))
        # テストデータ
        print(" --- test score ---\n {}\n".format(self.lr.score(X_test,Y_test)))

        return self.lr.score(X_train,Y_train), self.lr.score(X_test,Y_test)