Example #1
0
# 結婚の有無・東京大阪勤務かどうかでグループ分け
class partner_capital_group(Feature):
    def create_features(self):
        whole = pd.concat([train, test], axis=0)
        whole['is_tokyo'] = whole['area'].apply(lambda x: 1
                                                if x == '東京都' else 0)
        whole['is_osaka'] = whole['area'].apply(lambda x: 1
                                                if x == '大阪府' else 0)
        whole['is_capital'] = whole['is_tokyo'] + whole['is_osaka']
        whole['partner_capital_group_str'] = whole['partner'].astype(
            str) + '_' + whole['is_capital'].astype(str)
        le = {'0_0': 0, '0_1': 1, '1_0': 2, '1_1': 3}
        whole[self.__class__.
              __name__] = whole['partner_capital_group_str'].map(le)
        self.train[self.__class__.__name__] = whole[
            self.__class__.__name__].values[:len_train]
        self.test[self.__class__.__name__] = whole[
            self.__class__.__name__].values[len_train:]


if __name__ == '__main__':
    args = get_arguments()

    train = pd.read_feather('../data/input/train_data.feather')
    test = pd.read_feather('../data/input/test_data.feather')

    len_train = len(train)

    generate_features(globals(), args.force)
Example #2
0
                                                   if x == 'male' else 0)

    def add_meta(self):
        self.meta_dict['memo'] = 'sex'
        self.meta_dict['num_or_cat'] = 'cat'
        self.meta_dict['date'] = '{0:%Y-%m-%d %H:%M:%S}'.format(self.now)


if __name__ == '__main__':
    # log
    logger, sh, fh = preparation_logger()

    # do
    args = Util.get_arguments()
    train = Util.load_train_data()
    test = Util.load_test_data()

    # test mode?
    if args.debug:
        fh.setLevel(logging.ERROR)  # file書き出ししないという意思表示
        sh.setLevel(logging.DEBUG)  # stream handler を infoからdebugへ
        logger.info('*******************************')
        logger.info('********** test mode **********')
        logger.info('*******************************')

    logger.info('-------------------- start')
    logger.debug(f'\n-train\n {train.head()}')
    logger.debug(f'\n-test\n {test.head()}')
    generate_features(globals(), args.force, args.debug)
    logger.info('-------------------- end')
        if len(col) != 0:
            return

        writer = csv.writer(f)
        writer.writerow([col_name, desc])


if __name__ == '__main__':

    # CSVのヘッダーを書き込み
    create_memo('特徴量', 'メモ')

    if 'ipykernel' in sys.modules:
        # pklファイルを上書きする際はTrueに書き換え
        overwrite_ok = True
    else:
        overwrite_ok = get_arguments().force

    train = pd.read_csv(RAW_DATA_DIR_NAME + 'train.csv')
    test = pd.read_csv(RAW_DATA_DIR_NAME + 'test.csv')

    # globals()でtrain,testのdictionaryを渡す
    generate_features(globals(), overwrite_ok)

    # 特徴量メモをソートする
    feature_df = pd.read_csv(feature_memo_path)
    feature_df = feature_df.sort_values('特徴量')
    feature_df.to_csv(feature_memo_path, index=False)

# %%