def fit(self, Y_full, X_train): # Y_full is a dictionary """ Summary ------- Function to fit a logistic regression. Parameters ---------- self: LogRegModel instance Y_full: 'dict' Dictionary of response variable. X_train: 'numpy matrix' Matrix of covariates. """ self.coef = dict() Y_name = str(list(Y_full.keys())[0]) SubDict = get_dummies(Y_full, Y_name) categories = list(set(Y_full[Y_name])) for cat in categories: Y_train = SubDict[cat] self.fit_binary(Y_train, X_train, cat)
def data_check(self, train=[], test=[], target='', encode='', exclude_category=False): ''' Explain: 学習を行う前にデータに問題がないかチェックする カテゴリカルなデータが入っていたらエンコーディング or Dropする Args: Return: ''' if len(test): df = pd.concat([train, test], axis=0) else: df = train try: # categorical_list = [col for col in list(df.columns) if (df[col].dtype == 'object') and col not in self.ignore_list] categorical_list = [] for col in df.columns: if (df[col].dtype == 'object') and col not in self.ignore_list: categorical_list.append(col) except AttributeError: print(f"Duplicate Column: {col}") sys.exit() dt_list = [ col for col in list(df.columns) if str(df[col].dtype).count('time') and col not in self.ignore_list ] self.logger.info(f''' #============================================================================== # DATA CHECK START # CATEGORICAL FEATURE: {categorical_list} # DATETIME FEATURE : {dt_list} # CAT ENCODE : {encode} # ignore_list : {self.ignore_list} #============================================================================== ''') if encode == 'label': df = factorize_categoricals(df, categorical_list) elif encode == 'dummie': df = get_dummies(df, categorical_list) elif encode == 'ordinal': df, decoder = ordinal_encode(df, categorical_list) self.decoder = decoder if len(test): train = df[~df[target].isnull()] test = df[df[target].isnull()] else: train = df ' Testsetで値のユニーク数が1のカラムを除外する ' drop_list = dt_list if len(test): for col in test.columns: length = test[col].nunique() if length <= 1 and col not in self.ignore_list and col != target: self.logger.info(f''' ***********WARNING************* LENGTH {length} COLUMN: {col}''') self.move_feature(feature_name=col) if col not in self.ignore_list: drop_list.append(col) self.logger.info(f''' #============================================================================== # DATA CHECK END # SHAPE: {df.shape} #==============================================================================''' ) return train, test, drop_list
def data_check(logger, df, target, test=False, dummie=0, exclude_category=False, ignore_list=[]): ''' Explain: 学習を行う前にデータに問題がないかチェックする カテゴリカルなデータが入っていたらエンコーディング or Dropする Args: Return: ''' logger.info(f''' #============================================================================== # DATA CHECK START #==============================================================================''' ) categorical_list = get_categorical_features(df, ignore_list=ignore_list) dt_list = get_datetime_features(df, ignore_list=ignore_list) logger.info(f''' #============================================================================== # CATEGORICAL FEATURE: {categorical_list} # LENGTH: {len(categorical_list)} # DUMMIE: {dummie} #============================================================================== ''') #======================================================================== # 連続値として扱うべきカラムがobjectになっていることがあるので #======================================================================== # for cat in categorical_list: # try: # df[cat] = df[cat].astype('int64') # categorical_list.remove(cat) # except ValueError: # pass #======================================================================== # datetime系のカラムはdrop #======================================================================== for dt in dt_list: df.drop(dt, axis=1, inplace=True) ' 対象カラムのユニーク数が100より大きかったら、ラベルエンコーディングにする ' label_list = [] for cat in categorical_list: if len(df[cat].drop_duplicates()) > 100: label_list.append(cat) categorical_list.remove(cat) df = factorize_categoricals(df, label_list) if exclude_category: for cat in categorical_list: df.drop(cat, axis=1, inplace=True) move_feature(feature_name=cat) categorical_list = [] elif dummie == 1: df = get_dummies(df, categorical_list) categorical_list = [] elif dummie == 0: df = factorize_categoricals(df, categorical_list) categorical_list = [] logger.info(f'df SHAPE: {df.shape}') ' Testsetで値のユニーク数が1のカラムを除外する ' drop_list = [] if test: for col in df.columns: length = df[col].nunique() if length <= 1 and col not in ignore_list: logger.info(f''' ***********WARNING************* LENGTH {length} COLUMN: {col}''') move_feature(feature_name=col) if col != target: drop_list.append(col) logger.info(f''' #============================================================================== # DATA CHECK END #==============================================================================''' ) return df, drop_list
def main(): ''' BASE AGGRIGATION 単一カラムをlevelで粒度指定して基礎集計 ''' if agg_code == 'base': # ======================================================================= # 集計するカラムリストを用意 # ======================================================================= num_list = get_numeric_features(df=df, ignore=ignore_list) # ======================================================================= # 集計開始 # ======================================================================= for num in num_list: for method in method_list: arg_list.append(df, key, num, method, prefix, '', base) ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする ' call_list = pararell_process(pararell_wrapper(base_aggregation), arg_list) result = pd.concat(call_list, axis=1) for col in result.columns: if not(col.count('@')) or col in ignore_list: continue print(col) # utils.to_pickle(path=f"{dir}/{col}.fp", obj=result[col].values) sys.exit() # for num in num_list: # for method in method_list: # tmp_result = base_aggregation(df=df, level=key, method=method, prefix=prefix, feature=num, drop=True) # result = base.merge(tmp_result, on=key, how='left') # for col in result.columns: # if not(col.count('@')) or col in ignore_list: # continue # utils.to_pickle( # path=f"{dir}/{col}.fp", obj=result[col].values) # make_npy(result=result, ignore_list=ignore_features, logger=logger) elif agg_code == 'caliculate': ''' CALICULATION 複数カラムを四則演算し新たな特徴を作成する ''' f1_list = [] f2_list = [] used_lsit = [] for f1 in f1_list: for f2 in f2_list: ' 同じ組み合わせの特徴を計算しない ' if f1 == f2: continue if sorted([f1, f2]) in used_list: continue used_list.append(sorted([f1, f2])) if diff: df = diff_feature(df=df, first=f1, second=f2) elif div: df = division_feature(df=df, first=f1, second=f2) elif pro: df = product_feature(df=df, first=f1, second=f2) for col in df.columns: utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values) elif agg_code == 'cnt': ''' COUNT ENCODING level粒度で集計し、cnt_valを重複有りでカウント ''' cat_list = get_categorical_features(df=df, ignore=ignore_list) for category_col in cat_list: df = cnt_encoding(df, category_col, ignore_list) df = base.merge(df, on=key, how='inner') cnt_cols = [col for col in df.columns inf col.count('cntec')] for col in cnt_cols: utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values) elif agg_code == 'category': arg_list = [] ' カテゴリカラム ' cat_list = get_categorical_features(df=df, ignore=ignore_list) num_list = get_numeric_features(df=df, ignore=ignore_list) for cat in cat_list: for value in num_list: for method in method_list: arg_list.append(base, df, key, cat, value, method, ignore_list, prefix) ' データセットにおけるカテゴリカラムのvalue毎にエンコーディングする ' pararell_process(pararell_wrapper(select_category_value_agg), arg_list) # select_category_value_agg(base, df=df, key=key, category_col=cat, value=value, method, ignore_list, prefix) elif agg_code == 'combi': combi_num = [1, 2, 3][0] cat_combi = list(combinations(categorical, combi_num)) elif agg_code == 'dummie': ' データセットのカテゴリカラムをOneHotエンコーディングし、その平均をとる ' cat_list = get_categorical_features(data, ignore_features) df = get_dummies(df=df, cat_list=cat_list)
def main(): path = f'../input/{sys.argv[1]}*' df = utils.read_df_pickle(path=path) prefix = sys.argv[2] ''' BASE AGGRIGATION 単一カラムをlevelで粒度指定して基礎集計 ''' if agg_code == 'base': one_base_agg(df=df, prefix=prefix) elif agg_code == 'caliculate': df = two_calicurate(df=df) if prefix != 'app_': one_base_agg(df=df, prefix=prefix) else: for col in df.columns: utils.to_pickle(path=f"{dir}/{prefix}{col}.fp", obj=df[col].values) elif agg_code == 'cnt': ''' COUNT ENCODING level粒度で集計し、cnt_valを重複有りでカウント ''' cat_list = get_categorical_features(df=df, ignore=ignore_list) for category_col in cat_list: df = cnt_encoding(df, category_col, ignore_list) df = base.merge(df, on=key, how='inner') cnt_cols = [col for col in df.columns if col.count('cntec')] for col in cnt_cols: if exclude_feature(col, df[col].values): continue utils.to_pickle(path=f"{dir}/{col}.fp", obj=df[col].values) elif agg_code == 'category': ' カテゴリカラムの中のvalue毎に集計する ' arg_list = [] cat_list = get_categorical_features(df=df, ignore=ignore_list) num_list = get_numeric_features(df=df, ignore=ignore_list) for cat in cat_list: for value in num_list: for method in method_list: select_category_value_agg(base, df=df, key=key, category_col=cat, value=value, method=method, ignore_list=ignore_list, prefix=prefix) # arg_list.append(base, df, key, cat, value, method, ignore_list, prefix) # pararell_process(select_cat_wrapper, arg_list) elif agg_code == 'combi': combi_num = [1, 2, 3][0] cat_combi = list(combinations(categorical, combi_num)) elif agg_code == 'dummie': ' データセットのカテゴリカラムをOneHotエンコーディングし、その平均をとる ' cat_list = get_categorical_features(data, ignore_features) df = get_dummies(df=df, cat_list=cat_list)