def transform(self, X): log(f"start TP transform") for i in range(len(self.exec_cols)): cat_col = self.exec_cols[i][0] new_col = self.new_cols[i] X[new_col] = downcast(X[cat_col].map(self.mean_map_dict[new_col]), accuracy_loss=False)
def func(df): col = df.columns[0] num_col = df.columns[1] df[num_col] = df[num_col].astype('float32') means = df.groupby(col, sort=False)[num_col].mean() return tuple(df.columns), downcast(means)
def mean_label(ss: pd.Series, y): col = ss.name df = pd.concat([ss, y], axis=1) df.columns = [col, 'label'] df = pd.concat([ df[col].value_counts(), df.loc[df['label'] == 1, col].value_counts() ], axis=1) df.columns = ['cnt', 'pos_cnt'] df['rate'] = df['cnt'] / df['pos_cnt'] * 1.0 return col, downcast(df['rate'], accuracy_loss=False)
def transform(self, X: pd.DataFrame): log(f"start TP transform") def func(df): cats = gen_combine_cats(df, df.columns) return tuple(df.columns), cats res = Parallel(n_jobs=CONSTANT.JOBS, require='sharedmem')(delayed(func)(X[[col1, col2]]) for col1, col2 in self.exec_cols) for cols, cats in res: new_col = self.new_cols[self.exec_cols.index(cols)] X[new_col] = downcast(cats.map(self.cnt_map_dict[new_col]), accuracy_loss=False)
def func(df): cats = gen_combine_cats(df, df.columns) cnt = cats.value_counts() return tuple(df.columns), downcast(cnt)
def values_cnt(ss: pd.Series): counts = ss.value_counts() return ss.name, downcast(counts)
def time_atr(ss: pd.Series, atr): return atr, downcast(getattr(ss.dt, atr), accuracy_loss=False)
def func(df): cols = list(df.columns) diff = df[cols[0]] - df[cols[1]] return tuple(df.columns), downcast(diff, accuracy_loss=False)
def values_cnt(ss: pd.Series): counts = ss.groupby(ss).cumcount() return ss.name, downcast(counts)