def test_transform(self, X, fe=None): if fe is None: fe = set() df = X if isinstance(X, pd.DataFrame) else X.data cats = df[self.key].values vals = [] for i in cats: if i in self.cat2label: vals.append(self.cat2label[i]) else: vals.append([]) for val in vals: if len(val) > 0: val.extend([val[-1] for _ in range(len(val), self.max_win)]) break new_cols = [ f'{self.key}_{self.primary_time}_{self.label}_{i}' for i in range(1, self.max_win + 1) ] res = pd.DataFrame(vals, columns=new_cols, index=cats, dtype='float32') for col in res.columns: new_col = gen_feat_name(self.__class__.__name__, col, 'num') df[new_col] = downcast(res[col].values) self.feat_expend(df, fe)
def transform(self, ss): for cat in self.cats: codes = pd.Categorical( ss[cat], categories=self.cats2unique[cat]).codes + CAT_SHIFT codes = codes.astype('float') codes = downcast(codes, accuracy_loss=False) ss[cat] = codes
def transform(self, X): df = X if isinstance(X, pd.DataFrame) else X.data for atr in self.attrs: new_col = self.primary_timestamp + '_' + atr df[new_col] = downcast(getattr(df[self.primary_timestamp].dt, atr), accuracy_loss=False) df[self.primary_timestamp] = df[self.primary_timestamp].astype( 'int64') // 10**9
def func(shift): new_col = f'{self.key}_{self.primary_time}_{self.label}_{shift}' ss = self.record[self.label].shift(shift) ss[self.record[self.key] != self.record[self.key].shift( shift)] = np.nan ss.name = new_col ss = downcast(ss) return ss
def train_transform(self, X): df = X if isinstance(X, pd.DataFrame) else X.data category = df[self.cats[0]].astype('float64') for i in range(1, len(self.cats)): category *= self.cat_max[i] category += df[self.cats[i]] category[category == (CAT_SHIFT - 1)] = np.nan new_col = '_'.join(self.cats) + '_combineID' new_col = gen_feat_name(self.__class__.__name__, new_col, 'cat') category = downcast(category, accuracy_loss=False) df[new_col] = category return new_col
def feat_expend(self, df, drop_fe=None): if drop_fe is None: drop_fe = set() if self.feat_exp: # 就用一个 j = 0 new_col = f'{self.key}_{self.primary_time}_{self.label}_{j+1}_slope' new_col = gen_feat_name(self.__class__.__name__, new_col, 'num') if new_col not in drop_fe: tmp = (df[self.new_cols[j]] - df[self.new_cols[j + 1]]) / ( df[self.new_cols[j + 1]] + 1) tmp[(tmp == np.inf) | (tmp == -np.inf)] = np.nan df[new_col] = tmp.values for j in range(min(3, len(self.new_cols) - 1)): new_col = f'{self.key}_{self.primary_time}_{self.label}_{j + 1}_div' new_col = gen_feat_name(self.__class__.__name__, new_col, 'num') if new_col not in drop_fe: tmp = df[self.new_cols[j]] / (df[self.new_cols[j + 1]] + 1) tmp[(tmp == np.inf) | (tmp == -np.inf)] = np.nan df[new_col] = tmp.values j = 0 new_col = f'{self.key}_{self.primary_time}_{self.label}_{j + 1}_iszero' new_col = gen_feat_name(self.__class__.__name__, new_col, 'cat') if new_col not in drop_fe: df[new_col] = downcast((df[self.new_cols[j]] != 0).astype(int)) if self.palm_size > 0: feat_num = len(self.new_cols) // self.palm_size for i in range(feat_num): j = i * self.palm_size col = self.new_cols[j:j + self.palm_size] new_col = f'{self.key}_{self.primary_time}_{self.label}_{j+1}_{j + self.palm_size}' new_col = gen_feat_name(self.__class__.__name__, new_col, 'num') # ex_cols.append(new_col) if new_col not in drop_fe: df[new_col] = df[col].mean(axis=1) todo_func = ['mean', 'max', 'min'] new_col1 = f'{self.key}_{self.primary_time}_{self.label}' for j in self.palt_list: if j > self.max_win: break col = self.new_cols[:j] new_col2 = f'{new_col1}_{j}' for f in todo_func: new_col3 = f'{new_col2}_{f}' new_col3 = gen_feat_name(self.__class__.__name__, new_col3, 'num') if new_col3 not in drop_fe: df[new_col3] = getattr(df[col], f)(axis=1)
def test_transform(self, X): df = X if isinstance(X, pd.DataFrame) else X.data df_1 = df.loc[~self.judge, :] self.train_transform(df_1) df_2 = df[self.judge] new_col = '_'.join(self.cats) + '_combineID' new_col = gen_feat_name(self.__class__.__name__, new_col, 'cat') if not df_2.empty: codes = pd.Categorical( self.combine_cat, categories=self.unique).codes + CAT_SHIFT + self.shift codes = codes.astype('float') codes[codes == (CAT_SHIFT - 1)] = np.nan codes = downcast(codes, accuracy_loss=False) df_2[new_col] = codes df_1 = pd.concat([df_1, df_2], sort=False) df_1 = df_1.sort_index() df[new_col] = df_1[new_col].values
def func(shift): new_col = f'{self.primary_time}_{self.label}_{shift}' ss = downcast(self.record[self.label].shift(shift)) ss.name = new_col return ss