def feat_expend(self, df, drop_fe=None): if drop_fe is None: drop_fe = set() if self.palm_size > 0: feat_num = len(self.new_cols) // self.palm_size for i in range(feat_num): j = i * self.palm_size col = self.new_cols[j:j + self.palm_size] new_col = f'{self.key}_{self.primary_time}_{self.label}_{j}_{j + self.palm_size}' new_col = gen_feat_name(self.__class__.__name__, new_col, 'num') if new_col not in drop_fe: df[new_col] = df[col].mean(axis=1) todo_func = ['mean', 'max', 'min'] new_col1 = f'{self.key}_{self.primary_time}_{self.label}' for j in self.palt_list: if j > self.max_win: break col = self.new_cols[:j] new_col2 = f'{new_col1}_{j}' for f in todo_func: new_col3 = f'{new_col2}_{f}' new_col3 = gen_feat_name(self.__class__.__name__, new_col3, 'num') if new_col3 not in drop_fe: df[new_col3] = getattr(df[col], f)(axis=1)
def feat_expend(self, df, drop_fe=None): if drop_fe is None: drop_fe = set() if self.feat_exp: # 就用一个 j = 0 new_col = f'{self.key}_{self.primary_time}_{self.label}_{j+1}_slope' new_col = gen_feat_name(self.__class__.__name__, new_col, 'num') if new_col not in drop_fe: tmp = (df[self.new_cols[j]] - df[self.new_cols[j + 1]]) / ( df[self.new_cols[j + 1]] + 1) tmp[(tmp == np.inf) | (tmp == -np.inf)] = np.nan df[new_col] = tmp.values for j in range(min(3, len(self.new_cols) - 1)): new_col = f'{self.key}_{self.primary_time}_{self.label}_{j + 1}_div' new_col = gen_feat_name(self.__class__.__name__, new_col, 'num') if new_col not in drop_fe: tmp = df[self.new_cols[j]] / (df[self.new_cols[j + 1]] + 1) tmp[(tmp == np.inf) | (tmp == -np.inf)] = np.nan df[new_col] = tmp.values j = 0 new_col = f'{self.key}_{self.primary_time}_{self.label}_{j + 1}_iszero' new_col = gen_feat_name(self.__class__.__name__, new_col, 'cat') if new_col not in drop_fe: df[new_col] = downcast((df[self.new_cols[j]] != 0).astype(int)) if self.palm_size > 0: feat_num = len(self.new_cols) // self.palm_size for i in range(feat_num): j = i * self.palm_size col = self.new_cols[j:j + self.palm_size] new_col = f'{self.key}_{self.primary_time}_{self.label}_{j+1}_{j + self.palm_size}' new_col = gen_feat_name(self.__class__.__name__, new_col, 'num') # ex_cols.append(new_col) if new_col not in drop_fe: df[new_col] = df[col].mean(axis=1) todo_func = ['mean', 'max', 'min'] new_col1 = f'{self.key}_{self.primary_time}_{self.label}' for j in self.palt_list: if j > self.max_win: break col = self.new_cols[:j] new_col2 = f'{new_col1}_{j}' for f in todo_func: new_col3 = f'{new_col2}_{f}' new_col3 = gen_feat_name(self.__class__.__name__, new_col3, 'num') if new_col3 not in drop_fe: df[new_col3] = getattr(df[col], f)(axis=1)
def train_transform(self, X): def func(shift): new_col = f'{self.key}_{self.primary_time}_{self.label}_{shift}' ss = self.record[self.label].shift(shift) ss[self.record[self.key] != self.record[self.key].shift( shift)] = np.nan ss.name = new_col ss = downcast(ss) return ss df = X if isinstance(X, pd.DataFrame) else X.data res = Parallel(n_jobs=JOBS, require='sharedmem')( delayed(func)(i) for i in range(1, self.max_win + 1)) new_cols = [] if res: res = pd.concat(res, sort=False, axis=1) res[self.primary_time] = self.record[self.primary_time] res[self.key] = self.record[self.key] tmp = df[[self.primary_time, self.key]] tmp = tmp.merge(res, how='left', on=[self.primary_time, self.key]) for col in tmp.columns[2:]: new_col = gen_feat_name(self.__class__.__name__, col, 'num') new_cols.append(new_col) df[new_col] = tmp[col].values self.reduce_memory() return new_cols
def test_transform(self, X, fe=None): if fe is None: fe = set() df = X if isinstance(X, pd.DataFrame) else X.data cats = df[self.key].values vals = [] for i in cats: if i in self.cat2label: vals.append(self.cat2label[i]) else: vals.append([]) for val in vals: if len(val) > 0: val.extend([val[-1] for _ in range(len(val), self.max_win)]) break new_cols = [ f'{self.key}_{self.primary_time}_{self.label}_{i}' for i in range(1, self.max_win + 1) ] res = pd.DataFrame(vals, columns=new_cols, index=cats, dtype='float32') for col in res.columns: new_col = gen_feat_name(self.__class__.__name__, col, 'num') df[new_col] = downcast(res[col].values) self.feat_expend(df, fe)
def train_transform(self, X): df = X if isinstance(X, pd.DataFrame) else X.data category = df[self.cats[0]].astype('float64') for i in range(1, len(self.cats)): category *= self.cat_max[i] category += df[self.cats[i]] category[category == (CAT_SHIFT - 1)] = np.nan new_col = '_'.join(self.cats) + '_combineID' new_col = gen_feat_name(self.__class__.__name__, new_col, 'cat') category = downcast(category, accuracy_loss=False) df[new_col] = category return new_col
def test_transform(self, X): df = X if isinstance(X, pd.DataFrame) else X.data df_1 = df.loc[~self.judge, :] self.train_transform(df_1) df_2 = df[self.judge] new_col = '_'.join(self.cats) + '_combineID' new_col = gen_feat_name(self.__class__.__name__, new_col, 'cat') if not df_2.empty: codes = pd.Categorical( self.combine_cat, categories=self.unique).codes + CAT_SHIFT + self.shift codes = codes.astype('float') codes[codes == (CAT_SHIFT - 1)] = np.nan codes = downcast(codes, accuracy_loss=False) df_2[new_col] = codes df_1 = pd.concat([df_1, df_2], sort=False) df_1 = df_1.sort_index() df[new_col] = df_1[new_col].values
def get_feat_name(self): new_col = '_'.join(self.cats) + '_combineID' new_col = gen_feat_name(self.__class__.__name__, new_col, 'cat') return new_col
def test_transform(self, X): df = X if isinstance(X, pd.DataFrame) else X.data for shift in range(1, self.max_win + 1): new_col = f'{self.primary_time}_{self.label}_{shift}' new_col = gen_feat_name(self.__class__.__name__, new_col, 'num') df[new_col] = self.label_list[shift - 1]