def c_cats_combine(df, col2max): columns = df.columns ss = df[columns[0]].astype('float64') for col in columns[1:]: mx = col2max[col] ss *= mx ss += df[col] downcast(ss, accuracy_loss=False) return ss
def explore_params(self, X, y, categories): self.cat_cols = tuple(categories) self.num_cols = [col for col in X.columns if col not in categories] log(f'train num col: {self.num_cols}') log(f'train cat col:{self.cat_cols}') cat_feats = self.cat_fit_transform(X, mode='fit_trans') num_feats = self.num_fit_transform(X, mode='fit_trans') if len(cat_feats) > 0 and len(num_feats) > 0: feats = np.concatenate([cat_feats, num_feats], axis=1) elif len(cat_feats) > 0: feats = cat_feats elif len(num_feats) > 0: feats = num_feats log(f'before downcast {feats.dtype}') feats = downcast(feats) log(f'aft downcast {feats.dtype}') feats = feats[-50000:] y = y.iloc[-50000:] log(f'train features shape : {feats.shape}') X_train, X_eval, y_train, y_eval = train_test_split(feats, y, test_size=0.2, shuffle=False, random_state=0) self.select_cols(X_train, X_eval, y_train, y_eval) final_rmse = self.select_alpha(X_train, X_eval, y_train, y_eval) return final_rmse
def groupby_mean(df): col = df.columns[0] num_col = df.columns[1] means = df.groupby(col, sort=False)[num_col].mean() ss = df[col].map(means) ss = downcast(ss) return ss
def n_diff(df, time_col, cat_col, num_col): index = df.index df.reset_index(drop=True, inplace=True) if cat_col is not None: df.sort_values([cat_col, time_col], inplace=True) else: df.sort_values([time_col], inplace=True) num_ss = df[num_col] delta = num_ss.diff() if cat_col is not None: cat_ss = df[cat_col].diff() cat_ss2 = df[cat_col].diff(2) delta_ratio = delta / (num_ss.shift(1) + 1e-3) delta = delta.shift(1) delta_ratio = delta_ratio.shift(1) delta_delta = delta.diff() delta = downcast(delta) delta_delta = downcast(delta_delta) delta_ratio = downcast(delta_ratio) if cat_col is not None: delta[cat_ss != 0] = np.nan delta_ratio[cat_ss != 0] = np.nan delta_delta[cat_ss2 != 0] = np.nan new_df = pd.concat([delta, delta_delta, delta_ratio], axis=1) new_df.sort_index(inplace=True) new_df.index = index return new_df
def transform(self, ss): codes = pd.Categorical(ss, categories=self.cats).codes + CONSTANT.CAT_SHIFT # more = set(self.cats) - set(ss) # print(f'more:{more}') codes = codes.astype('float') codes[codes == (CONSTANT.CAT_SHIFT - 1)] = np.nan # nan_ratio = np.isnan(codes).mean() # print(f'nan_ratio') # print(nan_ratio) codes = downcast(codes, accuracy_loss=False) return codes
def fit(self, table): #self.cols1 = table.cat_cols self.cols1 = ['c_TimeDate:A1:hour'] key_col = table.key_col cols2 = [table.label] if len(self.cols1) == 0 or len(cols2) == 0: return df = table.train_X df['key_cross'] = df['c_TimeDate:A1:hour'] * df[key_col] self.cols1 = ['key_cross'] for col1 in self.cols1: for col2 in cols2: obj = f'({col1})({col2})' param = None new_col = FeatNamer.gen_feat_name(self.__class__.__name__, obj, param, CONSTANT.NUMERICAL_TYPE) mean_ss = df.groupby([col1], sort=False)[col2].mean() mean_ss = downcast(mean_ss) self.res[new_col] = mean_ss.to_dict()
def fit(self, X, y, categories): if self.good_cols is not None: X = X[self.good_cols] self.cat_cols = tuple( [col for col in categories if col in self.good_cols]) self.num_cols = tuple( [col for col in X.columns if col not in categories]) else: self.cat_cols = tuple(categories) self.num_cols = tuple( [col for col in X.columns if col not in categories]) #X, y = sample(X, y, 500000, random_state=2019) if self.train_shape is None: self.train_shape = X.shape[0] X_sample = X.iloc[:self.train_shape].sample(frac=0.8, random_state=2019) y_sample = y.loc[X_sample.index] X_test = X.iloc[self.train_shape:] y_test = y.loc[self.train_shape:] X = pd.concat([X_sample, X_test], axis=0) y = pd.concat([y_sample, y_test], axis=0) del X_sample, y_sample, X_test, y_test gc.collect() # self.cat_cols = tuple() # self.num_cols = tuple([col for col in X.columns]) #log(f'train num col: {self.num_cols}') #log(f'train cat col:{self.cat_cols}') cat_feats = self.cat_fit_transform(X, mode='fit_trans') num_feats = self.num_fit_transform(X, mode='fit_trans') if len(cat_feats) > 0 and len(num_feats) > 0: feats = np.concatenate([cat_feats, num_feats], axis=1) elif len(cat_feats) > 0: feats = cat_feats elif len(num_feats) > 0: feats = num_feats feats = downcast(feats) self.model = Ridge(solver='svd', max_iter=300, alpha=self.best_alpha) try: if not self.do_sample: self.model.fit(feats, y) else: m = min(int(feats.shape[0] / 2), 500000) if self.do_lsqr: self.model = Ridge(solver='lsqr', max_iter=500, alpha=self.best_alpha) else: self.model = Ridge(solver='svd', max_iter=300, alpha=self.best_alpha) self.model.fit(feats[-m:], y.iloc[-m:]) except: try: m = min(int(feats.shape[0] / 2), 500000) self.model.fit(feats[-m:], y.iloc[-m:]) self.do_sample = True except: m = min(int(feats.shape[0] / 2), 500000) self.model = Ridge(solver='lsqr', max_iter=500, alpha=self.best_alpha) self.model.fit(feats[-m:], y.iloc[-m:]) self.do_sample = True self.do_lsqr = True return self
def transform(self, ss): return downcast(ss)
def c_values_cnt(ss): counts = ss.value_counts() ss = ss.map(counts) ss = downcast(ss) return ss
def n_plus_n(ss_1, ss_2): new_ss = ss_1 + ss_2 return downcast(new_ss)
def n_minus_n(ss_1, ss_2): new_ss = ss_1 - ss_2 return downcast(new_ss)
def n_multiply_n(ss_1, ss_2): new_ss = ss_1 * ss_2 return downcast(new_ss)
def n_div_n(ss_1, ss_2): new_ss = ss_1 / ss_2 return downcast(new_ss)
def fit(self, X, y, categories): log(f'debug{self.good_cols}') if self.good_cols is not None: X = X[self.good_cols] self.cat_cols = tuple( [col for col in categories if col in self.good_cols]) self.num_cols = tuple( [col for col in X.columns if col not in categories]) else: self.cat_cols = tuple(categories) self.num_cols = tuple( [col for col in X.columns if col not in categories]) if self.train_shape is None: self.train_shape = X.shape[0] X_sample = X.iloc[:self.train_shape].sample(frac=0.8, random_state=2020) y_sample = y.loc[X_sample.index] X_test = X.iloc[self.train_shape:] y_test = y.loc[self.train_shape:] X = pd.concat([X_sample, X_test], axis=0) y = pd.concat([y_sample, y_test], axis=0) del X_sample, y_sample, X_test, y_test gc.collect() # self.cat_cols = tuple() # self.num_cols = tuple([col for col in X.columns]) #log(f'train num col: {self.num_cols}') #log(f'train cat col:{self.cat_cols}') cat_feats = self.cat_fit_transform(X, mode='fit_trans') num_feats = self.num_fit_transform(X, mode='fit_trans') if len(cat_feats) > 0 and len(num_feats) > 0: feats = np.concatenate([cat_feats, num_feats], axis=1) elif len(cat_feats) > 0: feats = cat_feats elif len(num_feats) > 0: feats = num_feats #log(f'before downcast {feats.dtype}') feats = downcast(feats) self.model = Lasso(alpha=self.best_alpha, max_iter=500) try: if not self.do_sample: self.model.fit(feats, y) else: self.model.fit(feats[-self.size:], y.iloc[-self.size:]) except: try: m = int(feats.shape[0] / 2) self.model.fit(feats[-m:], y.iloc[-m:]) self.do_sample = True self.size = m except: m = min(int(feats.shape[0] / 5), 500000) self.model.fit(feats[-m:], y.iloc[-m:]) self.size = m self.do_sample = True return self
def time_atr(ss: pd.Series, atr): return downcast(getattr(ss.dt, atr), accuracy_loss=False)