def transform_data(data, transform='log'): if transform == 'log': data = np.log(data) elif transform == 'box-cox': data = pd.DataFrame(power_transform(data, method='box-cox'), columns = data.columns) elif transform =='yeo-johnson': data = pd.DataFrame(power_transform(data, method='yeo-johnson'), columns = data.columns) else: pass return data
def transform(self, X, **kwargs): data = X.copy() for col in self.cols: data[col] = power_transform(data[[col]]) return data
def log_power_transform(df: pd.DataFrame, method: str = 'box-cox', standardize: bool = False) -> Tuple[pd.DataFrame, str]: """ Perform log or power transform of the input dataframe. If performing a power transformation, user has option to standardize afterwards. Parameters ---------- df : The pd.DataFrame to be normalized. Can be either univariate or multivariate. method : The type of log/power transformation used. Current options are 'box-cox', 'yeo-johnson', or 'log'. standardize : The option to standardize data after transformation. The default is False. Returns ------- df_trans : The transformed dataframe. title : The key used to access df_pwr in CheckpointDict during run_package(). """ stan = '' if standardize: stan = 'Standardized' if method == 'log': df_trans = df.transform(np.log) else: data_trans = power_transform(df, method=method, standardize=standardize) df_trans = pd.DataFrame(data_trans, index=df.index, columns=df.columns) title = str(method.title() + ' ' + stan) return df_trans, title
def num_yeo(X, cols, predix='yeo_', **params): # Yeo-Johnson Transformer params['method'] = 'yeo-johnson' _X = X.copy() _cols = [prefix + col for col in cols] _x = power_transform(X[cols]) _X[_cols] = pd.DataFrame(_x, columns=_cols, index=_X.index) return _X, _cols
def num_boxcox(X, cols, predix='bc_', **params): # Box-Cox Transformer params['method'] = 'box-cox' _X = X.copy() _cols = [prefix + col for col in cols] _x = power_transform(X[cols]) _X[_cols] = pd.DataFrame(_x, columns=_cols, index=_X.index) return _X, _cols
def ts_scaler(self, object_id): test = self.ts_df.drop('mean_detected', axis=1).loc[object_id] # object_id_set = set(object_id_list) # scaled_df_v3 = pd.DataFrame() # for object_id in object_id_set: # test = scaler_df.loc[object_id] passband_set = set(test.index.get_level_values('passband')) scaled_df_v2 = pd.DataFrame() for passband in passband_set: tester = test.xs(passband, level='passband') scaled_df = pd.DataFrame() for i, column in enumerate(tester): # print(i, column) pd_series = tester[column] series_index = pd_series.index.tolist() if bool(re.search("range", column)): # maxabs_scaler = preprocessing.MaxAbsScaler() x = pd_series.values.reshape(-1, 1) # returns a numpy array x_scaled = preprocessing.maxabs_scale(x) x_scaled = pd.Series(x_scaled[:, 0], index=series_index) else: # power_scaler = preprocessing.PowerTransformer() x = pd_series.values.reshape(-1, 1) # returns a numpy array try: x_scaled = preprocessing.power_transform( x, method='yeo-johnson') x_scaled = pd.Series(x_scaled[:, 0], index=series_index) except RuntimeWarning: print(object_id, passband, column) scaled_df = pd.concat((scaled_df, x_scaled), axis=1) scaled_df.index.names = ['input'] scaled_df.columns = [x for x in tester.columns] scaled_df['passband'] = passband scaled_df.set_index('passband', append=True, inplace=True) scaled_df_v2 = pd.concat((scaled_df_v2, scaled_df), axis=0) scaled_df_v2 = scaled_df_v2.reorder_levels(['input', 'passband']) scaled_df_v2['object_id'] = object_id scaled_df_v2.set_index('object_id', append=True, inplace=True) scaled_df_v2 = scaled_df_v2.reorder_levels( ['object_id', 'input', 'passband']) # scaled_df_v3 = pd.concat((scaled_df_v3, scaled_df_v2), axis=0) final_scaled_df = pd.concat( (scaled_df_v2, self.ts_df[['mean_detected']]), axis=1, sort=False) return final_scaled_df
def train(self): #Get a dataset. This is Microsoft stock data. df = pm.datasets.load_msft() df = df.drop(columns=['Date', 'Volume', 'OpenInt']) #Dataset shape is now (7983,4) print(df.shape) #define the series to be forecasted (user specified) y = df['High'] y = np.array(y) y = y.reshape(-1, 1) #exog represents the exogeneous variables (user specified) exog = df[['Open', 'Low', 'Close']] exog = np.array(exog) #Box-Cox transform on y and exog y = power_transform(y, method='box-cox') exog = power_transform(exog, method='box-cox') y_train, y_test = pm.model_selection.train_test_split(y, test_size=0.2) exog_train, exog_test = pm.model_selection.train_test_split( exog, test_size=0.2) arima = pm.auto_arima(y_train, exog_train, start_p=1, d=None, start_q=1, information_criterion='aic', maxiter=100, method='lbfgs', test='kpss', stepwise=True) forecasts = arima.predict(y_test.shape[0], exog_test) error = smape(y_test, forecasts) mae = mean_absolute_error(y_test, forecasts) print("Symmetric Mean Absolute Percentage Error: ", error) print("Mean Absolute Error: ", mae)
def transform(self, df): df = df.copy() for col in self.to_transform: if self.how == 'log': df[col] = np.log(1 + df[col]) elif self.how == 'yj': df[col] = skl_preproc.power_transform( df[col].values.reshape(-1, 1), method='yeo-johnson', standardize=self.standardize) elif self.how == 'boxcox': df[col] = skl_preproc.power_transform( df[col].values.reshape(-1, 1), method='box-cox', standardize=self.standardize) elif self.how == 'boxcox1p': df[col] = skl_preproc.power_transform( 1 + df[col].values.reshape(-1, 1), method='box-cox', standardize=self.standardize) return df
def newBoxCoxTranformation(df, target): #assuming that only numerical features are presented print("Shape of the dataset before transformation : ", df.shape) y = df[target].apply(lambda x: math.log(x)) X = df.drop(target, axis=1) x_columns = list(X) X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X) X = preprocessing.power_transform(X, method='box-cox') #X = pd.DataFrame(X,columns=x_columns) print("Shape of the dataset after transformation : ", X.shape, y.shape) return X, y
def normalize(dataframe, type): global normalized_data if type == 'zscore': clean_data = dataframe.select_dtypes(['number']) cleaner_data = clean_data.dropna(how='any') normalized_data = cleaner_data.apply(zscore) elif type == 'minmax': clean_data = dataframe.select_dtypes(['number']) cleaner_data = clean_data.dropna(how='any') minmax_data = minmax_scale(cleaner_data) normalized_data = pd.DataFrame(minmax_data) elif type == 'l1_norm': clean_data = dataframe.select_dtypes(['number']) cleaner_data = clean_data.dropna(how='any') norm_data = normalize(cleaner_data, norm='l1') normalized_data = pd.DataFrame(norm_data) elif type == 'l2_norm': clean_data = dataframe.select_dtypes(['number']) cleaner_data = clean_data.dropna(how='any') norm_data = normalize(cleaner_data, norm='l2') normalized_data = pd.DataFrame(norm_data) elif type == 'power_yeo': clean_data = dataframe.select_dtypes(['number']) cleaner_data = clean_data.dropna(how='any') power_data = power_transform(cleaner_data, method='yeo-johnson') normalized_data = pd.DataFrame(power_data) elif type == 'power_box': clean_data = dataframe.select_dtypes(['number']) cleaner_data = clean_data.dropna(how='any') power_data = power_transform(cleaner_data, method='box-cox') normalized_data = pd.DataFrame(power_data) elif type == 'quantile': clean_data = dataframe.select_dtypes(['number']) cleaner_data = clean_data.dropna(how='any') quantile_data = quantile_transform(cleaner_data) normalized_data = pd.DataFrame(quantile_data) return normalized_data
def pow_transformer(df , column_names): '''This function takes in a dataframe and the columns that need to power scaled. Loops through the columns power transforms the continuous variables.''' dataframe = [] copy_df = df.copy() for column in column_names: new_df = power_transform(np.array(df[column]).reshape(-1,1)) new_df.columns = [column + '_' + str(name) for name in new_df] dataframes.append(new_df) copy_df.drop(column, axis=1, inplace=True) new_df = pd.concat(dataframes, axis=1) return pd.concat([copy_df, new_df], axis=1)
def main(): data_path = r'C:\Users\win10\Desktop\Projects\CYB\Experiment_Balint\CYB004\Data' n_channels = 8 X = np.empty((n_channels, 0)) for file in sorted([f for f in os.listdir(data_path) if f.endswith('.json')]): if 'Stair' not in file: continue with open(data_path + '\\' + file) as json_file: dict_data = json.load(json_file) emg_data = np.array(dict_data["EMG"]) X = np.hstack((X, emg_data)) X_std = np.std(X, axis=1) X_mean = np.mean(X, axis=1) X = (X - X_mean[:, None]) / X_std[:, None] a = power_transform(np.expand_dims(np.abs(X[0]),0).T, method='box-cox') nProcess = multiprocessing.cpu_count() with multiprocessing.Pool(nProcess) as pool: lambdas = pool.map(parallel_proc, X) with open(data_path + r'\lambdas.csv', "w", newline='') as f: writer = csv.writer(f) writer.writerows(lambdas)
models = [clf_bag, clf_log] get_split_loader = get_split_loader_func(3, X) #evaluate(models, [tr], X, y, get_split_loader) from sklearn.preprocessing import power_transform nor_dis = TurnToNormDist(cols_with_nan) new_X = nor_dis.transform(X) new_X.shape==X.shape power_transform(X[["sector"]]) X.head() import seaborn as sns sns.distplot( new_X["return_1w"]) def select_feat_by_corr(X, y, threshold=0.09): y.columns = ["Target"] data = pd.concat([X,y], axis=1) _corr = data.corr()[["Target"]].sort_values("Target") feat_to_drop=list(_corr[(_corr["Target"]< threshold)& (_corr["Target"]>-threshold)].index) #X.drop(feat_to_drop,axis=1,inplace=True) return feat_to_drop, _corr corr_cols, a = select_feat_by_corr(X, y, 0.2) a pd.concat([X,y], axis=1)
def preprocess(features: np.ndarray, target: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: features = power_transform(features, standardize=True) target = label_binarize(target, np.unique(target)) return features, target
p = Path(__file__).parents[1] # To load project modules import sys sys.path.append(str(p)) from src.logger import LOGGER from src import estimators as e LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.drop(labels='loss', axis=1) y = df['loss'].copy() LOGGER.info('Process target') y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index) LOGGER.info('Load categorical features to drop') noVarFeatures = json.load( open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r')) LOGGER.info('Process categorical features') catf = pd.DataFrame(data=make_pipeline( e.CategoricalGrouper(), e.CategoricalEncoder()).fit_transform( X.filter(like='cat').drop(labels=noVarFeatures, axis=1), y), columns=X.filter(like='cat').drop(labels=noVarFeatures, axis=1).columns, index=X.index)
def fit(self, desc): desc = minmax_scale(desc, axis=0) if self.bc: desc = power_transform(desc, method='yeo-johnson') self.desc = desc return self
def gp_fit_test(x_train: Tensor, y_train: Tensor, error_train: Tensor, x_test: Tensor, y_test: Tensor, error_test: Tensor, gp_obj_model: SingleTaskGP, gp_error_model: SingleTaskGP, tkwargs: Dict[str, Any], gp_test_folder: str, obj_out_wp: bool = False, err_out_wp: bool = False) -> None: """ 1) Estimates mean test error between predicted and the true objective function values. 2) Estimates mean test error between predicted recon. error by the gp_model and the true recon. error of the vae_model. :param x_train: normalised points at which the gps were trained :param y_train: objective value function corresponding to x_train that were used as targets of `gp_obj_model` :param error_train: reconstruction error value at points x_train that were used as targets of `gp_error_model` :param x_test: normalised test points :param y_test: objective value function corresponding to x_test :param error_test: reconstruction error at test points :param gp_obj_model: the gp model trained to predict the black box objective function values :param gp_error_model: the gp model trained to predict reconstruction error :param tkwargs: dict of type and device :param gp_test_folder: folder to save test results :param obj_out_wp: if the `gp_obj_model` was trained with output warping then need to apply the same transform :param err_out_wp: if the `gp_error_model` was trained with output warping then need to apply the same transform :return: (Sum_i||true_y_i - pred_y_i||^2 / n_points, Sum_i||true_recon_i - pred_recon_i||^2 / n_points) """ do_robust = True if gp_error_model is not None else False if not os.path.exists(gp_test_folder): os.mkdir(gp_test_folder) gp_obj_model.eval() gp_obj_model.to(tkwargs['device']) y_train = y_train.view(-1) if do_robust: gp_error_model.eval() gp_error_model.to(tkwargs['device']) error_train = error_train.view(-1) with torch.no_grad(): if obj_out_wp: Y_numpy = y_train.cpu().numpy() if Y_numpy.min() <= 0: y_train = torch.FloatTensor( power_transform(Y_numpy / Y_numpy.std(), method='yeo-johnson')) else: y_train = torch.FloatTensor( power_transform(Y_numpy / Y_numpy.std(), method='box-cox')) if y_train.std() < 0.5: Y_numpy = y_train.numpy() y_train = torch.FloatTensor( power_transform(Y_numpy / Y_numpy.std(), method='yeo-johnson')).to(x_train) Y_numpy = y_test.cpu().numpy() if Y_numpy.min() <= 0: y_test = torch.FloatTensor( power_transform(Y_numpy / Y_numpy.std(), method='yeo-johnson')) else: y_test = torch.FloatTensor( power_transform(Y_numpy / Y_numpy.std(), method='box-cox')) if y_test.std() < 0.5: Y_numpy = y_test.numpy() y_test = torch.FloatTensor( power_transform(Y_numpy / Y_numpy.std(), method='yeo-johnson')).to(x_test) y_train = y_train.view(-1).to(**tkwargs) y_test = y_test.view(-1).to(**tkwargs) gp_obj_val_model_mse_train = ( gp_obj_model.posterior(x_train).mean.view(-1) - y_train).pow(2).div(len(y_train)) gp_obj_val_model_mse_test = ( gp_obj_model.posterior(x_test).mean.view(-1) - y_test).pow(2).div( len(y_test)) torch.save( gp_obj_val_model_mse_train, os.path.join(gp_test_folder, 'gp_obj_val_model_mse_train.npz')) torch.save(gp_obj_val_model_mse_test, os.path.join(gp_test_folder, 'gp_obj_val_model_test.npz')) print( f'GP training fit on objective value: MSE={gp_obj_val_model_mse_train.sum().item():.5f}' ) print( f'GP testing fit on objective value: MSE={gp_obj_val_model_mse_test.sum().item():.5f}' ) if do_robust: if err_out_wp: error_train = error_train.view(-1, 1) R_numpy = error_train.cpu().numpy() if R_numpy.min() <= 0: error_train = torch.FloatTensor( power_transform(R_numpy / R_numpy.std(), method='yeo-johnson')) else: error_train = torch.FloatTensor( power_transform(R_numpy / R_numpy.std(), method='box-cox')) if error_train.std() < 0.5: R_numpy = error_train.numpy() error_train = torch.FloatTensor( power_transform(R_numpy / R_numpy.std(), method='yeo-johnson')).to(x_train) R_numpy = error_test.cpu().numpy() if R_numpy.min() <= 0: error_test = torch.FloatTensor( power_transform(R_numpy / R_numpy.std(), method='yeo-johnson')) else: error_test = torch.FloatTensor( power_transform(R_numpy / R_numpy.std(), method='box-cox')) if error_test.std() < 0.5: R_numpy = error_test.numpy() error_test = torch.FloatTensor( power_transform(R_numpy / R_numpy.std(), method='yeo-johnson')).to(x_test) error_train = error_train.view(-1).to(**tkwargs) error_test = error_test.view(-1).to(**tkwargs) pred_recon_train = gp_error_model.posterior(x_train).mean.view(-1) pred_recon_test = gp_error_model.posterior(x_test).mean.view(-1) gp_error_model_mse_train = (error_train - pred_recon_train).pow(2).div( len(error_train)) gp_error_model_mse_test = (error_test - pred_recon_test).pow(2).div( len(error_test)) torch.save( gp_error_model_mse_train, os.path.join(gp_test_folder, 'gp_error_model_mse_train.npz')) torch.save( gp_error_model_mse_test, os.path.join(gp_test_folder, 'gp_error_model_mse_test.npz')) print( f'GP training fit on reconstruction errors: MSE={gp_error_model_mse_train.sum().item():.5f}' ) print( f'GP testing fit on reconstruction errors: MSE={gp_error_model_mse_test.sum().item():.5f}' ) torch.save(error_test, os.path.join(gp_test_folder, f"true_rec_err_z.pt")) torch.save(error_train, os.path.join(gp_test_folder, f"error_train.pt")) torch.save(x_train, os.path.join(gp_test_folder, f"train_x.pt")) torch.save(x_test, os.path.join(gp_test_folder, f"test_x.pt")) torch.save(y_train, os.path.join(gp_test_folder, f"y_train.pt")) torch.save(x_test, os.path.join(gp_test_folder, f"X_test.pt")) torch.save(y_test, os.path.join(gp_test_folder, f"y_test.pt")) # y plots plt.hist(y_train.cpu().numpy(), bins=100, label='y train', alpha=0.5, density=True) plt.hist(gp_obj_model.posterior(x_train).mean.view( -1).detach().cpu().numpy(), bins=100, label='y pred', alpha=0.5, density=True) plt.legend() plt.title('Training set') plt.savefig(os.path.join(gp_test_folder, 'gp_obj_train.pdf')) plt.close() plt.hist(gp_obj_val_model_mse_train.detach().cpu().numpy(), bins=100, alpha=0.5, density=True) plt.title('MSE of gp_obj_val model on training set') plt.savefig(os.path.join(gp_test_folder, 'gp_obj_train_mse.pdf')) plt.close() plt.hist(y_test.cpu().numpy(), bins=100, label='y true', alpha=0.5, density=True) plt.hist(gp_obj_model.posterior(x_test).mean.detach().cpu().numpy(), bins=100, alpha=0.5, label='y pred', density=True) plt.legend() plt.title('Validation set') plt.savefig(os.path.join(gp_test_folder, 'gp_obj_test.pdf')) plt.close() plt.hist(gp_obj_val_model_mse_test.detach().cpu().numpy(), bins=100, alpha=0.5, density=True) plt.title('MSE of gp_obj_val model on validation set') plt.savefig(os.path.join(gp_test_folder, 'gp_obj_test_mse.pdf')) plt.close() if do_robust: # error plots plt.hist(error_train.cpu().numpy(), bins=100, label='error train', alpha=0.5, density=True) plt.hist( gp_error_model.posterior(x_train).mean.detach().cpu().numpy(), bins=100, label='error pred', alpha=0.5, density=True) plt.legend() plt.title('Training set') plt.savefig(os.path.join(gp_test_folder, 'gp_error_train.pdf')) plt.close() plt.hist(gp_error_model_mse_train.detach().cpu().numpy(), bins=100, alpha=0.5, density=True) plt.title('MSE of gp_error model on training set') plt.savefig(os.path.join(gp_test_folder, 'gp_error_train_mse.pdf')) plt.close() plt.hist(error_test.cpu().numpy(), bins=100, label='error true', alpha=0.5, density=True) plt.hist( gp_error_model.posterior(x_test).mean.detach().cpu().numpy(), bins=100, alpha=0.5, label='error pred', density=True) plt.legend() plt.title('Validation set') plt.savefig(os.path.join(gp_test_folder, 'gp_error_test.pdf')) plt.close() plt.hist(gp_error_model_mse_test.detach().cpu().numpy(), bins=100, alpha=0.5, density=True) plt.title('MSE of gp_error model on validation set') plt.savefig(os.path.join(gp_test_folder, 'gp_error_test_mse.pdf')) plt.close() # y-error plots y_train_sorted, indices_train = torch.sort(y_train) error_train_sorted = error_train[indices_train] gp_y_train_pred_sorted, indices_train_pred = torch.sort( gp_obj_model.posterior(x_train).mean.view(-1)) gp_r_train_pred_sorted = (gp_error_model.posterior( x_train).mean.view(-1))[indices_train_pred] plt.scatter(y_train_sorted.cpu().numpy(), error_train_sorted.cpu().numpy(), label='true', marker='+') plt.scatter(gp_y_train_pred_sorted.detach().cpu().numpy(), gp_r_train_pred_sorted.detach().cpu().numpy(), label='pred', marker='*') plt.xlabel('y train targets') plt.ylabel('recon. error train targets') plt.title('y_train vs. error_train') plt.legend() plt.savefig( os.path.join(gp_test_folder, 'scatter_obj_error_train.pdf')) plt.close() y_test_std_sorted, indices_test = torch.sort(y_test) error_test_sorted = error_test[indices_test] gp_y_test_pred_sorted, indices_test_pred = torch.sort( gp_obj_model.posterior(x_test).mean.view(-1)) gp_r_test_pred_sorted = (gp_error_model.posterior( x_test).mean.view(-1))[indices_test_pred] plt.scatter(y_test_std_sorted.cpu().numpy(), error_test_sorted.cpu().numpy(), label='true', marker='+') plt.scatter(gp_y_test_pred_sorted.detach().cpu().numpy(), gp_r_test_pred_sorted.detach().cpu().numpy(), label='pred', marker='*') plt.xlabel('y test targets') plt.ylabel('recon. error test targets') plt.title('y_test vs. error_test') plt.legend() plt.savefig( os.path.join(gp_test_folder, 'scatter_obj_error_test.pdf')) plt.close() # error var plots error_train_sorted, indices_train_pred = torch.sort(error_train) # error_train_sorted = error_train # indices_train_pred = np.arange(len(error_train)) gp_r_train_pred_sorted = gp_error_model.posterior( x_train).mean[indices_train_pred].view(-1) gp_r_train_pred_std_sorted = gp_error_model.posterior( x_train).variance.view(-1).sqrt()[indices_train_pred] plt.scatter(np.arange(len(indices_train_pred)), error_train_sorted.cpu().numpy(), label='err true', marker='+', color='C1', s=15) plt.errorbar( np.arange(len(indices_train_pred)), gp_r_train_pred_sorted.detach().cpu().numpy().flatten(), yerr=gp_r_train_pred_std_sorted.detach().cpu().numpy().flatten( ), fmt='*', alpha=0.05, label='err pred', color='C0', ecolor='C0') plt.scatter(np.arange(len(indices_train_pred)), gp_r_train_pred_sorted.detach().cpu().numpy(), marker='*', alpha=0.2, s=10, color='C0') # plt.scatter(np.arange(len(indices_train_pred)), # (gp_r_train_pred_sorted + gp_r_train_pred_std_sorted).detach().cpu().numpy(), # label='err pred mean+std', marker='.') # plt.scatter(np.arange(len(indices_train_pred)), # (gp_r_train_pred_sorted - gp_r_train_pred_std_sorted).detach().cpu().numpy(), # label='err pred mean-std', marker='.') plt.legend() plt.title('error predictions and uncertainty on train set') plt.savefig( os.path.join(gp_test_folder, 'gp_error_train_uncertainty.pdf')) plt.close() error_test_sorted, indices_test_pred = torch.sort(error_test) # error_test_sorted = error_test # indices_test_pred = np.arange(len(error_test_sorted)) gp_r_test_pred_sorted = gp_error_model.posterior(x_test).mean.view( -1)[indices_test_pred] gp_r_test_pred_std_sorted = gp_error_model.posterior( x_test).variance.view(-1).sqrt()[indices_test_pred] plt.scatter(np.arange(len(indices_test_pred)), error_test_sorted.cpu().numpy(), label='err true', marker='+', color='C1', s=15) plt.errorbar( np.arange(len(indices_test_pred)), gp_r_test_pred_sorted.detach().cpu().numpy().flatten(), yerr=gp_r_test_pred_std_sorted.detach().cpu().numpy().flatten( ), marker='*', alpha=0.05, label='err pred', color='C0', ecolor='C0') plt.scatter(np.arange(len(indices_test_pred)), gp_r_test_pred_sorted.detach().cpu().numpy().flatten(), marker='*', color='C0', alpha=0.2, s=10) # plt.scatter(np.arange(len(indices_test_pred)), # (gp_r_test_pred_sorted + gp_r_test_pred_std_sorted).detach().cpu().numpy(), # label='err pred mean+std', marker='.') # plt.scatter(np.arange(len(indices_test_pred)), # (gp_r_test_pred_sorted - gp_r_test_pred_std_sorted).detach().cpu().numpy(), # label='err pred mean-std', marker='.') plt.legend() plt.title('error predictions and uncertainty on test set') plt.savefig( os.path.join(gp_test_folder, 'gp_error_test_uncertainty.pdf')) plt.close() # y var plots y_train_std_sorted, indices_train = torch.sort(y_train) gp_y_train_pred_sorted = gp_obj_model.posterior( x_train).mean[indices_train].view(-1) gp_y_train_pred_std_sorted = gp_obj_model.posterior( x_train).variance.sqrt()[indices_train].view(-1) plt.scatter(np.arange(len(indices_train)), y_train_std_sorted.cpu().numpy(), label='y true', marker='+', color='C1', s=15) plt.scatter(np.arange(len(indices_train)), gp_y_train_pred_sorted.detach().cpu().numpy(), marker='*', alpha=0.2, s=10, color='C0') plt.errorbar( np.arange(len(indices_train)), gp_y_train_pred_sorted.detach().cpu().numpy().flatten(), yerr=gp_y_train_pred_std_sorted.detach().cpu().numpy().flatten(), fmt='*', alpha=0.05, label='y pred', color='C0', ecolor='C0') # plt.scatter(np.arange(len(indices_train_pred)), # (gp_y_train_pred_sorted+gp_y_train_pred_std_sorted).detach().cpu().numpy(), # label='y pred mean+std', marker='.') # plt.scatter(np.arange(len(indices_train_pred)), # (gp_y_train_pred_sorted-gp_y_train_pred_std_sorted).detach().cpu().numpy(), # label='y pred mean-std', marker='.') plt.legend() plt.title('y predictions and uncertainty on train set') plt.savefig( os.path.join(gp_test_folder, 'gp_obj_val_train_uncertainty.pdf')) plt.close() y_test_std_sorted, indices_test = torch.sort(y_test) gp_y_test_pred_sorted = gp_obj_model.posterior(x_test).mean.view( -1)[indices_test] gp_y_test_pred_std_sorted = gp_obj_model.posterior( x_test).variance.view(-1).sqrt()[indices_test] plt.scatter(np.arange(len(indices_test)), y_test_std_sorted.cpu().numpy(), label='y true', marker='+', color='C1', s=15) plt.errorbar( np.arange(len(indices_test)), gp_y_test_pred_sorted.detach().cpu().numpy().flatten(), yerr=gp_y_test_pred_std_sorted.detach().cpu().numpy().flatten(), fmt='*', alpha=0.05, label='y pred', color='C0', ecolor='C0') plt.scatter(np.arange(len(indices_test)), gp_y_test_pred_sorted.detach().cpu().numpy(), marker='*', alpha=0.2, s=10, color='C0') # plt.scatter(np.arange(len(indices_test_pred)), # (gp_y_test_pred_sorted + gp_y_test_pred_std_sorted).detach().cpu().numpy(), # label='y pred mean+std', marker='.') # plt.scatter(np.arange(len(indices_test_pred)), # (gp_y_test_pred_sorted - gp_y_test_pred_std_sorted).detach().cpu().numpy(), # label='y pred mean-std', marker='.') plt.legend() plt.title('y predictions and uncertainty on test set') plt.savefig( os.path.join(gp_test_folder, 'gp_obj_val_test_uncertainty.pdf')) plt.close()
def transform(self,X): X = X.copy() X[self.features] = power_transform( X[self.features], method='yeo-johnson') return X
print(ks_statistic, p_value) # Shapiro Wilk test # best test # If the P-Value of the Shapiro Wilk Test is larger than 0.05, we assume a normal distribution # If the P-Value of the Shapiro Wilk Test is smaller than 0.05, we do not assume a normal distribution from scipy import stats shapiro_test = stats.shapiro(data_MSTL) print(shapiro_test.statistic, shapiro_test.pvalue) # if the data is present in non-normal shape (which it is), it can be transformed into a normal distribution using the box cox # https://www.statisticshowto.com/box-cox-transformation/ # Normality is an important assumption for many statistical techniques; # if your data isn’t normal, applying a Box-Cox means that you are able to run a broader number of tests. from sklearn.preprocessing import power_transform xt, lmbda = stats.yeojohnson(data_MSTL) print(power_transform(data_MSTL["Temp"].values.reshape(-1, 1), method='yeo-johnson', standardize = False)) xts = power_transform(data_MSTL["Temp"].values.reshape(-1, 1), method='yeo-johnson') shapiro_test = stats.shapiro(xt) print(shapiro_test.statistic, shapiro_test.pvalue) comparison = pd.concat([data_MSTL, pd.DataFrame(xt, index = data_MSTL.index).rename(columns={0: "stats-non-standardised"}), pd.DataFrame(xts, index = data_MSTL.index).rename(columns={0: "standardised"})], axis = 1) fig = plt.figure() ax1 = fig.add_subplot(221) prob = stats.probplot(comparison["Temp"], dist=stats.norm, plot=ax1) ax1.set_xlabel('') ax1.set_title('Probplot against normal distribution') ax2 = fig.add_subplot(222) prob = stats.probplot(comparison["stats-non-standardised"], dist=stats.norm, plot=ax2) ax2.set_title('Probplot after Yeo-Johnson transformation')
Y = Y - 1 n_test = int(len(df) / 10) Y_train = Y[n_test:] Y_test = Y[:n_test] X = df[[ 'LineFitGeoSplit1Params.n_hits', 'SplineMPEDirectHitsICB.n_early_strings', 'SplineMPEDirectHitsICB.n_late_doms', 'SPEFitSingleTimeSplit1.azimuth', 'ProjectedQ.max_grad_radius_circ_F', 'ProjectedQ.ratio', 'BestTrackCramerRaoParams.cramer_rao_theta', 'BestTrackCramerRaoParams.variance_theta', 'BestTrackCramerRaoParams.variance_x', 'BestTrackCramerRaoParams.variance_y', 'BestTrackCramerRaoParams.covariance_theta_y', 'SplineMPETruncatedEnergy_SPICEMie_DOMS_Muon.energy', 'SplineMPETruncatedEnergy_SPICEMie_BINS_Muon.energy', 'SPEFit2TimeSplit1BayesianFitParams.nmini', 'LineFitTimeSplit2Params.n_hits', 'BestTrackDirectHitsICB.n_dir_pulses', 'HitStatisticsValues.min_pulse_time', 'SplineMPEDirectHitsICE.n_dir_doms', 'SplineMPEDirectHitsICE.n_late_strings', 'MPEFit_HVFitParams.nmini' ]] #'SplineMPECharacteristicsIC.avg_dom_dist_q_tot_dom', #'MPEFitHighNoiseFitParams.nmini']] X_box = power_transform(X, method='yeo-johnson') X_btrain = X_box[n_test:] #splitting the dataframe X_btest = X_box[:n_test] estimator = LogisticAT() selector = RFE(estimator, n_features_to_select=5, step=1) selector.fit(X_box, Y) print(selector.ranking_)
df.hist(bins=50, figsize=(20, 20)) plt.show() #now we transform the data to Gaussian and delete outliers to see better evry features dfT = pd.DataFrame() for c in data.columns: #plt.figure(i) if c[0] != 'V': continue x = data[c] x = x.sort_values() x = x[20000:-20000] #x=sklearn.preprocessing.PowerTransformer(method='yeo-johnson', standardize=True, copy=True) X = power_transform(x[:, np.newaxis], method='yeo-johnson', standardize=True, copy=True) #print(X.shape) dfT[c] = X.squeeze() dfT.hist(bins=50, figsize=(20, 20)) plt.show() #Transform the data for work - no delete outlier #we use method 'yeo-johnson' because we have negative values dataT = pd.DataFrame() for c in data.columns: #plt.figure(i) if c[0] != 'V': continue
def suggest(self, n_suggestions=1, fix_input = None): if self.X.shape[0] < self.rand_sample: sample = self.quasi_sample(n_suggestions, fix_input) return sample else: X, Xe = self.space.transform(self.X) try: if self.y.min() <= 0: y = torch.FloatTensor(power_transform(self.y / self.y.std(), method = 'yeo-johnson')) else: y = torch.FloatTensor(power_transform(self.y / self.y.std(), method = 'box-cox')) if y.std() < 0.5: y = torch.FloatTensor(power_transform(self.y / self.y.std(), method = 'yeo-johnson')) if y.std() < 0.5: raise RuntimeError('Power transformation failed') model = get_model(self.model_name, self.space.num_numeric, self.space.num_categorical, 1, **self.model_config) model.fit(X, Xe, y) except: y = torch.FloatTensor(self.y).clone() model = get_model(self.model_name, self.space.num_numeric, self.space.num_categorical, 1, **self.model_config) model.fit(X, Xe, y) best_id = np.argmin(self.y.squeeze()) best_x = self.X.iloc[[best_id]] best_y = y.min() py_best, ps2_best = model.predict(*self.space.transform(best_x)) py_best = py_best.detach().numpy().squeeze() ps_best = ps2_best.sqrt().detach().numpy().squeeze() iter = max(1, self.X.shape[0] // n_suggestions) upsi = 0.5 delta = 0.01 # kappa = np.sqrt(upsi * 2 * np.log(iter ** (2.0 + self.X.shape[1] / 2.0) * 3 * np.pi**2 / (3 * delta))) kappa = np.sqrt(upsi * 2 * ((2.0 + self.X.shape[1] / 2.0) * np.log(iter) + np.log(3 * np.pi**2 / (3 * delta)))) acq = MACE(model, py_best, kappa = kappa) # LCB < py_best mu = Mean(model) sig = Sigma(model, linear_a = -1.) opt = EvolutionOpt(self.space, acq, pop = 100, iters = 100, verbose = False) rec = opt.optimize(initial_suggest = best_x, fix_input = fix_input).drop_duplicates() rec = rec[self.check_unique(rec)] cnt = 0 while rec.shape[0] < n_suggestions: rand_rec = self.quasi_sample(n_suggestions - rec.shape[0], fix_input) rand_rec = rand_rec[self.check_unique(rand_rec)] rec = rec.append(rand_rec, ignore_index = True) cnt += 1 if cnt > 3: # sometimes the design space is so small that duplicated sampling is unavoidable break if rec.shape[0] < n_suggestions: rand_rec = self.quasi_sample(n_suggestions - rec.shape[0], fix_input) rec = rec.append(rand_rec, ignore_index = True) select_id = np.random.choice(rec.shape[0], n_suggestions, replace = False).tolist() x_guess = [] with torch.no_grad(): py_all = mu(*self.space.transform(rec)).squeeze().numpy() ps_all = -1 * sig(*self.space.transform(rec)).squeeze().numpy() best_pred_id = np.argmin(py_all) best_unce_id = np.argmax(ps_all) if best_unce_id not in select_id and n_suggestions > 2: select_id[0]= best_unce_id if best_pred_id not in select_id and n_suggestions > 2: select_id[1]= best_pred_id rec_selected = rec.iloc[select_id].copy() return rec_selected
# To load project modules import sys; sys.path.append(str(p)) from src.logger import LOGGER from src import estimators as e from src.ranker import Ranker A4_DIMS = (11.7, 8.27) LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.drop(labels='loss', axis=1) y = df['loss'].copy() LOGGER.info('Process target') y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index) LOGGER.info('Load categorical features to drop') noVarFeatures = json.load(open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r')) LOGGER.info('Process categorical features') catf = pd.DataFrame( data=make_pipeline( e.CategoricalGrouper(), e.CategoricalEncoder() ).fit_transform(X.filter(like='cat').drop(labels=noVarFeatures, axis=1), y), columns=X.filter(like='cat').drop(labels=noVarFeatures, axis=1).columns, index=X.index ) LOGGER.info('Process continuous features')
... [ -2., 1., 3.], ... [ 4., 1., -2.]] >>> transformer = RobustScaler().fit(X) >>> transformer RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True) with_scaling=True) >>> transformer.transform(X) array([[ 0. , -2. , 0. ], [-1. , 0. , 0.4], [ 1. , 0. , -1.6]]) #Power transform >>> import numpy as np >>> from sklearn.preprocessing import power_transform >>> data = [[1, 2], [3, 2], [4, 5]] >>> print(power_transform(data, method='box-cox')) [[-1.332... -0.707...] [ 0.256... -0.707...] [ 1.076... 1.414...]] #функция взаимодействий from itertools import combinations def interactions(data): columns=list(data.columns) ls=list(combinations(columns, 2)) for inter in ls: print(inter[0], inter[1]) data[str(inter[0])+'_'+str(inter[1])]=data[str(inter[0])]+data[str(inter[1])]
plt.subplots(figsize=(12, 8)) sns.residplot(train_final_1.KitchenQual, train_final_1.SalePrice).set_title('KITC W/out influential') # Megaphone effect plt.subplots(figsize=(12, 8)) sns.residplot( train_final_1.OverallQual, train_final_1.SalePrice).set_title('OverallQual W/out influential') #------------------------------------------------------------------------------ ### Transforming the data with boxcox ## Using power transform, method = boxcox # The optimal parameter for stabilizing variance and minimizing skewness is estimated through maximum likelihood print(power_transform(train_final_1, method='box-cox')) train_final_1_boxcox = power_transform(train_final_1, method='box-cox') ## Converting the new boxcox np array back into a pd dataframe # I can't believe this worked and I am so proud of myself train_final_boxcox = pd.DataFrame(train_final_1_boxcox, index=train_final_1.index, columns=train_final_1.columns) ## Running the final reg. model with transformed dataframe X = train_final_boxcox[[ "OverallQual", "TotalBsmtSF", "GrLivArea", "KitchenQual" ]] y = train_final_boxcox["SalePrice"] X = sm.add_constant(X)
ax = df_normalized.loc[str(anio), j].plot() ax.set_ylabel('Columnas'); ax.set_xlabel('Anios'); ''' # Probar transformaciones: # Usaremos 2: # BoxCox: # Logit: logit(p) = log(p/(1-p)) # Primera transformacion from sklearn.preprocessing import power_transform df_normalized = df_normalized.replace(0, 0.00001) transf_boxcox = power_transform(df_normalized, method='box-cox') df_boxcox = pd.DataFrame(data = transf_boxcox) df_boxcox.plot() df_normalized = df_normalized.replace(0.00001, 0) # Esta es la misma transformacion pero con otro metodo #transf_yeo = power_transform(df_indicators, method='yeo-johnson') #dfyeo = pd.DataFrame(data = transf_yeo) #dfyeo.plot() # Esta es la misma transformacion pero estandarizada, solo se mueven los ejes pero la curva es igual #from sklearn.preprocessing import PowerTransformer #power = PowerTransformer(method='yeo-johnson', standardize=True) #data_trans = power.fit_transform(dfyeo) #dfyeostandardized = pd.DataFrame(data = data_trans) #dfyeostandardized.plot()
import pandas as pd Data = pd.read_csv('hack_final.csv') from sklearn.preprocessing import power_transform x = Data[['Click_count_y', 'Unique_products']] power_transform(x, method='box-cox') y = Data['y'] from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state=0) regressor.fit(x, y) from sklearn.externals import joblib joblib.dump(regressor, 'model.pkl')
def cleanData(df): # Drop variables with little variance df = df.drop([ 'Id', 'Alley', 'Street', 'LotShape', 'Utilities', 'LandSlope', 'RoofMatl', 'Heating', 'Electrical', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'MiscFeature', 'MiscVal' ], axis=1) # TotalSF variable df.loc[:, 'TotalSF'] = df['TotalBsmtSF'].apply( lambda x: 0 if pd.isna(x) else x) + df['1stFlrSF'] + df['2ndFlrSF'] # Convert some integral classes to categorical df.MSSubClass = df.MSSubClass.astype('str') df.MoSold = df.MoSold.astype('str') df.YrSold = df.YrSold.astype('str') # Bin rare categories df.loc[df.MSSubClass.isin(['180', '75', '45', '40', '150']), 'MSSubClass'] = 'Other' df.loc[df.MSZoning.isin(['RH', 'C (all)']), 'MSZoning'] = 'Other' df.loc[df.Neighborhood.isin(['Blueste', 'NPkVill']), 'Neighborhood'] = 'Other' df.loc[df.Condition1.isin(['RRAe', 'RRAn', 'RRNe', 'RRNn']), 'Condition1'] = 'Near railroad' df.loc[df.Condition1.isin(['PosA', 'PosN']), 'Condition1'] = 'Near positive feature' df.loc[df.Condition2.isin(['RRAe', 'RRAn', 'RRNe', 'RRNn']), 'Condition2'] = 'Near railroad' df.loc[df.Condition2.isin(['PosA', 'PosN']), 'Condition2'] = 'Near positive feature' df.loc[df.HouseStyle.isin(['2.5Unf', '2.5Fin']), 'HouseStyle'] = '2.5' def lengthMap(x): if x == 0 or math.isnan(x): area = 'None' else: area = str( x // 50 * 50) + ' to ' + str((x // 50 + 1) * 50 - 1) + ' ft.' return area df.LotFrontage = df.LotFrontage.apply(lambda x: lengthMap(x)) def remodelAgeMap(x): if x == 1950: era = 'No remodel' elif x > 1950 and x < 1960: era = '1950s' elif x >= 1960 and x < 1970: era = '1960s' elif x >= 1970 and x < 1980: era = '1970s' elif x >= 1980 and x < 1990: era = '1980s' elif x >= 1990 and x < 2000: era = '1990s' elif x >= 2000 and x < 2010: era = '2000s' else: era = '2010s' return era df.loc[:, 'RemodelEra'] = df.YearRemodAdd.apply(lambda x: remodelAgeMap(x)) df = df.drop('YearRemodAdd', axis=1) df.loc[df.Exterior2nd.isin(['Wd Shng']), 'Exterior2nd'] = 'WdShing' df.loc[df.Exterior2nd.isin(['CmentBd']), 'Exterior2nd'] = 'CemntBd' df.loc[df.Exterior2nd.isin(['Brk Cmn']), 'Exterior2nd'] = 'BrkComm' df.loc[df.RoofStyle.isin(['Flat', 'Gambrel', 'Mansard', 'Shed']), 'RoofStyle'] = 'Other' df.loc[df.Exterior1st. isin(['AsphShn', 'ImStucc', 'CBlock', 'Stone', 'BrkComm']), 'Exterior1st'] = 'Other' df.loc[df.Exterior2nd. isin(['AsphShn', 'ImStucc', 'CBlock', 'Stone', 'BrkComm']), 'Exterior2nd'] = 'Other' def areaMap(x): if x == 0: area = 'None' else: area = str(x // 50 * 50) + ' to ' + str((x // 50 + 1) * 50 - 1) + ' sq. ft.' return area df.loc[:, 'VeneerArea'] = df.MasVnrArea.apply(lambda x: areaMap(x)) df = df.drop('MasVnrArea', axis=1) df.loc[df.ExterCond.isin(['Po', 'Fa']), 'ExterCond'] = 'Fa' df.loc[df.ExterCond.isin(['Gd', 'Ex']), 'ExterCond'] = 'Gd' df.loc[df.Foundation.isin(['Wood', 'Stone', 'Slab']), 'Foundation'] = 'Other' df.loc[df.BsmtCond.isin(['Po', 'Fa']), 'BsmtCond'] = 'Fa' df.loc[:, 'BasementUnfinishedSF'] = df.BsmtUnfSF.apply(lambda x: areaMap(x)) df = df.drop('BsmtUnfSF', axis=1) df.loc[:, 'TotalBasementSF'] = df.TotalBsmtSF.apply(lambda x: areaMap(x)) df = df.drop('TotalBsmtSF', axis=1) df.loc[df.HeatingQC.isin(['Po', 'Fa']), 'HeatingQC'] = 'Fa' df.loc[:, 'TotalIndoorSF'] = df['1stFlrSF'] + df['2ndFlrSF'] df = df.drop(['1stFlrSF', '2ndFlrSF'], axis=1) df.loc[:, 'TwoBasementFullBath'] = df.BsmtFullBath.apply( lambda x: 'Yes' if x == 2 else 'No') df = df.drop('BsmtFullBath', axis=1) df.loc[:, 'TwoHalfBath'] = df.HalfBath.apply(lambda x: 'Yes' if x == 2 else 'No') df = df.drop('HalfBath', axis=1) df.loc[df.Functional.isin(['Maj1', 'Maj2', 'Sev']), 'Functional'] = 'Other' df.loc[df.Functional.isin(['Min1', 'Min2']), 'Functional'] = 'Minimial' df.loc[df.GarageType.isin(['CarPort', '2Types']), 'GarageType'] = 'Other' df.GarageArea = df.GarageArea.apply(lambda x: areaMap(x)) df.loc[df.GarageQual.isin(['Ex', 'Gd']), 'GarageQual'] = 'Gd' df.loc[df.GarageQual.isin(['Po', 'Fa']), 'GarageQual'] = 'Fa' df.loc[df.GarageCond.isin(['Ex', 'Gd']), 'GarageCond'] = 'Gd' df.loc[df.GarageCond.isin(['Po', 'Fa']), 'GarageCond'] = 'Fa' df.WoodDeckSF = df.WoodDeckSF.apply(lambda x: areaMap(x)) df.OpenPorchSF = df.OpenPorchSF.apply(lambda x: areaMap(x)) df.EnclosedPorch = df.EnclosedPorch.apply(lambda x: areaMap(x)) df.loc[df.Fence.isin(['MnWw']), 'Fence'] = 'MnPrv' df.loc[df.SaleType.isin(['Con', 'Oth', 'CWD', 'ConLI', 'ConLw', 'ConLD']), 'SaleType'] = 'Other' df.loc[df.SaleCondition.isin(['AdjLand', 'Alloca']), 'SaleCondition'] = 'Other' # Impute missing values with a "None" feature or a computed feature df.loc[df.Fence.isna(), 'Fence'] = 'None' df.loc[df.FireplaceQu.isna(), 'FireplaceQu'] = 'None' df.loc[df.GarageCond.isna(), 'GarageCond'] = 'None' df.loc[df.GarageYrBlt.isna(), 'GarageYrBlt'] = 'None' df.loc[df.GarageFinish.isna(), 'GarageFinish'] = 'None' df.loc[df.GarageQual.isna(), 'GarageQual'] = 'None' df.loc[df.GarageType.isna(), 'GarageType'] = 'None' df.loc[df.BsmtCond.isna(), 'BsmtCond'] = 'None' df.loc[df.BsmtExposure.isna(), 'BsmtExposure'] = 'None' df.loc[df.BsmtQual.isna(), 'BsmtQual'] = 'None' df.loc[df.BsmtFinType1.isna(), 'BsmtFinType1'] = 'None' df.loc[df.MSZoning.isna(), 'MSZoning'] = 'Other' df.loc[df.Functional.isna(), 'Functional'] = 'Other' df.loc[df.SaleType.isna(), 'SaleType'] = 'Other' df.loc[df.KitchenQual.isna(), 'KitchenQual'] = df.groupby( 'KitchenQual').KitchenQual.count().sort_values( ascending=False).index[1] df.loc[df.GarageCars.isna(), 'GarageCars'] = 0 df.loc[df.Exterior1st.isna(), 'Exterior1st'] = 'Other' df.loc[df.Exterior2nd.isna(), 'Exterior2nd'] = 'Other' df.loc[df.MasVnrType.isna(), 'MasVnrType'] = 'None' # Apply Yeo-Johnson transformation to all numeric variables for i in df.columns: if df[i].dtype.name != 'object' and df[i].name != 'SalePrice': df[i] = power_transform(df[i].values.reshape(-1, 1), method='yeo-johnson') # One hot encode categoricals df = pd.get_dummies(df) table = { 'Condition': ['Norm', 'Feedr', 'Near positive feature', 'Artery', 'Near railroad'], 'Exterior': [ 'VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing', 'CemntBd', 'Plywood', 'AsbShng', 'Stucco', 'Other' ] } # Combine Exterior1st and Exterior2nd features # Combine Condition1 and Condition2 features def transformCols(row): for name in table['Condition']: row['Condition' + '_' + name] = max(row['Condition1_' + name], row['Condition2_' + name]) for name in table['Exterior']: row['Exterior' + '_' + name] = max(row['Exterior1st_' + name], row['Exterior2nd_' + name]) return row df = df.transform(transformCols, axis=1) for name in table['Condition']: df.drop(['Condition1_' + name, 'Condition2_' + name], axis=1, inplace=True) for name in table['Exterior']: df.drop(['Exterior1st_' + name, 'Exterior2nd_' + name], axis=1, inplace=True) df.SalePrice = np.log(df.SalePrice) return [df[df.Type_train == 1], df[df.Type_test == 1]]
def suggest(self, n_suggestions=1): if self.X.shape[0] < 4 * n_suggestions: df_suggest = self.quasi_sample(n_suggestions) x_guess = [] for i, row in df_suggest.iterrows(): x_guess.append(row.to_dict()) else: X, Xe = self.space.transform(self.X) try: if self.y.min() <= 0: y = torch.FloatTensor( power_transform(self.y / self.y.std(), method='yeo-johnson')) else: y = torch.FloatTensor( power_transform(self.y / self.y.std(), method='box-cox')) if y.std() < 0.5: y = torch.FloatTensor( power_transform(self.y / self.y.std(), method='yeo-johnson')) if y.std() < 0.5: raise RuntimeError('Power transformation failed') model = get_model(self.model_name, self.space.num_numeric, self.space.num_categorical, 1, **self.model_config) model.fit(X, Xe, y) except: print('Error fitting GP') y = torch.FloatTensor(self.y).clone() filt, q = self.filter(y) print('Q = %g, kept = %d/%d' % (q, y.shape[0], self.y.shape[0])) X = X[filt] Xe = Xe[filt] y = y[filt] model = get_model(self.model_name, self.space.num_numeric, self.space.num_categorical, 1, **self.model_config) model.fit(X, Xe, y) print('Noise level: %g' % model.noise, flush=True) best_id = np.argmin(self.y.squeeze()) best_x = self.X.iloc[[best_id]] best_y = y.min() py_best, ps2_best = model.predict(*self.space.transform(best_x)) py_best = py_best.detach().numpy().squeeze() ps_best = ps2_best.sqrt().detach().numpy().squeeze() # XXX: minimize (mu, -1 * sigma) # s.t. LCB < best_y iter = max(1, self.X.shape[0] // n_suggestions) upsi = 0.5 delta = 0.01 kappa = np.sqrt( upsi * 2 * np.log(iter**(2.0 + self.X.shape[1] / 2.0) * 3 * np.pi**2 / (3 * delta))) acq = MACE(model, py_best, kappa=kappa) # LCB < py_best mu = Mean(model) sig = Sigma(model, linear_a=-1.) opt = EvolutionOpt(self.space, acq, pop=100, iters=100, verbose=True) rec = opt.optimize(initial_suggest=best_x).drop_duplicates() rec = rec[self.check_unique(rec)] cnt = 0 while rec.shape[0] < n_suggestions: rand_rec = self.quasi_sample(n_suggestions - rec.shape[0]) rand_rec = rand_rec[self.check_unique(rand_rec)] rec = rec.append(rand_rec, ignore_index=True) cnt += 1 if cnt > 3: break if rec.shape[0] < n_suggestions: rand_rec = self.quasi_sample(n_suggestions - rec.shape[0]) rec = rec.append(rand_rec, ignore_index=True) select_id = np.random.choice(rec.shape[0], n_suggestions, replace=False).tolist() x_guess = [] with torch.no_grad(): py_all = mu(*self.space.transform(rec)).squeeze().numpy() ps_all = -1 * sig(*self.space.transform(rec)).squeeze().numpy() best_pred_id = np.argmin(py_all) best_unce_id = np.argmax(ps_all) if best_unce_id not in select_id and n_suggestions > 2: select_id[0] = best_unce_id if best_pred_id not in select_id and n_suggestions > 2: select_id[1] = best_pred_id rec_selected = rec.iloc[select_id].copy() py, ps2 = model.predict(*self.space.transform(rec_selected)) rec_selected['py'] = py.squeeze().numpy() rec_selected['ps'] = ps2.sqrt().squeeze().numpy() print(rec_selected) print('Best y is %g %g %g %g' % (self.y.min(), best_y, py_best, ps_best), flush=True) for idx in select_id: x_guess.append(rec.iloc[idx].to_dict()) for rec in x_guess: for name in rec: if self.api_config[name]['type'] == 'int': rec[name] = int(rec[name]) return x_guess
# In[23]: iplot( dict(data=[ dict(type='violin', name=name, y=data, box=dict(visible=True)) for name, data in zip(standardized_feature_names, ( standardized_features[:, j] for j in count())) ], layout=dict(title="Standardized Population Distribution by Feature"))) # ### Power Transform Features # In[24]: power_transformed_features = power_transform(raw_features, standardize=True) power_transformed_feature_names = [ name.partition(' (cm)')[0] for name in feature_names ] # In[25]: iplot( dict( data=[ dict(type='violin', name=name, y=data, box=dict(visible=True)) for name, data in zip(power_transformed_feature_names, ( power_transformed_features[:, j] for j in count())) ], layout=dict( title=