def __init__(self, df, model=Models.PROPHET, upsample_freq=None, train_test_split_ratio=Constants.TRAIN_TEST_SPLIT_RATIO.value, epochs=Constants.EPOCHS.value, initial_epoch=Constants.INITIAL_EPOCH.value, batch_size=Constants.BATCH_SIZE.value, sliding_window_size_or_time_steps=Constants.SLIDING_WINDOW_SIZE_OR_TIME_STEPS.value, do_shuffle=True): logging.info("resample: {}. future_prediction: {}, epochs: {}, batch_size: {}," " window_size: {}, eurons: {}" .format(Constants.RESAMPLING_FREQ.value , Constants.SHIFT_IN_TIME_STEP_TO_PREDICT.value , epochs , batch_size , sliding_window_size_or_time_steps , Constants.NEURONS.value )) if logging.getLogger().isEnabledFor(logging.INFO): explore_data(df) # first step is to create a timestamp column as index to turn it to a TimeSeries data df.index = pd.to_datetime(df[ColumnNames.DATE.value] + df[ColumnNames.TIME.value], format='%Y-%m-%d%H:%M:%S', errors='raise') if 'Unnamed: 0' in df.columns: df.drop('Unnamed: 0', axis=1, inplace=True) # keep a copy of original dataset for future comparison self.df_original = df.copy() # we interpolate temperature using prophet to use it in a multivariate forecast temperature = ColumnNames.TEMPERATURE.value interpolated_df = facebook_prophet_filter(df, temperature, Constants.FORECASTED_TEMPERATURE_FILE.value) interpolated_df.index = df.index df[[temperature]] = interpolated_df[[ColumnNames.FORECAST.value]] # lets also interpolate missing kwh using facebook prophet (or we could simply drop them) # now turn to kwh and make the format compatible with prophet power = ColumnNames.POWER.value interpolated_df = facebook_prophet_filter(df, power, Constants.FORECASTED_POWER_FILE.value) interpolated_df.index = df.index df[[power]] = interpolated_df[[ColumnNames.FORECAST.value]] df = df.rename(columns={power: ColumnNames.LABEL.value}) df.drop(columns=[ColumnNames.DATE.value, ColumnNames.TIME.value, ColumnNames.DAY_OF_WEEK.value, ColumnNames.MONTH.value], inplace=True ) if upsample_freq is not None: df = df.resample(upsample_freq).mean() # for any regression or forecasting it is better to work with normalized data self.transformer = QuantileTransformer() # handle outliers better than MinMaxScalar features = ColumnNames.FEATURES.value normalized = normalize(df, features, transformer=self.transformer) # we use the last part (after 12/1/2013) that doesnt have temperature for testing cutoff_date = Constants.CUTOFF_DATE.value self.df = normalized[normalized.index < cutoff_date] self.testing = normalized[normalized.index >= cutoff_date] self.df[ColumnNames.DATE_STAMP.value] = self.df.index self.df_blocked = None self.train_test_split_ratio = train_test_split_ratio self.model_type = model self.train_X, self.test_X, self.train_test_split_index = self.train_test_split(self.df[features]) self.train_y, self.test_y, _ = self.train_test_split(self.df[ColumnNames.LABELS.value]) self.model_fit = None self.epochs = epochs self.initial_epoch = initial_epoch self.batch_size = batch_size self.history = None # following is defines in sliding_window self.do_shuffle = do_shuffle self.val_idx = None self.shuffled_X = None self.shuffled_y = None self.train = None self.label = None self.train_size = None self.val_size = None if logging.getLogger().isEnabledFor(logging.INFO): explore_data(self.df)
# } lgb_param = utility.json2param('magic') lgb_clf0 = lgb.LGBMClassifier(**lgb_param) lgb_clf1 = lgb.LGBMClassifier(**lgb_param) lgb_clf2 = lgb.LGBMClassifier(**lgb_param) #catboost import catboost cat_param = utility.json2param('catboost') #cat_clf= catboost.CatBoostClassifier(**cat_param) # create NB clf from sklearn.preprocessing import QuantileTransformer from sklearn.pipeline import make_pipeline from sklearn.naive_bayes import GaussianNB #NB_clf = make_pipeline(QuantileTransformer(output_distribution='normal'), GaussianNB(n_classes=2)) NB_clf = make_pipeline(QuantileTransformer(output_distribution='normal'), GaussianNB()) import Stacking # make stacking # from sklearn.model_selection import StratifiedKFold # kfold = StratifiedKFold(n_splits=2, random_state=999).split(X,y) #kfold is generator that will destory itself after one usage clf_list = [lgb_clf0, lgb_clf1, lgb_clf2, NB_clf] layer0 = Stacking.layering(clf_list) layer0_out = layer0.fit_blend(X, y, cv=10) # #last layer(meta) # from sklearn.linear_model import LogisticRegression # meta_clf = LogisticRegression(n_jobs=4,random_state=123) # # # meta_clf.fit(layer0_out,y.reshape(-1,1))
# In[91]: classifier_pipeline = Pipeline(steps = [ ('feature_processing', ColumnTransformer(transformers = [ #binary ('binary', Pipeline([ ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent'))]), binary_features), #numeric ('numeric', Pipeline([ ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')), ('scale', RobustScaler()), ('transform', QuantileTransformer(output_distribution='normal')), ('engineer', PolynomialFeatures())]), numerical_features), #categorical ('categorical', Pipeline([ ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-10000)), ('toint', FunctionTransformer(lambda x: x.astype('int64'))) ]), # ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False)) categorical_features), ])), ] )
test = data[data.target.isnull()].copy() target_col = "target" drop_cols = ["spectrum_id", "spectrum_filename", "chip_id"] X_train = train.drop(drop_cols + [target_col], axis=1) y_train = train[target_col].values X_test = test.drop(drop_cols + [target_col], axis=1) # fill inf/nan X_train.replace(np.inf, np.nan, inplace=True) X_test.replace(np.inf, np.nan, inplace=True) X_train.fillna(X_train.mean(), inplace=True) X_test.fillna(X_train.mean(), inplace=True) # rankgauss transform # https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629 prep = QuantileTransformer(output_distribution="normal") X_cont_train = prep.fit_transform(X_train) X_cont_test = prep.transform(X_test) # train and predict timestamp = get_timestamp() oof_preds, test_preds, cv_scores = run( X_seq_train, X_cont_train, y_train, X_seq_test, X_cont_test, timestamp, random_state=0, )
data = np.random.power(5, (100, 2)) #data = np.column_stack((x, y)) # scaliamo i dati in vari modi all_scalings = { 'A_non_scaled': data, 'B_min_max': MinMaxScaler().fit_transform(data), 'C_standard': StandardScaler().fit_transform(data), 'D_robust': RobustScaler().fit_transform(data), 'E_quantile_uniform': QuantileTransformer(output_distribution='uniform').fit_transform(data), 'F_quantile_normal': QuantileTransformer(output_distribution='normal').fit_transform(data) } # plot i = 0 for scaling in sorted(all_scalings.keys()): i += 1 plt.subplot('32' + str(i)) plt.title(scaling) x = all_scalings[scaling][:, 0] y = all_scalings[scaling][:, 1] plt.scatter(x=x, y=y) plt.tight_layout() plt.show()
def fit(self): """ perform model fitting """ # initialize y_vals = np.zeros((self.train_df.shape[0], )) if self.task == "multiclass": n_class = len(np.unique(self.train_df[self.target].values)) oof_pred = np.zeros((self.train_df.shape[0], n_class)) y_pred = np.zeros((self.test_df.shape[0], n_class)) else: oof_pred = np.zeros((self.train_df.shape[0], )) y_pred = np.zeros((self.test_df.shape[0], )) # group does not kick in when group k fold is used if self.group is not None: if self.group in self.features: self.features.remove(self.group) if self.group in self.categoricals: self.categoricals.remove(self.group) fi = np.zeros((self.n_splits, len(self.features))) # target encoding numerical_features = [f for f in self.features if f not in self.categoricals] if self.target_encoding: # perform target encoding overall_mean = self.train_df[self.target].mean() for c in self.categoricals: data_tmp = pd.DataFrame({c: self.train_df[c].values, 'target': self.train_df[self.target].values}) tmp = np.nan * np.ones(self.train_df.shape[0]) cv = self.get_cv() for fold, (train_idx, val_idx) in enumerate(cv): target_mean = data_tmp.iloc[train_idx].groupby(c)['target'].mean() tmp[val_idx] = self.train_df[c].iloc[val_idx].map(target_mean).values self.train_df[c] = tmp # replace categorical variable in test target_mean = data_tmp.groupby(c)['target'].mean() self.test_df.loc[:, c] = self.test_df[c].map(target_mean).values # no categoricals any more numerical_features = self.features.copy() self.categoricals = [] # fill nan if self.model not in ['lgb', 'catb', 'xgb']: # fill NaN (numerical features -> median, categorical features -> mode) self.train_df[numerical_features] = self.train_df[numerical_features].replace([np.inf, -np.inf], np.nan) self.test_df[numerical_features] = self.test_df[numerical_features].replace([np.inf, -np.inf], np.nan) self.train_df[numerical_features] = self.train_df[numerical_features].fillna(self.train_df[numerical_features].median()) self.test_df[numerical_features] = self.test_df[numerical_features].fillna(self.test_df[numerical_features].median()) self.train_df[self.categoricals] = self.train_df[self.categoricals].fillna(self.train_df[self.categoricals].mode().iloc[0]) self.test_df[self.categoricals] = self.test_df[self.categoricals].fillna(self.test_df[self.categoricals].mode().iloc[0]) # scaling, if necessary if self.scaler is not None: # to normal pt = QuantileTransformer(n_quantiles=100, random_state=self.seed, output_distribution="normal") self.train_df[numerical_features] = pt.fit_transform(self.train_df[numerical_features]) self.test_df[numerical_features] = pt.transform(self.test_df[numerical_features]) # starndardize if self.scaler == "MinMax": scaler = MinMaxScaler() elif self.scaler == "Standard": scaler = StandardScaler() self.train_df[numerical_features] = scaler.fit_transform(self.train_df[numerical_features]) self.test_df[numerical_features] = scaler.transform(self.test_df[numerical_features]) x_test = self.test_df.copy() if self.model == "nn": x_test = [np.absolute(x_test[i]) for i in self.categoricals] + [x_test[numerical_features]] else: x_test = x_test[self.features] else: x_test = self.test_df[self.features] # fitting with out of fold cv = self.get_cv() for fold, (train_idx, val_idx) in enumerate(cv): # train test split x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx] y_train, y_val = self.train_df[self.target].iloc[train_idx], self.train_df[self.target].iloc[val_idx] if self.model == "nn": x_train = [np.absolute(x_train[i]) for i in self.categoricals] + [x_train[numerical_features]] x_val = [np.absolute(x_val[i]) for i in self.categoricals] + [x_val[numerical_features]] # model fitting train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val) model, importance = self.train_model(train_set, val_set) fi[fold, :] = importance y_vals[val_idx] = y_val # predictions and check cv score oofs, ypred = get_oof_ypred(model, x_val, x_test, self.model, self.task) y_pred += ypred.reshape(y_pred.shape) / self.n_splits if self.task == "multiclass": oof_pred[val_idx, :] = oofs.reshape(oof_pred[val_idx, :].shape) print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], np.argmax(oof_pred[val_idx, :], axis=1)))) else: oof_pred[val_idx] = oofs.reshape(oof_pred[val_idx].shape) print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], oof_pred[val_idx]))) # feature importance data frame fi_df = pd.DataFrame() for n in np.arange(self.n_splits): tmp = pd.DataFrame() tmp["features"] = self.features tmp["importance"] = fi[n, :] tmp["fold"] = n fi_df = pd.concat([fi_df, tmp], ignore_index=True) gfi = fi_df[["features", "importance"]].groupby(["features"]).mean().reset_index() fi_df = fi_df.merge(gfi, on="features", how="left", suffixes=('', '_mean')) # outputs if self.task == "multiclass": loss_score = self.calc_metric(y_vals, np.argmax(oof_pred, axis=1)) else: loss_score = self.calc_metric(y_vals, oof_pred) if self.verbose: print('Our oof loss score is: ', loss_score) return y_pred, loss_score, model, oof_pred, y_vals, fi_df
class Dataset: """ Class to generate the data matrices (train, validation and test) """ ## Train X matrix train_x = None ## Train y matrix train_y = None ## Validation X matrix val_x = None ## Validation y matrix val_y = None ## Test X matrix test_x = None ## Test y matrix test_y = None ## Path to the datafiles data_path = None ## Section 'data' of the configuration file config = None ## Mode of the dataset mode = None ## Functions to use for scaling the data scalers = { 'standard': StandardScaler(), 'minmax': MinMaxScaler(feature_range=(-1, 1)), 'tanh': tanh_normalization(), 'robustscaler': RobustScaler(), 'quantile': QuantileTransformer() } ## Strings corresponding to the different dataset configurations dataset_type = [ 'onesiteonevar', 'onesitemanyvar', 'manysiteonevar', 'manysitemanyvar', 'manysitemanyvarstack', 'manysitemanyvarstackneigh' ] generated = False raw_data = None scaler = None # Scaler object so data can be rescaled after training def __init__(self, config, data_path): """ Initializes the object with the data configuration section of the configuration file and the path where the actual data is :param config: :param data_path: """ self.config = config self.data_path = data_path def is_teacher_force(self): """ Returns if the data matrix is configured for teaching force :return: """ return self.config['dmatrix'] == 'teach_force' def is_dependent_auxiliary(self): """ Returns if the data matrix is cofigured to separate dependent and independent variables :return: """ return self.config['dmatrix'] == 'dep_aux' def _generate_dataset_one_var(self, data, datasize, testsize, lag=1, ahead=1, slice=1, mode=None): """ Generates dataset matrices for one variable according to the lag and ahead horizon. The ahead horizon can be sliced to a subset of the horizon The dimensions of the matrix are adapted accordingly to the input and output dimensions of the model Input: By default is a 3D matrix - examples x variables x lag 2D - examples x (variables * lag) Output: 3D - examples x horizon x 1 2D - examples x horizon 1D - examples x 1 x 1 0D - examples x 1 'scaling' is obtained from the data section of the configuration 'fraction' allows selecting only a part of the data, selects from the end :param data: :param datasize: :param testsize: :param lag: :param ahead: :param slice: :param mode: :return: :return: """ if 'scaler' in self.config and self.config['scaler'] in self.scalers: scaler = self.scalers[self.config['scaler']] tmpdata = scaler.fit_transform(data) self.scaler = scaler.fit(data[:, 0].reshape( -1, 1)) # saves the scaler for the first variable for descaling data = tmpdata # else: # scaler = StandardScaler() # data = scaler.fit_transform(data) mode_x, mode_y = mode if 'fraction' in self.config: isize = int((1 - self.config['fraction']) * datasize) wind_train = data[isize:datasize, :] else: wind_train = data[:datasize, :] train = lagged_vector(wind_train, lag=lag, ahead=ahead, mode=mode) train_x = train[:, :lag] ####################################### if mode_x == '2D': train_x = np.reshape(train_x, (train_x.shape[0], train_x.shape[1])) elif mode_x == '4D': raise NameError('4D is not possible when there is only a variable') # Default is '3D' if mode_y == '3D': train_y = train[:, -slice:, 0] train_y = np.reshape(train_y, (train_y.shape[0], train_y.shape[1], 1)) elif mode_y == '2D': train_y = train[:, -slice:, 0] train_y = np.reshape(train_y, (train_y.shape[0], train_y.shape[1])) elif mode_y == '1D': train_y = train[:, -1:, 0] elif mode_y == '0D': train_y = np.ravel(train[:, -1:, 0]) else: train_y = train[:, -1:, 0] wind_test = data[datasize:datasize + testsize, 0].reshape(-1, 1) test = lagged_vector(wind_test, lag=lag, ahead=ahead, mode=mode) half_test = int(test.shape[0] / 2) val_x = test[:half_test, :lag] test_x = test[half_test:, :lag] ####################################### if mode_x == '2D': val_x = np.reshape(val_x, (val_x.shape[0], val_x.shape[1])) test_x = np.reshape(test_x, (test_x.shape[0], test_x.shape[1])) elif mode_x == '4D': raise NameError('4D is not possible when there is only a variable') # Default is '3D' if mode_y == '3D': val_y = test[:half_test, -slice:, 0] test_y = test[half_test:, -slice:, 0] val_y = np.reshape(val_y, (val_y.shape[0], val_y.shape[1], 1)) test_y = np.reshape(test_y, (test_y.shape[0], test_y.shape[1], 1)) elif mode_y == '2D': val_y = test[:half_test, -slice:, 0] test_y = test[half_test:, -slice:, 0] val_y = np.reshape(val_y, (val_y.shape[0], val_y.shape[1])) test_y = np.reshape(test_y, (test_y.shape[0], test_y.shape[1])) elif mode_y == '1D': val_y = test[:half_test, -1:, 0] test_y = test[half_test:, -1:, 0] elif mode_y == '0D': val_y = np.ravel(test[:half_test, -1:, 0]) test_y = np.ravel(test[half_test:, -1:, 0]) else: # Default is '1D' val_y = test[:half_test, -1:, 0] test_y = test[half_test:, -1:, 0] return train_x, train_y, val_x, val_y, test_x, test_y def _generate_dataset_multiple_var(self, data, datasize, testsize, lag=1, ahead=1, slice=1, mode=None): """ Generates dataset matrices for one variable according to the lag and ahead horizon. The ahead horizon can be sliced to a subset of the horizon The dimensions of the matrix are adapted accordingly to the input and output dimensions of the model Input: By default is a 3D matrix - examples x lag x variables 2D - examples x (lag * variables) Output: 3D - examples x horizon x 1 2D - examples x horizon 1D - examples x 1 x 1 0D - examples x 1 'scaling' is obtained from the data section of the configuration 'fraction' allows selecting only a part of the data, selects from the end :return: """ if 'scaler' in self.config and self.config['scaler'] in self.scalers: scaler = self.scalers[self.config['scaler']] tmpdata = scaler.fit_transform(data) self.scaler = scaler.fit(data[:, 0].reshape( -1, 1)) # saves the scaler for the first variable for descaling data = tmpdata # else: # scaler = StandardScaler() # data = scaler.fit_transform(data) # print('DATA Dim =', data.shape) mode_x, mode_y = mode if 'fraction' in self.config: isize = int((1 - self.config['fraction']) * datasize) wind_train = data[isize:datasize, :] else: self.config['fraction'] = 1 wind_train = data[:datasize, :] # print('Train Dim =', wind_train.shape) # Train train = lagged_matrix(wind_train, lag=lag, ahead=ahead, mode=mode) train_x = train[:, :lag] if 'aggregate' in self.config and 'x' in self.config['aggregate']: step = self.config['aggregate']['x']['step'] if self.config['aggregate']['x']['method'] == 'average': train_x = aggregate_average_all(train_x, step) elif self.config['aggregate']['x']['method'] == 'max': train_x = aggregate_max_min_all(train_x, step, aggmax=True) elif self.config['aggregate']['x']['method'] == 'min': train_x = aggregate_max_min_all(train_x, step, aggmax=False) # Signal decomposition if 'decompose' in self.config and 'x' in self.config['decompose']: components = self.config['decompose']['x']['components'] if type(self.config['decompose']['x']['var']) == int: var = self.config['decompose']['x']['var'] train_x = apply_SSA_decomposition_one(var, components, train_x) else: train_x = apply_SSA_decomposition_all(components, train_x) ####################################### print('pollo', mode_y) if mode_x == '2D': # Interchange axes 1 and 2 so the variables values are contiguous in the 2D matrix train_x = np.swapaxes(train_x, 1, 2) train_x = np.reshape( train_x, (train_x.shape[0], train_x.shape[1] * train_x.shape[2])) elif mode_x == '4D': # Add an extra dimension to simulate that we have only one channel train_x = np.reshape( train_x, (train_x.shape[0], train_x.shape[1], train_x.shape[2], 1)) if mode_y == '3D': train_y = train[:, -slice:, 0] if 'aggregate' in self.config and 'y' in self.config['aggregate']: step = self.config['aggregate']['y']['step'] if self.config['aggregate']['y']['method'] == 'average': train_y = aggregate_average(train_y, step) elif self.config['aggregate']['y']['method'] == 'max': train_y = aggregate_max_min(train_y, step, aggmax=True) elif self.config['aggregate']['y']['method'] == 'min': train_y = aggregate_max_min(train_y, step, aggmax=False) # Decompose prediction and keep one of the components if 'decompose' in self.config and 'y' in self.config['decompose']: components = self.config['decompose']['y']['components'] dec_y = apply_SSA_decomposition_y(components, train_y) train_y = dec_y[:, :, self.config['decompose']['y']['var']] # We need an additional third dimension train_y = np.reshape(train_y, (train_y.shape[0], train_y.shape[1], 1)) elif mode_y == '2D': train_y = train[:, -slice:, 0] if 'aggregate' in self.config and 'y' in self.config['aggregate']: print('hello pollastre', self.config['aggregate']['y']['method']) step = self.config['aggregate']['y']['step'] if self.config['aggregate']['y']['method'] == 'average': train_y = aggregate_average(train_y, step) elif self.config['aggregate']['y']['method'] == 'max': train_y = aggregate_max_min(train_y, step, aggmax=True) elif self.config['aggregate']['y']['method'] == 'min': train_y = aggregate_max_min(train_y, step, aggmax=False) # Decompose prediction and keep one of the components if 'decompose' in self.config and 'y' in self.config['decompose']: components = self.config['decompose']['y']['components'] dec_y = apply_SSA_decomposition_y(components, train_y) train_y = dec_y[:, :, self.config['decompose']['y']['var']] train_y = np.reshape(train_y, (train_y.shape[0], train_y.shape[1])) elif mode_y == '1D': train_y = train[:, -1:, 0] elif mode_y == '0D': train_y = np.ravel(train[:, -1:, 0]) else: train_y = train[:, -slice:, 0] # Test and Val wind_test = data[datasize:datasize + testsize, :] test = lagged_matrix(wind_test, lag=lag, ahead=ahead, mode=mode) half_test = int(test.shape[0] / 2) val_x = test[:half_test, :lag] test_x = test[half_test:, :lag] if 'aggregate' in self.config and 'x' in self.config['aggregate']: step = self.config['aggregate']['x']['step'] if self.config['aggregate']['x']['method'] == 'average': val_x = aggregate_average_all(val_x, step) test_x = aggregate_average_all(test_x, step) elif self.config['aggregate']['x']['method'] == 'max': val_x = aggregate_max_min_all(val_x, step, aggmax=True) test_x = aggregate_max_min_all(test_x, step, aggmax=True) elif self.config['aggregate']['x']['method'] == 'min': val_x = aggregate_max_min_all(val_x, step, aggmax=False) test_x = aggregate_max_min_all(test_x, step, aggmax=False) if 'decompose' in self.config and 'x' in self.config['decompose']: components = self.config['decompose']['x']['components'] if type(self.config['decompose']['x']['var']) == int: var = self.config['decompose']['x']['var'] val_x = apply_SSA_decomposition_one(var, components, val_x) test_x = apply_SSA_decomposition_one(var, components, test_x) else: val_x = apply_SSA_decomposition_all(components, val_x) test_x = apply_SSA_decomposition_all(components, test_x) ######################################################## if mode_x == '2D': val_x = np.swapaxes(val_x, 1, 2) val_x = np.reshape( val_x, (val_x.shape[0], val_x.shape[1] * val_x.shape[2])) test_x = np.swapaxes(test_x, 1, 2) test_x = np.reshape( test_x, (test_x.shape[0], test_x.shape[1] * test_x.shape[2])) elif mode_x == '4D': # Add an extra dimension to simulate that we have only one channel val_x = np.reshape( val_x, (val_x.shape[0], val_x.shape[1], val_x.shape[2], 1)) test_x = np.reshape( test_x, (test_x.shape[0], test_x.shape[1], test_x.shape[2], 1)) if mode_y == '3D': val_y = test[:half_test, -slice:, 0] test_y = test[half_test:, -slice:, 0] if 'aggregate' in self.config and 'y' in self.config['aggregate']: step = self.config['aggregate']['step'] if self.config['aggregate']['method'] == 'average': val_y = aggregate_average(val_y, step) test_y = aggregate_average(test_y, step) elif self.config['aggregate']['method'] == 'max': val_y = aggregate_max_min(val_y, step, aggmax=True) test_y = aggregate_max_min(test_y, step, aggmax=True) elif self.config['aggregate']['method'] == 'min': val_y = aggregate_max_min(val_y, step, aggmax=False) test_y = aggregate_max_min(test_y, step, aggmax=False) # Decompose prediction and keep one of the components if 'decompose' in self.config and 'y' in self.config['decompose']: components = self.config['decompose']['y']['components'] dec_y = apply_SSA_decomposition_y(components, val_y) val_y = dec_y[:, :, self.config['decompose']['y']['var']] dec_y = apply_SSA_decomposition_y(components, test_y) test_y = dec_y[:, :, self.config['decompose']['y']['var']] val_y = np.reshape(val_y, (val_y.shape[0], val_y.shape[1], 1)) test_y = np.reshape(test_y, (test_y.shape[0], test_y.shape[1], 1)) elif mode_y == '2D': val_y = test[:half_test, -slice:, 0] test_y = test[half_test:, -slice:, 0] if 'aggregate' in self.config and 'y' in self.config['aggregate']: step = self.config['aggregate']['y']['step'] if self.config['aggregate']['y']['method'] == 'average': val_y = aggregate_average(val_y, step) test_y = aggregate_average(test_y, step) elif self.config['aggregate']['y']['method'] == 'max': val_y = aggregate_max_min(val_y, step, aggmax=True) test_y = aggregate_max_min(test_y, step, aggmax=True) elif self.config['aggregate']['y']['method'] == 'min': val_y = aggregate_max_min(val_y, step, aggmax=False) test_y = aggregate_max_min(test_y, step, aggmax=False) if 'decompose' in self.config and 'y' in self.config['decompose']: # Decompose prediction and keep one of the components components = self.config['decompose']['y']['components'] dec_y = apply_SSA_decomposition_y(components, val_y) val_y = dec_y[:, :, self.config['decompose']['y']['var']] dec_y = apply_SSA_decomposition_y(components, test_y) test_y = dec_y[:, :, self.config['decompose']['y']['var']] val_y = np.reshape(val_y, (val_y.shape[0], val_y.shape[1])) test_y = np.reshape(test_y, (test_y.shape[0], test_y.shape[1])) elif mode_y == '1D': val_y = test[:half_test, -1:, 0] test_y = test[half_test:, -1:, 0] elif mode_y == '0D': val_y = np.ravel(test[:half_test, -1:, 0]) test_y = np.ravel(test[half_test:, -1:, 0]) else: val_y = test[:half_test, -slice:, 0] test_y = test[half_test:, -slice:, 0] return train_x, train_y, val_x, val_y, test_x, test_y def load_raw_data(self, remote=False): """ Loads the data so some computations can be performed :return: """ datanames = self.config['datanames'] d = datanames[0] # just the main dataset vars = self.config['vars'] if 'angle' in self.config: angle = self.config['angle'] else: angle = False if remote: srv = pysftp.Connection(host=remote_data[0], username=remote_data[1]) srv.get(remote_wind_data_path + f"/{d}.npy", self.data_path + f"/{d}.npy") srv.close() if angle: wind = np.load(self.data_path + '_angle' + f"/{d}.npy") else: wind = np.load(self.data_path + f"/{d}.npy") if remote: os.remove(self.data_path + f"/{d}.npy") # If there is a list in vars attribute it should be a list of integers if type(vars) == list: for v in vars: if type(v) != int or v > wind.shape[1]: raise NameError('Error in variable selection') wind = wind[:, vars] self.raw_data = wind def generate_dataset(self, ahead=1, mode=None, ensemble=False, ens_slice=None, remote=None): """ Generates the dataset for training, test and validation 0 = One site - wind 1 = One site - all variables 2 = All sites - wind 3 = All sites - all variables 4 = All sites - all variables stacked 5 = Uses neighbor sites around a radius :param ens_slice: (not yet used) :param remote: Use remote data :param ensemble: (not yet used) :param datanames: Name of the wind datafiles :param ahead: number of steps ahead for prediction :param mode: type of dataset (pair indicating the type of dimension for input and output) :return: """ self.generated = True self.mode = mode datanames = self.config['datanames'] datasize = self.config['datasize'] testsize = self.config['testsize'] lag = self.config['lag'] vars = self.config['vars'] wind = {} if 'angle' in self.config: angle = self.config['angle'] else: angle = False # ahead = self.config['ahead'] if (type(self.config['ahead']) == list) else [1, self.config['ahead']] if type(ahead) == list: dahead = ahead[1] slice = (ahead[1] - ahead[0]) + 1 else: dahead = ahead slice = ahead # Augment the dataset with the closest neighbors if self.config['dataset'] == 5 or self.config['dataset'] == 31: if 'radius' not in self.config: raise NameError( "Radius missing for neighbours augmented dataset") else: radius = self.config['radius'] if 'nneighbors' in self.config: datanames = get_closest_k_neighbors(datanames[0], radius, self.config['nneighbors']) else: print('before', datanames) datanames = get_all_neighbors(datanames[0], radius) print('after', datanames) # Reads numpy arrays for all sites and keeps only selected columns for d in datanames: if remote: srv = pysftp.Connection(host=remote_data[0], username=remote_data[1]) srv.get(remote_wind_data_path + f"/{d}.npy", self.data_path + f"/{d}.npy") srv.close() if angle: wind[d] = np.load(self.data_path + '_angle' + f"/{d}.npy") else: wind[d] = np.load(self.data_path + f"/{d}.npy") if remote: os.remove(self.data_path + f"/{d}.npy") # If there is a list in vars attribute it should be a list of integers if type(vars) == list: for v in vars: if type(v) != int or v > wind[d].shape[1]: raise NameError('Error in variable selection') wind[d] = wind[d][:, vars] if (self.config['dataset'] == 0) or (self.config['dataset'] == 'onesiteonevar'): if not ensemble: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_one_var(wind[datanames[0]][:, 0].reshape(-1, 1), datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) else: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_one_var(wind[datanames[0]][ens_slice[0]::ens_slice[1], 0].reshape(-1, 1), datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif (self.config['dataset'] == 1) or (self.config['dataset'] == 'onesitemanyvar'): if not ensemble: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(wind[datanames[0]], datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) else: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(wind[datanames[0][ens_slice[0]::ens_slice[1], :]], datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif self.config['dataset'] == 2 or self.config[ 'dataset'] == 'manysiteonevar': stacked = np.vstack([wind[d][:, 0] for d in datanames]).T self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(stacked, datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif self.config['dataset'] == 3 or self.config[ 'dataset'] == 31 or self.config['dataset'] == 'manysitemanyvar': stacked = np.hstack([wind[d] for d in datanames]) self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(stacked, datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif self.config['dataset'] == 4 or self.config['dataset'] == 5 or \ self.config['dataset'] == 'manysitemanyvarstack': stacked = [ self._generate_dataset_multiple_var(wind[d], datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) for d in datanames ] self.train_x = np.vstack([x[0] for x in stacked]) self.train_y = np.vstack([x[1] for x in stacked]) self.val_x = stacked[0][2] self.val_y = stacked[0][3] self.test_x = stacked[0][4] self.test_y = stacked[0][5] else: raise NameError('ERROR: No such dataset type') def get_data_matrices(self): """ Returns the data matrices for training, validation and test :return: """ if not 'dmatrix' in self.config or self.config['dmatrix'] == 'normal': return self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y elif self.config['dmatrix'] == 'teach_force': return self.teacher_forcing() elif self.config['dmatrix'] == 'dep_aux': return self.dependent_auxiliary() elif self.config['dmatrix'] == 'future': return self.auxiliary_future() else: raise NameError("DataSet: No such dmatrix type") def teacher_forcing(self): """ returns data matrices for teacher forcing/attention assuming that data is for RNN :return: """ # Use the last element of wind traininig data as the first of teacher forcing tmp = self.train_x[:, -1, 0] tmp = tmp.reshape(tmp.shape[0], 1, 1) train_y_tf = np.concatenate((tmp, self.train_y[:, :-1, :]), axis=1) tmp = self.test_x[:, -1, 0] tmp = tmp.reshape(tmp.shape[0], 1, 1) test_y_tf = np.concatenate((tmp, self.test_y[:, :-1, :]), axis=1) tmp = self.val_x[:, -1, 0] tmp = tmp.reshape(tmp.shape[0], 1, 1) val_y_tf = np.concatenate((tmp, self.val_y[:, :-1, :]), axis=1) return [self.train_x, train_y_tf], self.train_y, \ [self.val_x, val_y_tf], self.val_y, \ [self.test_x, test_y_tf], self.test_y def dependent_auxiliary(self): """ Return data matrices separating dependent variable from the rest This is for two headed architecture with dependent and auxiliary variables in separated branches :return: """ horizon = self.config['lag'] if self.mode[1] != '2D': return [self.train_x[:, :, 0].reshape(self.train_x.shape[0], self.train_x.shape[1], 1), self.train_x[:, :, 1:]], self.train_y, \ [self.val_x[:, :, 0].reshape(self.val_x.shape[0], self.val_x.shape[1], 1), self.val_x[:, :, 1:]], self.val_y, \ [self.test_x[:, :, 0].reshape(self.test_x.shape[0], self.test_x.shape[1], 1), self.test_x[:, :, 1:]], self.test_y else: return [self.train_x[:, :horizon].train_x[:, horizon:, ]], self.train_y, \ [self.val_x[:, :horizon], self.val_x[:, :horizon]], self.val_y, \ [self.test_x[:, :horizon], self.test_x[:, :horizon]], self.test_y def auxiliary_future(self): """ Returns data matrices adding a matrix for the future for a subset of the auxiliary matrices :return: """ # Future variable, just one for now datalag = self.config['lag'] future = self.config['varsf'][0] ahead = self.config['ahead'] if (type( self.config['ahead']) == list) else [1, self.config['ahead']] if type(ahead) == list: dahead = ahead[1] slice = (ahead[1] - ahead[0]) + 1 else: dahead = ahead slice = ahead if self.mode[1] != '2D': # The values of the future variable are dahead positions from the start train_x_future = self.train_x[dahead:, -slice:, future] val_x_future = self.val_x[dahead:, -slice:, future] test_x_future = self.test_x[dahead:, -slice:, future] else: nvars = len(self.config['vars']) train_x_future = self.train_x[datalag - 1:, (future * datalag) + ahead[0]:(future * datalag) + ahead[0] + slice] val_x_future = self.val_x[datalag - 1:, (future * datalag) + ahead[0]:(future * datalag) + ahead[0] + slice] test_x_future = self.test_x[datalag - 1:, (future * datalag) + ahead[0]:(future * datalag) + ahead[0] + slice] # We lose the last datalag-1 examples because we do not have their full future in the data matrix return [self.train_x[:-(datalag - 1)], train_x_future], self.train_y[:-(datalag - 1)], [ self.val_x[:-(datalag - 1)], val_x_future], self.val_y[:-(datalag - 1)], \ [self.test_x[:-(datalag - 1)], test_x_future], self.test_y[:-(datalag - 1)] def summary(self): """ Dataset Summary of its characteristics :return: """ if self.train_x is None: raise NameError('Data not loaded yet') else: print("--- Dataset Configuration-----------") print(f"Dataset name: {self.config['datanames']}") if 'fraction' in self.config: print(f"Data fraction: {self.config['fraction']}") else: print(f"Data fraction: 2") print(f"Training: X={self.train_x.shape} Y={self.train_y.shape}") print(f"Validation: X={self.val_x.shape} Y={self.val_y.shape}") print(f"Tests: X={self.test_x.shape} T={self.test_y.shape}") if type(self.config['dataset']) == int: print( f"Dataset type= {self.dataset_type[self.config['dataset']]}" ) else: print(f"Dataset type= {self.config['dataset']}") if 'scaler' in self.config: print(f"Scaler= {self.config['scaler']}") else: print(f"Scaler= standard") if 'dmatrix' in self.config: print(f"Data matrix configuration= {self.config['dmatrix']}") print(f"Vars= {self.config['vars']}") print(f"Lag= {self.config['lag']}") print(f"Ahead= {self.config['ahead']}") print("------------------------------------") def compute_measures(self, var, window=None): """ Computing some measures with the wind series Window is a dictionary with a keyword for the windoe size and a window length :return: """ if self.raw_data is None: raise NameError("Raw data is not loaded") if var > self.raw_data.shape[1]: raise NameError("Invalid variable number") dvals = {} dvals['SpecEnt'] = spectral_entropy(self.raw_data[:, var], sf=1) dvals['SampEnt'] = sample_entropy(self.raw_data[:, var], order=2) data = self.raw_data[:, var] for w in window: lw = window[w] length = int(data.shape[0] / lw) size = lw * length datac = data[:size] datac = datac.reshape(-1, lw) means = np.mean(datac, axis=1) vars = np.std(datac, axis=1) dvals[f'Stab{w}'] = np.std(means) dvals[f'Lump{w}'] = np.std(vars) return dvals
from sklearn.model_selection import train_test_split df = pd.read_csv("baddata.txt", delimiter='\s+', header=None) X = df.iloc[:, :].values N_SAMPLES = 1000 FONT_SIZE = 6 BINS = 30 rng = np.random.RandomState(304) bc = PowerTransformer(method='box-cox') yj = PowerTransformer(method='yeo-johnson') # n_quantiles is set to the training set size rather than the default value # to avoid a warning being raised by this example qt = QuantileTransformer(n_quantiles=500, output_distribution='normal', random_state=rng) size = (N_SAMPLES, 1) # lognormal distribution X_lognormal = rng.lognormal(size=size) # chi-squared distribution df = 3 X_chisq = rng.chisquare(df=df, size=size) # weibull distribution a = 50 X_weibull = rng.weibull(a=a, size=size) # gaussian distribution
def _make_experiment(category, name, learning_algorithm, learning_params, X, y, outer_folds=7, inner_folds=5, logger=None): if category not in ('classification', 'regression'): raise ValueError("'category' should be either equal to " "'classification' or 'regression' " f"(found {category})") if logger: logger.info(f'starting experiment: {name}') pipeline_desc = [('scaler', None), ('learning_algorithm', learning_algorithm)] pipe = Pipeline(pipeline_desc) scalers = [ StandardScaler(), RobustScaler(), MinMaxScaler(), QuantileTransformer(n_quantiles=50) ] params = {'scaler': scalers} for k in learning_params: params['learning_algorithm__' + k] = learning_params[k] fold_gen = StratifiedKFold if category == 'classification' else KFold outer_fold = fold_gen(n_splits=outer_folds) scores = [] best_models = [] best_params = [] progress = tqdm(outer_fold.split(X, y), total=outer_fold.get_n_splits(), desc=name, leave=False) for train_idx, test_idx in progress: if logger: logger.info(f'Outer fold {progress.n}') X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] gs = RandomizedSearchCV(pipe, params, verbose=0, cv=inner_folds, error_score=np.nan, n_jobs=-1, pre_dispatch=10) gs = gs.fit(X_train, y_train) predictions = gs.predict(X_test) perf = _f1_score if category == 'classification' else _rmse score = perf(y_test, predictions) scores.append(score) best_models.append(gs.best_estimator_) best_params.append(gs.best_params_) if logger: logger.info(f'score {score}') logger.info(f'best params {gs.best_params_}') if logger: logger.info('ended experiment.') logger.info(f'mean test error {np.mean(scores)}') progress.close() print(f'{name}: {np.mean(scores):.3f}') return scores, best_models, best_params
print(len(tfv.vocabulary_)) #df = pd.DataFrame(data=X.toarray()) #print(df ) sys.exit() from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, PowerTransformer cols = ["Age", "Fee", "PhotoAmt", "VideoAmt", "Quantity"] norm = np.random.normal(0, 0.1, 1000) from scipy.stats import skewtest, normaltest print(normaltest(norm)) rng = np.random.RandomState(304) qt = QuantileTransformer(output_distribution='normal', random_state=rng) pt = PowerTransformer(method="yeo-johnson") for c in cols: f, axes = plt.subplots(2, 2) axes[0, 0].hist(train[c], bins='auto') axes[0, 0].set_title(c + " notransform:" + str(normaltest(train[c])[1])) qt_t = qt.fit_transform(train[c].values.reshape(-1, 1)) axes[0, 1].hist(qt_t, bins='auto', label=str(normaltest(qt_t)[1])) axes[0, 1].set_title("quantiletransform:" + str(normaltest(qt_t)[1])) pt_t = pt.fit_transform(train[c].values.reshape(-1, 1)) axes[1, 0].hist(pt_t, bins='auto', label=str(normaltest(pt_t)[1])) axes[1, 0].set_title("powertransform:" + str(normaltest(pt_t)[1]))
def uniform_scaler(train, test, seed=123): scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=seed, copy=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values]) return scaler, train_scaled, test_scaled
signal_desc_PowerTransformer = [] for i in range(8): signal_desc_PowerTransformer += [pd.Series(signal_PowerTransformer[i, :])] corr_silver_PowerTransformer = pd.DataFrame(signal_PowerTransformer) corr_silver_PowerTransformer = corr_silver_PowerTransformer.transpose() desc_corr_silver_PowerTransformer = corr_silver_PowerTransformer.describe() corr_mat_PowerTransformer = corr_silver_PowerTransformer.corr() cov_mat_PowerTransformer = corr_silver_PowerTransformer.cov() ###################################################################################################################### signal_QuantileTransformerUniform = QuantileTransformer( output_distribution='uniform').fit_transform(signal) signal_desc_QuantileTransformerUniform = [] for i in range(8): signal_desc_QuantileTransformerUniform += [ pd.Series(signal_QuantileTransformerUniform[i, :]) ] corr_silver_QuantileTransformerUniform = pd.DataFrame( signal_QuantileTransformerUniform) corr_silver_QuantileTransformerUniform = corr_silver_QuantileTransformerUniform.transpose( ) desc_corr_silver_QuantileTransformerUniform = corr_silver_QuantileTransformerUniform.describe( )
def input_data_clustering(device: str, start_date: date, end_date: Optional[date] = None, n_clusters=5, return_only_cluster=True, return_pca=False) -> pd.DataFrame: def add_column_postfix(df: pd.DataFrame, postfix: str) -> pd.DataFrame: columns = df.columns mapping = {c: f"{c}_{postfix}" for c in columns} return df.rename(columns=mapping) # get normalized input data data = get_input_data(device, start_date, end_date=end_date, normalized=True) if data.empty: return data # compute statistics over rolling window rolling = data.rolling('15Min', min_periods=1, win_type=None) data_rolling_ = list() data_rolling_.append(add_column_postfix(rolling.count(), "count")) data_rolling_.append(add_column_postfix(rolling.sum(), "sum")) data_rolling_.append(add_column_postfix(rolling.mean(), "mean")) data_rolling_.append(add_column_postfix(rolling.median(), "median")) data_rolling_.append(add_column_postfix(rolling.var(), "var")) data_rolling_.append(add_column_postfix(rolling.kurt(), "kurt")) data_rolling_.append(add_column_postfix(rolling.skew(), "skew")) data_rolling = pd.concat(data_rolling_, axis=1) data_rolling = data_rolling.loc[~data_rolling.index.duplicated( keep='first')] data_rolling = data_rolling.resample("1Min").nearest(limit=1).dropna( how='all') from analytics.instruction import get_power power_data = get_power(device, start_date) power_data_rolling = power_data.rolling('15Min', min_periods=1, win_type=None).mean() data_rolling = data_rolling.merge(power_data_rolling, how='left', left_index=True, right_index=True) data_rolling = data_rolling[data_rolling.power >= 0.95] data_rolling = data_rolling.drop(columns='power') # normalize rolling data st_rolling = QuantileTransformer(output_distribution="normal") st_rolling.fit(data_rolling) data_rolling_normalized = pd.DataFrame(st_rolling.transform(data_rolling), columns=data_rolling.columns, index=data_rolling.index).fillna(0) # We do not have enough data for a clustering if len(data_rolling_normalized) < n_clusters: return pd.DataFrame() # perform PCA pca = PCA(random_state=31415) pca.fit(data_rolling_normalized) variance = np.cumsum(pca.explained_variance_ratio_) # how many dimensions to keep for variance over 0.95 n_dims = variance[variance <= 0.95].shape[0] + 1 data_pca = pca.transform(data_rolling_normalized)[:, :n_dims] # Cluster the data into 5 clusters k_means = KMeans(n_clusters=n_clusters, random_state=31415) clustering = k_means.fit_predict(data_pca) if return_pca: cluster_df = pd.DataFrame(clustering, columns=['cluster']) pca_df = pd.DataFrame(data_pca) pca_df.columns = [f"d_{c}" for c in pca_df.columns] return pd.concat([cluster_df, pca_df], axis=1) data_rolling.loc[:, 'cluster'] = clustering if return_only_cluster: return data_rolling[['cluster']] return data_rolling
def perform_uniform_scaler(train, test): u_scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(train) u_train_scaled = pd.DataFrame(u_scaler.transform(train), columns=train.columns.values).set_index([train.index.values]) u_test_scaled = pd.DataFrame(u_scaler.transform(test), columns=test.columns.values).set_index([test.index.values]) return u_scaler, u_train_scaled, u_test_scaled
from app.ml.objects.normalization import Normalization from sklearn.preprocessing import (MinMaxScaler, Normalizer, QuantileTransformer, RobustScaler, StandardScaler) normalizer_factory_dict = { Normalization.MIN_MAX_SCALER: lambda: MinMaxScaler(), Normalization.NORMALIZER: lambda: Normalizer(), Normalization.QUANTILE_TRANSFORMER: lambda: QuantileTransformer(), Normalization.ROBUST_SCALER: lambda: RobustScaler(), Normalization.STANDARD_SCALER: lambda: StandardScaler() } def get_normalizer(normalization: Normalization): return normalizer_factory_dict[normalization]()
def train_model(params, seed, model_num): if model_num == 0: num_cores = 1 GPU = False CPU = True if GPU: num_GPU = 1 num_CPU = 1 if CPU: num_CPU = 1 num_GPU = 0 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \ inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \ device_count={'CPU': num_CPU, 'GPU': num_GPU}) session = tf.Session(config=config) K.set_session(session) batchsize = 2000 #3000 epochs = 3 np.random.seed(seed) tf.set_random_seed(seed) model = keras_mercari_model(seed, params) train_idx, val_idx = cvlist[seed] X_tr = [x[train_idx] for x in X] X_val = [x[val_idx] for x in X] lr1, lr2, lr3 = params[-3:] lrs = [lr1, lr2, lr3] def schedule(epoch): return lrs[epoch] lr_schedule = LearningRateScheduler(schedule) # val_store = TestCallback(X_val, X_test) gc.collect() if valid: model.fit(X_tr, y[train_idx], batch_size=batchsize, epochs=epochs, verbose=0, validation_data=(X_val, y[val_idx]), shuffle=True, callbacks=[lr_schedule]) y_val = y[val_idx, 0] y_pred = model.predict(X_val)[:, 0] print(np.sqrt(metrics.mean_squared_error(y_val, y_pred))) else: model.fit(X, y, batch_size=batchsize, epochs=epochs, verbose=0, shuffle=True, callbacks=[lr_schedule]) y_test_pred = model.predict(X_test)[:, 0] K.clear_session() return y_test_pred if model_num == 1: num_cores = 1 GPU = False CPU = True if GPU: num_GPU = 1 num_CPU = 1 if CPU: num_CPU = 1 num_GPU = 0 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \ inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \ device_count={'CPU': num_CPU, 'GPU': num_GPU}) session = tf.Session(config=config) K.set_session(session) batchsize = 2000 epochs = 3 np.random.seed(seed) tf.set_random_seed(seed) model = keras_mercari_model(seed, params) train_idx, val_idx = cvlist[seed] X_tr = [x[train_idx] for x in X] X_val = [x[val_idx] for x in X] lr1, lr2, lr3 = params[-3:] lrs = [lr1, lr2, lr3] def schedule(epoch): return lrs[epoch] lr_schedule = LearningRateScheduler(schedule) # val_store = TestCallback(X_val, X_test) gc.collect() if valid: model.fit(X_tr, ynorm[train_idx], batch_size=batchsize, epochs=epochs, verbose=0, validation_data=(X_val, ynorm[val_idx]), shuffle=True, callbacks=[lr_schedule]) y_val = y[val_idx, 0] y_pred = model.predict(X_val)[:, 0] * std + mean print(np.sqrt(metrics.mean_squared_error(y_val, y_pred))) else: model.fit(X, ynorm, batch_size=batchsize, epochs=epochs, verbose=1, shuffle=True, callbacks=[lr_schedule]) y_test_pred = model.predict(X_test)[:, 0] * std + mean K.clear_session() return y_test_pred if model_num == 2: normll = QuantileTransformer(output_distribution='normal') ynorm2 = normll.fit_transform(yrel) num_cores = 1 GPU = False CPU = True if GPU: num_GPU = 1 num_CPU = 1 if CPU: num_CPU = 1 num_GPU = 0 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \ inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \ device_count={'CPU': num_CPU, 'GPU': num_GPU}) session = tf.Session(config=config) K.set_session(session) batchsize = 2000 epochs = 3 np.random.seed(seed) tf.set_random_seed(seed) model = keras_mercari_model(seed, params) train_idx, val_idx = cvlist[seed] X_tr = [x[train_idx] for x in X] X_val = [x[val_idx] for x in X] lr1, lr2, lr3 = params[-3:] lrs = [lr1, lr2, lr3] def schedule(epoch): return lrs[epoch] lr_schedule = LearningRateScheduler(schedule) # val_store = TestCallback(X_val, X_test) gc.collect() if valid: model.fit(X_tr, ynorm2[train_idx], batch_size=batchsize, epochs=epochs, verbose=0, validation_data=(X_val, ynorm2[val_idx]), shuffle=True, callbacks=[lr_schedule]) y_val = y[val_idx, 0] y_pred = (normll.inverse_transform(model.predict(X_val))[:, 0] + 1) * train_data['cat_price'].values[val_idx] print(np.sqrt(metrics.mean_squared_error(y_val, y_pred))) else: model.fit(X, ynorm2, batch_size=batchsize, epochs=epochs, verbose=0, shuffle=True, callbacks=[lr_schedule]) y_test_pred = ( normll.inverse_transform(model.predict(X_test))[:, 0] + 1) * test_data['cat_price'].values K.clear_session() return y_test_pred if model_num == 3: num_cores = 1 GPU = False CPU = True if GPU: num_GPU = 1 num_CPU = 1 if CPU: num_CPU = 1 num_GPU = 0 config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \ inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \ device_count={'CPU': num_CPU, 'GPU': num_GPU}) session = tf.Session(config=config) K.set_session(session) batchsize = 2000 epochs = 3 np.random.seed(seed) tf.set_random_seed(seed) model = keras_mercari_model(seed, params) train_idx, val_idx = cvlist[seed] X_tr = [x[train_idx] for x in X] X_val = [x[val_idx] for x in X] lr1, lr2, lr3 = params[-3:] lrs = [lr1, lr2, lr3] def schedule(epoch): return lrs[epoch] lr_schedule = LearningRateScheduler(schedule) # val_store = TestCallback(X_val, X_test) gc.collect() if valid: model.fit(X_tr, y[train_idx], batch_size=batchsize, epochs=epochs, verbose=0, validation_data=(X_val, y[val_idx]), shuffle=True, callbacks=[lr_schedule]) y_val = y[val_idx, 0] y_pred = model.predict(X_val)[:, 0] print(np.sqrt(metrics.mean_squared_error(y_val, y_pred))) else: model.fit(X, y, batch_size=batchsize, epochs=epochs, verbose=0, shuffle=True, callbacks=[lr_schedule]) y_test_pred = model.predict(X_test)[:, 0] K.clear_session() return y_test_pred
# 画图一 ax0.scatter(y_test, y_pred) ax0.set_xlabel('True Target') ax0.set_ylabel('Target predicted') ax0.plot([0, 10], [0, 10], '--k') ax0.text( 1, 9, r'$R^2$=%.2f, MAE=%.2f' % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax0.set_xlim([0, 10]) ax0.set_ylim([0, 10]) # TransformedTargetRegressor在拟合回归模型之前会变换目标y。模型的预测结果会通过一个逆向变换被重新映射回到原始的空间。 # 该类接受两个参数:一个是用于预测的regressor,另一个是用于变换目标变量的transformer。 regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), transformer=QuantileTransformer( n_quantiles=300, output_distribution='normal')) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) # 画图二 ax1.scatter(y_test, y_pred) ax1.plot([0, 10], [0, 10], '--k') ax1.set_xlabel('True Target') ax1.set_ylabel('Target predicted') ax1.text( 1, 9, r'$R^2$=%.2f, MAE=%.2f' % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax1.set_xlim([0, 10]) ax1.set_ylim([0, 10])
train_filename = 'NSLKDD/KDDTrain.csv' test_filename = 'NSLKDD/KDDTest.csv' model_dir = 'WideDeepModel/NSLKDD/' train_path = model_dir + 'aug_train.csv' test_path = model_dir + 'aug_test.csv' fold = 5 num_epochs = 240 batch_size = 64 dropout = 0.2 label_mapping = {'normal': 0, 'probe': 1, 'dos': 2, 'u2r': 3, 'r2l': 4} class_weights = { 'normal': 0.15, 'probe': 0.2, 'dos': 0.15, 'u2r': 0.3, 'r2l': 0.2 } transformer = QuantileTransformer() transformer_fitted = False scaler = MinMaxScaler() scaler_fitted = False columns = process_dataset(train_filename, train_path, split=True) process_dataset(test_filename, test_path, split=False) hist = train_and_eval(model_dir, columns, train_path, test_path) plot_history(hist['train_loss'], hist['valid_loss'], hist['test_loss'], model_dir) output = open(model_dir + 'Runs%d.pkl' % (num_epochs), 'wb') pickle.dump(hist, output) output.close()
def deal_with_scaling(data, model, y_name): """ Fits a scaler and transform data. :param data: pandas DataFrame :param model: regression model to be used :param scaler: scaler for numerical data to be used :param y_name: name of your target variable :return: transformed data. """ data = data.copy() if sum(data.isna().sum()) > 0: print('Unable to check best scaler for data. You have NaNs in there!') return None, None, None, None scalers = { 'row-wise': [ PowerTransformer(method='yeo-johnson'), PowerTransformer(method='box-cox'), StandardScaler(), MinMaxScaler(), RobustScaler(), FunctionTransformer(np.log1p, validate=True) ], 'col-wise': [QuantileTransformer(output_distribution='normal'), Normalizer()] } max_score = regression_benchmark(data, model, y_name) final_scaler = None final_model = model X = pd.get_dummies(data.drop(y_name, axis=1)) y = data[y_name] X_train_final, X_test_final, y_train_final, y_test_final = train_test_split( X, y, test_size=0.25, random_state=42) print('Testing different scalers. This might take a while.') for scaler in scalers['row-wise']: X = pd.get_dummies(data.drop(y_name, axis=1)) y = data[y_name] X_train, X_test, y_train_, y_test_ = train_test_split(X, y, test_size=0.25, random_state=42) try: scaler.fit(X_train) X_train_, X_test_ = (scaler.transform(X_train), scaler.transform(X_test)) model.fit(X_train_, y_train_) score = model.score(X_test_, y_test_) except: print(f'An error ocurred while scaling with {scaler}.') continue if score > max_score: max_score = score final_scaler = scaler final_model = model X_train_final, X_test_final, y_train_final, y_test_final = X_train_, X_test_, y_train_, y_test_ print('Almost there...') for scaler in scalers['col-wise']: X = data.drop(y_name, axis=1) y = data[y_name] try: X_num = scaler.fit_transform(X.select_dtypes(np.number)) X_cat = pd.get_dummies(X.select_dtypes(exclude=np.number)) X_ = np.concatenate((X_num, X_cat), axis=1) X_train_, X_test_, y_train_, y_test_ = train_test_split( X_, y, test_size=0.25, random_state=42) model.fit(X_train_, y_train_) score = model.score(X_test_, y_test_) except: print(f'An error ocurred while scaling with {scaler}.') continue if score > max_score: max_score = score final_scaler = scaler final_model = model X_train_final, X_test_final, y_train_final, y_test_final = X_train_, X_test_, y_train_, y_test_ with open('final_scaler.pkl', 'wb') as file: pickle.dump(scaler, file) #if max_score == regression_benchmark(data, model, y_name): # final_model = # max_score = print( f'The scaler chosen was {scaler}, with an r-squared of {max_score}.\nSaving scaler to "final_scaler.pkl".\n' ) return X_train_final, X_test_final, y_train_final, y_test_final, final_model, max_score, final_scaler
def _get_valid_samples_by_column(X, col): """Get non NaN samples in column of X""" return X[:, [col]][~np.isnan(X[:, col])] @pytest.mark.parametrize( "est, func, support_sparse, strictly_positive, omit_kwargs", [ (MaxAbsScaler(), maxabs_scale, True, False, []), (MinMaxScaler(), minmax_scale, False, False, ["clip"]), (StandardScaler(), scale, False, False, []), (StandardScaler(with_mean=False), scale, True, False, []), (PowerTransformer("yeo-johnson"), power_transform, False, False, []), (PowerTransformer("box-cox"), power_transform, False, True, []), (QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []), (RobustScaler(), robust_scale, False, False, []), (RobustScaler(with_centering=False), robust_scale, True, False, []), ], ) def test_missing_value_handling( est, func, support_sparse, strictly_positive, omit_kwargs ): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[ rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing) ] = np.nan if strictly_positive:
def get_scaler(n_quantiles): if n_quantiles > 0: return QuantileTransformer(n_quantiles=n_quantiles, output_distribution='normal', subsample=int(1e10)) else: return StandardScaler()
def get_new_base_enc(): return QuantileTransformer()
"Parameter1", "Parameter2", "Accuracy", "Balanced Accuracy", "MSE", "r2", "spearmanr" ]) for split in np.arange(numsplits): print("Evaluating fold " + str(split)) train_index = kfolds["fold_" + str(split)]["train"] test_index = kfolds["fold_" + str(split)]["test"] X_train, X_test = features_nosurv.iloc[train_index], features_nosurv.iloc[ test_index] y_train, y_test = surv_days[train_index], surv_days[test_index] # scale target with a quantile transform qtfm = QuantileTransformer(output_distribution='uniform', n_quantiles=150, random_state=randomstate) y_train = np.squeeze(qtfm.fit_transform(y_train.values.reshape(-1, 1))) y_test = np.squeeze(qtfm.transform(y_test.values.reshape(-1, 1))) # y_train, y_test = surv_classes[train_index], surv_classes[test_index] # for every split, perform feature selection for sel_name, sel in zip(selectornames_short, selectors): print('#####') print(sel_name) print('#####') if sel_name is "CHSQ": # shift X values to be non-negative for chsq feature selection X_train_tmp = X_train + np.abs(X_train.min()) selscore = sel(X_train_tmp, y_train)
def prepare_data(control_fmri_data, control_phenotype_data, SCHZ_fmri_data, SCHZ_phenotype_data, \ ADHD_fmri_data, ADHD_phenotype_data, BIPL_fmri_data, BIPL_phenotype_data, train_num, factor=5, sampling='bootstrap'): CTRL_num = control_phenotype_data.shape[0] SCHZ_num = SCHZ_phenotype_data.shape[0] ADHD_num = ADHD_phenotype_data.shape[0] BIPL_num = BIPL_phenotype_data.shape[0] x_context = torch.zeros([train_num+15, factor, control_phenotype_data.shape[1]]) y_context = torch.zeros([train_num+15, factor, control_fmri_data.shape[1], control_fmri_data.shape[2], control_fmri_data.shape[3]]) x_all = torch.zeros([train_num+15, factor, control_phenotype_data.shape[1]]) y_all = torch.zeros([train_num+15, factor, control_fmri_data.shape[1], control_fmri_data.shape[2], control_fmri_data.shape[3]]) rand_idx = np.random.permutation(CTRL_num) train_idx_ctrl = rand_idx[0:train_num] test_idx_ctrl = np.setdiff1d(np.array(range(CTRL_num)),train_idx_ctrl) rand_idx = np.random.permutation(SCHZ_num) train_idx_SCHZ = rand_idx[0:5] test_idx_SCHZ = np.setdiff1d(np.array(range(SCHZ_num)),train_idx_SCHZ) rand_idx = np.random.permutation(ADHD_num) train_idx_ADHD = rand_idx[0:5] test_idx_ADHD = np.setdiff1d(np.array(range(ADHD_num)),train_idx_ADHD) rand_idx = np.random.permutation(BIPL_num) train_idx_BIPL = rand_idx[0:5] test_idx_BIPL = np.setdiff1d(np.array(range(BIPL_num)),train_idx_BIPL) x_context_train = torch.cat((control_phenotype_data[train_idx_ctrl,:], SCHZ_phenotype_data[train_idx_SCHZ,:], ADHD_phenotype_data[train_idx_ADHD,:], BIPL_phenotype_data[train_idx_BIPL,:])) means = x_context_train.mean(dim = 0, keepdim = True) stds = x_context_train.std(dim = 0, keepdim = True) x_context_train = (x_context_train - means) / stds x_context_train[x_context_train != x_context_train] = 0 x_context_train[x_context_train == float("-Inf")] = 0 x_context_train[x_context_train == float("Inf")] = 0 x_context_test = torch.cat((control_phenotype_data[test_idx_ctrl,:], SCHZ_phenotype_data[test_idx_SCHZ,:], ADHD_phenotype_data[test_idx_ADHD,:], BIPL_phenotype_data[test_idx_BIPL,:]),0) x_context_test = (x_context_test - means) / stds x_context_test[x_context_test != x_context_test] = 0 x_context_test[x_context_test == float("-Inf")] = 0 x_context_test[x_context_test == float("Inf")] = 0 x_test = x_context_test x_context_test = x_context_test.unsqueeze(1).expand(-1,factor,-1) y_context_train = torch.cat((control_fmri_data[train_idx_ctrl,:,:,:], SCHZ_fmri_data[train_idx_SCHZ,:,:,:], ADHD_fmri_data[train_idx_ADHD,:,:,:], BIPL_fmri_data[train_idx_BIPL,:,:,:]),0) y_test = torch.cat((control_fmri_data[test_idx_ctrl,:,:,:], SCHZ_fmri_data[test_idx_SCHZ,:,:,:], ADHD_fmri_data[test_idx_ADHD,:,:,:], BIPL_fmri_data[test_idx_BIPL,:,:,:]),0) y_context_test = torch.zeros([y_test.shape[0], factor, y_test.shape[1], y_test.shape[2], y_test.shape[3]]) scaler = QuantileTransformer() scaler.fit(ravel_2D(np.concatenate((control_fmri_data, SCHZ_fmri_data, ADHD_fmri_data, BIPL_fmri_data),0))) for i in range(factor): if sampling == 'noise': x_context[:,i,:] = x_context_train + torch.randn(x_context_train.shape) * 0.01 x_context_test[:,i,:] = x_context_test[:,i,:] + torch.randn([x_context_test.shape[0],x_context_test.shape[2]]) * 0.01 elif sampling == 'bootstrap': x_context[:,i,:] = x_context_train[:,:] idx = np.random.randint(0,x_context_train.shape[0], x_context_train.shape[0]) for j in range(y_context_train.shape[1]): for k in range(y_context_train.shape[2]): for l in range(y_context_train.shape[3]): reg = LinearRegression() if sampling == 'noise': reg.fit(x_context[:,i,:].numpy(),y_context_train[:,j,k,l].numpy()) elif sampling == 'bootstrap': reg.fit(x_context[idx,i,:].numpy(),y_context_train[idx,j,k,l].numpy()) y_context[:,i,j,k,l] = torch.tensor(reg.predict(x_context[:,i,:].numpy())) y_context_test[:,i,j,k,l] = torch.tensor(reg.predict(x_context_test[:,i,:].numpy())) y_context[:,i,:,:,:] = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_context[:,i,:,:,:])),y_context[:,i,:,:,:].shape)) y_context_test[:,i,:,:,:] = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_context_test[:,i,:,:,:])),y_context_test[:,i,:,:,:].shape)) print(i) x_all = x_context_train.unsqueeze(1).expand(-1,factor,-1) y_all = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_context_train)),y_context_train.shape),dtype=torch.float32).unsqueeze(1).expand(-1,factor,-1,-1,-1) y_test = torch.tensor(unravel_2D(scaler.transform(ravel_2D(y_test)),y_test.shape),dtype=torch.float32) y_test = y_test.view((y_test.shape[0],1,y_test.shape[1],y_test.shape[2],y_test.shape[3])) labels = np.zeros(y_test.shape[0]) labels[len(test_idx_ctrl):] = 1 diagnosis_labels = np.zeros(y_test.shape[0]) diagnosis_labels[len(test_idx_ctrl):len(test_idx_ctrl)+len(test_idx_SCHZ)] = 1 diagnosis_labels[len(test_idx_ctrl)+len(test_idx_SCHZ):len(test_idx_ctrl)+len(test_idx_SCHZ)+len(test_idx_ADHD)] = 2 diagnosis_labels[len(test_idx_ctrl)+len(test_idx_SCHZ)+len(test_idx_ADHD):len(test_idx_ctrl)+len(test_idx_SCHZ)+len(test_idx_ADHD)+len(test_idx_BIPL)] = 3 return x_context, y_context, x_all, y_all, x_context_test, y_context_test, x_test, y_test, labels, diagnosis_labels, scaler
def pandas_group_quantile_transform(x): "Used inside the transform function after a pandas groupby operation" qt = QuantileTransformer() return qt.fit_transform(x.values.reshape(-1, 1)).reshape(-1)
class PowerForecaster: """ Check out the class spec at https://docs.google.com/document/d/1-ceuHfJ2bNbgmKddLTUCS0HJ1juE5t0042Mts_yEUD8v sample data is in https://drive.google.com/uc?export=download&id=1z2MBYJ8k4M5J3udlFVc2d8opE_f-S4BK """ def __init__(self, df, model=Models.PROPHET, upsample_freq=None, train_test_split_ratio=Constants.TRAIN_TEST_SPLIT_RATIO.value, epochs=Constants.EPOCHS.value, initial_epoch=Constants.INITIAL_EPOCH.value, batch_size=Constants.BATCH_SIZE.value, sliding_window_size_or_time_steps=Constants.SLIDING_WINDOW_SIZE_OR_TIME_STEPS.value, do_shuffle=True): logging.info("resample: {}. future_prediction: {}, epochs: {}, batch_size: {}," " window_size: {}, eurons: {}" .format(Constants.RESAMPLING_FREQ.value , Constants.SHIFT_IN_TIME_STEP_TO_PREDICT.value , epochs , batch_size , sliding_window_size_or_time_steps , Constants.NEURONS.value )) if logging.getLogger().isEnabledFor(logging.INFO): explore_data(df) # first step is to create a timestamp column as index to turn it to a TimeSeries data df.index = pd.to_datetime(df[ColumnNames.DATE.value] + df[ColumnNames.TIME.value], format='%Y-%m-%d%H:%M:%S', errors='raise') if 'Unnamed: 0' in df.columns: df.drop('Unnamed: 0', axis=1, inplace=True) # keep a copy of original dataset for future comparison self.df_original = df.copy() # we interpolate temperature using prophet to use it in a multivariate forecast temperature = ColumnNames.TEMPERATURE.value interpolated_df = facebook_prophet_filter(df, temperature, Constants.FORECASTED_TEMPERATURE_FILE.value) interpolated_df.index = df.index df[[temperature]] = interpolated_df[[ColumnNames.FORECAST.value]] # lets also interpolate missing kwh using facebook prophet (or we could simply drop them) # now turn to kwh and make the format compatible with prophet power = ColumnNames.POWER.value interpolated_df = facebook_prophet_filter(df, power, Constants.FORECASTED_POWER_FILE.value) interpolated_df.index = df.index df[[power]] = interpolated_df[[ColumnNames.FORECAST.value]] df = df.rename(columns={power: ColumnNames.LABEL.value}) df.drop(columns=[ColumnNames.DATE.value, ColumnNames.TIME.value, ColumnNames.DAY_OF_WEEK.value, ColumnNames.MONTH.value], inplace=True ) if upsample_freq is not None: df = df.resample(upsample_freq).mean() # for any regression or forecasting it is better to work with normalized data self.transformer = QuantileTransformer() # handle outliers better than MinMaxScalar features = ColumnNames.FEATURES.value normalized = normalize(df, features, transformer=self.transformer) # we use the last part (after 12/1/2013) that doesnt have temperature for testing cutoff_date = Constants.CUTOFF_DATE.value self.df = normalized[normalized.index < cutoff_date] self.testing = normalized[normalized.index >= cutoff_date] self.df[ColumnNames.DATE_STAMP.value] = self.df.index self.df_blocked = None self.train_test_split_ratio = train_test_split_ratio self.model_type = model self.train_X, self.test_X, self.train_test_split_index = self.train_test_split(self.df[features]) self.train_y, self.test_y, _ = self.train_test_split(self.df[ColumnNames.LABELS.value]) self.model_fit = None self.epochs = epochs self.initial_epoch = initial_epoch self.batch_size = batch_size self.history = None # following is defines in sliding_window self.do_shuffle = do_shuffle self.val_idx = None self.shuffled_X = None self.shuffled_y = None self.train = None self.label = None self.train_size = None self.val_size = None if logging.getLogger().isEnabledFor(logging.INFO): explore_data(self.df) def train_test_split(self, df): split_index = int(self.train_test_split_ratio * df.shape[0]) train = df.iloc[:split_index, :] test = df.iloc[split_index:, :] return train, test, split_index def stationary_test(self): dataset = self.test_y.dropna() seasonal_dataset = sm.tsa.seasonal_decompose(dataset, freq=365) fig = seasonal_dataset.plot() fig.set_figheight(8) fig.set_figwidth(15) fig.show() def p_value(dataset): # ADF-test(Original-time-series) dataset.dropna() p_value = sm.tsa.adfuller(dataset, regression='ct') logging.debug('p-value:{}'.format(p_value)) p_value = sm.tsa.adfuller(dataset, regression='c') logging.debug('p-value:{}'.format(p_value)) p_value(self.train_y) p_value(self.test_y) # Test works for only 12 variables, check the eigenvalues johnsen_test = coint_johansen(self.df[ColumnNames.FEATURES.value].dropna(), -1, 1).eig return johnsen_test def seasonal_prediction(self): from statsmodels.tsa.api import SimpleExpSmoothing y_hat_avg = self.test_y.copy() fit2 = SimpleExpSmoothing(np.asarray(self.train_y['Count'])).fit(smoothing_level=0.6, optimized=False) y_hat_avg['SES'] = fit2.forecast(len(self.test_y)) plt.figure(figsize=(16, 8)) plt.plot(self.train_y['Count'], label='Train') plt.plot(self.test_y['Count'], label='Test') plt.plot(y_hat_avg['SES'], label='SES') plt.legend(loc='best') plt.show() def fit(self): if self.model_type == Models.PROPHET: self.prophet_fit() elif self.model_type == Models.ARIMA: self.arima_fit() elif self.model_type == Models.VAR: self.var_fit() elif self.model_type == Models.LSTM: self.lstm_fit() else: raise ValueError("{} is not defined".format(self.model_type)) def evaluate(self): self.loss_metrics = self.model_type.value.evaluate( self.val_X, self.val_y, batch_size=self.batch_size, verbose=0 ) logging.info("Metric names:{}".format(self.model_type.value.metrics_names)) logging.info("Loss Metrics:{}".format(self.loss_metrics)) def resultToDataFrame(self, data, start_index, end_index, do_scale_back=False): label_column = ColumnNames.LABEL.value df = self.df.iloc[start_index:end_index] df[label_column] = data if do_scale_back: features = ColumnNames.FEATURES.value df[features] = self.transformer.inverse_transform(df[features]) return df[[label_column]] def block_after_date(self, start_block_date_st): index, _ = find_index(self.df, start_block_date_st) logging.debug("Index of block is {} with length of {}".format(index, len(self.df) - index)) self.df_blocked = self.df.iloc[index:] self.df_blocked.reindex() logging.info("Blocked from {} to {} fromo training and validation" .format(self.df_blocked.index[0], self.df_blocked.index[-1])) def adjust_index_and_training_shift(self, start_date_in_labeling_st , training_duration_in_frequency = None , start_date_training_st = None ): logging.debug("Original range data of data: [{}-{}]".format(self.df.index[0], self.df.index[-1])) index_start_labeling, _ = find_index(self.df, start_date_in_labeling_st) if start_date_training_st is not None: index_start_training, _ = find_index(self.df, start_date_training_st) if index_start_labeling < index_start_training: raise ValueError("Labeling should be after training") self.shift = index_start_labeling - index_start_training else: index_start_training = 0 self.shift = index_start_labeling if training_duration_in_frequency is None: logging.info("Shift is set to be {}".format(self.shift)) else: final_index = index_start_training + training_duration_in_frequency + self.shift logging.debug("start index: {}, final_index: {}".format(index_start_training, final_index)) self.df = self.df.iloc[index_start_training:index_start_training + training_duration_in_frequency + self.shift] logging.info("Shift is set to be {}, we picked the slice of [{} : {}] for trainig".format( self.shift, self.df.index[0] , self.df.index[-1] )) def lstm_predict(self, model , start_date_to_predict_st=None , duration_in_freq = None , do_scale_back = False ): X, true_y = self.get_whole() if start_date_to_predict_st is not None: y_index_i, _ = find_index(self.df, start_date_to_predict_st) x_index_i = 0 if y_index_i <= self.shift else y_index_i - self.shift x_index_f = x_index_i + duration_in_freq y_index_f = y_index_i + duration_in_freq logging.info("Predicting time slice [{} : {}] from [{} : {}]".format( self.df.index[y_index_i],self.df.index[y_index_f] , self.df.index[x_index_i], self.df.index[x_index_f] )) X = X[x_index_i:x_index_f] true_y = true_y[y_index_i:y_index_f] predicted = model.predict(X) logging.debug("Predicted Labels shape: {}".format(predicted.shape)) plt.plot(predicted, 'r') plt.plot(true_y, 'b') plt.show() df_predicted = self.resultToDataFrame(predicted, x_index_i + self.shift , x_index_f + self.shift, do_scale_back) return df_predicted def scale_back(self, df_predicted, start_index, end_index): label_column = ColumnNames.LABEL.value features = ColumnNames.FEATURES.value df = self.df[features].iloc[start_index:end_index] df[label_column] = df_predicted[label_column] scaled_predicted = self.transformer.inverse_transform(df[features]) df[features] = scaled_predicted return df def prophet_fit(self): past = self.train_y.copy() past[ColumnNames.DATE_STAMP.value] = self.train_y.index self.model_type.value.fit(past) def arima_fit(self): model = sm.tsa.statespace.SARIMAX(self.train_y, order=Constants.SARIMAX_ORDER.value, seasonal_order=Constants.SARIMAX_SEASONAL_ORDER.value) # ,enforce_stationarity=False, enforce_invertibility=False, freq='15T') logging.debug("SARIMAX fitting ....") self.model_fit = self.model_type.value.fit() self.model_fit.summary() logging.debug("SARIMAX forecast", self.model_fit.forecast()) def var_fit(self): logging.debug("making VAR model") model = VAR(endog=self.train_X[ColumnNames.FEATURES.value].dropna()) logging.debug("VAR fitting ....") self.model_fit = model.fit() print(self.model_fit.summary()) def lstm_fit(self): if logging.getLogger().isEnabledFor(logging.INFO): print(self.model_type.value.summary()) callbacks = Callbacks(Constants.MODEL_NAME.value, self.batch_size, self.epochs) X, y = self.get_shuff_train_label() self.history = self.model_type.value.fit( X, y, epochs=self.epochs, batch_size=self.batch_size, validation_split=0.35, verbose=0, callbacks=callbacks.getDefaultCallbacks(), initial_epoch=self.initial_epoch, ) logging.debug("history of performance:{}".format(self.history.history)) def predict(self, feature_set=None): future = feature_set if feature_set is not None \ else Constants.DEFAULT_FUTURE_PERIODS.value if self.model_type == Models.PROPHET: self.future = self.model_type.value.make_future_dataframe(periods=future, freq=Constants.DEFAULT_FUTURE_FREQ.value, include_history=False) if self.model_type == Models.PROPHET: predicted = self.model_type.value.predict(self.future) predicted[ColumnNames.LABEL.value] = predicted[ColumnNames.FORECAST.value] elif self.model_type == Models.ARIMA: predicted = self.arima_predict(future) elif self.model_type == Models.VAR: predicted = self.var_predict(future) elif self.model_type == Models.LSTM: return self.lstm_predict(self.model.value, start_date_to_predict_st="2013-6-01", duration_in_freq=3 * 30) else: raise ValueError("{} is not defined".format(self.model_type)) df_predicted = self.resultToDataFrame(predicted, self.train_test_split_index , self.train_test_split_index + len(predicted)) return df_predicted def arima_predict(self, future): end = str(self.train_y.index[-1]) start = str(self.train_y.index[-future]) print(start, end) predicted = self.model_fit.predict(start=start[:10], end=end[:10], dynamic=True) return predicted def var_predict(self, future): predicted_array = self.model_fit.forecast(self.model_fit.y, future) predicted = pd.DataFrame(predicted_array) predicted.columns = ColumnNames.FEATURES.value predicted.index = self.test_y.index[:len(predicted)] return predicted def sliding_window(self): # Generate the data matrix length0 = self.df.shape[0] window_size = Constants.SLIDING_WINDOW_SIZE_OR_TIME_STEPS.value future_time_steps = Constants.SHIFT_IN_TIME_STEP_TO_PREDICT.value features_column = ColumnNames.FEATURES.value label_column = ColumnNames.LABEL.value sliding_window_feature = np.zeros((length0 - window_size - future_time_steps, window_size, len(features_column))) sliding_window_label = np.zeros((length0 - window_size - future_time_steps, 1)) for counter in range(length0 - window_size - future_time_steps): sliding_window_label[counter, :] = self.df[label_column][counter + window_size + future_time_steps] for counter in range(length0 - window_size - future_time_steps): sliding_window_feature[counter, :] = self.df[features_column][ counter: counter + window_size] if self.do_shuffle: logging.debug('Random shuffeling') length = sliding_window_feature.shape[0] if self.df_blocked is not None: length -= len(self.df_blocked) logging.info("length of data reduced by {} due to blocking. The last date is {}" .format(len(self.df_blocked), self.df.index[length])) logging.debug("sliding window length: {}".format(length)) split_ratio = Constants.TRAIN_TEST_SPLIT_RATIO.value idx = np.random.choice(length, length, replace=False) if self.do_shuffle else np.arange(length) self.val_idx = idx[int(split_ratio * length):] feature_window_shuffled = sliding_window_feature[idx, :] label_window_shuffled = sliding_window_label[idx, :] self.shuffled_X = feature_window_shuffled self.shuffled_y = label_window_shuffled self.train = sliding_window_feature self.label = sliding_window_label self.train_X = self.shuffled_X[:int(split_ratio * length), :] self.train_y = self.shuffled_y[:int(split_ratio * length), :] self.train_size = int(split_ratio * length) self.val_X = self.shuffled_X[int(split_ratio * length):, :] self.val_y = self.shuffled_y[int(split_ratio * length):, :] self.val_size = length - self.train_size def get_shuff_train_label(self): X = self.shuffled_X # np.expand_dims(self.shuffled_X, axis=-1) Y = self.shuffled_y return X, Y def evaluate_performance(self): # make a prediction X = self.test_X # np.expand_dims(self.test_X, axis=-1) yhat = self.model_type.value.predict(X) test_X = self.test_X.reshape((self.test_X.shape[0], self.test_X.shape[2])) # invert scaling for forecast inv_yhat = pd.concatenate((yhat, test_X[:, 1:]), axis=1) inv_yhat = self.transformer.inverse_transform(inv_yhat) inv_yhat = inv_yhat[:, 0] # invert scaling for actual test_y = self.test_y.reshape((len(self.test_y), 1)) inv_y = pd.concatenate((test_y, test_X[:, 1:]), axis=1) inv_y = self.transformer.inverse_transform(inv_y) inv_y = inv_y[:, 0] # calculate RMSE rmse = sqrt(mean_squared_error(inv_y, inv_yhat)) logging.debug('Test RMSE: %.3f' % rmse) def plot_future(self, predicted): self.model_type.value.plot(predicted, xlabel='Date', ylabel='KWH') self.model_type.value.plot_components(predicted) # by_dow.plot(xticks=ticks, style=style, title='Averaged on Days of the Week') # plt.show() def visual_inspection(self): style = [':', '--', '-'] pd.plotting.register_matplotlib_converters() df = self.df self.df_original[ColumnNames.ORIGINAL_FEATURES.value].plot(style=style, title='Original Data') plt.show() self.df[ColumnNames.FEATURES.value].plot(style=style, title='Normalized Data') plt.show() sampled = df.resample('M').sum()[ColumnNames.FEATURES.value] sampled.plot(style=style, title='Aggregated Monthly') plt.show() sampled = df.resample('W').sum()[ColumnNames.FEATURES.value] sampled.plot(style=style, title='Aggregated Weekly') plt.show() sampled = df.resample('D').sum()[ColumnNames.FEATURES.value] sampled.rolling(30, center=True).sum().plot(style=style, title='Aggregated Daily') plt.show() by_time = df.groupby(by=df.index.time).mean()[ColumnNames.FEATURES.value] ticks = 4 * 60 * 60 * np.arange(6) by_time.plot(xticks=ticks, style=style, title='Averaged Hourly') plt.show() days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] def tick(x): if x % 24 == 12: return days[int(x) // 24] else: return "" # ax.xaxis.set_major_formatter(NullFormatter()) # ax.xaxis.set_minor_formatter(FuncFormatter(tick)) # ax.tick_params(which="major", axis="x", length=10, width=1.5) #by_dow = df.groupby(by=df.dow).mean()[ColumnNames.FEATURES.value] #ticks = 4 * 60 * 60 * np.arange(6) def plot_prediction(self, start_index, end_index): style = [':', '--', '-'] pd.plotting.register_matplotlib_converters() label_column = ColumnNames.LABELS.value # import pdb; pdb.set_trace() t = self.train.index.iloc[start_index:end_index] X = self.train.iloc[start_index: end_index] true_y = self.label.iloc[start_index, end_index] y = self.model_type.value.predict(X) plt.plot(t, y, true_y, style=style) plt.show() def plot_history(self): plt.plot(np.arange(self.epochs - self.initial_epoch), self.history.history['loss'], label='train') plt.plot(np.arange(self.epochs - self.initial_epoch), self.history.history['val_loss'], label='validation') plt.legend() plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() def get_next_train_batch(self): # getting the next train batch if self.pointer + self.batchsize >= self.train_size: end = self.train_size start = self.pointer self.pointer = 0 self.epoch += 1 else: end = self.pointer + self.batchsize start = self.pointer self.pointer += self.batchsize X = self.train_data[start:end, :] Y = self.train_label[start:end, :] return X, Y def get_val(self): X = np.expand_dims(self.val_data, axis=-1) return X, self.val_label[:] def get_whole(self): # get whole, for validation set X = self.train[:, :] # np.expand_dims(self.train[:, :], axis=-1) Y = self.label[:, :] return X, Y def reset(self): self.pointer = 0 self.epoch = 0
class NNClassifier: ''' Usage: clf = NNClassifier(**params) history = clf.fit( X_train, y_train, X_valid, y_valid, early_stopping_rounds ) ''' def __init__(self, input_shape=1024, input_dropout=0.2, hidden_layers=1, hidden_units=64, hidden_dropout=0.2, batch_norm="none", learning_rate=0.05, batch_size=64, epochs=10000): self.input_shape = int(input_shape) # layer param self.input_dropout = input_dropout # layer param self.hidden_layers = int(hidden_layers) # layer param self.hidden_units = int(hidden_units) # layer param self.hidden_dropout = hidden_dropout # layer param self.batch_norm = batch_norm # layer param self.learning_rate = learning_rate # optimizer param self.batch_size = int(batch_size) # fit param self.epochs = int(epochs) # fit param def fit(self, X_train, y_train, X_valid, y_valid, early_stopping_rounds): # Data standardization self.transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal') X_train = self.transformer.fit_transform(X_train) X_valid = self.transformer.transform(X_valid) # layers self.model = Sequential() self.model.add(Dropout(self.input_dropout, input_shape=(self.input_shape,))) for i in range(self.hidden_layers): self.model.add(Dense(self.hidden_units)) if self.batch_norm == 'before_act': self. model.add(BatchNormalization()) self.model.add(ReLU()) self.model.add(Dropout(self.hidden_dropout)) self.model.add(Dense(1, activation='sigmoid')) # Optimazer optimizer = Adam(lr=self.learning_rate, beta_1=0.9, beta_2=0.999, decay=0.) # Compile self.model.compile( loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'] ) # train early_stopping = EarlyStopping(patience=early_stopping_rounds, restore_best_weights=True) self.history = self.model.fit( X_train, y_train, epochs=self.epochs, batch_size=self.batch_size, verbose=1, validation_data=(X_valid, y_valid), callbacks=[early_stopping] ) return self.history def predict(self, x): x = self.transformer.transform(x) y_pred = self.model.predict(x).astype("float64") y_pred = y_pred.flatten() return y_pred def get_model(self): return self.model
class CutOff(TransformerMixin): def fit(self, X, y=None, **fit_params): return self def transform(self, X, y=None, **fit_params): X[X > 3] = 3 X[X < -3] = -3 return X # Preprocessing for numerical data num_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant')), ('scale', RobustScaler(quantile_range=[5, 95])), ('quantile', QuantileTransformer( n_quantiles=300, output_distribution='normal', random_state=0)), ('cutoff', CutOff()), # Cut off at 3 standard deviations ('norm', Normalizer(norm='l2')) ]) # Preprocessing for nominal categorical data cat_transformer_nominal = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant')), ('pca', PCA(whiten=True, random_state=0)), ('bins', KBinsDiscretizer(n_bins=100, encode='onehot', strategy='quantile')), ('norm', Normalizer(norm='l2')), ]) # Preprocessing for ordinal categorical data cat_transformer_ordinal = Pipeline(steps=[
def rgb_burn_in(red, green, blue, burn_in_array, color=None, min_value=None, max_value=None, colormap='viridis', fade=1, uniform_distribution=False, no_data_value=-9999, valid_value=1, transp=0.0): """ Burn in a mask or a specific parameter into an RGB image for visualization purposes. The burn_in_array will be copied where values are different from no_data_value. :param uniform_distribution: convert the input values in a uniform histogram :param colormap: matplotlib colormap (string) to create the RGB ramp :param max_value: maximum value :param min_value: minimum value :param red: Original red band :param green: Original green band :param blue: Original blue band :param burn_in_array: Values to be burnt in :param no_data_value: Value to ne unconsidered :param color: Tuple of color (R, G, B) to be used in the burn in :param fade: Fade the RGB bands to emphasize the copied values :param transp: Transparency to use in the mask (0=opaque 1=completely transparent) :return: RGB image bands """ if color: new_red = np.where(burn_in_array == valid_value, color[0] * (1 - transp) + red * (transp), red * fade) new_green = np.where(burn_in_array == valid_value, color[1] * (1 - transp) + green * (transp), green * fade) new_blue = np.where(burn_in_array == valid_value, color[2] * (1 - transp) + blue * (transp), blue * fade) else: # the mask is where the value equals no_data_value mask = (burn_in_array == no_data_value) # the valid values are those outside the mask (~mask) burn_in_values = burn_in_array[~mask] # apply scalers to uniform the data if uniform_distribution: burn_in_values = QuantileTransformer().fit_transform(burn_in_values[:, np.newaxis])[:, 0] # burn_in_values = MinMaxScaler((0, 0.3)).fit_transform(burn_in_values) # rgb_burn_in_values = DWutils.gray2color_ramp(burn_in_values[:, 0], limits=(0, 0.3)) rgb_burn_in_values = DWutils.gray2color_ramp(burn_in_values, min_value=min_value, max_value=max_value, colormap=colormap, limits=(0, 0.25)) # return the scaled values to the burn_in_array # burn_in_array[~mask] = burn_in_values[:, 0] # calculate a color_ramp for these pixels # rgb_burn_in_values = DWutils.gray2color_ramp(burn_in_array, limits=(0, 0.3)) # new_red = np.where(burn_in_array == no_data_value, red, rgb_burn_in_values[:, 0]) # new_green = np.where(burn_in_array == no_data_value, green, rgb_burn_in_values[:, 1]) # new_blue = np.where(burn_in_array == no_data_value, blue, rgb_burn_in_values[:, 2]) # return the scaled values to the burn_in_array burn_in_array[~mask] = rgb_burn_in_values[:, 0] burn_in_red = np.copy(burn_in_array) burn_in_array[~mask] = rgb_burn_in_values[:, 1] burn_in_green = np.copy(burn_in_array) burn_in_array[~mask] = rgb_burn_in_values[:, 2] burn_in_blue = np.copy(burn_in_array) # burn in the values new_red = np.where(burn_in_array == no_data_value, red*fade, burn_in_red) new_green = np.where(burn_in_array == no_data_value, green*fade, burn_in_green) new_blue = np.where(burn_in_array == no_data_value, blue*fade, burn_in_blue) return new_red, new_green, new_blue
from sklearn.preprocessing import PowerTransformer from sklearn.preprocessing import QuantileTransformer from sklearn.model_selection import train_test_split print(__doc__) N_SAMPLES = 1000 FONT_SIZE = 6 BINS = 30 rng = np.random.RandomState(304) bc = PowerTransformer(method='box-cox') yj = PowerTransformer(method='yeo-johnson') qt = QuantileTransformer(output_distribution='normal', random_state=rng) size = (N_SAMPLES, 1) # lognormal distribution X_lognormal = rng.lognormal(size=size) # chi-squared distribution df = 3 X_chisq = rng.chisquare(df=df, size=size) # weibull distribution a = 50 X_weibull = rng.weibull(a=a, size=size) # gaussian distribution
# In[74]: train_features = pd.read_csv('../input/lish-moa/train_features.csv') train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv') train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv') test_features = pd.read_csv('../input/lish-moa/test_features.csv') submission = pd.read_csv('../input/lish-moa/sample_submission.csv') GENES = [col for col in train_features.columns if col.startswith('g-')] CELLS = [col for col in train_features.columns if col.startswith('c-')] for col in (GENES + CELLS): transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal") vec_len = len(train_features[col].values) vec_len_test = len(test_features[col].values) raw_vec = train_features[col].values.reshape(vec_len, 1) transformer.fit(raw_vec) train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0] test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0] def seed_everything(seed=42): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True