def flatten_and_standardize_dataset(model, dest): target_ds_preprocessed = utils.open_pickle(model.target_ds_preprocessed_path) target_ds_preprocessed = utils.remove_expver(target_ds_preprocessed) # reshaping reshapestarttime = timer(); print(f"{utils.time_now()} - Reshaping data now...") print(f"\n{utils.time_now()} - reshaping rhum dataarrays now, total levels to loop: {model.rhum_pressure_levels}.") reshaped_unnorma_darrays = {} reshaped_unnorma_darrays['rhum'], reshaped_unnorma_darrays['uwnd'], reshaped_unnorma_darrays['vwnd'] = {}, {}, {} for level in model.rhum_pressure_levels: print(f'@{level}... ') reshaped_unnorma_darrays['rhum'][level] = np.reshape( target_ds_preprocessed.rhum.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size )) print(f"\n{utils.time_now()} - reshaping uwnd/vwnd dataarrays now, total levels to loop: {model.uwnd_vwnd_pressure_lvls}.") for level in model.uwnd_vwnd_pressure_lvls: print(f'@{level}... ') reshaped_unnorma_darrays['uwnd'][level] = np.reshape( target_ds_preprocessed.uwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size )) reshaped_unnorma_darrays['vwnd'][level] = np.reshape( target_ds_preprocessed.vwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size )) reshapetime = timer()-reshapestarttime; reshapetime = str(datetime.timedelta(seconds=reshapetime)).split(".")[0]; print(f'Time taken: {reshapetime}s.\n') # stacking unstandardized dataarrays stackingstarttime = timer(); print("Stacking unstandardized dataarrays now...") stacked_unstandardized_ds = np.hstack([reshaped_unnorma_darrays[var][lvl] for var in reshaped_unnorma_darrays for lvl in reshaped_unnorma_darrays[var]]) stackingtime = timer()-stackingstarttime; stackingtime = str(datetime.timedelta(seconds=stackingtime)).split(".")[0]; print(f'Time taken: {stackingtime}s.\n') # standardizing the stacked dataarrays standardizestarttime = timer(); print("standardizing stacked dataarrays now...") print(f'"stacked_unstandardized_ds.shape" is {stacked_unstandardized_ds.shape}') transformer = RobustScaler(quantile_range=(25, 75)) standardized_stacked_arr = transformer.fit_transform(stacked_unstandardized_ds) # som & kmeans training transformer.get_params() standardizetime = timer()-standardizestarttime; standardizetime = str(datetime.timedelta(seconds=standardizetime)).split(".")[0]; print(f'That took {standardizetime}s to complete.\n') standardized_stacked_arr_path = utils.to_pickle('standardized_stacked_arr', standardized_stacked_arr, dest) return standardized_stacked_arr_path
class RobustScaler(FeatureTransformAlgorithm): r"""Implementation of the robust scaler. Date: 2020 Author: Luka Pečnik License: MIT Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler See Also: * :class:`niaaml.preprocessing.feature_transform.FeatureTransformAlgorithm` """ Name = 'Robust Scaler' def __init__(self, **kwargs): r"""Initialize RobustScaler. """ self._params = dict(with_centering=ParameterDefinition([True, False]), with_scaling=ParameterDefinition([True, False])) self.__robust_scaler = RS() def fit(self, x, **kwargs): r"""Fit implemented transformation algorithm. Arguments: x (pandas.core.frame.DataFrame): n samples to fit transformation algorithm. """ self.__robust_scaler.fit(x) def transform(self, x, **kwargs): r"""Transforms the given x data. Arguments: x (pandas.core.frame.DataFrame): Data to transform. Returns: pandas.core.frame.DataFrame: Transformed data. """ return self.__robust_scaler.transform(x) def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return FeatureTransformAlgorithm.to_string(self).format( name=self.Name, args=self._parameters_to_string(self.__robust_scaler.get_params()))
def scale_data(trainX, testX): """ Scale data 2D :param trainX: (array) :param testX: (array) :return: trainX: (array) testX: (array) """ # remove overlap cut = int(trainX.shape[1] / 2) longX = trainX[:, -cut:, :] # flatten windows longX = longX.reshape((longX.shape[0] * longX.shape[1], longX.shape[2])) # flatten train and test flatTrainX = trainX.reshape( (trainX.shape[0] * trainX.shape[1], trainX.shape[2])) flatTestX = testX.reshape( (testX.shape[0] * testX.shape[1], testX.shape[2])) # standardize s = RobustScaler() # fit on training data s.fit(longX) # print("MEAN:") # print(s.mean_) # print("------------------------------------------") # print("VAR:") # print(s.var_) # print("------------------------------------------") # print("STD:") # print(s.scale_) print(s.get_params(True)) # apply to training and test data longX = s.transform(longX) flatTrainX = s.transform(flatTrainX) flatTestX = s.transform(flatTestX) # reshape flatTrainX = flatTrainX.reshape((trainX.shape)) flatTestX = flatTestX.reshape((testX.shape)) return flatTrainX, flatTestX
def classify(datapath,v, normalize=True):#datapath: directory name of the datasets, (v)erbose: True or false, normalize = True normalizes training data # Grab both wine datasets in one dataset concat_data = get_data(datapath) # Bag data to 5 scores recode = {3:0, 4:0, 5:1, 6:2, 7:3, 8:4,9:4} concat_data['quality_c'] = bag_data(recode,concat_data,'quality') # Split up dataset 70/30 training,testing y_wine = concat_data['quality_c'] X_wine = concat_data.drop(['quality_c','quality'], axis=1) X_train, X_test, y_train, y_test = train_test_split(X_wine, y_wine, test_size=0.3, random_state=420) X_train_c , X_test_c = X_train.copy(), X_test.copy() #save test and train X sets for classification if normalize: # Normalize training examples by removing mean and scaling by interquartile range (better than using s.d=1 for outliers in dataset) sclr = RobustScaler() X_train = sclr.fit_transform(X_train) # Retain Training scale params for scaling test set scl_params = sclr.get_params() # Normalise test examples using training set normalization params sclr = sclr.set_params(**scl_params) X_test = sclr.transform(X_test) # Set parameters by cross validation #========================================================================================== # REGRESSION PROBLEM #========================================================================================== # Multivariate Linear Regression clf = LinearRegression(fit_intercept=True, normalize=True, copy_X=True) clf.fit(X_train, y_train) # Make Predictions for both sets pred_train = clf.predict(X_train) pred_test = clf.predict(X_test) print('='*100+"\nLinear Regression:\n") print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v,problem_type=1) #========================================================================================== # Support Vector Machine(kernel=rbf), Regression clf = svm.SVR(C=3,kernel='rbf') clf.fit(X_train, y_train) # Make Predictions for both sets pred_train = clf.predict(X_train) pred_test = clf.predict(X_test) print('='*100+"\nSVR :\n") print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v,problem_type=1) #========================================================================================== # NN Regression, default params # Grid Search h_max = 2 #specify maximum number of hidden layers hidden_layer_sizes = build_grid(h_max) tuned_param = {'hidden_layer_sizes': hidden_layer_sizes} clf = GridSearchCV(neural_network.MLPRegressor(),tuned_param,cv=3) clf.fit(X_train,y_train) # Make Predictions for both sets pred_train = clf.predict(X_train) pred_test = clf.predict(X_test) print('='*100+"\nNNs :\n") print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v,problem_type=1) print("Best params:", clf.best_params_) #========================================================================================== # CLASSIFICATION PROBLEM #========================================================================================== # Restore normalized examples back to original X_train, X_test = X_train_c, X_test_c # Support Vector Machine(Kernel=rbf), Classification clf = svm.SVC(C=3,kernel='rbf',random_state=0) clf.fit(X_train, y_train) # Make Predictions for both sets pred_train = clf.predict(X_train) pred_test = clf.predict(X_test) print('='*100+"\nSVC :\n") print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v) #========================================================================================== # Support Vector Machine(Kernel=rbf), One vs Rest Classification clf = OneVsRestClassifier(estimator=svm.SVC(C=3,kernel='rbf', random_state=1)) clf.fit(X_train, y_train) # Make Predictions for both sets pred_train = clf.predict(X_train) pred_test = clf.predict(X_test) print('='*100+"\nSVC(OneVsRest):\n") print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v) #========================================================================================== # NN Classification # Grid Search h_max = 2 #specify maximum number of hidden layers hidden_layer_sizes = build_grid(h_max) tuned_param = {'hidden_layer_sizes': hidden_layer_sizes} clf = GridSearchCV(neural_network.MLPClassifier(),tuned_param,cv=3) clf.fit(X_train,y_train) # Make Predictions for both sets pred_train = clf.predict(X_train) pred_test = clf.predict(X_test) print('='*100+"\nNNs :\n") print_metrics(clf,X_train,y_train,X_test,y_test,pred_train,pred_test,verbose=v) print("Best params:", clf.best_params_)
# Can't use pipelines, unfortunately pipeline = Pipeline([ ('scaler', RobustScaler()), ('NN', baseline_model()), ]) ''' if not load_weights: epochs = 50 # 50 -> 200 batch_size = 5120 validation_split = 0.2 scaler = RobustScaler() X_train_scaled = scaler.fit_transform(X_train) params = scaler.get_params() model = baseline_model(input_dim, out_dim) history = model.fit( X_train_scaled, y_train, epochs = epochs, batch_size = batch_size, verbose = 0, class_weight = class_weight, sample_weight = df_train['weight'].values, ) store_model(model, scaler, version=version)
df_x.head() # Load y data if on_colab: data_dir = 'SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv' else: data_dir = r'F:\temp\thesisdata\SAATCHI_MICRO_DATASET_PRICE_VIEWSLIKES.tsv' df_y = pd.read_csv(data_dir, sep='\t') df_y.set_index('FILENAME', inplace=True) # scale y data scaler_price = RobustScaler().fit(df_y[['PRICE']].values) scaler_rating = RobustScaler().fit(df_y[['LIKES_VIEWS_RATIO']].values) scalar_params_price = scaler_price.get_params(deep=True) scalar_params_rating = scaler_rating.get_params(deep=True) scaled_price = scaler_price.transform(df_y[['PRICE']].values) scaled_rating = scaler_rating.transform(df_y[['LIKES_VIEWS_RATIO']].values) df_y['PRICE'] = scaled_price df_y['LIKES_VIEWS_RATIO'] = scaled_rating df_y.head() # Join x and y into a single dataframe df = df_y.join(df_x) df.head() class SaatchiDataset(Dataset): training_set = df[:13000]
def prep_for_testing_random_dates(model): indp_vars_raw_data_paths = utils.find('*nc', model.test_indp_vars_raw_data_dir) RF_raw_data_paths = utils.find('*nc4', model.test_RF_raw_data_dir) CHOSEN_VARS_ds = [rf"{path}" for var in model.CHOSEN_VARS for path in indp_vars_raw_data_paths if f"{var}" in path ] ds_CHOSEN_VARS_renamed = xr.open_mfdataset(CHOSEN_VARS_ds, chunks={'time':4}).rename({ 'latitude':'lat', 'longitude':'lon', 'r':'rhum', 'u':'uwnd', 'v':'vwnd' }) ds_CHOSEN_VARS_renamed = utils.remove_expver(ds_CHOSEN_VARS_renamed) ds_sliced = ds_CHOSEN_VARS_renamed.sel( level=slice(np.min(model.unique_pressure_lvls),np.max(model.unique_pressure_lvls)), lat=slice(model.LAT_N,model.LAT_S), lon=slice(model.LON_W,model.LON_E)) ds_sliced_rhum = ds_sliced.rhum ds_sliced_rhum_no925 = ds_sliced_rhum.drop_sel({"level":925}) ds_sliced_uwnd_only = ds_sliced.uwnd ds_sliced_vwnd_only = ds_sliced.vwnd ds_combined_sliced = xr.merge([ds_sliced_rhum_no925, ds_sliced_uwnd_only, ds_sliced_vwnd_only], compat='override') ds_RAINFALL = xr.open_mfdataset(RF_raw_data_paths).sel( lat=slice(model.LAT_S, model.LAT_N), lon=slice(model.LON_W,model.LON_E)) ds_RAINFALL['time'] = ds_RAINFALL.indexes['time'].to_datetimeindex() valid_datetimes = [i for i in ds_combined_sliced.time.data if i in ds_RAINFALL.time.data] target_ds_preprocessed = ds_combined_sliced.sel(time=valid_datetimes) desired_res = .75 coarsen_magnitude = int(desired_res/np.ediff1d(target_ds_preprocessed.isel(lon=slice(0,2)).lon.data)[0]) print(f'Coarsen magnitude set at: {coarsen_magnitude} toward desired spatial resolu. of {desired_res}') target_ds_preprocessed = target_ds_preprocessed.coarsen(lat=coarsen_magnitude, lon=coarsen_magnitude, boundary='trim').mean() rf_ds_preprocessed = ds_RAINFALL.sel(time=valid_datetimes) utils.to_pickle('target_ds_preprocessed', target_ds_preprocessed, model.test_prepared_data_dir) utils.to_pickle('rf_ds_preprocessed', rf_ds_preprocessed, model.test_prepared_data_dir) reshaped_unnorma_darrays = {} reshaped_unnorma_darrays['rhum'], reshaped_unnorma_darrays['uwnd'], reshaped_unnorma_darrays['vwnd'] = {}, {}, {} n_datapoints, lat_size, lon_size = target_ds_preprocessed.time.size, target_ds_preprocessed.lat.size, target_ds_preprocessed.lon.size for level in model.rhum_pressure_levels: print(f'@{level}... ') reshaped_unnorma_darrays['rhum'][level] = np.reshape( target_ds_preprocessed.rhum.sel(level=level).values, (n_datapoints, lat_size*lon_size )) print(f"\n{utils.time_now()} - reshaping uwnd/vwnd dataarrays now, total levels to loop: {model.uwnd_vwnd_pressure_lvls}.") for level in model.uwnd_vwnd_pressure_lvls: print(f'@{level}... ') reshaped_unnorma_darrays['uwnd'][level] = np.reshape( target_ds_preprocessed.uwnd.sel(level=level).values, (n_datapoints, lat_size*lon_size )) reshaped_unnorma_darrays['vwnd'][level] = np.reshape( target_ds_preprocessed.vwnd.sel(level=level).values, (n_datapoints, lat_size*lon_size )) # stacking unstandardized dataarrays print("Stacking unstandardized dataarrays now...") stacked_unstandardized_ds = np.hstack([reshaped_unnorma_darrays[var][lvl] for var in reshaped_unnorma_darrays for lvl in reshaped_unnorma_darrays[var]]) # standardizing the stacked dataarrays print("standardizing stacked dataarrays now...") print(f'"stacked_unstandardized_ds.shape" is {stacked_unstandardized_ds.shape}') transformer = RobustScaler(quantile_range=(25, 75)) standardized_stacked_arr = transformer.fit_transform(stacked_unstandardized_ds) # som & kmeans training transformer.get_params() utils.to_pickle('standardized_stacked_arr', standardized_stacked_arr, model.test_prepared_data_dir)