#filename = r'D:\vlachos\Documents\KV MSc thesis\Data\Satellite\Gulf Stream_1\npz_files_sral_slstr\S3A_2018-05-10 02_08_39__2018-05-10 02_05_24.npz'.replace('\\','\\') models_path = r"C:\Users\vlachos\Desktop\SSTlevel4".replace('\\', '\\') filespath = r'C:\Users\vlachos\Desktop\npz_files_sral_sstL4_1DCNN_real'.replace( '\\', '\\') npz_files = os.listdir(filespath) # load npz files. encoding argument is used only if npz files have been # saved using py2s.x and are loaded by py3.x # Calculate Maximum distance vector size d = [] fff = [] font = {'size': 18} plt.rc('font', **font) for filename in npz_files: plt.close('all') matrix, distance, _ = ml_utilities.feature_matrix_from_npz( os.path.join(filespath, filename)) if distance.size < 2000: continue # Imputate NaNs matrix, _ = ml_utilities.imputate_nans_feature_matrix(matrix, method='Interpolate', drop_nan=False) label = np.array(matrix['SSHA_105']) # label = ml_utilities.matrix_min_max_rescale(label, 1, -1, axis=0) matrix = matrix.drop(columns=['SSHA_35', 'SSHA_71', 'SSHA_105']) # matrix = ml_utilities.matrix_min_max_rescale(matrix, 0.5, -0.5, axis=0) matrix = np.array(matrix) matrix = ml_utilities.my_standardizer(matrix, matrix) # standardize label = ml_utilities.my_standardizer(np.expand_dims(label, axis=1), np.expand_dims(label,
path_npzfiles = r'D:\vlachos\Documents\KV MSc thesis\Data\Satellite\Gulf Stream_1\npz_files_sral_slstr'.replace('\\','\\') npz_files = os.listdir(path_npzfiles) npz_files = [item for item in npz_files if 'npz' in item] model_name = 'S3B_2019-03-28 14_55_41__2019-03-28 01_16_43_RF_slstr_model.sav' for npz in npz_files: if npz[4:14] == model_name[4:14]: pass else: continue # Read model model = pickle.load(open(os.path.join(path_models, model_name), 'rb')) # Read npz file matrix, distance, _ = ml_utilities.feature_matrix_from_npz(os.path.join(path_npzfiles, npz)) matrix, idx_nan = ml_utilities.imputate_nans_feature_matrix(matrix, method='Interpolate', drop_nan=True) label = np.array(matrix['SSHA_35']) matrix = matrix.drop(columns=['SSHA_35', 'SST_125km', 'SST_95km','SST_75km', 'SST_32km', 'SST_16km', 'SST_12.5km']) matrix_labels = list(matrix.columns) # keep feature matrix names matrix = np.array(matrix) # Predict y_hat = model.predict(matrix) # PLOT font = {'size' : 18} plt.rc('font', **font)
# ============================================================================= # BEGIN # ============================================================================= filespath = r'H:\MSc_Thesis_05082019\Data\Satellite\Gulf Stream_1\npz_files_sral_slstr'.replace( '\\', '\\') npz_files = os.listdir(filespath) npz_files = [filename for filename in npz_files if '.npz' in filename] N_npz_files = len(npz_files) models_path = r'C:\Users\vlachos\Desktop\MLP'.replace('\\', '\\') # Derive names of variables var_to_drop = [ 'SSHA_35', 'SST_125km', 'SST_95km', 'SST_75km', 'SST_32km', 'SST_16km', 'SST_12.5km' ] matrix, _, _ = ml_utilities.feature_matrix_from_npz( os.path.join(filespath, npz_files[0])) matrix = matrix.drop(columns=var_to_drop) matrix_labels = list(matrix.columns) # keep feature matrix names del matrix i = 1 bad = [] RMSE_test = [] RMSE_train = [] matrix = pd.DataFrame(columns=matrix_labels, dtype=np.float32) label = pd.DataFrame(dtype=np.float32) for filename in npz_files: try: # Progress
# Progress sys.stdout.write("\rProgress ... {0:.2f} %".format( (counter_2 / N_temp_npz_files) * 100)) sys.stdout.flush() # load npz files. encoding argument is used only if npz files have been # # saved using py2.x and are loaded by py3.x # dat = np.load(os.path.join(filespath, file_name), encoding='latin1', allow_pickle=True) # # # Retrieve dictionary # dat = dat['arr_0'].item() # # Keep distance in variable # # distance = dat['Distance'] # del dat['Metadata'], dat['Distance'] fullpath = os.path.join(filespath, file_name) data_temp, distance, _ = ml_utilities.feature_matrix_from_npz( fullpath) # raise Exception('STOPPED CODE') # ============================================================================= # MISSING VALUES HANDLING # ============================================================================= # Assign label and feature matrix to temporary variables # data_temp = pd.DataFrame.from_dict(dat, dtype=np.float32) if True: # 1) IMPUTATION OF NANs- INTERPOLATION AND DROP THE REST OF THE NANS # Interpolate the NAN values inside the dataset data_temp = data_temp.interpolate(method='akima', limit=150, limit_direction='both', axis=0) # Interpolate (actually extrapolate) the values at the edges
model = skensemble.RandomForestRegressor(**params) i = 1 bad = [] RMSE_test = [] RMSE_train = [] importance_list = [] for filename in npz_files: try: plt.close('all') # Progress sys.stdout.write("\rFiles {0} out of {1}".format(i, N_npz_files)) sys.stdout.flush() fullpath = os.path.join(path, filename) matrix, distance, metadata = ml_utilities.feature_matrix_from_npz( fullpath) # ============================================================================= # MISSING VALUES IMPUTATION # ============================================================================= matrix, _ = ml_utilities.imputate_nans_feature_matrix( matrix, method='Interpolate', drop_nan=True) label = np.array(matrix['SSHA_35']) # matrix = matrix.drop(columns='SSHA_35') # matrix_labels_out = [item for item in matrix.columns.to_list() if ('OLCI' not in item) or ('CHL' not in item)] # matrix_labels_out = ['SSHA_35', 'SST_12.5km', 'SST_32km', 'SST_125km'] matrix_labels_out = [ 'SSHA_35', 'KD490_M07_OLCI_12.5km', 'CHL_OC4ME_OLCI_5km', 'KD490_M07_OLCI_5km', 'ADG443_NN_OLCI_5km', 'TSM_NN_OLCI_12.5km',
paths = { 'SRAL': r'C:\Users\vlachos\Desktop\SRAL'.replace('\\', '\\'), 'OLCI': r'C:\Users\vlachos\Desktop\OLCI'.replace('\\', '\\'), 'SLSTR': r'C:\Users\vlachos\Desktop\SLSTR'.replace('\\', '\\') } # Read npz list path = r'C:\Users\vlachos\Desktop\npz_files_sral_slstr_olci_RF'.replace( '\\', '\\') npz_files = os.listdir(path) npz_files = [item for item in npz_files if 'npz' in item] # Correct for the missing OLCI name in the npz olci_dates = [] for filename in npz_files: _, _, metadata = ml_utilities.feature_matrix_from_npz( os.path.join(path, filename)) olci_temp = metadata.split(' ')[3] olci_temp = dt.datetime.strptime(olci_temp, '%Y%m%dT%H%M%S') olci_temp = dt.datetime.strftime(olci_temp, '%Y-%m-%d %H_%M_%S') olci_dates.append(olci_temp) npz_files = [ item[:-4] + '__' + item_2 + '.npz' for (item, item_2) in zip(npz_files, olci_dates) ] # Folder names with the common dates common_date = s3utilities.find_common_dates(paths) common_date = ml_utilities.products_from_npz(common_date, npz_files) # Remove Dublicates