def get_dataset(seed=0, samples=50, test_split=0.5, save_dir=None, us=[0], name='pendulum-gym-image-dataset.pkl', **kwargs): data = {} assert save_dir is not None # path = '{}/pendulum-small-angle-image-dataset.pkl'.format(save_dir) path = os.path.join(save_dir, name) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...".format(path)) trajs_frames_force = [] trajs_force = [] for u in us: trajs_frames, trajs, tspan, _ = sample_gym(seed=seed, trials=samples, u=u, **kwargs) trajs_frames_force.append(trajs_frames) trajs_force.append(trajs) # make a train/test split split_ix = int(samples * test_split) tmp = np.stack(trajs_frames_force, axis=0) # (n_u, n_ts, n_trial, 50, 50) data['x'], data['test_x'] = tmp[:,:,:split_ix,:,:], tmp[:,:,split_ix:,:,:] tmp = np.stack(trajs_force, axis=0) # (n_u, n_ts, n_trial, 3) data['obs'], data['test_obs'] = tmp[:,:,:split_ix,:], tmp[:,:,split_ix:,:] data['t'] = tspan data['us'] = us to_pickle(data, path) return data
def cut_dataset(model, alpha, dest, dataset_path, ds_name): dataset = utils.open_pickle(dataset_path) try: dataset = dataset.sel( level=slice(np.min(model.tl_model.unique_pressure_lvls),np.max(model.tl_model.unique_pressure_lvls)), lat=slice(model.tl_model.LAT_N, model.tl_model.LAT_S), lon=slice(model.tl_model.LON_W, model.tl_model.LON_E), time=slice('1999', '2019')) except ValueError: dataset = dataset.sel( lat=slice(model.tl_model.LAT_S, model.tl_model.LAT_N), lon=slice(model.tl_model.LON_W, model.tl_model.LON_E), time=slice('1999', '2019')) if model.tl_model.period == "NE_mon": dataset = dataset.sel(time=is_NE_mon(dataset['time.month'])) elif model.tl_model.period == "SW_mon": dataset = dataset.sel(time=is_SW_mon(dataset['time.month'])) elif model.tl_model.period == "inter_mon": dataset = dataset.sel(time=is_inter_mon(dataset['time.month'])) if alpha != model.ALPHAs: gt_years = model.tl_model.years[(alpha-1)*model.PSI : alpha*model.PSI] train_years = np.delete(model.tl_model.years, np.arange((alpha-1) * model.PSI, alpha * model.PSI)) test = utils.cut_year(dataset, np.min(gt_years), np.max(gt_years)) train = utils.cut_year(dataset, np.min(train_years), np.max(train_years)) else: gt_years = model.tl_model.years[(alpha-1)*model.PSI : alpha*model.PSI+model.runoff_years] train_years = np.delete(model.tl_model.years, np.arange((alpha-1)*model.PSI, alpha*model.PSI+model.runoff_years)) test = utils.cut_year(dataset, np.min(gt_years), np.max(gt_years)) train = utils.cut_year(dataset, np.min(train_years), np.max(train_years)) time.sleep(1); gc.collect() utils.to_pickle(f'{ds_name}_test_alpha_{alpha}_preprocessed', test, dest) utils.to_pickle(f'{ds_name}_train_alpha_{alpha}_preprocessed', train, dest)
def get_dataset(seed=0, samples=50, test_split=0.5, save_dir=None, us=[0], name='acrobot-gym-image-dataset-rgb-0.pkl', **kwargs): data = {} assert save_dir is not None path = save_dir + '/' + name try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...".format(path)) trajs_frames_force = [] trajs_force = [] for u in us: trajs_frames, trajs, tspan, _ = sample_gym(seed=seed, trials=samples, u=u, **kwargs) trajs_frames = (np.moveaxis(trajs_frames, -1, -3) / 255.0) trajs_frames_force.append(trajs_frames) trajs_force.append(trajs) # make a train/test split split_ix = int(samples * test_split) tmp = np.stack(trajs_frames_force, axis=0) # (n_u, n_ts, n_trial, 3, 64, 64) data['x'], data['test_x'] = tmp[:,:,:split_ix], tmp[:,:,split_ix:] tmp = np.stack(trajs_force, axis=0) # (n_u, n_ts, n_trial, 3) data['obs'], data['test_obs'] = tmp[:,:,:split_ix,:], tmp[:,:,split_ix:,:] data['t'] = tspan data['us'] = us to_pickle(data, path) return data
def save_preloaded_raw_rf_data(model): rf_fpaths = get_files(model.raw_rf_dir) ds_RAINFALL = xr.open_mfdataset(rf_fpaths) utils.to_pickle('ds_RAINFALL', ds_RAINFALL, model.raw_rf_dir) return ds_RAINFALL
def main(): # Run the main application model_idx = train_ffnn_models() mse, mae, mape = run_ensemble(model_idx + 1) results = {"mse": mse, "mae": mae, "mape": mape} print results utils.to_pickle(Config.output_path + "results", results)
def main(input_fname, output_fname): guide_to_seqnames = from_pickle(input_fname) fingerprinted_guides = [ (fingerprint(guide), guide) for guide in guide_to_seqnames.keys() ] fingerprinted_guides.sort(key=lambda a: a[0]) sorted_guides = [guide for _, guide in fingerprinted_guides] to_pickle(output_fname, sorted_guides)
def train_SOM(self, alpha=None): d_hp_dir_path = str(utils.models_dir / self.dir_hp_str) self.d_hp_dir_path = d_hp_dir_path os.makedirs(d_hp_dir_path, exist_ok=True) if not utils.find(f'*extent_{self.dir_str}.png', self.d_hp_dir_path): visualization.get_domain_geometry(self, self.d_hp_dir_path) models_dir_path = str(utils.models_dir / self.dir_hp_str / self.period) + f'_{self.month_names_joined}' os.makedirs(models_dir_path, exist_ok=True) self.models_dir_path = models_dir_path # utils.update_cfgfile('Paths', 'models_dir_path', self.models_dir_path) if alpha: destination = self.alpha_model_dir arr_path = self.alpha_standardized_stacked_arr_path prefix = f'alpha_{alpha}_' prompt = f'< alpha-{alpha} >' else: destination = self.models_dir_path arr_path = self.standardized_stacked_arr_path prefix = '' prompt = '' print(f'Destination: "{destination}", arr_path: "{arr_path}", prefix: "{prefix}"') if utils.find(f'*{prefix}som_model.pkl', destination): print(f'{utils.time_now()} - SOM model trained before, skipping...') self.som_model_path = utils.find(f'*{prefix}som_model.pkl', destination)[0] else: print(f'{utils.time_now()} - {prompt} No SOM model trained for {self.domain}, {self.period}, for {self.hyperparameters}, doing so now...') standardized_stacked_arr = utils.open_pickle(arr_path) sominitstarttime = timer(); print(f'{utils.time_now()} - Initializing MiniSom... ') som = MiniSom(self.gridsize, self.gridsize, # square standardized_stacked_arr.shape[1], sigma=self.sigma, learning_rate=self.learning_rate, neighborhood_function='gaussian', random_seed=self.random_seed) """ Note: initializing PCA for weights is faster (~1/2 hour), but for serialized arrays > 300mb, chances are this will kill the RAM and halt the entire process. """ ## try: ## som.pca_weights_init(standardized_stacked_arr) ## except MemoryError as e: ## print(f'Memory error has occured: \n{e}') print(f"Initialization took {utils.time_since(sominitstarttime)}.\n") trainingstarttime = timer(); print(f"{utils.time_now()} - Beginning training.") getattr(som, self.training_mode)(standardized_stacked_arr, self.iterations, verbose=True) q_error = np.round(som.quantization_error(standardized_stacked_arr), 2) print(f"Training complete. Q error is {q_error}, time taken for training is {utils.time_since(trainingstarttime)}s\n") if alpha: self.som_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}som_model', som, destination) else: self.som_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}som_model', som, destination)
def get_dataset(experiment_name, save_dir, **kwargs): '''Returns a PDE dataset.''' path = '{}/{}-dataset.pkl'.format(save_dir, experiment_name) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print( "Had a problem loading data from {}. Rebuilding dataset...".format( path)) data = make_dataset(experiment_name, **kwargs) to_pickle(data, path) os.makedirs('{}/data/'.format(save_dir), exist_ok=True) import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt u_all = np.concatenate([data['u'], data['test_u']], axis=0) energy_all = np.concatenate([data['energy'], data['test_energy']], axis=0) mass_all = u_all.sum(-1).squeeze(-1) for idx in range(len(u_all)): u = u_all[idx] energy = energy_all[idx] mass = mass_all[idx] fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, figsize=(6., 6.), facecolor='white') t = data['t_eval'] M = u.shape[-1] y = np.arange(M) / M T, Y = np.meshgrid(t, y) if experiment_name.startswith('ch'): ax1.pcolormesh(T, Y, u.squeeze(1).T, cmap='seismic', vmin=-1, vmax=1) else: ax1.pcolormesh(T, Y, u.squeeze(1).T, cmap='seismic') ax1.set_aspect('auto') ax1.set_yticks((0 - .5 / M, 1 - .5 / M)) ax1.set_yticklabels((0, 1)) ax2.plot(t, energy) ax3.plot(t, mass) ax3.set_xticks((t[0], t[-1])) ax3.set_xticklabels((t[0], t[-1])) fig.savefig('{}/data/data_{}_{:02d}.png'.format( save_dir, experiment_name, idx)) plt.close() return data
def _preprocess_data(): # Load store sale with state all_stores = pd.read_pickle(Config.store_data_path) # Add more columns of days of week and days of month trn_sls_dte = pd.to_datetime(all_stores.trn_sls_dte) all_stores["dayofweek"] = trn_sls_dte.dt.dayofweek all_stores["dayofmonth"] = trn_sls_dte.dt.day # Load weather data weather = pd.read_csv(Config.weather_data_path) # Join store sales and weather data store_weather = pd.merge(all_stores, weather, left_on=["store_id", "trn_sls_dte"], right_on=["location", "date"]) store_weather.drop(['date', 'location', 'max_temp', 'min_temp'], axis=1, inplace=True) # Sort the dataframe with date_time store_weather.sort_values(["store_id", "trn_sls_dte"], inplace=True) # Remove store with smaller than 180 days store_ids = store_weather.store_id.unique() for sid in store_ids: c_store = store_weather[store_weather.store_id == sid] if c_store.shape[0] < 180: store_weather = store_weather[store_weather.store_id != sid] # Shift the response variables to the past 1 day store_weather["p_total_revenue"] = store_weather.total_revenue.shift(1) store_weather["p_total_volume"] = store_weather.total_volume.shift(1) # Drop the first day of each store store_size = store_weather.groupby(["store_id"]).size() row_drop_idx = np.cumsum(store_size) - store_size store_weather.drop(store_weather.index[row_drop_idx], axis=0, inplace=True) # Backup store_ids store_weather["store_id_bk"] = store_weather.store_id # Factorize categorical features cat_cols = ["dayofweek", "dayofmonth", "state", "isholiday", "store_id"] for col in cat_cols: store_weather[col] = pd.factorize(store_weather[col])[0] # Drop the time column store_weather.drop(["trn_sls_dte"], axis=1, inplace=True) # Save the preprocessed dataframe to pickle object utils.to_pickle(Config.save_dir + "store_weather.pkl", store_weather) return store_weather
def save_preloaded_raw_input_data(model): input_fpaths = get_files(model.raw_input_dir) CHOSEN_VARS_ds = [] for var in model.CHOSEN_VARS: CHOSEN_VARS_ds.append([rf"{_}" for _ in input_fpaths if f"{var}" in _]) ds_CHOSEN_VARS_renamed = xr.open_mfdataset(CHOSEN_VARS_ds, chunks={'time':4}).rename({ 'latitude':'lat', 'longitude':'lon', 'r':'rhum', 'u':'uwnd', 'v':'vwnd' }) utils.to_pickle('ds_CHOSEN_VARS_renamed', ds_CHOSEN_VARS_renamed, model.raw_input_dir) return ds_CHOSEN_VARS_renamed
def _train_test_split(): # Build the store_weather dataframe store_weather_filename = Config.save_dir + "store_weather.pkl" if os.path.exists(store_weather_filename): store_weather = utils.from_pickle(store_weather_filename) else: store_weather = _preprocess_data() # Split train test for each store train = pd.DataFrame({}) test = pd.DataFrame({}) store_ids = store_weather.store_id_bk.unique() for sid in store_ids: c_store = store_weather[store_weather.store_id_bk == sid] s_train = c_store[:-Config.test_size] s_test = c_store[-Config.test_size:] train = train.append(s_train).reset_index().drop(["index"], axis=1) test = test.append(s_test).reset_index().drop(["index"], axis=1) # Scale numeric columns num_cols = ["p_total_revenue", "p_total_volume", "mean_temp", "total_precipitation", "total_snow"] scaler = MaxAbsScaler().fit(train.loc[:, num_cols]) train.loc[:, num_cols] = scaler.transform(train.loc[:, num_cols]) test.loc[:, num_cols] = scaler.transform(test.loc[:, num_cols]) # Scale 2 output columns revenue_scale = MaxAbsScaler().fit(train.loc[:, ["total_revenue"]]) volume_scale = MaxAbsScaler().fit(train.loc[:, ["total_volume"]]) train.loc[:, ["total_revenue"]] = revenue_scale.transform( train.loc[:, ["total_revenue"]]) test.loc[:, ["total_revenue"]] = revenue_scale.transform( test.loc[:, ["total_revenue"]]) train.loc[:, ["total_volume"]] = volume_scale.transform( train.loc[:, ["total_volume"]]) test.loc[:, ["total_volume"]] = volume_scale.transform( test.loc[:, ["total_volume"]]) # Save the train/test dataframes to pickle objects utils.to_pickle(Config.save_dir + "train_set.pkl", train) utils.to_pickle(Config.save_dir + "test_set.pkl", test) # Save the 2 scaler for later use utils.to_pickle(Config.save_dir + "revenue_scale", revenue_scale) utils.to_pickle(Config.save_dir + "volume_scale", volume_scale) # Save store_ids utils.to_pickle(Config.save_dir + "store_id.pkl", store_ids) return train, test
def get_dataset(experiment_name, save_dir, **kwargs): '''Returns an orbital dataset. Also constructs the dataset if no saved version is available.''' path = '{}/{}-orbits-dataset.pkl'.format(save_dir, experiment_name) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...".format(path)) data = make_orbits_dataset(**kwargs) to_pickle(data, path) return data
def train_kmeans(self, alpha=None): if alpha: optimal_k = self.tl_model.optimal_k print(f'>> self.alpha_model_dir: {self.alpha_model_dir}') print(f'>> optimal_k: {optimal_k}') found = [i for i in Path(self.alpha_model_dir).glob(f'k-{optimal_k}_*')] if found: self.alpha_cluster_dir = found[0] else: self.alpha_cluster_dir = str(Path(self.alpha_model_dir) / f"k-{optimal_k}_NOT-singled-out-as-potential-cluster-for-this-split") os.makedirs(self.alpha_cluster_dir, exist_ok=True) print(f'>> self.alpha_cluster_dir: {self.alpha_cluster_dir}') destination = self.alpha_cluster_dir prefix = f'alpha_{alpha}_' else: optimal_k = self.optimal_k destination = self.cluster_dir prefix = '' print(f'optimal_k: "{optimal_k}", destination: "{destination}", prefix: "{prefix}"') for phrase in ('kmeans_model', 'labels_ar', 'labels_to_coords', 'label_markers', 'target_ds_withClusterLabels', 'dates_to_ClusterLabels', 'RFprec_to_ClusterLabels_dataset'): if utils.find(f'*{phrase}*.pkl', destination): print(f'>>>>>>>>> "self.{phrase}_path" initialized.') exec(f'self.{phrase}_path = utils.find(f\'*{phrase}*.pkl\', r"{destination}")[0]') else: print(f'{utils.time_now()} - No KMeans model trained for {self.domain}, {self.period}, for {self.hyperparameters}, doing so now...') som_weights_to_nodes = utils.open_pickle(self.som_weights_to_nodes_path) samples, features = som_weights_to_nodes.shape km = KMeans(n_clusters=optimal_k).fit(som_weights_to_nodes) print(f"n{utils.time_now()} - K-means estimator fitted, sample size is {samples} and number of features is {features}.") self.kmeans_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}kmeans_model', km, destination) self.serialize_kmeans_products(km, alpha) break
def preprocess_time_series(model, dest, nfold_ALPHA=None, desired_res=0.75): """ Preparing datasets for use in training algorithms - dropping missing values - ensuring both target & input datasets have same dates - coarsening spatial resolution of rainfall(target) dataset to desired resolution - pickling these "preprocessed" datasets """ target_ds = utils.open_pickle(model.input_ds_serialized_path) rf_target_ds = utils.open_pickle(model.rf_ds_serialized_path) # removing NA rows, supraneous dates, & coarsening dates accordingly print(f'{utils.time_now()} - Preprocessing data now.') try: rf_target_ds['time'] = rf_target_ds.indexes['time'].to_datetimeindex() #converting CFTimeIndex -> DateTime Index except AttributeError: print('AttributeError: \'DatetimeIndex\' object has no attribute \'to_datetimeindex\', continuing regardless...') pass earliest_rf_reading, latest_rf_reading = rf_target_ds.isel(time=0).time.values, rf_target_ds.isel(time=-1).time.values earliest_target_ds_reading, latest_target_ds_reading = target_ds.isel(time=0).time.values, target_ds.isel(time=-1).time.values earliest_date = earliest_target_ds_reading if earliest_target_ds_reading > earliest_rf_reading else earliest_rf_reading latest_date = latest_target_ds_reading if latest_target_ds_reading < latest_rf_reading else latest_rf_reading rf_ds_preprocessed = rf_target_ds.sel(time=slice(earliest_date, latest_date)) target_ds = target_ds.sel(time=slice(earliest_date, latest_date)) more_time_gaps = [i for i in target_ds.time.data if i not in rf_ds_preprocessed.time.data] more_time_gaps = more_time_gaps+[i for i in rf_ds_preprocessed.time.data if i not in target_ds.time.data] valid_dates = [date for date in target_ds.time.data if date not in more_time_gaps] target_ds = target_ds.sel(time = valid_dates) coarsen_magnitude = int(desired_res/np.ediff1d(target_ds.isel(lon=slice(0,2)).lon.data)[0]) print(f'Coarsen magnitude set at: {coarsen_magnitude} toward desired spatial resolu. of {desired_res}') target_ds_preprocessed = target_ds.coarsen(lat=coarsen_magnitude, lon=coarsen_magnitude, boundary='trim').mean() target_ds_preprocessed_path = utils.to_pickle('target_ds_preprocessed', target_ds_preprocessed, dest) rf_ds_preprocessed_path = utils.to_pickle('rf_ds_preprocessed', rf_ds_preprocessed, dest) target_ds_preprocessed = utils.remove_expver(target_ds_preprocessed) if nfold_ALPHA: for alpha in range(nfold_ALPHA): pass return target_ds_preprocessed_path, rf_ds_preprocessed_path
def get_dataset(self, experiment_name, save_dir): '''Returns the trajectory dataset. Also constructs the dataset if no saved version is available.''' path = '{}/{}-orbits-dataset_{}_EnsemblesPerEnergy_{}_OrbitLen_{}_Resolution_{}_energyPoints{}.pkl'.format( save_dir, experiment_name, self.integrator, self.ensembles, self.tspan[1], self.time_points, self.energyPoints) #path = "../Henon-Heiles-orbits-dataset_RK45_EnsemblesPerEnergy_20_OrbitLen_5000_Resolution_50000_energyPoints20.pkl" #path = "../Henon-Heiles-orbits-dataset_RK45_EnsemblesPerEnergy_20_OrbitLen_1000_Resolution_10000_energyPoints20.pkl" try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...". format(path)) data = self.make_orbits_dataset() to_pickle(data, path) return data
def select_category_value_agg(base, df, key, category_col, value, method, path='../features/1_first_valid', ignore_list=[], prefix='', null_val='XNA'): ''' Explain: likelifood encoding カテゴリカラムの各内容における行に絞った上でlevel粒度の集計を行い、特徴量を作成する。 カテゴリカラムの内容を無視して集約するのではなく、カテゴリの内容毎に集計して、 粒度に対する各カテゴリの統計量を横持ちさせる。 Args: Return: ''' df = df[[level, category_col, value]] ' カテゴリカラムにNullがある場合はXNAとして集計する ' df[category_col].fillna(null_val, inplace=True) ' 集計するカテゴリカラムの中身 ' for cat_val in df[category_col].drop_duplicates().values: ' 集計するカテゴリに絞る ' df_cat = df.query("{category_col} == '{cat_val}'") if df_cat[value].dtype != 'object': logger.info( f"\ncat: {category_col}\ncat_val: {cat_val}\nval: {value}\nmethod: {method}" ) result = base_aggregation(df_cat, level, value, method) result = base.merge(result, on=level, how='left') feature_list = [col for col in result.columns if col.count('@')] for feature in feature_list: utils.to_pickle(path=f'{path}/{prefix}{feature}.fp', obj=result[feature].values)
def main(input_fnames, output_fname): fnames = [] for input_sequence in input_fnames: fnames.extend(glob(input_sequence)) name_to_seqs = defaultdict(list) for fname in fnames: for seq_record in SeqIO.parse(fname, "fasta"): sequence = str(seq_record.seq).replace("-", "") name_to_seqs[seq_record.name].append(sequence) print("Creating database") guide_to_seqname = defaultdict(set) for seqname, sequences in name_to_seqs.items(): for sequence in sequences: for i in range(len(sequence) - 22): guide = sequence[i:i + 22] guide_to_seqname[guide].add(seqname) to_pickle(output_fname, guide_to_seqname)
def assign_test_clusters_to_datasets(self): target_ds_preprocessed = utils.open_pickle(utils.find('*target_ds_preprocessed.pkl', self.test_prepared_data_dir)[0]) rf_ds_preprocessed = utils.open_pickle(utils.find('*rf_ds_preprocessed.pkl', self.test_prepared_data_dir)[0]) standardized_stacked_arr = utils.open_pickle(utils.find('*standardized_stacked_arr.pkl', self.test_prepared_data_dir)[0]) self.n_datapoints = target_ds_preprocessed.time.shape[0] # length of xr_dataset self.lat_size = target_ds_preprocessed.lat.shape[0] self.lon_size = target_ds_preprocessed.lon.shape[0] self.months = np.unique(target_ds_preprocessed['time.month'].values) # month numbers self.month_names = [calendar.month_name[m][:3] for m in np.unique(target_ds_preprocessed['time.month'])] self.month_names_joined = '_'.join(self.month_names).upper() # to print months properly self.years = np.unique(target_ds_preprocessed['time.year'].values) # unique years self.X, self.Y = target_ds_preprocessed.lon, target_ds_preprocessed.lat km = utils.open_pickle(self.kmeans_model_path) predicted_clusters = km.predict(standardized_stacked_arr.astype(np.float)) target_ds_withClusterLabels = target_ds_preprocessed.assign_coords(cluster=("time", predicted_clusters)) dates_to_ClusterLabels = target_ds_withClusterLabels.cluster.reset_coords() RFprec_to_ClusterLabels_dataset = xr.merge([rf_ds_preprocessed, dates_to_ClusterLabels]) utils.to_pickle('target_ds_withClusterLabels', target_ds_withClusterLabels, self.test_prepared_data_dir) utils.to_pickle('RFprec_to_ClusterLabels_dataset', RFprec_to_ClusterLabels_dataset, self.test_prepared_data_dir)
def serialize_kmeans_products(self, km, alpha): if alpha: arr_path = self.alpha_standardized_stacked_arr_path uniq_markers = self.tl_model.uniq_markers destination = self.alpha_cluster_dir else: arr_path = self.standardized_stacked_arr_path uniq_markers = self.uniq_markers destination = self.cluster_dir print(f'arr_path: "{arr_path}", uniq_markers: "{uniq_markers}", destination: "{destination}"') standardized_stacked_arr = utils.open_pickle(arr_path) target_ds = utils.open_pickle(self.target_ds_preprocessed_path) rf_ds_preprocessed = utils.open_pickle(self.rf_ds_preprocessed_path) labels_ar = km.labels_ labels_to_coords = np.zeros([len(labels_ar), 2]) for i, var in enumerate(labels_ar): labels_to_coords[i] = i % self.gridsize, i // self.gridsize try: label_markers = np.array([uniq_markers[var] for i, var in enumerate(labels_ar)]) except IndexError: # more than 12 clusters label_markers = np.array([(uniq_markers*3)[var] for i, var in enumerate(labels_ar)]) target_ds_withClusterLabels = target_ds.assign_coords(cluster=("time", km.predict(standardized_stacked_arr.astype(np.float)))) dates_to_ClusterLabels = target_ds_withClusterLabels.cluster.reset_coords() RFprec_to_ClusterLabels_dataset = xr.merge([rf_ds_preprocessed, dates_to_ClusterLabels]) self.labels_ar_path = utils.to_pickle(f'{self.RUN_datetime}_labels_ar', labels_ar, destination) self.labels_to_coords_path = utils.to_pickle(f'{self.RUN_datetime}_labels_to_coords', labels_to_coords, destination) self.label_markers_path = utils.to_pickle(f'{self.RUN_datetime}_label_markers', label_markers, destination) self.target_ds_withClusterLabels_path = utils.to_pickle(f'{self.RUN_datetime}_target_ds_withClusterLabels', target_ds_withClusterLabels, destination) self.dates_to_ClusterLabels_path = utils.to_pickle(f'{self.RUN_datetime}_dates_to_ClusterLabels', dates_to_ClusterLabels, destination) self.RFprec_to_ClusterLabels_dataset_path = utils.to_pickle(f'{self.RUN_datetime}_RFprec_to_ClusterLabels_dataset', RFprec_to_ClusterLabels_dataset, destination)
def NFoldcrossvalidation_eval(alpha_level_model): """ Sequence of evaluation procedure 1. prepare_nfold_datasets(): split into years according to PSI & PSI_overlap 2. check_evaluated(): skip in scores already generated for that split/n^th-fold 3. else, for this alpha level = - detect_serialized_datasets(): is there such data - detect_prepared_datasets(): is there such cleaned datasets - train_SOM() & detect_som_products(): has SOM model & products already been generated - generate_k(): using internal val. to create folders of clustering quality - train_kmeans(): using optimal_k to generate kmeans model - print_outputs(): print outputs to see differences across the ALPHAs - evaluation_procedure(): generating AUC, Brier, etc. scores for this alpha split 4. compile_scores(): generating an overall score for this clustering attempt (domain, period, hpparam) """ finished_evaluation = alpha_level_model.check_evaluated() if finished_evaluation: print( f'Evaluation has already been finished & you can find the evaluation results @:\n{finished_evaluation}' ) return print(f'Commencing evaluation!') alpha_level_model.prepare_nfold_datasets() for alpha in range(1, alpha_level_model.ALPHAs + 1): evaluated = alpha_level_model.check_evaluated(alpha) if evaluated: continue alpha_level_model.prepare_alphafold_dataset(alpha) alpha_level_model.train_SOM(alpha) alpha_level_model.detect_som_products(alpha) alpha_level_model.generate_k(alpha) alpha_level_model.train_kmeans(alpha) #alpha_level_model.print_outputs(alpha) alpha_level_model.evaluation_procedure(alpha) utils.to_pickle('alpha_level_model', alpha_level_model, alpha_level_model.alpha_general_dir) alpha_level_model.compile_scores()
def get_dataset(experiment_name, save_dir, **kwargs): '''Returns a dataset bult on top of OpenAI Gym observations. Also constructs the dataset if no saved version is available.''' if experiment_name == "pendulum": env_name = "Pendulum-v0" elif experiment_name == "acrobot": env_name = "Acrobot-v1" else: assert experiment_name in ['pendulum'] path = '{}/{}-pixels-dataset.pkl'.format(save_dir, experiment_name) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...".format(path)) data = make_gym_dataset(**kwargs) to_pickle(data, path) return data
def detect_som_products(self, alpha=None): if alpha: destination = self.alpha_model_dir arr_path = self.alpha_standardized_stacked_arr_path prefix = f'alpha_{alpha}_' prompt = f'< alpha-{alpha} >' else: destination = self.models_dir_path arr_path = self.standardized_stacked_arr_path prefix = '' prompt = '' print(f'Destination: "{destination}", arr_path: "{arr_path}", prefix: "{prefix}", prompt:"{prompt}"') for phrase in ('winner_coordinates', 'dmap', 'ar', 'som_weights_to_nodes'): if utils.find(f'*{prefix}{phrase}.pkl', destination): p = utils.find(f'*{prefix}{phrase}.pkl', destination) print(f'{utils.time_now()} - {prefix}{phrase} is found @: \n{p[0]}') exec(f'self.{phrase}_path = {p}[0]') else: print(f'{utils.time_now()} - {prompt} Some SOM products found missing in, generating all products now...') som = utils.open_pickle(self.som_model_path) standardized_stacked_arr = utils.open_pickle(arr_path) winner_coordinates = np.array([som.winner(x) for x in standardized_stacked_arr]) dmap = som.distance_map() ar = som.activation_response(standardized_stacked_arr) som_weights = som.get_weights() # weights for training via k-means som_weights_to_nodes = np.array( [som_weights[c,r] for r in range(self.gridsize) for c in range(self.gridsize)]) #kmeans clustering self.winner_coordinates_path = utils.to_pickle(f'{prefix}winner_coordinates', winner_coordinates, destination) self.dmap_path = utils.to_pickle(f'{prefix}dmap', dmap, destination) self.ar_path = utils.to_pickle(f'{prefix}ar', ar, destination) self.som_weights_to_nodes_path = utils.to_pickle(f'{prefix}som_weights_to_nodes', som_weights_to_nodes, destination) break print('SOM products serialized.')
def prepare_dataset(model, dest): """ - xr.open_mfdataset() = loading - restricting to certain variables + "levels" of variables - combining variables xarrays into one - restricting to only between 1999 to 2019 - slicing domain dimensions up to required specs (i.e. model.LON_S, model.LON_N, etc...) - slicing up to chosen period only - pickling the datasets (both input & rainfall) & returning them """ # searching for raw data pickles preloaded_input_pickles = utils.find('*.pkl', model.raw_input_dir) if preloaded_input_pickles: print('Preloaded raw INPUT data pickles found...') ds_CHOSEN_VARS_renamed = utils.open_pickle(utils.find('*.pkl', model.raw_input_dir)[0]) else: print('Creating pickles of raw input data...') ds_CHOSEN_VARS_renamed = save_preloaded_raw_input_data(model) preloaded_input_pickles = utils.find('*.pkl', model.raw_rf_dir) if preloaded_input_pickles: print('Preloaded raw rainfall data pickles found...') ds_RAINFALL = utils.open_pickle(utils.find('*.pkl', model.raw_rf_dir)[0]) else: print('Creating pickles of raw rainfall data...') ds_RAINFALL = save_preloaded_raw_rf_data(model) print("Proceeding to do preliminary data cleaning...") ds_sliced = ds_CHOSEN_VARS_renamed.sel( level=slice(np.min(model.unique_pressure_lvls),np.max(model.unique_pressure_lvls)), lat=slice(model.LAT_N,model.LAT_S), lon=slice(model.LON_W,model.LON_E), time=slice('1999', '2019')) ds_sliced_rhum = ds_sliced.rhum ds_sliced_rhum_no925 = ds_sliced_rhum.drop_sel({"level":925}) ds_sliced_uwnd_only = ds_sliced.uwnd ds_sliced_vwnd_only = ds_sliced.vwnd ds_combined_sliced = xr.merge([ds_sliced_rhum_no925, ds_sliced_uwnd_only, ds_sliced_vwnd_only], compat='override') rf_ds_sliced = ds_RAINFALL.sel(lat=slice(model.LAT_S, model.LAT_N), lon=slice(model.LON_W,model.LON_E)) print('Pickling domain- & feature-constrained input & RF datasets...') if model.period == "NE_mon": input_ds = ds_combined_sliced.sel(time=is_NE_mon(ds_combined_sliced['time.month'])) rf_ds = rf_ds_sliced.sel(time=is_NE_mon(rf_ds_sliced['time.month'])) input_ds_serialized_path = utils.to_pickle('raw_input_ds_NE_mon_serialized', input_ds, dest) rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_NE_mon_serialized', rf_ds, dest) return input_ds_serialized_path, rf_ds_serialized_path elif model.period == "SW_mon": input_ds = ds_combined_sliced.sel(time=is_SW_mon(ds_combined_sliced['time.month'])) rf_ds = rf_ds_sliced.sel(time=is_SW_mon(rf_ds_sliced['time.month'])) input_ds_serialized_path = utils.to_pickle('raw_input_ds_SW_mon_serialized', input_ds, dest) rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_SW_mon_serialized', rf_ds, dest) return input_ds_serialized_path, rf_ds_serialized_path elif model.period == "inter_mon": input_ds = ds_combined_sliced.sel(time=is_inter_mon(ds_combined_sliced['time.month'])) rf_ds = rf_ds_sliced.sel(time=is_inter_mon(rf_ds_sliced['time.month'])) input_ds_serialized_path = utils.to_pickle('raw_input_ds_inter_mon_serialized', input_ds, dest) rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_inter_mon_serialized', rf_ds, dest) return input_ds_serialized_path, rf_ds_serialized_path
def get_dataset(experiment_name, save_dir, u, **kwargs): '''Returns a dataset bult on top of OpenAI Gym observations. Also constructs the dataset if no saved version is available.''' if experiment_name == "pendulum": env_name = "Pendulum-v0" elif experiment_name == "acrobot": env_name = "Acrobot-v1" elif experiment_name == "cartpole": env_name = 'My_FA_CartPole-v0' else: assert experiment_name in ['pendulum', 'acrobot', 'cartpole'] path = '{}/{}-pixels-dataset.pkl'.format(save_dir, experiment_name) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print( "Had a problem loading data from {}. Rebuilding dataset...".format( path)) data = {} for u_ in u: data_ = make_gym_dataset(u=u[0], **kwargs) for k, v in data_.items(): if k in ['meta']: continue new = data_[k] old = data.get(k, np.array([]).reshape(0, new.shape[1])) data[k] = np.vstack((old, data_[k])) to_pickle(data, path) return data
def flatten_and_standardize_dataset(model, dest): target_ds_preprocessed = utils.open_pickle(model.target_ds_preprocessed_path) target_ds_preprocessed = utils.remove_expver(target_ds_preprocessed) # reshaping reshapestarttime = timer(); print(f"{utils.time_now()} - Reshaping data now...") print(f"\n{utils.time_now()} - reshaping rhum dataarrays now, total levels to loop: {model.rhum_pressure_levels}.") reshaped_unnorma_darrays = {} reshaped_unnorma_darrays['rhum'], reshaped_unnorma_darrays['uwnd'], reshaped_unnorma_darrays['vwnd'] = {}, {}, {} for level in model.rhum_pressure_levels: print(f'@{level}... ') reshaped_unnorma_darrays['rhum'][level] = np.reshape( target_ds_preprocessed.rhum.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size )) print(f"\n{utils.time_now()} - reshaping uwnd/vwnd dataarrays now, total levels to loop: {model.uwnd_vwnd_pressure_lvls}.") for level in model.uwnd_vwnd_pressure_lvls: print(f'@{level}... ') reshaped_unnorma_darrays['uwnd'][level] = np.reshape( target_ds_preprocessed.uwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size )) reshaped_unnorma_darrays['vwnd'][level] = np.reshape( target_ds_preprocessed.vwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size )) reshapetime = timer()-reshapestarttime; reshapetime = str(datetime.timedelta(seconds=reshapetime)).split(".")[0]; print(f'Time taken: {reshapetime}s.\n') # stacking unstandardized dataarrays stackingstarttime = timer(); print("Stacking unstandardized dataarrays now...") stacked_unstandardized_ds = np.hstack([reshaped_unnorma_darrays[var][lvl] for var in reshaped_unnorma_darrays for lvl in reshaped_unnorma_darrays[var]]) stackingtime = timer()-stackingstarttime; stackingtime = str(datetime.timedelta(seconds=stackingtime)).split(".")[0]; print(f'Time taken: {stackingtime}s.\n') # standardizing the stacked dataarrays standardizestarttime = timer(); print("standardizing stacked dataarrays now...") print(f'"stacked_unstandardized_ds.shape" is {stacked_unstandardized_ds.shape}') transformer = RobustScaler(quantile_range=(25, 75)) standardized_stacked_arr = transformer.fit_transform(stacked_unstandardized_ds) # som & kmeans training transformer.get_params() standardizetime = timer()-standardizestarttime; standardizetime = str(datetime.timedelta(seconds=standardizetime)).split(".")[0]; print(f'That took {standardizetime}s to complete.\n') standardized_stacked_arr_path = utils.to_pickle('standardized_stacked_arr', standardized_stacked_arr, dest) return standardized_stacked_arr_path
def serialize(self, obj, **kw): """ Function for serializing object => string. This can be overwritten for custom uses. The default is to do nothing ('serializer'=None) If the connection is intialized with 'serializer' set to 'json.gz', 'json', 'gz', or 'zip', we'll do the transformations. """ serializer = kw.get('serializer', self._serializer) if serializer == "json.gz": return utils.to_gz(utils.to_json(obj)) elif serializer == "json": return utils.to_json(obj) elif serializer == "gz": assert(isinstance(obj, basestring)) return utils.to_gz(obj) elif serializer == "zip": assert(isinstance(obj, basestring)) return utils.to_zip(obj) elif serializer == "pickle": return utils.to_pickle(obj) elif serializer is not None: raise NotImplementedError( 'Only json, gz, json.gz, zip, and pickle' 'are supported as serializers.') return obj
train_loss = torch.cat(train_loss, dim=1) train_loss_per_traj = torch.sum(train_loss, dim=(0,2)) test_loss = torch.cat(test_loss, dim=1) test_loss_per_traj = torch.sum(test_loss, dim=(0,2)) print('Final trajectory train loss {:.4e} +/- {:.4e}\nFinal trajectory test loss {:.4e} +/- {:.4e}' .format(train_loss_per_traj.mean().item(), train_loss_per_traj.std().item(), test_loss_per_traj.mean().item(), test_loss_per_traj.std().item())) stats['traj_train_loss'] = train_loss_per_traj.detach().cpu().numpy() stats['traj_test_loss'] = test_loss_per_traj.detach().cpu().numpy() return model, stats if __name__ == "__main__": args = get_args() model, stats = train(args) # save os.makedirs(args.save_dir) if not os.path.exists(args.save_dir) else None label = '-baseline_ode' if args.baseline else '-hnn_ode' struct = '-struct' if args.structure else '' rad = '-rad' if args.rad else '' path = '{}/{}{}{}-{}-p{}{}.tar'.format(args.save_dir, args.name, label, struct, args.solver, args.num_points, rad) torch.save(model.state_dict(), path) path = '{}/{}{}{}-{}-p{}-stats{}.pkl'.format(args.save_dir, args.name, label, struct, args.solver, args.num_points, rad) to_pickle(stats, path)
else: x_stack.append(x[:, i:]) x_stack = np.stack(x_stack, axis=1) # n_u, n_p, ts+1-n_p, bs, 32, 32 x_stack = np.reshape(x_stack, (x.shape[0], num_points, -1, *x.shape[3:])) # n_u, n_p, (ts+1-n_p)*bs, 32, 32 t_eval = t[0:num_points] return x_stack, t_eval if __name__ == "__main__": THIS_DIR = os.path.dirname(os.path.abspath(__file__)) # load data us = [[0.0, 0.0], [0.0, 1.0], [0.0, -1.0], [0.0, 2.0], [0.0, -2.0], [1.0, 0.0], [-1.0, 0.0], [2.0, 0.0], [-2.0, 0.0]] # us = [[0.0, 0.0]] ts = 20 ; ss = 512 data = get_dataset(seed=0, timesteps=ts, save_dir=THIS_DIR, us=us, samples=ss, test_split=0.50, name='cartpole-gym-image-dataset-rgb-u9.pkl') train_data = {} train_data['x'] = data['x'] train_data['obs'] = data['obs'] train_data['t'] = data['t'] train_data['us'] = data['us'] to_pickle(train_data, THIS_DIR + '/' + 'cartpole-gym-image-dataset-rgb-u9-train.pkl') test_data = {} test_data['x'] = data['test_x'] test_data['obs'] = data['test_obs'] test_data['t'] = data['t'] test_data['us'] = data['us'] to_pickle(test_data, THIS_DIR + '/' + 'cartpole-gym-image-dataset-rgb-u9-test.pkl')
x_stack.append(x[:, i:]) x_stack = np.stack(x_stack, axis=1) # n_u, n_p, ts+1-n_p, bs, 32, 32 x_stack = np.reshape(x_stack, (x.shape[0], num_points, -1, *x.shape[3:])) # n_u, n_p, (ts+1-n_p)*bs, 32, 32 t_eval = t[0:num_points] return x_stack, t_eval if __name__ == "__main__": THIS_DIR = os.path.dirname(os.path.abspath(__file__)) np.random.seed(0) # load data us = [[0.0, 0.0], [0.0, 1.0], [0.0, -1.0], [0.0, 2.0], [0.0, -2.0], [1.0, 0.0], [-1.0, 0.0], [2.0, 0.0], [-2.0, 0.0]] # us = [[0.0, 0.0]] ts = 20 ; ss = 512 data = get_dataset(seed=0, timesteps=ts, save_dir=THIS_DIR, us=us, samples=ss, test_split=0.50, name='acrobot-gym-image-dataset-rgb-u9.pkl') train_data = {} train_data['x'] = data['x'] train_data['obs'] = data['obs'] train_data['t'] = data['t'] train_data['us'] = data['us'] to_pickle(train_data, THIS_DIR + '/' + 'acrobot-gym-image-dataset-rgb-u9-train.pkl') test_data = {} test_data['x'] = data['test_x'] test_data['obs'] = data['test_obs'] test_data['t'] = data['t'] test_data['us'] = data['us'] to_pickle(test_data, THIS_DIR + '/' + 'acrobot-gym-image-dataset-rgb-u9-test.pkl')
def generate_k(self, alpha=None): """ - detection of metrices to infer "k", i.e. optimal_k value - creation of metrices pickles - creation of folders in models_dir to indicate potential k values/cluster combinations """ metrics_dir = str(utils.metrics_dir / self.dir_hp_str / self.period) + f'_{self.month_names_joined}' os.makedirs(metrics_dir, exist_ok=True) self.metrics_dir_path = metrics_dir if alpha: self.alpha_metrics_dir_path = str(Path(self.tl_model.metrics_dir_path) / f'alpha_{alpha}') metric_destination = self.alpha_metrics_dir_path os.makedirs(metric_destination, exist_ok=True) model_destination = self.alpha_model_dir prefix = f'alpha_{alpha}_' prompt = f'< alpha-{alpha} >' else: metric_destination = self.metrics_dir_path model_destination = self.models_dir_path prefix = '' prompt = '' print(f'metric_destination: "{metric_destination}", model_destination: "{model_destination}", prefix: "{prefix}", prompt:"{prompt}"') for phrase in ('sil_peaks', 'ch_max', 'dbi_min', 'reasonable_sil', 'ch_dbi_tally', 'n_expected_clusters', 'dbs_err_dict'): if utils.find(f'*{prefix}{phrase}*.pkl', metric_destination): pass else: print(f'{utils.time_now()} - {prompt} Not all metrices have been found in {metric_destination}, generating them now...') # print all metrices if even 1 not found som_weights_to_nodes = utils.open_pickle(self.som_weights_to_nodes_path) ch_scores, dbi_scores = validation.print_elbow_CH_DBI_plot(self, som_weights_to_nodes, metric_destination) yellowbrick_expected_k = validation.print_yellowbrickkelbow(self, som_weights_to_nodes, metric_destination) silhouette_avgs, reasonable_silhoutte_scores_mt50 = validation.print_silhoutte_plots(self, som_weights_to_nodes, metric_destination) dbstop10 = validation.print_dbs_plots(self, som_weights_to_nodes, metric_destination) eps_ls, dbs_k_ls, dbs_noisepts_ls, dbs_labels = [], [], [], [] for i in dbstop10: eps_ls.append(i[0]) dbs_k_ls.append(i[1]) dbs_noisepts_ls.append(i[2]) dbs_labels.append(i[3]) sil_peaks, ch_max, dbi_min, reasonable_sil, ch_dbi_tally, n_expected_clusters, dbs_err_dict = validation.get_cluster_determination_vars( silhouette_avgs, ch_scores, dbi_scores, reasonable_silhoutte_scores_mt50, dbs_k_ls, dbs_noisepts_ls, yellowbrick_expected_k) for cluster_num in n_expected_clusters: if alpha: save_dir = fr"{self.alpha_model_dir}/k-{cluster_num}" else: save_dir = fr"{self.models_dir_path}/k-{cluster_num}" if cluster_num == ch_max: save_dir += '_CHhighestPeak' if cluster_num == dbi_min: save_dir += '_lowestDBItrough' if cluster_num in sil_peaks: save_dir += '_SilhouetteAVG-peak' if cluster_num == reasonable_sil: save_dir += '_mostReasonable-basedon-Silhouetteplot' if cluster_num in ch_dbi_tally: save_dir += '_CHpeak-and-DBItrough' if cluster_num == yellowbrick_expected_k: save_dir += '_Yellowbrickexpected-K' if cluster_num in dbs_err_dict: save_dir += f'_DBSCANclusterErrorValsExpected-{dbs_err_dict[cluster_num]}' os.makedirs(save_dir, exist_ok=True) print(f'save_dir: {save_dir}') self.ch_max_path = utils.to_pickle(f"{prefix}ch_max", ch_max, metric_destination) self.dbi_min_path = utils.to_pickle(f"{prefix}dbi_min", dbi_min, metric_destination) self.sil_peaks_path = utils.to_pickle(f"{prefix}sil_peaks", sil_peaks, metric_destination) self.reasonable_sil_path = utils.to_pickle(f"{prefix}reasonable_sil", reasonable_sil, metric_destination) self.ch_dbi_tally_path = utils.to_pickle(f"{prefix}ch_dbi_tally", ch_dbi_tally, metric_destination) self.yellowbrick_expected_k_path = utils.to_pickle(f"{prefix}yellowbrick_expected_k", yellowbrick_expected_k, metric_destination) self.dbs_err_dict_path = utils.to_pickle(f"{prefix}dbs_err_dict", dbs_err_dict, metric_destination) self.n_expected_clusters_path = utils.to_pickle(f"{prefix}n_expected_clusters", n_expected_clusters, metric_destination) break print(f'{utils.time_now()} - Internal validation of clusters has been run, please view metrices folder @:\n{metric_destination} to determine optimal cluster number.\n'\ f'\nYou can view the separate folders constructed for each discovered cluster combination. See @: \n{model_destination}.')
'/results') else None label = '' label = label + '-{}-{}'.format(args.model, args.solver) label = label + '-friction' if args.friction else label label = label + '-seed{}'.format(args.seed) name = args.name result_path = '{}/results/dg-{}{}'.format(args.save_dir, name, label) path_tar = '{}.tar'.format(result_path) path_pkl = '{}.pkl'.format(result_path) path_txt = '{}.txt'.format(result_path) if os.path.exists(path_txt): if args.noretry: exit() else: os.remove(path_txt) model, stats = train(args) torch.save(model.state_dict(), path_tar) to_pickle(stats, path_pkl) with open(path_txt, 'w') as of: print( '#final_train_loss\tfinal_test_loss\tenergy_mse_mean\tstate_mse_mean', file=of) print(stats['final_train_loss'], stats['final_test_loss'], stats['energy_mse_mean'], stats['state_mse_mean'], sep='\t', file=of)