def experiment_step(params: Dict, smoke_test: bool = False, subsample: Optional[int] = None) -> None: # ====================== # experiment - Data # ====================== # Get DataCube datacube = get_dataset(params["variable"])[params["variable"]] # subset datacube (spatially) datacube = select_region(xr_data=datacube, bbox=params["region"])[params["variable"]] # subset datacube (temporally) datacube = select_period(xr_data=datacube, period=params["period"]).compute() # get density cubes density_cube_df = get_density_cubes( data=datacube, spatial=params["dimensions"].spatial, temporal=params["dimensions"].temporal, ) # # standardize data X = density_cube_df.iloc[:, 0][:, np.newaxis] Y = density_cube_df.iloc[:, 1:] # standardize data X, Y = standardizer_data(X=X, Y=Y) # ====================== # experiment - Methods # ====================== res = get_similarity_scores(X_ref=X, Y_compare=Y, smoke_test=smoke_test, subsample=subsample) # Save Results results_df = pd.DataFrame( { "region": params["region"].name, "period": params["period"].name, "variable": params["variable"], "spatial": params["dimensions"].spatial, "temporal": params["dimensions"].temporal, "n_dimensions": params["dimensions"].dimensions, **res, }, index=[0], ) return results_df
def experiment_step(params: Dict, smoke_test: bool = False) -> None: # ====================== # experiment - Data # ====================== # Get DataCube datacube = get_dataset(params["variable"]) # subset datacube (spatially) datacube = select_region(xr_data=datacube, bbox=params["region"])[ params["variable"] ] # subset datacube (temporally) datacube = select_period(xr_data=datacube, period=params["period"]) # get datacubes reference_cube_df = get_reference_cube(data=datacube) # get density cubes density_cube_df = get_density_cubes( data=datacube, spatial=params["spatial"], temporal=params["temporal"] ) # get reference dataframe X, Y = get_common_indices( reference_df=reference_cube_df, density_df=density_cube_df ) # standardize data X, Y = standardizer_data(X=X, Y=Y) # ====================== # experiment - Methods # ====================== res = get_similarity_scores(X_ref=X, Y_compare=Y, smoke_test=smoke_test) # Save Results results_df = pd.DataFrame( { "region": params["region"].name, "period": params["period"].name, "variable": params["variable"], "spatial": params["spatial"], "temporal": params["temporal"], **res, }, index=[0], ) return results_df
def experiment_step(params: Dict, smoke_test: bool = False, subsample: Optional[int] = None) -> pd.DataFrame: # ====================== # experiment - Data # ====================== # Get DataCube datacube = get_dataset([params["variable"]]) # subset datacube (spatially) if params["region"] not in ["world"]: region_name = params["region"].name datacube = select_region(xr_data=datacube, bbox=params["region"])[params["variable"]] else: region_name = "world" # remove climatology # print(type(datacube)) datacube, _ = remove_climatology(datacube) # print(type(datacube)) # if isinstance(datacube, xr.Dataset): # print(type(datacube)) datacube = datacube[params["variable"]] # print(type(datacube)) # subset datacube (temporally) # print(type(datacube)) # print(datacube) datacube = select_period(xr_data=datacube, period=params["period"]) # get density cubes density_cube_df = get_density_cubes( data=datacube, spatial=params["dimensions"].spatial, temporal=params["dimensions"].temporal, ) if smoke_test: density_cube_df = density_cube_df.iloc[:10_000] logging.info(f"Total data (smoke-test): {density_cube_df.shape}") # # standardize data x_transformer = StandardScaler().fit(density_cube_df.values) density_cube_df_norm = pd.DataFrame( data=x_transformer.transform(density_cube_df.values), columns=density_cube_df.columns.values, index=density_cube_df.index, ) # ========================= # Model - Gaussianization # ========================= # Gaussianize the data t0 = time.time() rbig_h = rbig_h_measures(density_cube_df_norm.values, subsample=subsample, random_state=123) t1 = time.time() - t0 # Save Results results_df = pd.DataFrame( { "region": region_name, "period": params["period"].name, "variable": params["variable"], "spatial": params["dimensions"].spatial, "temporal": params["dimensions"].temporal, "n_dimensions": params["dimensions"].dimensions, "n_samples": density_cube_df_norm.shape[0], "entropy": rbig_h, "time": t1, }, index=[0], ) return results_df
def experiment_step(parameters: Dict, args: argparse.Namespace,) -> pd.DataFrame: # ====================== # experiment - Data # ====================== # Get DataCube datacube = get_dataset([parameters["variable"]]) # ====================== # RESAMPLE # ====================== if args.resample: datacube = datacube.resample(time=args.resample).mean() # ====================== # SPATIAL SUBSET # ====================== if parameters["region"] not in ["world"]: region_name = parameters["region"].name datacube = select_region(xr_data=datacube, bbox=parameters["region"])[ parameters["variable"] ] else: region_name = "world" # ====================== # CLIMATOLOGY (TEMPORAL) # ====================== if args.remove_climatology: datacube, _ = remove_climatology(datacube) # print(type(datacube)) # # ====================== # TEMPORAL SUBSET # ====================== datacube = select_period(xr_data=datacube, period=parameters["period"]) # ====================== # DENSITY CUBES # ====================== if isinstance(datacube, xr.Dataset): # print(type(datacube)) datacube = datacube[parameters["variable"]] density_cube_df = get_density_cubes( data=datacube, spatial=parameters["dimensions"].spatial, temporal=parameters["dimensions"].temporal, ) # ====================== # STANDARDIZE DATA # ====================== x_transformer = StandardScaler().fit(density_cube_df.values) density_cube_df_norm = pd.DataFrame( data=x_transformer.transform(density_cube_df.values), columns=density_cube_df.columns.values, index=density_cube_df.index, ) # ====================== # SUBSAMPLE DATA # ====================== if args.subsample is not None: idx = subset_indices( density_cube_df_norm.values, subsample=args.subsample, random_state=100 ) if idx is not None: X = density_cube_df_norm.iloc[idx, :].values else: X = density_cube_df_norm.values else: X = density_cube_df_norm.values # ========================= # Model - Gaussianization # ========================= # Gaussianize the data t0 = time.time() rbig_h = rbig_h_measures(X, random_state=123, method=args.method) t1 = time.time() - t0 # Save Results results_df = pd.DataFrame( { "region": region_name, "period": parameters["period"].name, "variable": parameters["variable"], "spatial": parameters["dimensions"].spatial, "temporal": parameters["dimensions"].temporal, "n_dimensions": parameters["dimensions"].dimensions, "n_samples": X.shape[0], "entropy": rbig_h, "time": t1, }, index=[0], ) return results_df
def experiment_step(args: argparse.Namespace, ) -> Union[Any, Any]: logging.info(f"Extracting Parameters") parameters = get_parameters(args) # ====================== # experiment - Data # ====================== # Get DataCube logging.info(f"Loading '{parameters['variable'][0]}' variable") datacube = get_dataset(parameters["variable"]) # ====================== # RESAMPLE # ====================== if args.resample: logging.info(f"Resampling datacube...") datacube = datacube.resample(time="1MS").mean() # ====================== # SPATIAL SUBSET # ====================== try: logging.info(f"Selecting region '{parameters['region'].name}'") datacube = select_region( xr_data=datacube, bbox=parameters["region"])[parameters["variable"]] except: logging.info(f"Selecting region 'world'") # ====================== # CLIMATOLOGY (TEMPORAL) # ====================== if args.clima: logging.info("Removing climatology...") datacube, _ = remove_climatology(datacube) # ====================== # TEMPORAL SUBSET # ====================== logging.info(f"Selecting temporal period: '{parameters['period'].name}'") datacube = select_period(xr_data=datacube, period=parameters["period"]) # ====================== # DENSITY CUBES # ====================== logging.info( f"Getting density cubes: S: {args.spatial}, T: {args.temporal}") if isinstance(datacube, xr.Dataset): datacube = datacube[parameters["variable"][0]] density_cube_df = get_density_cubes( data=datacube, spatial=args.spatial, temporal=args.temporal, ) logging.info(f"Total data: {density_cube_df.shape}") # ====================== # STANDARDIZE DATA # ====================== logging.info(f"Standardizing data...") x_transformer = StandardScaler().fit(density_cube_df.values) density_cube_df_norm = pd.DataFrame( data=x_transformer.transform(density_cube_df.values), columns=density_cube_df.columns.values, index=density_cube_df.index, ) # ====================== # SUBSAMPLE DATA # ====================== if args.smoke_test: logging.info(f"Smoke Test...") logging.info(f"Subsampling datacube...") idx = subset_indices(density_cube_df_norm.values, subsample=1000, random_state=100) X = density_cube_df_norm.iloc[idx, :].values index = density_cube_df_norm.iloc[idx, :].index elif args.subsample is not None: logging.info(f"Subsampling datacube...") idx = subset_indices(density_cube_df_norm.values, subsample=args.subsample, random_state=100) X = density_cube_df_norm.iloc[idx, :].values index = density_cube_df_norm.index else: X = density_cube_df_norm.values index = density_cube_df_norm.index logging.info(f"Input shape: {X.shape}") parameters["input_shape"] = X.shape # ========================= # Model - Gaussianization # ========================= # Gaussianize the data logging.info(f"Gaussianizing data...") t0 = time.time() rbig_model = get_rbig_model(X=X, method=args.method) rbig_model.fit(X) t1 = time.time() - t0 logging.info(f"Time Taken: {t1:.2f} secs") parameters["rbig_fit_time"] = t1 # ========================= # PROB ESTIMATES # ========================= logging.info(f"Getting probability estimates...") t0 = time.time() # add noise if args.add_noise: logging.info(f"Adding noise to values for probability...") density_cube_df_norm.values += 1e-1 * RNG.rand( *density_cube_df_norm.values.shape) logging.info(f"Parallel predictions...") if args.smoke_test: X_prob = parallel_predictions( X=X, func=rbig_model.predict_proba, batchsize=100, n_jobs=-1, verbose=1, ) else: X_prob = parallel_predictions( X=density_cube_df_norm.values, func=rbig_model.predict_proba, batchsize=10_000, n_jobs=-1, verbose=1, ) t1 = time.time() - t0 logging.info(f"Time Taken: {t1:.2f} secs") parameters["prob_size"] = density_cube_df_norm.values.shape parameters["rbig_predict_time"] = t1 X_prob = pd.DataFrame(data=X_prob, index=index, columns=["probability"]) # returning density cubes logging.info(f"Getting information cubes.") X_prob = get_information_cubes(X_prob, time=args.temporal_mean) X_prob.attrs = parameters return X_prob
def experiment_step( params: Dict, smoke_test: bool = False, subsample: Optional[int] = None ) -> Union[Any, Any, Any, Any]: # ====================== # experiment - Data # ====================== # Get DataCube logging.info(f"Loading '{params['variable']}' variable") datacube = get_dataset(params["variable"]) # subset datacube (spatially) try: logging.info(f"Selecting region '{params['region'].name}'") datacube = select_region(xr_data=datacube, bbox=params["region"])[ params["variable"] ] except: logging.info(f"Selecting region 'world'") # logging.info("Removing climatology...") datacube, _ = remove_climatology(datacube) # subset datacube (temporally) logging.info(f"Selecting temporal period: '{params['period'].name}'") datacube = select_period(xr_data=datacube, period=params["period"]) # get density cubes logging.info( f"Getting density cubes: S: {params['spatial']}, T: {params['temporal']}" ) if isinstance(datacube, xr.Dataset): # print(type(datacube)) datacube = datacube[params["variable"][0]] density_cube_df = get_density_cubes( data=datacube, spatial=params["spatial"], temporal=params["temporal"], ) logging.info(f"Total data: {density_cube_df.shape}") if smoke_test: density_cube_df = density_cube_df.iloc[:1_000] logging.info(f"Total data (smoke-test): {density_cube_df.shape}") # # standardize data logging.info(f"Standardizing data...") x_transformer = StandardScaler().fit(density_cube_df.values) density_cube_df_norm = pd.DataFrame( data=x_transformer.transform(density_cube_df.values), columns=density_cube_df.columns.values, index=density_cube_df.index, ) # ========================= # Model - Gaussianization # ========================= # Gaussianize the data logging.info(f"Gaussianizing data...") t0 = time.time() rbig_model = get_rbig_model( X=density_cube_df_norm.values, subsample=params["subsample"] ) t1 = time.time() - t0 logging.info(f"Time Taken: {t1:.2f} secs") # get the probability estimates logging.info(f"Getting probability estimates...") t0 = time.time() # add noise prob_inputs = density_cube_df_norm.values + 1e-1 * RNG.rand( *density_cube_df_norm.values.shape ) logging.info(f"Parallel predictions...") X_prob = parallel_predictions( X=prob_inputs, func=rbig_model.predict_proba, batchsize=10_000, n_jobs=-1, verbose=1, ) t1 = time.time() - t0 logging.info(f"Time Taken: {t1:.2f} secs") X_prob = pd.DataFrame(data=X_prob, index=density_cube_df_norm.index,) logging.info(f"Computing Mean...") X_prob = X_prob.groupby(level=["lat", "lon"]).mean() return rbig_model, x_transformer, X_prob, density_cube_df