Esempio n. 1
0
    def serialize_kmeans_products(self, km, alpha):
        if alpha:
            arr_path = self.alpha_standardized_stacked_arr_path
            uniq_markers = self.tl_model.uniq_markers
            destination = self.alpha_cluster_dir
        else:
            arr_path = self.standardized_stacked_arr_path
            uniq_markers = self.uniq_markers
            destination = self.cluster_dir

        print(f'arr_path: "{arr_path}", uniq_markers: "{uniq_markers}", destination: "{destination}"')

        standardized_stacked_arr = utils.open_pickle(arr_path)
        target_ds = utils.open_pickle(self.target_ds_preprocessed_path)
        rf_ds_preprocessed = utils.open_pickle(self.rf_ds_preprocessed_path)

        labels_ar = km.labels_
        labels_to_coords = np.zeros([len(labels_ar), 2])
        for i, var in enumerate(labels_ar): labels_to_coords[i] = i % self.gridsize, i // self.gridsize

        try:
            label_markers = np.array([uniq_markers[var] for i, var in enumerate(labels_ar)])
        except IndexError: # more than 12 clusters
            label_markers = np.array([(uniq_markers*3)[var] for i, var in enumerate(labels_ar)])
            
        target_ds_withClusterLabels = target_ds.assign_coords(cluster=("time", km.predict(standardized_stacked_arr.astype(np.float))))
        dates_to_ClusterLabels = target_ds_withClusterLabels.cluster.reset_coords()
        RFprec_to_ClusterLabels_dataset = xr.merge([rf_ds_preprocessed, dates_to_ClusterLabels])

        self.labels_ar_path = utils.to_pickle(f'{self.RUN_datetime}_labels_ar', labels_ar, destination)
        self.labels_to_coords_path = utils.to_pickle(f'{self.RUN_datetime}_labels_to_coords', labels_to_coords, destination)
        self.label_markers_path = utils.to_pickle(f'{self.RUN_datetime}_label_markers', label_markers, destination)
        self.target_ds_withClusterLabels_path = utils.to_pickle(f'{self.RUN_datetime}_target_ds_withClusterLabels', target_ds_withClusterLabels, destination)
        self.dates_to_ClusterLabels_path = utils.to_pickle(f'{self.RUN_datetime}_dates_to_ClusterLabels', dates_to_ClusterLabels, destination)
        self.RFprec_to_ClusterLabels_dataset_path = utils.to_pickle(f'{self.RUN_datetime}_RFprec_to_ClusterLabels_dataset', RFprec_to_ClusterLabels_dataset, destination)
Esempio n. 2
0
    def detect_prepared_datasets(self):
        """
        Pre-processing, including time-slicing, removal of NAs, stacking & standardizing.
        calls - 
        1. prepare.preprocess_time_series
        2. prepare.flatten_and_standardize_dataset`
        """
        if utils.find('*target_ds_preprocessed.pkl', self.prepared_data_dir) and \
            utils.find('*rf_ds_preprocessed.pkl', self.prepared_data_dir) and \
            utils.find('*standardized_stacked_arr.pkl', self.prepared_data_dir):
            print('Pickles (preprocessed) found.')
            for pkl in utils.find('*preprocessed.pkl', self.prepared_data_dir):
                if "target_ds" in pkl: self.target_ds_preprocessed_path = pkl
                elif "rf_ds" in pkl: self.rf_ds_preprocessed_path = pkl
            
            LocalModelParams(self, utils.open_pickle(self.target_ds_preprocessed_path))

            for pkl in utils.find('*standardized_stacked_arr.pkl', self.prepared_data_dir):
                self.standardized_stacked_arr_path = pkl
        else:
            print('Pickles of pre-processed data incomplete. Proceeding to load & process raw dataset pickles.')
            self.target_ds_preprocessed_path, self.rf_ds_preprocessed_path = prepare.preprocess_time_series(self, self.prepared_data_dir, self.ALPHAs)

            LocalModelParams(self, utils.open_pickle(self.target_ds_preprocessed_path)) # generate new local model params

            self.standardized_stacked_arr_path = prepare.flatten_and_standardize_dataset(self, self.prepared_data_dir)
        print(f'--> Months for this dataset are: {self.month_names}')
Esempio n. 3
0
def prepare_dataset(model, dest):
    """
    - xr.open_mfdataset() = loading
    - restricting to certain variables + "levels" of variables
    - combining variables xarrays into one
    - restricting to only between 1999 to 2019
    - slicing domain dimensions up to required specs (i.e. model.LON_S, model.LON_N, etc...)
    - slicing up to chosen period only
    - pickling the datasets (both input & rainfall) & returning them
    """
    # searching for raw data pickles
    preloaded_input_pickles = utils.find('*.pkl', model.raw_input_dir)
    if preloaded_input_pickles:
        print('Preloaded raw INPUT data pickles found...')
        ds_CHOSEN_VARS_renamed = utils.open_pickle(utils.find('*.pkl', model.raw_input_dir)[0])
    else: 
        print('Creating pickles of raw input data...')
        ds_CHOSEN_VARS_renamed = save_preloaded_raw_input_data(model)
    
    preloaded_input_pickles = utils.find('*.pkl', model.raw_rf_dir)
    if preloaded_input_pickles:
        print('Preloaded raw rainfall data pickles found...')
        ds_RAINFALL = utils.open_pickle(utils.find('*.pkl', model.raw_rf_dir)[0])
    else: 
        print('Creating pickles of raw rainfall data...')
        ds_RAINFALL = save_preloaded_raw_rf_data(model)

    print("Proceeding to do preliminary data cleaning...")
    ds_sliced = ds_CHOSEN_VARS_renamed.sel(
        level=slice(np.min(model.unique_pressure_lvls),np.max(model.unique_pressure_lvls)), 
        lat=slice(model.LAT_N,model.LAT_S), lon=slice(model.LON_W,model.LON_E),
        time=slice('1999', '2019'))
    ds_sliced_rhum = ds_sliced.rhum
    ds_sliced_rhum_no925 = ds_sliced_rhum.drop_sel({"level":925})
    ds_sliced_uwnd_only = ds_sliced.uwnd
    ds_sliced_vwnd_only = ds_sliced.vwnd
    ds_combined_sliced = xr.merge([ds_sliced_rhum_no925, ds_sliced_uwnd_only, ds_sliced_vwnd_only], compat='override')

    rf_ds_sliced = ds_RAINFALL.sel(lat=slice(model.LAT_S, model.LAT_N), lon=slice(model.LON_W,model.LON_E))
    print('Pickling domain- & feature-constrained input & RF datasets...')
    if model.period == "NE_mon":
        input_ds = ds_combined_sliced.sel(time=is_NE_mon(ds_combined_sliced['time.month']))
        rf_ds = rf_ds_sliced.sel(time=is_NE_mon(rf_ds_sliced['time.month']))
        input_ds_serialized_path = utils.to_pickle('raw_input_ds_NE_mon_serialized', input_ds, dest)
        rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_NE_mon_serialized', rf_ds, dest)
        return input_ds_serialized_path, rf_ds_serialized_path
    elif model.period == "SW_mon":
        input_ds = ds_combined_sliced.sel(time=is_SW_mon(ds_combined_sliced['time.month']))
        rf_ds = rf_ds_sliced.sel(time=is_SW_mon(rf_ds_sliced['time.month']))
        input_ds_serialized_path = utils.to_pickle('raw_input_ds_SW_mon_serialized', input_ds, dest)
        rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_SW_mon_serialized', rf_ds, dest)
        return input_ds_serialized_path, rf_ds_serialized_path
    elif model.period == "inter_mon":
        input_ds = ds_combined_sliced.sel(time=is_inter_mon(ds_combined_sliced['time.month']))
        rf_ds = rf_ds_sliced.sel(time=is_inter_mon(rf_ds_sliced['time.month']))
        input_ds_serialized_path = utils.to_pickle('raw_input_ds_inter_mon_serialized', input_ds, dest)
        rf_ds_serialized_path = utils.to_pickle('raw_rf_ds_inter_mon_serialized', rf_ds, dest)
        return input_ds_serialized_path, rf_ds_serialized_path
Esempio n. 4
0
def cut_dataset(model, alpha, dest, dataset_path, ds_name):
    dataset = utils.open_pickle(dataset_path)
    try: 
        dataset = dataset.sel(
            level=slice(np.min(model.tl_model.unique_pressure_lvls),np.max(model.tl_model.unique_pressure_lvls)), 
            lat=slice(model.tl_model.LAT_N, model.tl_model.LAT_S), lon=slice(model.tl_model.LON_W, model.tl_model.LON_E),
            time=slice('1999', '2019'))
    except ValueError:
        dataset = dataset.sel(
            lat=slice(model.tl_model.LAT_S, model.tl_model.LAT_N), lon=slice(model.tl_model.LON_W, model.tl_model.LON_E),
            time=slice('1999', '2019'))
    if model.tl_model.period == "NE_mon":
        dataset = dataset.sel(time=is_NE_mon(dataset['time.month']))
    elif model.tl_model.period == "SW_mon":
        dataset = dataset.sel(time=is_SW_mon(dataset['time.month']))
    elif model.tl_model.period == "inter_mon":
        dataset = dataset.sel(time=is_inter_mon(dataset['time.month']))

    if alpha != model.ALPHAs:
        gt_years = model.tl_model.years[(alpha-1)*model.PSI : alpha*model.PSI]
        train_years = np.delete(model.tl_model.years, np.arange((alpha-1) * model.PSI, alpha * model.PSI))
        test = utils.cut_year(dataset, np.min(gt_years), np.max(gt_years))
        train = utils.cut_year(dataset, np.min(train_years), np.max(train_years))
    else:
        gt_years = model.tl_model.years[(alpha-1)*model.PSI : alpha*model.PSI+model.runoff_years] 
        train_years = np.delete(model.tl_model.years, np.arange((alpha-1)*model.PSI, alpha*model.PSI+model.runoff_years)) 
        test = utils.cut_year(dataset, np.min(gt_years), np.max(gt_years))
        train = utils.cut_year(dataset, np.min(train_years), np.max(train_years))
    time.sleep(1); gc.collect()

    utils.to_pickle(f'{ds_name}_test_alpha_{alpha}_preprocessed', test, dest)
    utils.to_pickle(f'{ds_name}_train_alpha_{alpha}_preprocessed', train, dest)
Esempio n. 5
0
    def train_kmeans(self, alpha=None):
        if alpha:
            optimal_k = self.tl_model.optimal_k
            print(f'>> self.alpha_model_dir: {self.alpha_model_dir}')
            print(f'>> optimal_k: {optimal_k}')
            found = [i for i in Path(self.alpha_model_dir).glob(f'k-{optimal_k}_*')]
            if found: 
                self.alpha_cluster_dir = found[0]
            else: self.alpha_cluster_dir = str(Path(self.alpha_model_dir) / f"k-{optimal_k}_NOT-singled-out-as-potential-cluster-for-this-split")
            os.makedirs(self.alpha_cluster_dir, exist_ok=True)
            print(f'>> self.alpha_cluster_dir: {self.alpha_cluster_dir}')
            destination = self.alpha_cluster_dir
            prefix = f'alpha_{alpha}_'
        else:
            optimal_k = self.optimal_k
            destination = self.cluster_dir
            prefix = ''

        print(f'optimal_k: "{optimal_k}", destination: "{destination}", prefix: "{prefix}"')
            
        for phrase in ('kmeans_model', 'labels_ar', 'labels_to_coords', 'label_markers', 'target_ds_withClusterLabels', 'dates_to_ClusterLabels', 'RFprec_to_ClusterLabels_dataset'):
            if utils.find(f'*{phrase}*.pkl', destination): 
                print(f'>>>>>>>>> "self.{phrase}_path" initialized.')
                exec(f'self.{phrase}_path = utils.find(f\'*{phrase}*.pkl\', r"{destination}")[0]')
            else:
                print(f'{utils.time_now()} - No KMeans model trained for {self.domain}, {self.period}, for {self.hyperparameters}, doing so now...')
                som_weights_to_nodes = utils.open_pickle(self.som_weights_to_nodes_path)
                samples, features = som_weights_to_nodes.shape
                km = KMeans(n_clusters=optimal_k).fit(som_weights_to_nodes)
                print(f"n{utils.time_now()} - K-means estimator fitted, sample size is {samples} and number of features is {features}.")

                self.kmeans_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}kmeans_model', km, destination)
                self.serialize_kmeans_products(km, alpha)
                break
Esempio n. 6
0
    def prepare_alphafold_dataset(self, alpha):
        print(f'Preparing dataset for alpha-{alpha}')
        if alpha != self.ALPHAs:
            self.gt_years = np.array2string(self.tl_model.years[(alpha-1)*self.PSI : alpha*self.PSI], separator='-')
        else:
            self.gt_years = np.array2string(self.tl_model.years[(alpha-1)*self.PSI : alpha*self.PSI+self.runoff_years], separator='-')

        self.alpha_prepared_dir = str(Path(self.tl_model.prepared_data_dir) / f'alpha_{alpha}')
        self.alpha_model_dir = str(Path(self.tl_model.cluster_dir) / f'alpha_{alpha}_GT-{self.gt_years}')
        
        for pkl in utils.find(f'*alpha_{alpha}_preprocessed.pkl', self.alpha_prepared_dir):
            if "target_ds_train" in pkl: self.target_ds_preprocessed_path = pkl
            elif "rf_ds_train" in pkl: self.rf_ds_preprocessed_path = pkl
            elif "target_ds_test" in pkl: self.x_test_path = pkl
            elif "rf_ds_test" in pkl: self.y_test_path = pkl
        
        LocalModelParams(self, utils.open_pickle(self.target_ds_preprocessed_path))

        if utils.find('*standardized_stacked_arr.pkl', self.alpha_prepared_dir):
            self.alpha_standardized_stacked_arr_path = utils.find(f'*standardized_stacked_arr.pkl', self.alpha_prepared_dir)[0]
        else:
            self.alpha_standardized_stacked_arr_path = prepare.flatten_and_standardize_dataset(self, self.alpha_prepared_dir)
        print(f'--> Months for this dataset are: {self.month_names}')

        print(
            f'paths created @ prepare_alphafold_dataset():\nself.alpha_prepared_dir: "{self.alpha_prepared_dir}", \nself.alpha_model_dir: "{self.alpha_model_dir}"'
            f'\nself.target_ds_preprocessed_path: "{self.target_ds_preprocessed_path}", \nself.rf_ds_preprocessed_path: "{self.rf_ds_preprocessed_path}"' \
            f'\nself.rf_ds_preprocessed_path: "{self.rf_ds_preprocessed_path}", \nself.x_test_path: "{self.x_test_path}", \nself.y_test_path: "{self.y_test_path}"' \
            f'\nself.alpha_standardized_stacked_arr_path: "{self.alpha_standardized_stacked_arr_path}", \nself.gt_years: {self.gt_years}' \
            )
Esempio n. 7
0
def preprocess_time_series(model, dest, nfold_ALPHA=None, desired_res=0.75):   
    """
    Preparing datasets for use in training algorithms
    - dropping missing values
    - ensuring both target & input datasets have same dates 
    - coarsening spatial resolution of rainfall(target) dataset to desired resolution
    - pickling these "preprocessed" datasets
    """
    target_ds = utils.open_pickle(model.input_ds_serialized_path)
    rf_target_ds = utils.open_pickle(model.rf_ds_serialized_path)

    # removing NA rows, supraneous dates, & coarsening dates accordingly
    print(f'{utils.time_now()} - Preprocessing data now.')
    
    try:
        rf_target_ds['time'] =  rf_target_ds.indexes['time'].to_datetimeindex() #converting CFTimeIndex -> DateTime Index 
    except AttributeError:
        print('AttributeError: \'DatetimeIndex\' object has no attribute \'to_datetimeindex\', continuing regardless...')
        pass

    earliest_rf_reading, latest_rf_reading =  rf_target_ds.isel(time=0).time.values,  rf_target_ds.isel(time=-1).time.values
    earliest_target_ds_reading, latest_target_ds_reading = target_ds.isel(time=0).time.values, target_ds.isel(time=-1).time.values
    earliest_date = earliest_target_ds_reading if earliest_target_ds_reading > earliest_rf_reading else earliest_rf_reading
    latest_date = latest_target_ds_reading if latest_target_ds_reading < latest_rf_reading else latest_rf_reading

    rf_ds_preprocessed =  rf_target_ds.sel(time=slice(earliest_date, latest_date))
    target_ds = target_ds.sel(time=slice(earliest_date, latest_date))

    more_time_gaps = [i for i in target_ds.time.data if i not in rf_ds_preprocessed.time.data]
    more_time_gaps = more_time_gaps+[i for i in rf_ds_preprocessed.time.data if i not in target_ds.time.data]
    valid_dates = [date for date in target_ds.time.data if date not in more_time_gaps]
    target_ds = target_ds.sel(time = valid_dates)
    coarsen_magnitude = int(desired_res/np.ediff1d(target_ds.isel(lon=slice(0,2)).lon.data)[0])
    print(f'Coarsen magnitude set at: {coarsen_magnitude} toward desired spatial resolu. of {desired_res}')
    target_ds_preprocessed = target_ds.coarsen(lat=coarsen_magnitude, lon=coarsen_magnitude, boundary='trim').mean()
        
    target_ds_preprocessed_path = utils.to_pickle('target_ds_preprocessed', target_ds_preprocessed, dest)
    rf_ds_preprocessed_path = utils.to_pickle('rf_ds_preprocessed', rf_ds_preprocessed, dest)

    target_ds_preprocessed = utils.remove_expver(target_ds_preprocessed)

    if nfold_ALPHA:
        for alpha in range(nfold_ALPHA):
            pass

    return target_ds_preprocessed_path, rf_ds_preprocessed_path
Esempio n. 8
0
def TEST_all():
    dataset_in = open_pickle('../datasets/DSET_argentina.pkl')
    my_generator = get_enhansed_generator(segment_len=512,
                                          batch_size=20,
                                          dataset_in=dataset_in)
    batch = next(my_generator)
    print("shape of batch x= " + str(batch[0].shape))
    print("shape of batch y= " + str(batch[1].shape))
Esempio n. 9
0
def get_test_batch():
    from annotation.ann_generator import delete_baseline_wander, extract_first_leads, shrink_dataset
    dataset_in = open_pickle(dataset_path)
    _, test_dset = split_dict_annotations(dataset_in)
    dataset_only_one_channel = extract_first_leads(test_dset)
    delete_baseline_wander(dataset_only_one_channel['x'])
    dataset_shrinked = shrink_dataset(dataset_only_one_channel)
    return dataset_shrinked
Esempio n. 10
0
    def train_SOM(self, alpha=None):
        d_hp_dir_path = str(utils.models_dir / self.dir_hp_str)
        self.d_hp_dir_path = d_hp_dir_path
        os.makedirs(d_hp_dir_path, exist_ok=True)
        if not utils.find(f'*extent_{self.dir_str}.png', self.d_hp_dir_path):
            visualization.get_domain_geometry(self, self.d_hp_dir_path)
            
        models_dir_path = str(utils.models_dir / self.dir_hp_str / self.period) + f'_{self.month_names_joined}'
        os.makedirs(models_dir_path, exist_ok=True)
        self.models_dir_path = models_dir_path
        # utils.update_cfgfile('Paths', 'models_dir_path', self.models_dir_path)

        if alpha:
            destination = self.alpha_model_dir
            arr_path = self.alpha_standardized_stacked_arr_path
            prefix = f'alpha_{alpha}_'
            prompt = f'< alpha-{alpha} >'
        else:
            destination = self.models_dir_path
            arr_path = self.standardized_stacked_arr_path
            prefix = ''
            prompt = ''

        print(f'Destination: "{destination}", arr_path: "{arr_path}", prefix: "{prefix}"')

        if utils.find(f'*{prefix}som_model.pkl', destination):
            print(f'{utils.time_now()} - SOM model trained before, skipping...')
            self.som_model_path = utils.find(f'*{prefix}som_model.pkl', destination)[0]
        else:
            print(f'{utils.time_now()} - {prompt} No SOM model trained for {self.domain}, {self.period}, for {self.hyperparameters}, doing so now...')

            standardized_stacked_arr = utils.open_pickle(arr_path)

            sominitstarttime = timer(); print(f'{utils.time_now()} - Initializing MiniSom... ')
            som = MiniSom(self.gridsize, self.gridsize, # square
                        standardized_stacked_arr.shape[1],
                        sigma=self.sigma, learning_rate=self.learning_rate,
                        neighborhood_function='gaussian', random_seed=self.random_seed)
            """
            Note: initializing PCA for weights is faster (~1/2 hour), but for serialized arrays > 300mb, 
            chances are this will kill the RAM and halt the entire process. 
            """
##            try:
##                som.pca_weights_init(standardized_stacked_arr)
##            except MemoryError as e:
##                print(f'Memory error has occured: \n{e}')
            print(f"Initialization took {utils.time_since(sominitstarttime)}.\n")

            trainingstarttime = timer(); print(f"{utils.time_now()} - Beginning training.")
            getattr(som, self.training_mode)(standardized_stacked_arr, self.iterations, verbose=True)
            q_error = np.round(som.quantization_error(standardized_stacked_arr), 2)
            print(f"Training complete. Q error is {q_error}, time taken for training is {utils.time_since(trainingstarttime)}s\n")

            if alpha: self.som_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}som_model', som, destination)
            else: self.som_model_path = utils.to_pickle(f'{self.RUN_datetime}_{prefix}som_model', som, destination)
def normalize_data(X_train, X_test, folder_name):
    if os.path.isfile(folder_name + "mean.pkl"):
        mean = utils.open_pickle(folder_name + "mean.pkl")
        std = utils.open_pickle(folder_name + "std.pkl")
    else:
        train_matrix = []
        for x in X_train:
            train_matrix.extend(x)
        train_matrix = np.array(train_matrix)
        
        mean = np.array(train_matrix).mean(0)
        std = np.array(train_matrix).std(0)
        
        utils.save_pickle(folder_name + "mean.pkl", mean)
        utils.save_pickle(folder_name + "std.pkl", std)
    
    X_train = [(x-mean)/std for x in X_train]
    X_test = [(x-mean)/std for x in X_test]
    
    return np.array(X_train), np.array(X_test)
Esempio n. 12
0
    def assign_test_clusters_to_datasets(self):
        target_ds_preprocessed = utils.open_pickle(utils.find('*target_ds_preprocessed.pkl', self.test_prepared_data_dir)[0])
        rf_ds_preprocessed = utils.open_pickle(utils.find('*rf_ds_preprocessed.pkl', self.test_prepared_data_dir)[0])
        standardized_stacked_arr = utils.open_pickle(utils.find('*standardized_stacked_arr.pkl', self.test_prepared_data_dir)[0])
        
        self.n_datapoints = target_ds_preprocessed.time.shape[0] # length of xr_dataset
        self.lat_size = target_ds_preprocessed.lat.shape[0]
        self.lon_size = target_ds_preprocessed.lon.shape[0]
        self.months = np.unique(target_ds_preprocessed['time.month'].values) # month numbers
        self.month_names = [calendar.month_name[m][:3] for m in np.unique(target_ds_preprocessed['time.month'])]
        self.month_names_joined = '_'.join(self.month_names).upper() # to print months properly
        self.years = np.unique(target_ds_preprocessed['time.year'].values) # unique years
        self.X, self.Y = target_ds_preprocessed.lon, target_ds_preprocessed.lat

        km = utils.open_pickle(self.kmeans_model_path)
        predicted_clusters = km.predict(standardized_stacked_arr.astype(np.float))
        target_ds_withClusterLabels = target_ds_preprocessed.assign_coords(cluster=("time", predicted_clusters))
        dates_to_ClusterLabels = target_ds_withClusterLabels.cluster.reset_coords()
        RFprec_to_ClusterLabels_dataset = xr.merge([rf_ds_preprocessed, dates_to_ClusterLabels])
        utils.to_pickle('target_ds_withClusterLabels', target_ds_withClusterLabels, self.test_prepared_data_dir)
        utils.to_pickle('RFprec_to_ClusterLabels_dataset', RFprec_to_ClusterLabels_dataset, self.test_prepared_data_dir)
Esempio n. 13
0
def get_generators_permute(train_batch, test_batch):
    """чтобы возвращаелось none,4,512, а не none, 513, 3 как обычно
    """
    dataset_in = open_pickle(dataset_path)
    train_dset, test_dset = split_dict_annotations(dataset_in)
    my_generator_train = get_mulimask_generator_addon(segment_len,
                                                      batch_size=train_batch,
                                                      dataset_in=train_dset)
    my_generator_test = get_mulimask_generator_addon(segment_len,
                                                     batch_size=test_batch,
                                                     dataset_in=test_dset)
    return my_generator_train, my_generator_test
Esempio n. 14
0
def retrieve_and_insert_actual_RF_array(conn, all_test_prepared_data_dir,
                                        period, domain, test_date, sn, cluster,
                                        w_lim, e_lim, s_lim, n_lim):
    test_ds = utils.open_pickle(
        Path(all_test_prepared_data_dir) /
        f'{period}_mon_{domain}_prepared/RFprec_to_ClusterLabels_dataset.pkl')
    wholegrid_gt1mm_pred = (test_ds.sel(time=test_date).precipitationCal >
                            1).values
    SG_only_gt1mm_actual = (test_ds.precipitationCal.sel(
        lon=slice(w_lim, e_lim), lat=slice(s_lim, n_lim), time=test_date) >
                            1).values
    insert_actual_rf_array(conn, sn, period, domain, cluster, test_date,
                           SG_only_gt1mm_actual, wholegrid_gt1mm_pred)
Esempio n. 15
0
    def detect_som_products(self, alpha=None):
        if alpha:
            destination = self.alpha_model_dir
            arr_path = self.alpha_standardized_stacked_arr_path
            prefix = f'alpha_{alpha}_'
            prompt = f'< alpha-{alpha} >'
        else:
            destination = self.models_dir_path
            arr_path = self.standardized_stacked_arr_path
            prefix = ''
            prompt = ''

        print(f'Destination: "{destination}", arr_path: "{arr_path}", prefix: "{prefix}", prompt:"{prompt}"')

        for phrase in ('winner_coordinates', 'dmap', 'ar', 'som_weights_to_nodes'):
            if utils.find(f'*{prefix}{phrase}.pkl', destination): 
                p = utils.find(f'*{prefix}{phrase}.pkl', destination)
                print(f'{utils.time_now()} - {prefix}{phrase} is found @: \n{p[0]}')
                exec(f'self.{phrase}_path = {p}[0]')
            else:
                print(f'{utils.time_now()} - {prompt} Some SOM products found missing in, generating all products now...')
                som = utils.open_pickle(self.som_model_path)
                standardized_stacked_arr = utils.open_pickle(arr_path)
                winner_coordinates = np.array([som.winner(x) for x in standardized_stacked_arr]) 
                dmap = som.distance_map()
                ar = som.activation_response(standardized_stacked_arr) 
                som_weights = som.get_weights() # weights for training via k-means
                som_weights_to_nodes = np.array(
                    [som_weights[c,r] for r in range(self.gridsize) for c in range(self.gridsize)]) #kmeans clustering
                
                self.winner_coordinates_path = utils.to_pickle(f'{prefix}winner_coordinates', winner_coordinates, destination)
                self.dmap_path = utils.to_pickle(f'{prefix}dmap', dmap, destination)
                self.ar_path = utils.to_pickle(f'{prefix}ar', ar, destination)
                self.som_weights_to_nodes_path = utils.to_pickle(f'{prefix}som_weights_to_nodes', som_weights_to_nodes, destination)

                break

        print('SOM products serialized.')
Esempio n. 16
0
def draw_shrinked():
    dataset_in = open_pickle('../datasets/DSET_argentina.pkl')
    dataset_only_one_channel = extract_first_lines(dataset_in)
    dset_shrinked = shrink_dataset(dataset_only_one_channel)
    before_x = dataset_only_one_channel['x'][0, 0, 0:30]
    after_x = dset_shrinked['x'][0, 0, 0:30]

    figname = "shrinked_ecg.png"
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=False)
    ax1.plot(before_x, 'k-', label="несжатый")
    ax2.plot(after_x, 'm-', label="сжатый в " + str(SHRINK_FACTOR))

    plt.legend(loc=2)
    plt.savefig(figname)
Esempio n. 17
0
def get_test_ds_params(period, domain):
    path = Path(__file__).resolve().parents[
        1] / rf"data/external/casestudytesting_29_Jan/{period}_mon_{domain}_prepared"
    print(path / f'RFprec_to_ClusterLabels_dataset.pkl')
    test_ds = utils.open_pickle(
        str(path / f'RFprec_to_ClusterLabels_dataset.pkl'))
    w_lim = 103.5
    e_lim = 104.055
    s_lim = 1.1
    n_lim = 1.55
    rf_ds_lon = test_ds.sel(lon=slice(w_lim, e_lim),
                            lat=slice(s_lim, n_lim)).lon.values
    rf_ds_lat = test_ds.sel(lon=slice(w_lim, e_lim),
                            lat=slice(s_lim, n_lim)).lat.values
    return rf_ds_lon, rf_ds_lat
Esempio n. 18
0
def get_generators(train_batch, test_batch):
    """
    слепим два генератора- тестовый и трейновый
    :param train_batch: размер батча для трейнового генератора
    :param test_batch: размер батча для тестового генератора
    :return:
    """
    dataset_in = open_pickle('./DSET_argentina.pkl')
    train_dset, test_dset = split_dict_annotations(dataset_in)
    my_generator_train = get_enhansed_generator(segment_len,
                                                batch_size=train_batch,
                                                dataset_in=train_dset)
    my_generator_test = get_enhansed_generator(segment_len,
                                               batch_size=test_batch,
                                               dataset_in=test_dset)
    return my_generator_train, my_generator_test
Esempio n. 19
0
    def test_random_dates(self, dates_to_test, plots=5):
        test_dir = str(Path(__file__).resolve().parents[1] / 'test/2021_Jan_28_testing2020randomdates')
        self.test_RF_raw_data_dir = str(Path(__file__).resolve().parents[1] / "data/external/casestudytesting_29_Jan/raw/GPM_L3")
        self.test_indp_vars_raw_data_dir = str(Path(__file__).resolve().parents[1] / "data/external/casestudytesting_29_Jan/raw/downloadERA")
        self.test_prepared_data_dir = str(Path(__file__).resolve().parents[1] / f"data/external/casestudytesting_29_Jan/{self.period}_{self.dir_str}_prepared")
        os.makedirs(self.test_prepared_data_dir, exist_ok=True)

        number_of_test_plots_created = len(utils.find(f'*{self.period}_{self.dir_str}*_test_zscore_against_fullmodel*.png', test_dir))
        # number_of_test_plots_needed = date_to_test*plots
        if number_of_test_plots_created >= dates_to_test: 
            print(f"{number_of_test_plots_created} random dates have already been tested, please review at {test_dir}")
            return
        else:
            print(f"{number_of_test_plots_created} dates tested so far.")
            if not utils.find('*target_ds_preprocessed.pkl', self.test_prepared_data_dir) \
                or not utils.find('*rf_ds_preprocessed.pkl', self.test_prepared_data_dir) \
                    or not utils.find('*standardized_stacked_arr.pkl', self.test_prepared_data_dir): prepare.prep_for_testing_random_dates(self)

            if not utils.find('*target_ds_withClusterLabels.pkl', self.test_prepared_data_dir) \
                or not utils.find('*RFprec_to_ClusterLabels_dataset.pkl', self.test_prepared_data_dir): 
                self.assign_test_clusters_to_datasets()

            target_ds_withClusterLabels = utils.open_pickle(utils.find('*target_ds_withClusterLabels.pkl', self.test_prepared_data_dir)[0])
            if self.period == "NE_mon": target_ds_withClusterLabels = target_ds_withClusterLabels.sel(time=prepare.is_NE_mon(target_ds_withClusterLabels['time.month']))
            elif self.period == "SW_mon": target_ds_withClusterLabels = target_ds_withClusterLabels.sel(time=prepare.is_SW_mon(target_ds_withClusterLabels['time.month']))
            elif self.period == "inter_mon": target_ds_withClusterLabels = target_ds_withClusterLabels.sel(time=prepare.is_inter_mon(target_ds_withClusterLabels['time.month']))
            if target_ds_withClusterLabels.time.size == 0: 
                print(f'There are no dates available in your test dataset to use for this {self.period} monsoon period, please verify. Ending testing here.')
                return 

            random_sampled_dates = np.array(np.random.choice(target_ds_withClusterLabels.time.data, dates_to_test-number_of_test_plots_created, replace=False))
            random_sampled_dates.sort()
            print(random_sampled_dates)

            for i, sn in enumerate(range(number_of_test_plots_created+1, dates_to_test+1)):
            # for sn in range(number_of_test_plots_created+1, dates_to_test+1):
                print(f'Printing {sn} out of {dates_to_test} test plots now:')  
                # random_sampled_date = np.random.choice(target_ds_withClusterLabels.time.data, 1)
                random_sampled_date = [random_sampled_dates[i]]
                
                cluster = int(target_ds_withClusterLabels.sel(time=random_sampled_date).cluster.data)+1
                # run_test.print_test_date_abv_1mm_bool(self, test_dir, sn, random_sampled_date, cluster)
                run_test.print_test_date_abv_1mm_to500mm(self, test_dir, sn, random_sampled_date, cluster)   
                run_test.print_brier_gt1mm(self, test_dir, sn, random_sampled_date, cluster)   
                run_test.print_heavyrfforecastcomparison_gt50mm(self, test_dir, sn, random_sampled_date, cluster)   
                run_test.print_test_date_zscore_against_fullmodel(self, test_dir, sn, random_sampled_date, cluster)
Esempio n. 20
0
def flatten_and_standardize_dataset(model, dest):

    target_ds_preprocessed = utils.open_pickle(model.target_ds_preprocessed_path)
    target_ds_preprocessed = utils.remove_expver(target_ds_preprocessed)
    
    # reshaping
    reshapestarttime = timer(); print(f"{utils.time_now()} - Reshaping data now...")
    print(f"\n{utils.time_now()} - reshaping rhum dataarrays now, total levels to loop: {model.rhum_pressure_levels}.")

    reshaped_unnorma_darrays = {}
    reshaped_unnorma_darrays['rhum'], reshaped_unnorma_darrays['uwnd'], reshaped_unnorma_darrays['vwnd'] = {}, {}, {}

    for level in model.rhum_pressure_levels:
        print(f'@{level}... ')
        reshaped_unnorma_darrays['rhum'][level] = np.reshape(
            target_ds_preprocessed.rhum.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size ))

    print(f"\n{utils.time_now()} - reshaping uwnd/vwnd dataarrays now, total levels to loop: {model.uwnd_vwnd_pressure_lvls}.")

    for level in model.uwnd_vwnd_pressure_lvls:
        print(f'@{level}... ')
        reshaped_unnorma_darrays['uwnd'][level] = np.reshape(
            target_ds_preprocessed.uwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size ))
        reshaped_unnorma_darrays['vwnd'][level] = np.reshape(
            target_ds_preprocessed.vwnd.sel(level=level).values, (model.n_datapoints, model.lat_size*model.lon_size ))

    reshapetime = timer()-reshapestarttime; reshapetime = str(datetime.timedelta(seconds=reshapetime)).split(".")[0]; print(f'Time taken: {reshapetime}s.\n')

    # stacking unstandardized dataarrays
    stackingstarttime = timer(); print("Stacking unstandardized dataarrays now...")
    stacked_unstandardized_ds = np.hstack([reshaped_unnorma_darrays[var][lvl] for var in reshaped_unnorma_darrays for lvl in reshaped_unnorma_darrays[var]])

    stackingtime = timer()-stackingstarttime; stackingtime = str(datetime.timedelta(seconds=stackingtime)).split(".")[0]; print(f'Time taken: {stackingtime}s.\n')

    # standardizing the stacked dataarrays
    standardizestarttime = timer(); print("standardizing stacked dataarrays now...")
    print(f'"stacked_unstandardized_ds.shape" is {stacked_unstandardized_ds.shape}')
    transformer = RobustScaler(quantile_range=(25, 75))
    standardized_stacked_arr = transformer.fit_transform(stacked_unstandardized_ds) # som & kmeans training
    transformer.get_params()
    standardizetime = timer()-standardizestarttime; standardizetime = str(datetime.timedelta(seconds=standardizetime)).split(".")[0]; print(f'That took {standardizetime}s to complete.\n')

    standardized_stacked_arr_path = utils.to_pickle('standardized_stacked_arr', standardized_stacked_arr, dest)

    return standardized_stacked_arr_path
Esempio n. 21
0
def retrieve_and_insert_predicted_RF_array(conn, period, domain, cluster,
                                           w_lim, e_lim, s_lim, n_lim):
    print(
        f'Inserting predicted_rf arr: {period}, {domain} - cluster: {cluster}')
    # pred_ds = utils.open_pickle(model.RFprec_to_ClusterLabels_dataset_path) ## NOT possible due to this only being for ONE domain-period
    pred_ds = utils.open_pickle([
        *utils.models_dir.glob(f'**/{domain}*/{period}*/k-*/*RFprec*.pkl')
    ][0])
    data_pred_wholegrid = (pred_ds.where(pred_ds.cluster == int(cluster) - 1,
                                         drop=True).precipitationCal > 1)
    data_pred_sgonly = (pred_ds.where(pred_ds.cluster == int(cluster) - 1,
                                      drop=True).precipitationCal > 1).sel(
                                          lon=slice(w_lim, e_lim),
                                          lat=slice(s_lim, n_lim))
    whole_grid_gt1mm_pred = np.mean(data_pred_wholegrid, axis=0).values
    SG_only_gt1mm_pred = np.mean(data_pred_sgonly, axis=0).values
    insert_predicted_rf_array(conn, period, domain, cluster,
                              SG_only_gt1mm_pred, whole_grid_gt1mm_pred)
Esempio n. 22
0
import os
import utils
import lightgbm as lgb
import numpy as np
import math
from google.cloud import storage

from flask import Flask, render_template, request
app = Flask(__name__)

CLOUD_STORAGE_BUCKET = os.environ['NYPAB_BUCKET']
gcs = storage.Client()
bucket = gcs.get_bucket(CLOUD_STORAGE_BUCKET)
'''Prediction Files Here'''
model = lgb.Booster(model_file='data/lgb_classifier.txt')
scalar = utils.open_pickle('data/scalar1.pkl')

ethnicity_mapping = {
    'Black or African American': 0,
    'Asian': 1,
    'Hispanic or Latinx': 2,
    'White': 3,
    'American Indian': 4,
    'Refused': 5,
    'Other Race': 6,
    'Unknown': 7
}

gender_mapping_complainant = {
    'Male': 0,
    'Female': 1,
Esempio n. 23
0
	# 15 - number of keywords in inspect label is between 5, 10 and the model label is 5, 10, 15, therefore, extract 15.
	# But this number will be adjusted (same like how many number of keyphrase) when extract the features
	# 50 - number of keywords in news label is 50
	if data == 'default':
		n_topics = 15
	elif data == 'news':
		n_topics = 50
		
	#store the keywords into a list
	all_topics = []
	for n_doc in corpus:
		all_topics.append(TopicRank(n_doc).get_top_n(n = n_topics))
	return all_topics

#Read pickle file from the model
txt_train_data = utils.open_pickle('./pickle/txt train data')
txt_test_data = utils.open_pickle('./pickle/txt test data')
xml_train_data = utils.open_pickle('./pickle/xml train data')
xml_test_data = utils.open_pickle('./pickle/xml test data')

#Read pickle file from the news dataset
news_train_data = utils.open_pickle('./pickle/500N-KPCrowd/train data')
news_test_data = utils.open_pickle('./pickle/500N-KPCrowd/test data')

#Read pickle file from the inspec dataset
inspec_train_data = utils.open_pickle('./pickle/inspec/train data')
inspec_test_data = utils.open_pickle('./pickle/inspec/test data')


#Processing on the model data
txt_train_topics = calculate_topic_rank(txt_train_data, data='default')
Esempio n. 24
0
    def generate_k(self, alpha=None):
        """
        - detection of metrices to infer "k", i.e. optimal_k value
        - creation of metrices pickles 
        - creation of folders in models_dir to indicate potential k values/cluster combinations
        """
        metrics_dir = str(utils.metrics_dir / self.dir_hp_str / self.period) + f'_{self.month_names_joined}'
        os.makedirs(metrics_dir, exist_ok=True)
        self.metrics_dir_path = metrics_dir

        if alpha:
            self.alpha_metrics_dir_path = str(Path(self.tl_model.metrics_dir_path) / f'alpha_{alpha}')
            metric_destination = self.alpha_metrics_dir_path
            os.makedirs(metric_destination, exist_ok=True)
            model_destination = self.alpha_model_dir
            prefix = f'alpha_{alpha}_'
            prompt = f'< alpha-{alpha} >'
        else:
            metric_destination = self.metrics_dir_path
            model_destination = self.models_dir_path
            prefix = ''
            prompt = ''

        print(f'metric_destination: "{metric_destination}", model_destination: "{model_destination}", prefix: "{prefix}", prompt:"{prompt}"')
        
        for phrase in ('sil_peaks', 'ch_max', 'dbi_min', 'reasonable_sil', 'ch_dbi_tally', 'n_expected_clusters', 'dbs_err_dict'):
            if utils.find(f'*{prefix}{phrase}*.pkl', metric_destination): pass
            else:
                print(f'{utils.time_now()} - {prompt} Not all metrices have been found in {metric_destination}, generating them now...')
                # print all metrices if even 1 not found
                som_weights_to_nodes = utils.open_pickle(self.som_weights_to_nodes_path)

                ch_scores, dbi_scores = validation.print_elbow_CH_DBI_plot(self, som_weights_to_nodes, metric_destination)
                yellowbrick_expected_k = validation.print_yellowbrickkelbow(self, som_weights_to_nodes, metric_destination)
                silhouette_avgs, reasonable_silhoutte_scores_mt50 = validation.print_silhoutte_plots(self, som_weights_to_nodes, metric_destination)
                dbstop10 = validation.print_dbs_plots(self, som_weights_to_nodes, metric_destination)
                
                eps_ls, dbs_k_ls, dbs_noisepts_ls, dbs_labels = [], [], [], []
                for i in dbstop10:
                    eps_ls.append(i[0])
                    dbs_k_ls.append(i[1])
                    dbs_noisepts_ls.append(i[2])
                    dbs_labels.append(i[3])

                sil_peaks, ch_max, dbi_min, reasonable_sil, ch_dbi_tally, n_expected_clusters, dbs_err_dict = validation.get_cluster_determination_vars(
                    silhouette_avgs, ch_scores, dbi_scores, reasonable_silhoutte_scores_mt50, dbs_k_ls, dbs_noisepts_ls, yellowbrick_expected_k)

                for cluster_num in n_expected_clusters:
                    if alpha: save_dir = fr"{self.alpha_model_dir}/k-{cluster_num}"
                    else: save_dir = fr"{self.models_dir_path}/k-{cluster_num}"
                    
                    if cluster_num == ch_max: save_dir += '_CHhighestPeak'
                    if cluster_num == dbi_min: save_dir += '_lowestDBItrough'
                    if cluster_num in sil_peaks: save_dir += '_SilhouetteAVG-peak'
                    if cluster_num == reasonable_sil: save_dir += '_mostReasonable-basedon-Silhouetteplot'
                    if cluster_num in ch_dbi_tally: save_dir += '_CHpeak-and-DBItrough'
                    if cluster_num == yellowbrick_expected_k: save_dir += '_Yellowbrickexpected-K'
                    if cluster_num in dbs_err_dict: save_dir += f'_DBSCANclusterErrorValsExpected-{dbs_err_dict[cluster_num]}'

                    os.makedirs(save_dir, exist_ok=True)
                    print(f'save_dir: {save_dir}')

                self.ch_max_path = utils.to_pickle(f"{prefix}ch_max", ch_max, metric_destination)
                self.dbi_min_path = utils.to_pickle(f"{prefix}dbi_min", dbi_min, metric_destination)
                self.sil_peaks_path = utils.to_pickle(f"{prefix}sil_peaks", sil_peaks, metric_destination)
                self.reasonable_sil_path = utils.to_pickle(f"{prefix}reasonable_sil", reasonable_sil, metric_destination)
                self.ch_dbi_tally_path = utils.to_pickle(f"{prefix}ch_dbi_tally", ch_dbi_tally, metric_destination)
                self.yellowbrick_expected_k_path = utils.to_pickle(f"{prefix}yellowbrick_expected_k", yellowbrick_expected_k, metric_destination)
                self.dbs_err_dict_path = utils.to_pickle(f"{prefix}dbs_err_dict", dbs_err_dict, metric_destination)
                self.n_expected_clusters_path = utils.to_pickle(f"{prefix}n_expected_clusters", n_expected_clusters, metric_destination)

                break

        print(f'{utils.time_now()} - Internal validation of clusters has been run, please view metrices folder @:\n{metric_destination} to determine optimal cluster number.\n'\
            f'\nYou can view the separate folders constructed for each discovered cluster combination. See @: \n{model_destination}.')
Esempio n. 25
0
def get_all_subject_data():

    overall_train_meanEEG = np.zeros((1, 14))
    overall_train_minEEG = np.zeros((1, 14))
    overall_train_maxEEG = np.zeros((1, 14))
    overall_train_stdEEG = np.zeros((1, 14))
    overall_train_meanPeaks = np.zeros((1, 14))
    overall_train_numPeaks = np.zeros((1, 14))
    overall_train_q25Peaks = np.zeros((1, 14))
    overall_train_q50Peaks = np.zeros((1, 14))
    overall_train_q75Peaks = np.zeros((1, 14))
    overall_train_yTruth = np.zeros(1)

    overall_test_meanEEG = np.zeros((1, 14))
    overall_test_minEEG = np.zeros((1, 14))
    overall_test_maxEEG = np.zeros((1, 14))
    overall_test_stdEEG = np.zeros((1, 14))
    overall_test_meanPeaks = np.zeros((1, 14))
    overall_test_numPeaks = np.zeros((1, 14))
    overall_test_q25Peaks = np.zeros((1, 14))
    overall_test_q50Peaks = np.zeros((1, 14))
    overall_test_q75Peaks = np.zeros((1, 14))
    overall_test_yTruth = np.zeros(1)

    overall_test_sub_meanEEG = np.zeros((1, 14))
    overall_test_sub_minEEG = np.zeros((1, 14))
    overall_test_sub_maxEEG = np.zeros((1, 14))
    overall_test_sub_stdEEG = np.zeros((1, 14))
    overall_test_sub_meanPeaks = np.zeros((1, 14))
    overall_test_sub_numPeaks = np.zeros((1, 14))
    overall_test_sub_q25Peaks = np.zeros((1, 14))
    overall_test_sub_q50Peaks = np.zeros((1, 14))
    overall_test_sub_q75Peaks = np.zeros((1, 14))
    overall_test_sub_yTruth = np.zeros(1)

    X_train, Y_train = utils.open_pickle('eng_train')
    X_val, Y_val = utils.open_pickle('eng_val')
    X_test, Y_test = utils.open_pickle('eng_test')
    X_sub_test, Y_sub_test = utils.open_pickle('sub_test')

    for i in range(0, len(X_train)):
        data_point = X_train[i]
        y_val = Y_train[i]

        meanEEG, minEEG, maxEEG, stdEEG, meanPeaks, numPeaks, q25Peaks, q50Peaks, q75Peaks, yTruth = get_EEG_features(
            data_point, y_val)
        # print(meanEEG.shape)
        # print(q50Peaks.shape)

        overall_train_meanEEG = np.vstack((overall_train_meanEEG, meanEEG))
        overall_train_minEEG = np.vstack((overall_train_minEEG, minEEG))
        overall_train_maxEEG = np.vstack((overall_train_maxEEG, maxEEG))
        overall_train_stdEEG = np.vstack((overall_train_stdEEG, stdEEG))
        overall_train_q25Peaks = np.vstack((overall_train_q25Peaks, q25Peaks))
        overall_train_q50Peaks = np.vstack((overall_train_q50Peaks, q50Peaks))
        overall_train_q75Peaks = np.vstack((overall_train_q75Peaks, q75Peaks))
        overall_train_meanPeaks = np.vstack(
            (overall_train_meanPeaks, meanPeaks))
        overall_train_numPeaks = np.vstack((overall_train_numPeaks, numPeaks))
        overall_train_yTruth = np.vstack((overall_train_yTruth, yTruth))

    for i in range(0, len(X_val)):
        data_point = X_val[i]
        y_val = Y_val[i]

        meanEEG, minEEG, maxEEG, stdEEG, meanPeaks, numPeaks, q25Peaks, q50Peaks, q75Peaks, yTruth = get_EEG_features(
            data_point, y_val)
        # print(meanEEG.shape)
        # print(q50Peaks.shape)

        overall_train_meanEEG = np.vstack((overall_train_meanEEG, meanEEG))
        overall_train_minEEG = np.vstack((overall_train_minEEG, minEEG))
        overall_train_maxEEG = np.vstack((overall_train_maxEEG, maxEEG))
        overall_train_stdEEG = np.vstack((overall_train_stdEEG, stdEEG))
        overall_train_q25Peaks = np.vstack((overall_train_q25Peaks, q25Peaks))
        overall_train_q50Peaks = np.vstack((overall_train_q50Peaks, q50Peaks))
        overall_train_q75Peaks = np.vstack((overall_train_q75Peaks, q75Peaks))
        overall_train_meanPeaks = np.vstack(
            (overall_train_meanPeaks, meanPeaks))
        overall_train_numPeaks = np.vstack((overall_train_numPeaks, numPeaks))
        overall_train_yTruth = np.vstack((overall_train_yTruth, yTruth))

    for i in range(0, len(X_test)):
        data_point = X_test[i]
        y_val = Y_test[i]

        meanEEG, minEEG, maxEEG, stdEEG, meanPeaks, numPeaks, q25Peaks, q50Peaks, q75Peaks, yTruth = get_EEG_features(
            data_point, y_val)
        # print(meanEEG.shape)
        # print(q50Peaks.shape)

        overall_test_meanEEG = np.vstack((overall_test_meanEEG, meanEEG))
        overall_test_minEEG = np.vstack((overall_test_minEEG, minEEG))
        overall_test_maxEEG = np.vstack((overall_test_maxEEG, maxEEG))
        overall_test_stdEEG = np.vstack((overall_test_stdEEG, stdEEG))
        overall_test_q25Peaks = np.vstack((overall_test_q25Peaks, q25Peaks))
        overall_test_q50Peaks = np.vstack((overall_test_q50Peaks, q50Peaks))
        overall_test_q75Peaks = np.vstack((overall_test_q75Peaks, q75Peaks))
        overall_test_meanPeaks = np.vstack((overall_test_meanPeaks, meanPeaks))
        overall_test_numPeaks = np.vstack((overall_test_numPeaks, numPeaks))
        overall_test_yTruth = np.vstack((overall_test_yTruth, yTruth))

    for i in range(0, len(X_sub_test)):
        data_point = X_sub_test[i]
        y_val = Y_sub_test[i]

        meanEEG, minEEG, maxEEG, stdEEG, meanPeaks, numPeaks, q25Peaks, q50Peaks, q75Peaks, yTruth = get_EEG_features(
            data_point, y_val)
        # print(meanEEG.shape)
        # print(q50Peaks.shape)

        overall_test_sub_meanEEG = np.vstack(
            (overall_test_sub_meanEEG, meanEEG))
        overall_test_sub_minEEG = np.vstack((overall_test_sub_minEEG, minEEG))
        overall_test_sub_maxEEG = np.vstack((overall_test_sub_maxEEG, maxEEG))
        overall_test_sub_stdEEG = np.vstack((overall_test_sub_stdEEG, stdEEG))
        overall_test_sub_q25Peaks = np.vstack(
            (overall_test_sub_q25Peaks, q25Peaks))
        overall_test_sub_q50Peaks = np.vstack(
            (overall_test_sub_q50Peaks, q50Peaks))
        overall_test_sub_q75Peaks = np.vstack(
            (overall_test_sub_q75Peaks, q75Peaks))
        overall_test_sub_meanPeaks = np.vstack(
            (overall_test_sub_meanPeaks, meanPeaks))
        overall_test_sub_numPeaks = np.vstack(
            (overall_test_sub_numPeaks, numPeaks))
        overall_test_sub_yTruth = np.vstack((overall_test_sub_yTruth, yTruth))

    # overall_train = np.dstack((overall_train_q75Peaks, overall_train_numPeaks, overall_train_meanPeaks, overall_train_stdEEG, overall_train_maxEEG, overall_train_minEEG))
    # overall_test = np.dstack((overall_test_q75Peaks, overall_test_numPeaks, overall_test_meanPeaks, overall_test_stdEEG, overall_test_maxEEG, overall_test_minEEG))

    return overall_train_maxEEG, overall_train_yTruth, overall_test_maxEEG, overall_test_yTruth, overall_test_sub_maxEEG, overall_test_sub_yTruth
Esempio n. 26
0
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import utils
from tensorflow.contrib.tensorboard.plugins import projector

PATH = os.getcwd()
LOG_DIR = PATH + '/project-tensorboard/log-1'
MODEL_DIR = os.path.join("F:/", "DL-code", "text2shape-data",
                         "nrrd_256_filter_div_32_solid")
# METADATA_DIR = os.path.join('project-tensorboard', 'log-1', 'metadata.tsv')
# SPRITE_DIR = os.path.join('project-tensorboard', 'log-1', 'sprite.jpg')

embeddings = utils.open_pickle(
    os.path.join("F:/", "DL-code", "text2shape-data", "shapenet",
                 "shapenet-embeddings", "text_embeddings_train.p"))
sample_tuple = embeddings['caption_embedding_tuples'][0]
embedding_shape = list(sample_tuple[3].shape)
assert len(embedding_shape) == 1

embedding_data = [item[3] for item in embeddings['caption_embedding_tuples']]
model_id = [item[2] for item in embeddings['caption_embedding_tuples']]
print("embedding shape: ", np.shape(embedding_data))
print("model_id shape: ", np.shape(model_id))

thumb_dir = os.path.join(MODEL_DIR, "resize_models")
image_data = utils.get_images(thumb_dir)

feat_cols = ['col' + str(i) for i in range(np.shape(embedding_data)[1])]
df = pd.DataFrame(embedding_data, columns=feat_cols)
Esempio n. 27
0
 def __init__(self,filename,uncertainty):
     self.uncertainty=uncertainty #this is the central parameter of the classifier. 5 shouldn't be too aggressive
     self.filename=filename
     from utils import open_pickle
     self.dic,self.total_ngood,self.total_links=open_pickle(filename,({},0,0))
Esempio n. 28
0
import datamodel
import utils
import cPickle
from bayesian import Bayesian
old_days=30
novel_days=1
hist_bins=8
chronology=utils.open_pickle("novelty.pck",{})
filter=Bayesian('novelty_bayes.pck',5)

def predict(link):
    words=title_words(link.title)
    if not words:
        return 0.
    novelty=sum(1. for w in words if isnovel(w,link.date))/len(words)
    return filter.predict(["novelty_%d"%int(novelty*hist_bins)]) > 0.

def train(links):
    chronology={}
    for l in links:
        words=title_words(l.title)
        for w in words:
            if chronology.has_key(w):
                chronology[w].append(l.date)
            else:
                chronology[w]=[l.date]
    cPickle.dump(chronology,open("novelty.pck","wb",-1))
    training_set=[]
    for l in links:
        words=title_words(l.title)
        novelty=sum(1. for w in words if isnovel(w,l.date))/len(words)
Esempio n. 29
0
def print_test_date_abv_1mm_to500mm(model, dest, sn, random_sampled_date,
                                    cluster):

    RFprec_to_ClusterLabels_dataset = utils.open_pickle(
        utils.find('*RFprec_to_ClusterLabels_dataset.pkl',
                   model.test_prepared_data_dir)[0])
    date_split = pd.DatetimeIndex(random_sampled_date).strftime(
        "%Y-%m-%d").values
    print(f'{utils.time_now()} - printing >1mm plot for {date_split}')

    rf_random_choice = RFprec_to_ClusterLabels_dataset.sel(
        time=random_sampled_date).precipitationCal[0]
    rf_random_choice_gt1mm = np.ma.masked_where(rf_random_choice <= 1,
                                                rf_random_choice)

    rf_ds_lon = RFprec_to_ClusterLabels_dataset.lon
    rf_ds_lat = RFprec_to_ClusterLabels_dataset.lat

    fig = plt.Figure(figsize=(12, 15))
    ax = fig.add_subplot(111, projection=ccrs.PlateCarree())

    fig.suptitle(f"RF received {date_split[0]} over 1mm",
                 fontweight='bold',
                 fontsize=16,
                 y=.95)
    ax.set_title(f"Predicted cluster: {cluster}. \n"\
                f"Areas in grey denote 0.0-0.99mm RF, and considered as no rain occurred.",
                fontsize=14, y=1.04)

    ax.set_facecolor('silver')
    ax.set_extent(
        [model.LON_W - 1, model.LON_E + 1, model.LAT_S - 1, model.LAT_N + 1])
    ax.coastlines(
        "50m",
        linewidth=.8,
        color='k',
    )
    ax.add_feature(cf.BORDERS, linewidth=.5, color='k', linestyle='dashed')

    a = plt.cm.pink(np.linspace(.9, .2, 2))
    b = plt.cm.gnuplot2(np.linspace(0.4, .9, 6))
    all_colors = np.vstack((a, b))
    terrain_map = colors.LinearSegmentedColormap.from_list(
        'terrain_map', all_colors)

    RF = ax.contourf(rf_ds_lon,
                     rf_ds_lat,
                     rf_random_choice_gt1mm.T,
                     np.linspace(0, 500, 501),
                     cmap=terrain_map,
                     extend='max')

    cbar_rf = fig.colorbar(RF, label='RF (mm)', orientation='horizontal', \
                            pad=0.05, shrink=.8, ticks=np.arange(0,500,50))
    cbar_rf.ax.xaxis.set_ticks_position('top')
    cbar_rf.ax.xaxis.set_label_position('top')
    ax.set_xticks(np.round(np.linspace(model.LON_W, model.LON_E, 10)),
                  crs=ccrs.PlateCarree())
    ax.xaxis.tick_top()
    ax.set_xlabel('')

    ax.set_yticks(np.round(np.linspace(model.LAT_S, model.LAT_N, 10)),
                  crs=ccrs.PlateCarree())
    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()
    ax.set_ylabel('')

    fn = f'{dest}/{model.period}_{model.dir_str}_clus_{cluster}_test_abv1mm_to500_sn{sn}_{date_split}.png'
    fig.savefig(fn, bbox_inches='tight', pad_inches=1)
    print(f'Extent saved @:\n{fn}')
    plt.close('all')
Esempio n. 30
0
    def __repr__(self):
        return ",".join(self.words).encode('utf-8')

class PredicateClassifier:
    def __init__(self,predicate):
        self.predicate=predicate
    def train(self,titles,weights,evaluations):
        """titles is [[words]]"""
        total=sum(weights[n]*evaluations[n] for n,words in enumerate(titles) if self.predicate(words))
        self.wordgood=1. if total >= 0 else -1.
    def predict(self,title):
        if self.predicate(title):
            return self.wordgood
        else: return 0.

trained=open_pickle("adaboost.pck",[])

def predict(link):
    #words=tokenize(link.title)
    words=mash_post(link)
    if sum(alpha * c.predict(words) for c,alpha in trained) >= 0:
        return 1.
    else:
        return -1.

def train(links):
    from math import exp,fabs,log
    fwords=most_frequent_words()
    classifiers=[PredicateClassifier(HasWordsPredicate([w])) for w in fwords]
    #classifiers.extend(PredicateClassifier(HasWordsPredicate(duo)) for duo in most_frequent_duos(fwords))
    titles=[mash_post(l) for l in links]
Esempio n. 31
0
def print_heavyrfforecastcomparison_gt50mm(model, dest, sn,
                                           random_sampled_date, cluster):

    date_split = pd.DatetimeIndex(random_sampled_date).strftime(
        "%Y-%m-%d").values
    print(
        f'{utils.time_now()} - printing heavyrfforecastcomparison >50mm plot for {date_split}'
    )

    test_ds = utils.open_pickle(
        utils.find('*RFprec_to_ClusterLabels_dataset.pkl',
                   model.test_prepared_data_dir)[0])
    rf_ds_lon = test_ds.lon
    rf_ds_lat = test_ds.lat
    test_ds_random_date = test_ds.sel(time=random_sampled_date)

    training_rf_ds = utils.open_pickle(
        model.RFprec_to_ClusterLabels_dataset_path)
    clus_size = training_rf_ds.where(training_rf_ds.cluster == cluster,
                                     drop=True).time.size
    pred_gt50mm = np.mean(training_rf_ds.where(
        training_rf_ds.cluster == cluster - 1, drop=True).sel(
            lon=slice(model.LON_W, model.LON_E),
            lat=slice(model.LAT_S, model.LAT_N)).precipitationCal > 50,
                          axis=0).values * 100
    pred_gt50mm = np.ma.masked_where(pred_gt50mm == 0, pred_gt50mm)

    gt_arr_gt50mm = (test_ds_random_date.precipitationCal > 50)[0].values
    gt_arr_gt150mm = (test_ds_random_date.precipitationCal > 150)[0].values
    gt_arr_gt250mm = (test_ds_random_date.precipitationCal > 250)[0].values
    gt_arr_gt500mm = (test_ds_random_date.precipitationCal > 500)[0].values

    fig = plt.Figure(figsize=(12, 15))
    ax = fig.add_subplot(111, projection=ccrs.PlateCarree())

    fig.suptitle(
        f"Comparison between actual heavy RF chance occurrences on {date_split[0]} to \npredicted forecast of heavy RF in cluster {cluster}.",
        fontweight='bold',
        fontsize=16,
        y=1.02)

    ax.set_title(f'Number of training dates in cluster {cluster}: {clus_size}. \n' \
            'Areas with white indicate 0.0% predicted chance of heavy RF. \n' \
            'Hatched patterns in blue represent RF above 50mm -- in lime-green: >150mm, \n' \
            'In pink: >250mm -- in cyan: >500mm.', y=1.07)

    ax.set_facecolor('w')
    ax.set_extent(
        [model.LON_W - 1, model.LON_E + 1, model.LAT_S - 1, model.LAT_N + 1])
    ax.coastlines(
        "50m",
        linewidth=.8,
        color='k',
    )
    ax.add_feature(cf.BORDERS, linewidth=.5, color='k', linestyle='dashed')

    zero_to_ten = plt.cm.pink(np.linspace(1, .2, 3))
    eleven_to_25 = plt.cm.gist_earth(np.linspace(0.75, 0.2, 5))
    twnty5_to_40 = plt.cm.gist_stern(np.linspace(0.3, 0.1, 5))
    all_colors = np.vstack((zero_to_ten, eleven_to_25, twnty5_to_40))
    terrain_map = colors.LinearSegmentedColormap.from_list(
        'terrain_map', all_colors)

    ax.set_xticks(np.round(np.linspace(model.LON_W, model.LON_E, 10)),
                  crs=ccrs.PlateCarree())
    ax.xaxis.tick_top()
    ax.set_xlabel('')

    ax.set_yticks(np.round(np.linspace(model.LAT_S, model.LAT_N, 10)),
                  crs=ccrs.PlateCarree())
    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()
    ax.set_ylabel('')

    # predicted chance of heavy RF
    contf_predictions = ax.contourf(rf_ds_lon,
                                    rf_ds_lat,
                                    pred_gt50mm.T,
                                    np.arange(0, 50, 5),
                                    cmap=terrain_map,
                                    extend='max')

    cbar_rf = fig.colorbar(contf_predictions, label='Predicted chance of heavy RF (%)', orientation='horizontal', \
                            pad=0.08, shrink=.8, ticks = np.arange(0,50,5)
                        )
    cbar_rf.ax.xaxis.set_ticks_position('top')
    cbar_rf.ax.xaxis.set_label_position('top')

    # actual zones of heavy RF (>50mm)
    rf_gt50mm = ax.contourf(rf_ds_lon,
                            rf_ds_lat,
                            gt_arr_gt50mm.T,
                            levels=[-1, 0, 1],
                            colors='none',
                            hatches=[None, '///'])
    rf_gt50mm.collections[1].set_edgecolor('royalblue')
    rf_gt50mm.collections[1].set_linewidth(0.05)

    rf_gt250mm = ax.contourf(rf_ds_lon,
                             rf_ds_lat,
                             gt_arr_gt150mm.T,
                             levels=[-1, 0, 1],
                             colors='none',
                             hatches=[None, '\\\\\\'])
    rf_gt250mm.collections[1].set_edgecolor('lime')
    rf_gt250mm.collections[1].set_linewidth(0.05)

    rf_gt250mm = ax.contourf(rf_ds_lon,
                             rf_ds_lat,
                             gt_arr_gt250mm.T,
                             levels=[-1, 0, 1],
                             colors='none',
                             hatches=[None, '...XX'])
    rf_gt250mm.collections[1].set_edgecolor('magenta')
    rf_gt250mm.collections[1].set_linewidth(0.1)

    rf_gt250mm = ax.contourf(rf_ds_lon,
                             rf_ds_lat,
                             gt_arr_gt500mm.T,
                             levels=[-1, 0, 1],
                             colors='none',
                             hatches=[None, 'XX*'])
    rf_gt250mm.collections[1].set_edgecolor('aqua')
    rf_gt250mm.collections[1].set_linewidth(0.1)

    fn = f'{dest}/{model.period}_{model.dir_str}_clus_{cluster}_test_heavyrfforecastcomparison_gt50mm_v2_sn{sn}_{date_split}.png'
    fig.savefig(fn, bbox_inches='tight', pad_inches=1)
    print(f'Extent saved @:\n{fn}')
    plt.close('all')
Esempio n. 32
0
def print_brier_gt1mm(model, dest, sn, random_sampled_date, cluster):

    date_split = pd.DatetimeIndex(random_sampled_date).strftime(
        "%Y-%m-%d").values
    print(f'{utils.time_now()} - printing Brier >1mm plot for {date_split}')

    test_ds = utils.open_pickle(
        utils.find('*RFprec_to_ClusterLabels_dataset.pkl',
                   model.test_prepared_data_dir)[0])
    rf_ds_lon = test_ds.lon
    rf_ds_lat = test_ds.lat
    test_ds_random_date = test_ds.sel(time=random_sampled_date)
    test_ds_random_date_gt1mm = test_ds_random_date.precipitationCal > 1
    gt_arr = test_ds_random_date_gt1mm[0].values

    training_rf_ds = utils.open_pickle(
        model.RFprec_to_ClusterLabels_dataset_path)
    clus_size = training_rf_ds.where(training_rf_ds.cluster == cluster,
                                     drop=True).time.size
    pred = np.mean(training_rf_ds.where(
        training_rf_ds.cluster == cluster - 1, drop=True).sel(
            lon=slice(model.LON_W, model.LON_E),
            lat=slice(model.LAT_S, model.LAT_N)).precipitationCal > 1,
                   axis=0).values

    gt_arr_flat = np.reshape(gt_arr, (gt_arr.shape[0] * gt_arr.shape[1]))[:,
                                                                          None]
    pred_flat = np.reshape(pred, (pred.shape[0] * pred.shape[1]))[:, None]

    gridded_brier_flat = np.array([
        np.apply_along_axis(func1d=brier_score_loss, axis=0, arr=e, y_prob=f)
        for e, f in zip(gt_arr_flat, pred_flat)
    ])
    gridded_brier = gridded_brier_flat.reshape(gt_arr.shape)

    fig = plt.Figure(figsize=(12, 15))
    ax = fig.add_subplot(111, projection=ccrs.PlateCarree())

    fig.suptitle(
        f"Brier scores for {date_split[0]} compared to \npredicted forecast of rainday (>1mm) for cluster {cluster}.",
        fontweight='bold',
        fontsize=16,
        y=1)

    ax.set_title(f'Number of training dates in cluster {cluster}: {clus_size}. \n' \
                'Scores approaching 0 indicate better calibrated predictive models ' \
                'while 0.25 likely represent forecasts of 50%, regardless of outcome. \n' \
                'Areas occupied by white-grids: did NOT receive any RF (i.e. <1mm), ' \
                'While areas unoccupied by the white-grid have receive >1mm of RF.', y=1.06)

    ax.set_facecolor('w')
    ax.set_extent(
        [model.LON_W - 1, model.LON_E + 1, model.LAT_S - 1, model.LAT_N + 1])
    ax.coastlines(
        "50m",
        linewidth=.8,
        color='k',
    )
    ax.add_feature(cf.BORDERS, linewidth=.5, color='k', linestyle='dashed')

    a = plt.cm.summer(np.linspace(0, 1, 6))
    b = plt.cm.autumn(np.linspace(1, 0, 4))
    all_colors = np.vstack((a, b))
    terrain_map = colors.LinearSegmentedColormap.from_list(
        'terrain_map', all_colors)

    ax.set_xticks(np.round(np.linspace(model.LON_W, model.LON_E, 10)),
                  crs=ccrs.PlateCarree())
    ax.xaxis.tick_top()
    ax.set_xlabel('')

    ax.set_yticks(np.round(np.linspace(model.LAT_S, model.LAT_N, 10)),
                  crs=ccrs.PlateCarree())
    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()
    ax.set_ylabel('')

    # brier: comparing >1mm predictions to GT >1mm
    briers = ax.contourf(rf_ds_lon,
                         rf_ds_lat,
                         gridded_brier.T,
                         np.linspace(0, 1, 11),
                         cmap=terrain_map,
                         extend='neither')

    cbar_rf = fig.colorbar(briers, label='Brier score', orientation='horizontal', \
                            pad=0.07, shrink=.8, ticks=np.arange(0,1.1,.1))
    cbar_rf.ax.xaxis.set_ticks_position('top')
    cbar_rf.ax.xaxis.set_label_position('top')

    # actual no-rain (<=1mm) zones
    rf_dots = ax.contourf(rf_ds_lon,
                          rf_ds_lat,
                          gt_arr.T,
                          levels=[-1, 0, 1],
                          colors='none',
                          hatches=['/-', None])
    rf_dots.collections[0].set_edgecolor('white')
    # rf_dots.collections[0].set_linewidth(0.)

    fn = f'{dest}/{model.period}_{model.dir_str}_clus_{cluster}_test_brier_gt1mm_v2_sn{sn}_{date_split}.png'
    fig.savefig(fn, bbox_inches='tight', pad_inches=1)
    print(f'Extent saved @:\n{fn}')
    plt.close('all')
Esempio n. 33
0
def print_test_date_zscore_against_fullmodel(model, dest, sn,
                                             random_sampled_date, cluster):

    RFprec_to_ClusterLabels_dataset = utils.open_pickle(
        utils.find('*RFprec_to_ClusterLabels_dataset.pkl',
                   model.test_prepared_data_dir)[0])
    rf_ds_lon = RFprec_to_ClusterLabels_dataset.lon
    rf_ds_lat = RFprec_to_ClusterLabels_dataset.lat
    date_split = pd.DatetimeIndex(random_sampled_date).strftime(
        "%Y-%m-%d").values
    print(f'{utils.time_now()} - printing z-score plot for {date_split}')
    rf_random_choice = RFprec_to_ClusterLabels_dataset.sel(
        time=random_sampled_date).precipitationCal[0]

    training_rf_ds = utils.open_pickle(
        model.RFprec_to_ClusterLabels_dataset_path)
    rf_for_random_choice_cluster = training_rf_ds.precipitationCal.where(
        training_rf_ds.cluster == cluster - 1, drop=True)
    gridmean = np.mean(rf_for_random_choice_cluster, axis=0)
    gridstd = np.std(rf_for_random_choice_cluster, axis=0)
    stdardized_rf_random_choice = ((rf_random_choice - gridmean) /
                                   gridstd).values

    fig = plt.Figure(figsize=(12, 15))
    ax = fig.add_subplot(111, projection=ccrs.PlateCarree())

    fig.suptitle(f"Z-Score for {date_split[0]}. Predicted cluster: {cluster}",
                 fontweight='bold',
                 fontsize=18,
                 y=.99,
                 ha='center')
    ax.set_title(f"Total dates/datapoints per grid in cluster {cluster}: "\
                f"{training_rf_ds.where(training_rf_ds.cluster==cluster, drop=True).time.size}"\
                f"\n-1.65<=Z<=1.65 == 90%\n-1.96<=Z<=1.96 == 95%\n-2.58<=Z<=2.58 == 99%",
                fontsize=14, y=1.04)

    ax.set_facecolor('w')
    ax.set_extent(
        [model.LON_W - 1, model.LON_E + 1, model.LAT_S - 1, model.LAT_N + 1])
    ax.coastlines("50m", linewidth=.5, color='w', alpha=1)
    ax.add_feature(cf.BORDERS, linewidth=.5, color='k', linestyle='dashed')

    two58_to_196 = plt.cm.gist_ncar(np.linspace(.75, .8, 3))
    one96_to_0 = plt.cm.copper(np.linspace(1, 0, 4))
    zero_to_196 = plt.cm.twilight_shifted(np.linspace(0, .4, 4))
    one96_to_258 = plt.cm.gist_rainbow(np.linspace(.55, .3, 3))
    all_colors = np.vstack(
        (two58_to_196, one96_to_0, zero_to_196, one96_to_258))
    terrain_map = colors.LinearSegmentedColormap.from_list(
        'terrain_map', all_colors)

    RF = ax.contourf(rf_ds_lon,
                     rf_ds_lat,
                     stdardized_rf_random_choice.T,
                     np.linspace(-3.3333, 3.3333, 21),
                     alpha=1,
                     cmap=terrain_map,
                     extend='both')

    cbar_rf = fig.colorbar(RF, label='Z-Score of grid computed against grid-mean & grid-SD of whole model.', orientation='horizontal', \
                            pad=0.05, shrink=.9, ticks=[-2.58, -1.96, -1.65, 0, 1.65, 1.96, 2.58])
    cbar_rf.ax.tick_params(size=5)
    cbar_rf.ax.xaxis.set_ticks_position('top')
    cbar_rf.ax.xaxis.set_label_position('top')
    ax.set_xticks(np.round(np.linspace(model.LON_W, model.LON_E, 10)),
                  crs=ccrs.PlateCarree())
    ax.xaxis.tick_top()
    ax.set_xlabel('')

    ax.set_yticks(np.round(np.linspace(model.LAT_S, model.LAT_N, 10)),
                  crs=ccrs.PlateCarree())
    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()
    ax.set_ylabel('')

    fn = f'{dest}/{model.period}_{model.dir_str}_clus_{cluster}_test_zscore_against_fullmodel_sn{sn}_{date_split}.png'
    fig.savefig(fn, bbox_inches='tight', pad_inches=1)
    print(f'Extent saved @:\n{fn}')
    plt.close('all')