def select_block_size_rf(nruns, group_type, loc_dict, Cvar_dict, idw_example_grid, shapefile,\
                         file_path_elev, idx_list, cluster_num1, cluster_num2, cluster_num3,
                         expand_area, boreal_shapefile):
    '''Evaluate the standard deviation of MAE values based on consective runs of the cross-valiation,
    in order to select the block/cluster size

    Parameters
    ----------
         nruns : int
              number of repetitions
         group_type : string
              whether using 'clusters' or 'blocks'
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         idw_example_grid  : ndarray
              used for reference of study area grid size
         shapefile : string
              path to the study area shapefile
         file_path_elev : string
              path to the elevation lookup file
         idx_list : int
              position of the elevation column in the lookup file
         cluster_num1-3 : int
              three cluster numbers to test, for blocking this must be one of three:25, 16, 9
              you can enter 'None' and it will automatically test 25, 16, 9
         expand_area : bool
              expand area by 200km
         boreal_shapefile : string
              path to shapefile with the boreal zone
              
    Returns
    ----------
         int
              - block/cluster number with lowest stdev
         float
              - average MAE of all the runs for that cluster/block number
    '''

    # Get group dictionaries

    if group_type == 'blocks':

        folds25 = mbk.make_block(idw_example_grid, 25)
        dictionaryGroups25 = mbk.sorting_stations(folds25, shapefile,
                                                  Cvar_dict)
        folds16 = mbk.make_block(idw_example_grid, 16)
        dictionaryGroups16 = mbk.sorting_stations(folds16, shapefile,
                                                  Cvar_dict)
        folds9 = mbk.make_block(idw_example_grid, 9)
        dictionaryGroups9 = mbk.sorting_stations(folds9, shapefile, Cvar_dict)

    elif group_type == 'clusters':
        if expand_area:
            inBoreal = GD.is_station_in_boreal(loc_dict, Cvar_dict,
                                               boreal_shapefile)
            # Overwrite cvar_dict
            Cvar_dict = {k: v for k, v in Cvar_dict.items() if k in inBoreal}
            dictionaryGroups25 = c3d.spatial_cluster(loc_dict, Cvar_dict,
                                                     shapefile, cluster_num1,
                                                     file_path_elev, idx_list,
                                                     False, False, False)
            dictionaryGroups16 = c3d.spatial_cluster(loc_dict, Cvar_dict,
                                                     shapefile, cluster_num2,
                                                     file_path_elev, idx_list,
                                                     False, False, False)
            dictionaryGroups9 = c3d.spatial_cluster(loc_dict, Cvar_dict,
                                                    shapefile, cluster_num3,
                                                    file_path_elev, idx_list,
                                                    False, False, False)
        else:
            dictionaryGroups25 = c3d.spatial_cluster(loc_dict, Cvar_dict,
                                                     shapefile, cluster_num1,
                                                     file_path_elev, idx_list,
                                                     False, False, False)
            dictionaryGroups16 = c3d.spatial_cluster(loc_dict, Cvar_dict,
                                                     shapefile, cluster_num2,
                                                     file_path_elev, idx_list,
                                                     False, False, False)
            dictionaryGroups9 = c3d.spatial_cluster(loc_dict, Cvar_dict,
                                                    shapefile, cluster_num3,
                                                    file_path_elev, idx_list,
                                                    False, False, False)

    else:
        print('Thats not a valid group type')
        sys.exit()

    block25_error = []
    block16_error = []
    block9_error = []
    if nruns <= 1:
        print('That is not enough runs to calculate the standard deviation!')
        sys.exit()

    for n in range(0, nruns):
        # We want same number of stations selected for each cluster number
        # We need to calculate, 5 folds x 25 clusters = 125 stations; 8 folds x 16 clusters = 128 stations, etc.
        # What is 30% of the stations
        target_stations = len(Cvar_dict.keys()) * 0.3
        fold_num1 = int(round(target_stations / cluster_num1))
        fold_num2 = int(round(target_stations / cluster_num2))
        fold_num3 = int(round(target_stations / cluster_num3))

        block25 = spatial_groups_rf(idw_example_grid, loc_dict, Cvar_dict,
                                    shapefile, cluster_num1, fold_num1, True,
                                    dictionaryGroups25, file_path_elev,
                                    idx_list, expand_area)
        block25_error.append(block25)

        block16 = spatial_groups_rf(idw_example_grid, loc_dict, Cvar_dict,
                                    shapefile, cluster_num2, fold_num2, True,
                                    dictionaryGroups16, file_path_elev,
                                    idx_list, expand_area)
        block16_error.append(block16)

        block9 = spatial_groups_rf(idw_example_grid, loc_dict, Cvar_dict,
                                   shapefile, cluster_num3, fold_num3, True,
                                   dictionaryGroups9, file_path_elev, idx_list,
                                   expand_area)
        block9_error.append(block9)

    stdev25 = statistics.stdev(block25_error)
    stdev16 = statistics.stdev(block16_error)
    stdev9 = statistics.stdev(block9_error)

    list_stdev = [stdev25, stdev16, stdev9]
    list_block_name = [cluster_num1, cluster_num2, cluster_num3]
    list_error = [block25_error, block16_error, block9_error]
    index_min = list_stdev.index(min(list_stdev))
    lowest_stdev = statistics.stdev(list_error[index_min])

    ave_MAE = sum(list_error[index_min]) / len(list_error[index_min])
    cluster_select = list_block_name[index_min]

    print(list_error[index_min])
    print(ave_MAE)
    print(lowest_stdev)
    print(cluster_select)
    return cluster_select, ave_MAE, lowest_stdev
Example #2
0
def spatial_kfold_idw(idw_example_grid, loc_dict, Cvar_dict, shapefile, d,
                      file_path_elev, idx_list, block_num, blocking_type, return_error):
    '''Spatially blocked k-fold cross-validation procedure for IDW

    Parameters
    ----------
         idw_example_grid  : ndarray
              used for reference of study area grid size
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         d : int
              the weighting for IDW interpolation
         file_path_elev : string
              path to the elevation lookup file
         idx_list : int
              position of the elevation column in the lookup file
         block_num : int
              number of blocks/clusters
         blocking_type : string
              whether to use clusters or blocks
         return_error : bool
              whether or not to return the error dictionary
              
    Returns
    ----------
         float
              - MAE estimate for entire surface
         int
              - Return the block number just so we can later write it into the file to keep track
         dictionary
              - if return_error = True, a dictionary of the absolute error at each fold when it was left out
    '''

    groups_complete = []
    error_dictionary = {}
    x_origin_list = []
    y_origin_list = []

    absolute_error_dictionary = {}
    projected_lat_lon = {}

    if blocking_type == 'cluster':
        cluster = c3d.spatial_cluster(
            loc_dict, Cvar_dict, shapefile, block_num, file_path_elev, idx_list, False, False, False)
    elif blocking_type == 'block':
        # Get the numpy array that delineates the blocks
        np_array_blocks = mbk.make_block(idw_example_grid, block_num)
        cluster = mbk.sorting_stations(
            np_array_blocks, shapefile, loc_dict, Cvar_dict)  # Now get the dictionary
    else:
        print('That is not a valid blocking method')
        sys.exit()
        
    for group in cluster.values():
        if group not in groups_complete:
            station_list = [k for k, v in cluster.items() if v == group]
            groups_complete.append(group)

    for station_name in Cvar_dict.keys():

        if station_name in loc_dict.keys():

            loc = loc_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
            Plat = float(Plat)
            Plon = float(Plon)
            projected_lat_lon[station_name] = [Plat, Plon]

    lat = []
    lon = []
    Cvar = []
    for station_name in sorted(Cvar_dict.keys()):
        if station_name in loc_dict.keys():
            if station_name not in station_list:  # This is the step where we hold back the fold
                loc = loc_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                cvar_val = Cvar_dict[station_name]
                lat.append(float(latitude))
                lon.append(float(longitude))
                Cvar.append(cvar_val)
            else:
                pass  # Skip the station

    y = np.array(lat)
    x = np.array(lon)
    z = np.array(Cvar)

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    xmax = bounds['maxx']
    xmin = bounds['minx']
    ymax = bounds['maxy']
    ymin = bounds['miny']
    pixelHeight = 10000
    pixelWidth = 10000

    num_col = int((xmax - xmin) / pixelHeight)
    num_row = int((ymax - ymin) / pixelWidth)

    # We need to project to a projected system before making distance matrix
    # We dont know but assume
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProj, yProj = pyproj.Proj('esri:102001')(x, y)

    yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
    xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

    Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
    Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

    Xi, Yi = np.meshgrid(Xi, Yi)
    Xi, Yi = Xi.flatten(), Yi.flatten()
    maxmin = [np.min(yProj_extent), np.max(yProj_extent),
              np.max(xProj_extent), np.min(xProj_extent)]

    vals = np.vstack((xProj, yProj)).T

    interpol = np.vstack((Xi, Yi)).T
    # Length of the triangle side from the cell to the point with data
    dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0])
    # Length of the triangle side from the cell to the point with data
    dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1])
    # euclidean distance, getting the hypotenuse
    distance_matrix = np.hypot(dist_not, dist_one)

    # what if distance is 0 --> np.inf? have to account for the pixel underneath
    weights = 1 / (distance_matrix**d)
    # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath
    weights[np.where(np.isinf(weights))] = 1 / (1.0E-50)
    weights /= weights.sum(axis=0)

    Zi = np.dot(weights.T, z)
    idw_grid = Zi.reshape(num_row, num_col)

    # Compare at a certain point
    for statLoc in station_list:
        coord_pair = projected_lat_lon[statLoc]

        x_orig = int(
            (coord_pair[0] - float(bounds['minx']))/pixelHeight)  # lon
        y_orig = int(
            (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
        x_origin_list.append(x_orig)
        y_origin_list.append(y_orig)

        interpolated_val = idw_grid[y_orig][x_orig]

        original_val = Cvar_dict[statLoc]
        absolute_error = abs(interpolated_val - original_val)
        absolute_error_dictionary[statLoc] = absolute_error

    # average of all the withheld stations
    MAE = sum(absolute_error_dictionary.values()) / \
        len(absolute_error_dictionary.values())
    if return_error:
        return block_num, MAE, absolute_error_dictionary
    else:
        return block_num, MAE
def spatial_kfold_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list,\
                     block_num, blocking_type, return_error):
    '''Spatially blocked k-fold cross-validation procedure for RF

    Parameters
    ----------
         idw_example_grid  : ndarray
              used for reference of study area grid size
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         file_path_elev : string
              path to the elevation lookup file
         elev_array : ndarray
              array for elevation, create using IDEW interpolation (this is a trick to speed up code)         
         idx_list : int
              position of the elevation column in the lookup file
         block_num : int
              number of blocks/clusters
         blocking_type : string
              whether to use clusters or blocks
         return_error : bool
              whether or not to return the error dictionary
              
    Returns
    ----------
         float
              - MAE estimate for entire surface
         int
              - Return the block number just so we can later write it into the file to keep track
         dictionary
              - if return_error = True, a dictionary of the absolute error at each fold when it was left out
    '''
    groups_complete = [
    ]  # If not using replacement, keep a record of what we have done
    error_dictionary = {}

    x_origin_list = []
    y_origin_list = []

    absolute_error_dictionary = {}
    projected_lat_lon = {}

    # Selecting blocknum
    if blocking_type == 'cluster':
        cluster = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile,
                                      block_num, file_path_elev, idx_list,
                                      False, False, False)
    elif blocking_type == 'block':
        # Get the numpy array that delineates the blocks
        np_array_blocks = mbk.make_block(idw_example_grid, block_num)
        cluster = mbk.sorting_stations(np_array_blocks, shapefile, loc_dict,
                                       Cvar_dict)  # Now get the dictionary
    else:
        print('That is not a valid blocking method')
        sys.exit()

    for group in cluster.values():
        if group not in groups_complete:
            station_list = [k for k, v in cluster.items() if v == group]
            groups_complete.append(group)

    for station_name in Cvar_dict.keys():
        if station_name in loc_dict.keys():

            loc = loc_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
            Plat = float(Plat)
            Plon = float(Plon)
            projected_lat_lon[station_name] = [Plat, Plon]

    lat = []
    lon = []
    Cvar = []
    for station_name in sorted(Cvar_dict.keys()):
        if station_name in loc_dict.keys():
            if station_name not in station_list:
                loc = loc_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                cvar_val = Cvar_dict[station_name]
                lat.append(float(latitude))
                lon.append(float(longitude))
                Cvar.append(cvar_val)
            else:

                pass

    y = np.array(lat)
    x = np.array(lon)
    z = np.array(Cvar)

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    xmax = bounds['maxx']
    xmin = bounds['minx']
    ymax = bounds['maxy']
    ymin = bounds['miny']
    pixelHeight = 10000
    pixelWidth = 10000

    num_col = int((xmax - xmin) / pixelHeight)
    num_row = int((ymax - ymin) / pixelWidth)

    # We need to project to a projected system before making distance matrix
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProj, yProj = pyproj.Proj('esri:102001')(x, y)

    df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z})

    yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
    xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

    Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
    Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

    Xi, Yi = np.meshgrid(Xi, Yi)
    Xi, Yi = Xi.flatten(), Yi.flatten()

    maxmin = [
        np.min(yProj_extent),
        np.max(yProj_extent),
        np.max(xProj_extent),
        np.min(xProj_extent)
    ]

    # Elevation
    # Preparing the coordinates to send to the function that will get the elevation grid
    concat = np.array((Xi.flatten(), Yi.flatten())).T
    send_to_list = concat.tolist()
    # The elevation function takes a tuple
    send_to_tuple = [tuple(x) for x in send_to_list]

    Xi1_grd = []
    Yi1_grd = []
    elev_grd = []
    # Get the elevations from the lookup file
    elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev,
                                               idx_list)

    for keys in elev_grd_dict.keys():  # The keys are each lat lon pair
        x = keys[0]
        y = keys[1]
        Xi1_grd.append(x)
        Yi1_grd.append(y)
        # Append the elevation data to the empty list
        elev_grd.append(elev_grd_dict[keys])

    elev_array = np.array(elev_grd)  # make an elevation array

    elev_dict = GD.finding_data_frm_lookup(
        zip(xProj, yProj), file_path_elev,
        idx_list)  # Get the elevations for the stations

    xProj_input = []
    yProj_input = []
    e_input = []

    for keys in zip(
            xProj,
            yProj):  # Repeat process for just the stations not the whole grid
        x = keys[0]
        y = keys[1]
        xProj_input.append(x)
        yProj_input.append(y)
        e_input.append(elev_dict[keys])

    source_elev = np.array(e_input)

    Xi1_grd = np.array(Xi1_grd)
    Yi1_grd = np.array(Yi1_grd)

    df_trainX = pd.DataFrame({
        'xProj': xProj,
        'yProj': yProj,
        'elevS': source_elev,
        'var': z
    })

    df_testX = pd.DataFrame({'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array})

    reg = RandomForestRegressor(n_estimators=100,
                                max_features='sqrt',
                                random_state=1)

    y = np.array(df_trainX['var']).reshape(-1, 1)
    X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']])
    X_test = np.array(df_testX[['Xi', 'Yi', 'elev']])

    reg.fit(X_train, y)

    Zi = reg.predict(X_test)

    rf_grid = Zi.reshape(num_row, num_col)

    # Calc the RMSE, MAE at the pixel loc
    # Delete at a certain point
    for statLoc in station_list:
        coord_pair = projected_lat_lon[statLoc]

        x_orig = int(
            (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
        y_orig = int(
            (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat
        x_origin_list.append(x_orig)
        y_origin_list.append(y_orig)

        interpolated_val = rf_grid[y_orig][x_orig]

        original_val = Cvar_dict[statLoc]
        absolute_error = abs(interpolated_val - original_val)
        absolute_error_dictionary[statLoc] = absolute_error

    # average of all the withheld stations
    MAE = sum(absolute_error_dictionary.values()) / \
        len(absolute_error_dictionary.values())
    if return_error:
        return block_num, MAE, absolute_error_dictionary
    else:
        return block_num, MAE
Example #4
0
def spatial_kfold_tps(idw_example_grid, loc_dict, Cvar_dict, shapefile, phi,
                      file_path_elev, idx_list, block_num, blocking_type,
                      return_error, calc_phi):
    '''Spatially blocked k-folds cross-validation procedure for thin plate splines

    Parameters
    ----------
         idw_example_grid  : ndarray
              used for reference of study area grid size
         loc_dict : dictionary
              the latitude and longitudes of the daily/hourly stations
         Cvar_dict : dictionary
              dictionary of weather variable values for each station
         shapefile : string
              path to the study area shapefile
         phi : float
              smoothing parameter for the thin plate spline, if 0 no smoothing
         file_path_elev : string
              path to the elevation lookup file
         idx_list : int
              position of the elevation column in the lookup file
         block_num : int
              number of blocks/clusters
         blocking_type : string
              whether to use clusters or blocks
         return_error : bool
              whether or not to return the error dictionary
         calc_phi : bool
             whether to calculate phi in the function, if True, phi can = None
             
    Returns
    ----------
         float
              - MAE estimate for entire surface
         int
              - Return the block number just so we can later write it into the file to keep track
         dictionary
              - if return_error = True, a dictionary of the absolute error at each fold when it was left out
    '''
    groups_complete = [
    ]  # If not using replacement, keep a record of what we have done
    error_dictionary = {}

    absolute_error_dictionary = {}  # for plotting
    station_name_list = []
    projected_lat_lon = {}

    if blocking_type == 'cluster':
        cluster = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile,
                                      block_num, file_path_elev, idx_list,
                                      False, False, False)
    elif blocking_type == 'block':
        # Get the numpy array that delineates the blocks
        np_array_blocks = mbk.make_block(idw_example_grid, block_num)
        cluster = mbk.sorting_stations(np_array_blocks, shapefile, loc_dict,
                                       Cvar_dict)  # Now get the dictionary
    else:
        print('That is not a valid blocking method')
        sys.exit()

    for group in cluster.values():
        if group not in groups_complete:
            station_list = [k for k, v in cluster.items() if v == group]
            groups_complete.append(group)

    for station_name in Cvar_dict.keys():
        if station_name in loc_dict.keys():
            station_name_list.append(station_name)

            loc = loc_dict[station_name]
            latitude = loc[0]
            longitude = loc[1]
            Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude)
            Plat = float(Plat)
            Plon = float(Plon)
            projected_lat_lon[station_name] = [Plat, Plon]

    lat = []
    lon = []
    Cvar = []

    # For preparing the empty grid w/ the values inserted for the rbf function
    x_origin_list = []
    y_origin_list = []
    z_origin_list = []

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds

    pixelHeight = 10000
    pixelWidth = 10000

    for station_name in sorted(Cvar_dict.keys()):
        if station_name in loc_dict.keys():
            if station_name not in station_list:
                loc = loc_dict[station_name]
                latitude = loc[0]
                longitude = loc[1]
                cvar_val = Cvar_dict[station_name]
                lat.append(float(latitude))
                lon.append(float(longitude))
                Cvar.append(cvar_val)

                coord_pair = projected_lat_lon[station_name]

                x_orig = int((coord_pair[0] - float(bounds['minx'])) /
                             pixelHeight)  # lon
                y_orig = int((coord_pair[1] - float(bounds['miny'])) /
                             pixelWidth)  # lat
                x_origin_list.append(x_orig)
                y_origin_list.append(y_orig)
                z_origin_list.append(Cvar_dict[station_name])
            else:
                pass

    y = np.array(lat)
    x = np.array(lon)
    z = np.array(Cvar)

    na_map = gpd.read_file(shapefile)
    bounds = na_map.bounds
    xmax = bounds['maxx']
    xmin = bounds['minx']
    ymax = bounds['maxy']
    ymin = bounds['miny']
    pixelHeight = 10000
    pixelWidth = 10000

    num_col = int((xmax - xmin) / pixelHeight)
    num_row = int((ymax - ymin) / pixelWidth)

    # We need to project to a projected system before making distance matrix
    # We dont know but assume
    source_proj = pyproj.Proj(proj='latlong', datum='NAD83')
    xProj, yProj = pyproj.Proj('esri:102001')(x, y)

    yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']])
    xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']])

    Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row)
    Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col)

    Xi, Yi = np.meshgrid(Xi, Yi)

    empty_grid = np.empty((
        num_row,
        num_col,
    )) * np.nan

    for x, y, z in zip(x_origin_list, y_origin_list, z_origin_list):
        empty_grid[y][x] = z

    vals = ~np.isnan(empty_grid)

    func = interpolate.Rbf(Xi[vals],
                           Yi[vals],
                           empty_grid[vals],
                           function='thin_plate',
                           smooth=phi)
    thin_plate = func(Xi, Yi)
    spline = thin_plate.reshape(num_row, num_col)

    # Calc the RMSE, MAE, at the pixel loc
    # Delete at a certain point
    for statLoc in station_list:
        coord_pair = projected_lat_lon[statLoc]

        x_orig = int(
            (coord_pair[0] - float(bounds['minx'])) / pixelHeight)  # lon
        y_orig = int(
            (coord_pair[1] - float(bounds['miny'])) / pixelWidth)  # lat

        interpolated_val = spline[y_orig][x_orig]

        original_val = Cvar_dict[statLoc]
        absolute_error = abs(interpolated_val - original_val)
        absolute_error_dictionary[statLoc] = absolute_error

    # average of all the withheld stations
    MAE = sum(absolute_error_dictionary.values()) / \
        len(absolute_error_dictionary.values())
    if return_error:
        return block_num, MAE, absolute_error_dictionary
    else:
        return block_num, MAE