Ejemplo n.º 1
0
def _calculate_interpolated_polygon_population_from_correspondence_table(
    polygon, raster, corresp_table, force_crs_match=True, na_value=255
):

    """Function that returns the interpolated population of a given polygon
    according to a correspondence table previous built.

    Parameters
    ----------

    polygon         : polygon for the profile (it can be a row of a geopandas)

    raster          : the associated raster (from rasterio.open)

    corresp_table   : correspondence table that has the interpolated population for each pixel. This object is created with the function 'create_non_zero_population_by_pixels_locations'.

    force_crs_match : bool. Default is True.
                      Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
                      It is recommended to let this argument as True.

    na_value        : int. Default is 255.
                      The number which is considered to be 'Not a Number' (NaN) in the raster pixel values.

    Notes
    -----
    When you clip a specific polygon, there are pixels that lie beyond the polygon extent, because the clipping is rectangular.
    Therefore, the population could be wrongly summed from another spatial unit.
    The solution is to build a pandas and filter the pixels different than 255. This is done during the construction of the polygon summary for the resulting population of this function.
    """

    _check_presence_of_crs(polygon)

    if force_crs_match:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            polygon_projected = polygon.to_crs(crs=raster.crs.data)
    else:
        warnings.warn(
            "The polygon is not being reprojected. The clipping might be being performing on unmatching polygon to the raster."
        )

    coords = getFeatures(polygon_projected)
    out_img, out_transform = mask(dataset=raster, shapes=coords, crop=True)
    lons, lats = create_lon_lat(out_img, out_transform)
    data = {
        "lons": np.ndarray.flatten(lons).round().astype(int).tolist(),
        "lats": np.ndarray.flatten(lats).round().astype(int).tolist(),
        "pixel_value": np.ndarray.flatten(out_img),
    }
    polygon_summary_full = pd.DataFrame.from_dict(data)

    # Remove pixels of the polygon that do not belong to the spatial unit, but might be from another one
    polygon_summary = polygon_summary_full[polygon_summary_full.pixel_value != na_value]

    merged_polygon = corresp_table.merge(polygon_summary, on=["lons", "lats"])

    pop = merged_polygon["pop_value"].sum()

    return pop
def _calculate_interpolated_population_from_correspondence_table(
        geodataframe,
        raster,
        corresp_table,
        variable_name=None,
        force_crs_match=True):
    """Function that returns the interpolated population of an entire geopandas
    according to a correspondence table previous built.

    Parameters
    ----------

    geodataframe    : a GeoDataFrame from geopandas

    raster          : the associated raster (from rasterio.open)

    corresp_table   : correspondence table that has the interpolated population for each pixel. This object is created with the function 'create_non_zero_population_by_pixels_locations'.

    force_crs_match : bool. Default is True.
                      Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
                      It is recommended to let this argument as True.

    Notes
    -----
    This function returns the same GeoDataFrame used as input with the addition of a new variable called 'interpolated_population', which is the resulting population.
    """

    _check_presence_of_crs(geodataframe)

    final_geodataframe = geodataframe.copy()[[geodataframe.geometry.name]]
    pop_final = np.empty(len(geodataframe))
    with rasterio.open(raster) as raster:

        pbar = tqdm(total=len(geodataframe),
                    desc="Estimating target polygon values")

        for line_index in range(len(geodataframe)):
            polygon = geodataframe.iloc[[line_index]]
            pop_aux = _calculate_interpolated_polygon_population_from_correspondence_table(
                polygon, raster, corresp_table, force_crs_match)
            pop_final[line_index] = pop_aux

            pbar.update(1)

        pbar.close()
        final_geodataframe[variable_name] = pop_final

    return final_geodataframe
def subset_gdf_polygons_from_raster(geodataframe,
                                    raster,
                                    force_crs_match=True):
    """Function that returns only the polygons that actually have some
    intersection with a given raster.

    Parameters
    ----------

    geodataframe    : a GeoDataFrame from geopandas

    raster          : the associated raster (from rasterio.open)

    force_crs_match : bool. Default is True.
                      Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
                      It is recommended to let this argument as True.
    """

    _check_presence_of_crs(geodataframe)
    if force_crs_match:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            reprojected_gdf = geodataframe.to_crs(crs=raster.crs.data)
    else:
        warnings.warn(
            "The geodataframe is not being reprojected. The clipping might be being performing on unmatching polygon to the raster."
        )

    # has_intersection is a boolean vector: True if the polygon has some overlay with raster, False otherwise
    has_intersection = []

    pbar = tqdm(total=len(reprojected_gdf), desc="Subsetting polygons")
    for i in list(range(len(reprojected_gdf))):
        pbar.update(1)
        coords = getFeatures(reprojected_gdf.iloc[[i]])
        try:
            out_img = mask(dataset=raster, shapes=coords, crop=True)[0]
            has_intersection.append(True)
        except:
            has_intersection.append(False)
    pbar.close()

    overlayed_subset_gdf = reprojected_gdf.iloc[has_intersection]
    overlayed_subset_gdf = overlayed_subset_gdf.set_geometry(
        overlayed_subset_gdf.geometry.name)

    return overlayed_subset_gdf
def return_area_profile(polygon, raster, force_crs_match=True):
    """DEPRECATED
    
    Function that counts the amount of pixel types it is inside a polygon within a given raster
    
    Parameters
    ----------
    
    polygon         : polygon for the profile (it can be a row of a geopandas)
    
    raster          : the associated raster (from rasterio.open)
    
    force_crs_match : bool. Default is True.
                      Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. 
                      It is recommended to let this argument as True.

    """

    _check_presence_of_crs(polygon)

    if force_crs_match:
        polygon_projected = polygon.to_crs(crs=raster.crs.data)
    else:
        warnings.warn(
            "The polygon is not being reprojected. The clipping might be being performing on unmatching polygon to the raster."
        )

    coords = getFeatures(polygon_projected)
    out_img = mask(dataset=raster, shapes=coords, crop=True)[0]

    x = np.ndarray.flatten(out_img)
    y = np.bincount(x)
    ii = np.nonzero(y)[0]

    profile = pd.DataFrame.from_dict(
        dict(
            zip(
                np.core.defchararray.add("Type_", ii.astype(str)),
                y[ii].reshape(len(y[ii]), 1),
            )))  # pandas like

    polygon_with_profile = pd.concat([polygon.reset_index(drop=True), profile],
                                     axis=1)  # Appends in the end

    return polygon_with_profile
def append_profile_in_gdf(geodataframe, raster, force_crs_match=True):
    """DEPRECATED
    
    Function that appends the columns of the profile in a geopandas according to a given raster
    
    geodataframe    : a GeoDataFrame from geopandas that has overlay with the raster. The variables of the profile will be appended in this data.
                      If some polygon do not overlay the raster, consider a preprocessing step using the function subset_gdf_polygons_from_raster.
    
    raster          : the associated NLCD raster (from rasterio.open).
    
    force_crs_match : bool. Default is True.
                      Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. 
                      It is recommended to let this argument as True.
                      
    Notes
    -----
    The generated geodataframe will input the value 0 for each Type that is not present in the raster for each polygon. 
    
    """

    _check_presence_of_crs(geodataframe)

    final_geodata = gpd.GeoDataFrame()

    for i in range(len(geodataframe)):

        aux = return_area_profile(geodataframe.iloc[[i]],
                                  raster=raster,
                                  force_crs_match=force_crs_match)
        final_geodata = pd.concat(
            [final_geodata.reset_index(drop=True), aux], axis=0, sort=False
        )  # sort = False means that the profile will be appended in the end of the result
        final_geodata.reset_index(drop=True)
        print(
            "Polygon profile {} appended out of {}".format(
                i + 1, len(geodataframe)),
            end="\r",
        )

    # Input 0 in Types which are not present in the raster for the polygons
    filter_col = [col for col in final_geodata if col.startswith("Type_")]
    final_geodata[filter_col] = final_geodata[filter_col].fillna(value=0)

    return final_geodata
def subset_gdf_polygons_from_raster(geodataframe,
                                    raster,
                                    force_crs_match=True):
    """Function that returns only the polygons that actually have some intersection with a given raster
    
    Parameters
    ----------
    
    geodataframe    : a GeoDataFrame from geopandas
    
    raster          : the associated raster (from rasterio.open)
    
    force_crs_match : bool. Default is True.
                      Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. 
                      It is recommended to let this argument as True.

    """

    _check_presence_of_crs(geodataframe)

    if force_crs_match:
        reprojected_gdf = geodataframe.to_crs(crs=raster.crs.data)
    else:
        warnings.warn(
            'The geodataframe is not being reprojected. The clipping might be being performing on unmatching polygon to the raster.'
        )

    # has_intersection is a boolean vector: True if the polygon has some overlay with raster, False otherwise
    has_intersection = []
    for i in list(range(len(reprojected_gdf))):
        print('Polygon {} checked out of {}'.format(i, len(reprojected_gdf)),
              end="\r")
        coords = getFeatures(reprojected_gdf.iloc[[i]])
        try:
            out_img = mask(dataset=raster, shapes=coords, crop=True)[0]
            has_intersection.append(True)
        except:
            has_intersection.append(False)

    overlayed_subset_gdf = reprojected_gdf.iloc[has_intersection]
    overlayed_subset_gdf = overlayed_subset_gdf.set_geometry('geometry')

    return overlayed_subset_gdf
def calculate_interpolated_population_from_correspondence_table(
        geodataframe, raster, corresp_table, force_crs_match=True):
    """Function that returns the interpolated population of an entire geopandas according to a correspondence table previous built
    
    Parameters
    ----------
    
    geodataframe    : a GeoDataFrame from geopandas
    
    raster          : the associated raster (from rasterio.open)
    
    corresp_table   : correspondence table that has the interpolated population for each pixel. This object is created with the function 'create_non_zero_population_by_pixels_locations'.
    
    force_crs_match : bool. Default is True.
                      Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. 
                      It is recommended to let this argument as True.
    
    Notes
    -----
    This function returns the same GeoDataFrame used as input with the addition of a new variable called 'interpolated_population', which is the resulting population.
    
    """

    _check_presence_of_crs(geodataframe)

    final_geodataframe = geodataframe.copy()
    pop_final = np.empty(len(geodataframe))

    for line_index in range(len(geodataframe)):
        polygon = geodataframe.iloc[[line_index]]
        pop_aux = calculate_interpolated_polygon_population_from_correspondence_table(
            polygon, raster, corresp_table, force_crs_match)
        pop_final[line_index] = pop_aux

        print(
            "Polygon {} processed out of {}".format(line_index + 1,
                                                    len(geodataframe)),
            end="\r",
        )

    final_geodataframe["interpolated_population"] = pop_final

    return final_geodataframe
def _fast_append_profile_in_gdf(geodataframe,
                                raster_path,
                                force_crs_match=True):
    """Function that appends the columns of the profile in a geopandas
    according to a given raster taking advantage of rasterstats.

    geodataframe    : geopandas.GeoDataFrame
        geodataframe that has overlay with the raster. The variables of the profile will be appended in this data.
        If some polygon do not overlay the raster, consider a preprocessing step using the function subset_gdf_polygons_from_raster.
    raster_path     : str
        the path to the associated raster image.
    force_crs_match : bool, Default is True.
        Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
        It is recommended to let this argument as True.

    Notes
    -----
    The generated geodataframe will input the value 0 for each Type that is not present in the raster for each polygon.
    """

    _check_presence_of_crs(geodataframe)
    if force_crs_match:
        with rasterio.open(raster_path) as raster:
            # raster =
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                geodataframe = geodataframe.to_crs(crs=raster.crs.data)
    else:
        warnings.warn(
            "The GeoDataFrame is not being reprojected. The clipping might be being performing on unmatching polygon to the raster."
        )

    zonal_gjson = rs.zonal_stats(geodataframe,
                                 raster_path,
                                 prefix="Type_",
                                 geojson_out=True,
                                 categorical=True)

    zonal_ppt_gdf = GeoDataFrame.from_features(zonal_gjson)

    return zonal_ppt_gdf
Ejemplo n.º 9
0
def _create_non_zero_population_by_pixels_locations(
    geodataframe, raster, pop_string, weights=None, force_crs_match=True
):

    """Function that returns the actual population of each pixel from a given
    geodataframe and variable.

    geodataframe       : a geopandas dataframe that it is contained in the raster

    raster             : the raster used from rasterio

    pop_string         : a string of the column name of the geodataframe that the estimation will be made

    weights            : vector of weights in each position of the pixel values according 'return_weights_from_regression' function. This must be provided by the user.

    force_crs_match    : bool. Default is True.
                         Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
                         It is recommended to let this argument as True.
    """

    _check_presence_of_crs(geodataframe)

    if not force_crs_match:
        warnings.warn(
            "The polygon is not being reprojected. The clipping might be being performing on unmatching polygon to the raster."
        )

    else:
        with rasterio.open(raster) as raster:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                geodataframe_projected = geodataframe.to_crs(crs=raster.crs.data)
            result_pops_array = np.array([])
            result_lons_array = np.array([])
            result_lats_array = np.array([])

            pbar = tqdm(
                total=len(geodataframe_projected),
                desc="Estimating population per pixel",
            )

            for line_index in range(len(geodataframe_projected)):
                polygon_projected = geodataframe_projected.iloc[[line_index]]

                coords = getFeatures(polygon_projected)

                out_img, out_transform = mask(dataset=raster, shapes=coords, crop=True)

                """Calculating the population for each pixel"""
                trans_numpy = weights[out_img]  # Pixel population from regression
                orig_estimate = polygon_projected[
                    pop_string
                ]  # Original Population Value of The polygon
                correction_term = orig_estimate / trans_numpy.sum()
                final_pop_numpy_pre = trans_numpy * np.array(correction_term)

                flatten_final_pop_numpy_pre = np.ndarray.flatten(final_pop_numpy_pre)

                non_zero_pop_index = np.where(flatten_final_pop_numpy_pre != 0)

                final_pop_numpy = flatten_final_pop_numpy_pre[non_zero_pop_index]

                """Retrieving location of each pixel"""
                lons, lats = create_lon_lat(out_img, out_transform)

                final_lons = np.ndarray.flatten(lons)[non_zero_pop_index]
                final_lats = np.ndarray.flatten(lats)[non_zero_pop_index]

                """Append all flattens numpy arrays"""
                result_pops_array = np.append(result_pops_array, final_pop_numpy)
                result_lons_array = np.append(result_lons_array, final_lons)
                result_lats_array = np.append(result_lats_array, final_lats)

                pbar.update(1)

            data = {
                "pop_value": result_pops_array,
                "lons": result_lons_array.round().astype(int).tolist(),
                "lats": result_lats_array.round().astype(int).tolist(),
            }

            corresp = pd.DataFrame.from_dict(data)
        pbar.close()

    return corresp
Ejemplo n.º 10
0
def _return_weights_from_xgboost(
    geodataframe,
    raster_path,
    pop_string,
    codes=[21, 22, 23, 24],
    n_pixels_option_values=256,
    tuned_xgb=False,
    gbm_hyperparam_grid={
        "learning_rate": [0.001, 0.01, 0.1],
        "n_estimators": [200],
        "subsample": [0.3, 0.5],
        "max_depth": [4, 5, 6],
        "num_boosting_rounds": [10, 20],
    },
    force_crs_match=True,
    na_value=255,
    ReLU=True,
):

    """Function that returns the weights of each land type according to NLCD
    types/codes given by Extreme Gradient Boost model (XGBoost)

    Parameters
    ----------

    geodataframe           : a geopandas geoDataFrame used to build regression

    raster_path            : the path to the associated raster image.

    pop_string             : the name of the variable on geodataframe that the regression shall be conducted

    codes                  : an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD).
                             The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html
                             The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity).

    n_pixels_option_values : number of options of the pixel values of rasterior. Default is 256.

    tuned_xgb              : bool. Default is False.
                             If True the XGBoost model will be tuned making a grid search using gbm_hyperparam_grid dictionary a picking the best model in terms of mean squared error with some pre-defined number of cross-validation.
                             Otherwise, the XGBoost model is fitted with default values of xgboost.train function from xgboost Python library.

    gbm_hyperparam_grid    : a dictionary that represent the grid for the grid search of XGBoost.

    force_crs_match        : bool. Default is True.
                             Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
                             It is recommended to let this argument as True.

    na_value               : int. Default is 255.
                             The number which is considered to be 'Not a Number' (NaN) in the raster pixel values.

    ReLU                   : bool. Default is True.
                             Wheter the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types.

    Notes
    -----
    1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
    1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
    2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256.
    3) The returning weights represent the average of the Shapley's values from each feature.
    """
    try:
        import xgboost as xgb
        import shap
    except ImportError as e:
        raise ImportError("xgboost and shap are required to perform this.")

    _check_presence_of_crs(geodataframe)

    if na_value in codes:
        raise ValueError("codes should not assume the na_value value.")

    profiled_df = _fast_append_profile_in_gdf(
        geodataframe[["geometry", pop_string]], raster_path, force_crs_match
    )  # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it).

    # If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match
    codes.sort()

    str_codes = [str(i) for i in codes]
    feature_names = ["Type_" + s for s in str_codes]

    y = profiled_df[pop_string]
    X = profiled_df[feature_names]

    if tuned_xgb == False:

        # Create the DMatrix
        xgb_dmatrix = xgb.DMatrix(X, y)

        # Create the parameter dictionary
        params = {"objective": "reg:linear"}

        # Train the model
        xg_reg = xgb.train(params=params, dtrain=xgb_dmatrix)

    if tuned_xgb == True:

        try:
            from sklearn.model_selection import GridSearchCV
        except ImportError as e:
            raise ImportError("sklearn is required to perform this.")

        gbm = xgb.XGBRegressor()
        grid_mse = GridSearchCV(
            estimator=gbm,
            param_grid=gbm_hyperparam_grid,
            scoring="neg_mean_squared_error",
            cv=4,  # 4-fold crossvalidation
            verbose=3,  # Prints the grid search profile
            n_jobs=-1,
        )  # Process the GridSearch in parallel all cores availables

        # Fit the grid to the data
        grid_mse.fit(X, y)

        best_params = grid_mse.best_params_
        best_params["objective"] = "reg:linear"

        # Create the DMatrix
        xgb_dmatrix = xgb.DMatrix(X, y)

        # Train the model from the best parameters of the grid search
        xg_reg = xgb.train(params=best_params, dtrain=xgb_dmatrix)

    # Build explainer and fit Shapley's values (https://github.com/slundberg/shap)
    explainer = shap.TreeExplainer(xg_reg, feature_dependence="independent")
    shap_values = explainer.shap_values(X)
    weights_from_xgb = shap_values.mean(axis=0)  # This is already sorted by pixel Type

    weights = np.zeros(n_pixels_option_values)
    weights[codes] = list(weights_from_xgb)  # Convert to list a dict_values

    if ReLU:
        weights = np.where(weights < 0, 0, weights)

    return weights
Ejemplo n.º 11
0
def harmonize(
    raw_community,
    target_year=None,
    weights_method="area",
    extensive_variables=None,
    intensive_variables=None,
    allocate_total=True,
    raster="nlcd_2011",
    codes=[21, 22, 23, 24],
    force_crs_match=True,
    index="geoid",
    time_col="year",
):
    r"""
    Use spatial interpolation to standardize neighborhood boundaries over time.

    Parameters
    ----------
    raw_community : list
        Multiple GeoDataFrames given by a list (see (1) in Notes).

    target_year : string
        The target year that represents the bondaries of all datasets generated
        in the harmonization. Could be, for example '2010'.

    weights_method : string
        The method that the harmonization will be conducted. This can be set to:
            "area"                          : harmonization according to area weights.
            "land_type_area"                : harmonization according to the Land Types considered 'populated' areas.
            "land_type_Poisson_regression"  : NOT YET INTRODUCED.
            "land_type_Gaussian_regression" : NOT YET INTRODUCED.

    extensive_variables : list
        The names of variables in each dataset of raw_community that contains
        extensive variables to be harmonized (see (2) in Notes).

    intensive_variables : list
        The names of variables in each dataset of raw_community that contains
        intensive variables to be harmonized (see (2) in Notes).

    allocate_total : boolean
        True if total value of source area should be allocated.
        False if denominator is area of i. Note that the two cases
        would be identical when the area of the source polygon is
        exhausted by intersections. See (3) in Notes for more details.

    raster : str
        the path to the associated raster image that has the types of
        each pixel in the spatial context.
        Only taken into consideration for harmonization raster based.

    codes : an integer list of codes values that should be considered as
        'populated'. Since this draw inspiration using the National Land Cover
        Database (NLCD), the default is 21 (Developed, Open Space),
        22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and
        24 (Developed, High Intensity). The description of each code can be
        found here:
        https://www.mrlc.gov/sites/default/files/metadata/landcover.html
        Only taken into consideration for harmonization raster based.

    force_crs_match : bool. Default is True.
        Wheter the Coordinate Reference System (CRS) of the polygon will be
        reprojected to the CRS of the raster file. It is recommended to
        leave this argument True.
        Only taken into consideration for harmonization raster based.


    Notes
    -----
    1) Each GeoDataFrame of raw_community is assumed to have a 'year' column
       Also, all GeoDataFrames must have the same Coordinate Reference System (CRS).

    2) A quick explanation of extensive and intensive variables can be found
    here: http://ibis.geog.ubc.ca/courses/geob370/notes/intensive_extensive.htm

    3) For an extensive variable, the estimate at target polygon j (default case) is:

        v_j = \sum_i v_i w_{i,j}

        w_{i,j} = a_{i,j} / \sum_k a_{i,k}

        If the area of the source polygon is not exhausted by intersections with
        target polygons and there is reason to not allocate the complete value of
        an extensive attribute, then setting allocate_total=False will use the
        following weights:

        v_j = \sum_i v_i w_{i,j}

        w_{i,j} = a_{i,j} / a_i

        where a_i is the total area of source polygon i.

        For an intensive variable, the estimate at target polygon j is:

        v_j = \sum_i v_i w_{i,j}

        w_{i,j} = a_{i,j} / \sum_k a_{k,j}

    """
    if extensive_variables is None and intensive_variables is None:
        raise ValueError(
            "You must pass a set of extensive and/or intensive variables to interpolate"
        )

    _check_presence_of_crs(raw_community)
    dfs = raw_community.copy()
    times = dfs[time_col].unique()

    target_df = dfs[dfs[time_col] == target_year].reset_index()

    interpolated_dfs = {}
    interpolated_dfs[target_year] = target_df.copy()

    for i in times:
        source_df = dfs[dfs[time_col] == i]

        if weights_method == "area":

            # In area_interpolate, the resulting variable has same lenght as target_df
            interpolation = area_interpolate_binning(
                source_df,
                target_df.copy(),
                extensive_variables=extensive_variables,
                intensive_variables=intensive_variables,
                allocate_total=allocate_total,
            )

        elif weights_method == "land_type_area":
            try:

                area_tables_raster_fitted = area_tables_raster(
                    source_df,
                    target_df.copy(),
                    raster_path=raster,
                    codes=codes,
                    force_crs_match=force_crs_match,
                )

                # In area_interpolate, the resulting variable has same lenght as target_df
                interpolation = area_interpolate(
                    source_df,
                    target_df.copy(),
                    extensive_variables=extensive_variables,
                    intensive_variables=intensive_variables,
                    allocate_total=allocate_total,
                    tables=area_tables_raster_fitted,
                )
            except IOError:
                raise IOError(
                    "You must have NLCD raster data installed locally to use the"
                    "`land_type_area` method. You can install it using the"
                    "`tobler.data.store_rasters()` function from the `tobler` package"
                )
        else:
            raise ValueError('weights_method must of one of ["area", "land_type_area"]')

        profiles = []
        if extensive_variables:
            profile = pd.DataFrame(interpolation[0], columns=extensive_variables)
            profiles.append(profile)

        if intensive_variables:
            profile = pd.DataFrame(interpolation[1], columns=intensive_variables)
            profiles.append(profile)

        profile = pd.concat(profiles, sort=True)
        profile["geometry"] = target_df["geometry"]
        profile[index] = target_df[index]
        profile[time_col] = i

        interpolated_dfs[i] = profile

    harmonized_df = gpd.GeoDataFrame(
        pd.concat(list(interpolated_dfs.values()), sort=True)
    )

    return harmonized_df
Ejemplo n.º 12
0
def harmonize(raw_community,
              target_year_of_reference,
              weights_method='area',
              extensive_variables=[],
              intensive_variables=[],
              allocate_total=True,
              raster_path=None,
              codes=[21, 22, 23, 24],
              force_crs_match=True):
    """
    Harmonize Multiples GeoData Sources with different approaches

    Parameters
    ----------

    raw_community : list
        Multiple GeoDataFrames given by a list (see (1) in Notes).
    
    target_year_of_reference : string
        The target year that represents the bondaries of all datasets generated in the harmonization. Could be, for example '2010'.
        
    weights_method : string
        The method that the harmonization will be conducted. This can be set to:
            "area"                          : harmonization according to area weights.
            "land_type_area"                : harmonization according to the Land Types considered 'populated' areas.
            "land_type_Poisson_regression"  : NOT YET INTRODUCED.
            "land_type_Gaussian_regression" : NOT YET INTRODUCED.

    extensive_variables : list
        The names of variables in each dataset of raw_community that contains extensive variables to be harmonized (see (2) in Notes).
        
    intensive_variables : list
        The names of variables in each dataset of raw_community that contains intensive variables to be harmonized (see (2) in Notes).
    
    allocate_total : boolean
        True if total value of source area should be allocated.
        False if denominator is area of i. Note that the two cases
        would be identical when the area of the source polygon is
        exhausted by intersections. See (3) in Notes for more details.
        
    raster_path : the path to the associated raster image that has the types of each pixel in the spatial context.
        Only taken into consideration for harmonization raster based.
        
    codes : an integer list of codes values that should be considered as 'populated'.
        Since this draw inspiration using the National Land Cover Database (NLCD), the default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity).
        The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html
        Only taken into consideration for harmonization raster based.
        
    force_crs_match : bool. Default is True.
        Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. 
        It is recommended to let this argument as True.
        Only taken into consideration for harmonization raster based.

    
    Notes
    -----
    
    1) Each GeoDataFrame of raw_community is assumed to have a 'year' column. Also, all GeoDataFrames must have the same Coordinate Reference System (CRS).
    
    2) A quick explanation of extensive and intensive variables can be found here: http://ibis.geog.ubc.ca/courses/geob370/notes/intensive_extensive.htm.
    
    3) For an extensive variable, the estimate at target polygon j (default case) is:

        v_j = \sum_i v_i w_{i,j}
    
        w_{i,j} = a_{i,j} / \sum_k a_{i,k}
    
        If the area of the source polygon is not exhausted by intersections with
        target polygons and there is reason to not allocate the complete value of
        an extensive attribute, then setting allocate_total=False will use the
        following weights:
    
        v_j = \sum_i v_i w_{i,j}
    
        w_{i,j} = a_{i,j} / a_i
    
        where a_i is the total area of source polygon i.
    
        For an intensive variable, the estimate at target polygon j is:
    
        v_j = \sum_i v_i w_{i,j}
    
        w_{i,j} = a_{i,j} / \sum_k a_{k,j}
    
    """

    for i in raw_community:
        _check_presence_of_crs(i)

    if not all(i.crs == raw_community[0].crs for i in raw_community):
        raise ValueError(
            'There is, at least, one pairwise difference in the Coordinate Reference System (CRS) of the GeoDataFrames of raw_community. All of them must be the same.'
        )

    years_set = [i['year'].unique()[0] for i in raw_community]
    reference_idx_year = years_set.index(target_year_of_reference)

    source_years = years_set.copy()
    del source_years[reference_idx_year]

    source_idx_year = list(
        np.where(np.isin(years_set, source_years) == True)[0])

    reference_df = raw_community[reference_idx_year]

    interpolated_dfs = {}

    for i in source_idx_year:
        print('Starting to Harmonize the year of {}...'.format(years_set[i]))
        source_df = raw_community[i]

        if (weights_method == 'area'):

            # In area_interpolate, the resulting variable has same lenght as target_df
            interpolation = area_interpolate_binning(
                source_df,
                reference_df,
                extensive_variables=extensive_variables,
                intensive_variables=intensive_variables,
                allocate_total=allocate_total)

        if (weights_method == 'land_type_area'):

            area_tables_raster_fitted = area_tables_raster(
                source_df,
                reference_df,
                raster_path,
                codes=codes,
                force_crs_match=force_crs_match)

            # In area_interpolate, the resulting variable has same lenght as target_df
            interpolation = area_interpolate(
                source_df,
                reference_df,
                extensive_variables=extensive_variables,
                intensive_variables=intensive_variables,
                allocate_total=allocate_total,
                tables=area_tables_raster_fitted)

        for j in list(range(interpolation[0].shape[1])):
            print('Harmonizing extensive variable {} of the year {}.'.format(
                extensive_variables[j], years_set[i]))
            profile = pd.DataFrame.from_dict({
                'interpolated_' + extensive_variables[j]:
                interpolation[0][:, j]
            })
            reference_df = pd.concat(
                [reference_df.reset_index(drop=True), profile], axis=1)

        for k in list(range(interpolation[1].shape[1])):
            print('Harmonizing intensive variable {} of the year {}.'.format(
                intensive_variables[k], years_set[i]))
            profile = pd.DataFrame.from_dict({
                'interpolated_' + intensive_variables[k]:
                interpolation[1][:, k]
            })
            reference_df = pd.concat(
                [reference_df.reset_index(drop=True), profile], axis=1)

        # Resetting the year column to the year that it is been harmonized
        reference_df['year'] = years_set[i]

        interpolated_dfs.update({years_set[i]: reference_df})

        # Resets the reference_df to refresh the loop (this has to be present)
        del reference_df
        reference_df = raw_community[reference_idx_year]

    harmonized_df = gpd.GeoDataFrame()
    for value in interpolated_dfs.values():
        harmonized_df = pd.concat(
            [harmonized_df.reset_index(drop=True), value], axis=0)

    return harmonized_df
Ejemplo n.º 13
0
def scanlines_count_pixels(source_gdf, raster_path, verbose=True):
    """Function that generates the count of all pixel types in a raster of a given set of polygons using scanlines
    
    Parameters
    ----------
    
    source_gdf      : geopandas GeoDataFrame with geometry column of polygon type for the source set of polygons desired.
    
    raster_path     : the path to the associated raster image.
    
    verbose         : bool. Default is False.
                      Wheter the function will print progress steps.

    """

    t0_aux = time.time()

    _check_presence_of_crs(source_gdf)

    if verbose: print('Opening raster metadata...')
    raster = rasterio.open(raster_path)

    if verbose:
        print('Matching both crs\'s (reprojecting source_gdf to raster)...')
    source_gdf = source_gdf.to_crs(crs=raster.crs.data)

    # Check if Operational System is Windows
    sep_cmd = ";" if os.name == 'nt' else ":"

    if ('geometry' not in source_gdf.columns):
        source_gdf['geometry'] = source_gdf[source_gdf._geometry_column_name]
        source_gdf = source_gdf.drop([source_gdf._geometry_column_name],
                                     axis=1)
        source_gdf = source_gdf.set_geometry('geometry')

    # Create Temporary Directory
    with tempfile.TemporaryDirectory() as source_gdf_temp_dir:

        # parquet like internal file
        if verbose:
            print('Starting to create well-known text (wkt) of geometries...')
        source_gdf['geometry_wkt'] = source_gdf['geometry'].apply(
            lambda x: x.wkt
        )  # Create well-know text (raw text) for the geometry column
        source_gdf = source_gdf.drop(['geometry'], axis=1)
        source_gdf_temp_file_name = os.path.join(source_gdf_temp_dir,
                                                 'source_gdf_temp.parquet')

        # Just extract the useful column for optimization
        source_gdf = source_gdf[['geometry_wkt']]

        if verbose:
            print(
                'Starting to convert the GeoDataFrame to a temporary file...')
        source_gdf.to_parquet(source_gdf_temp_file_name)

        cmd_pre = os.path.join('java -client -cp dependency', '*')

        cmd = cmd_pre + "{}ucrspatial-6.0-SNAPSHOT.jar histogram {} {}".format(
            sep_cmd, raster_path, source_gdf_temp_file_name)

        t1_aux = time.time()

        if verbose:
            print(
                'Time of preparation before scanline (in seconds): {}'.format(
                    t1_aux - t0_aux))

        if verbose: print('Starting to perform the scanline...')
        t0_aux = time.time()
        run(cmd, shell=True, check=True
            )  # Will generate an parquet for output: histogram.parquet
        t1_aux = time.time()
        if verbose: print('Scanline: Done.')
        if verbose:
            print('Time of scanline itself (in seconds): {}'.format(t1_aux -
                                                                    t0_aux))

    profile_df = pd.read_parquet("histogram.parquet")

    os.remove("histogram.parquet")

    return profile_df
Ejemplo n.º 14
0
def scanlines_interpolate(target_gdf,
                          source_CTs,
                          weights_long,
                          raster_path,
                          verbose=True):
    """Function that generates the interpolated values using scanlines with a given set of weights and Correction Terms using scanlines
    
    Parameters
    ----------
    
    target_gdf      : geopandas GeoDataFrame with geometry column of polygon type for the target set of polygons desired.
    
    source_CTs      : geopandas GeoDataFrame with the Correction Terms for source polygons.
    
    weights_long    : a numpy array with the weights for all land types in the raster.
    
    raster_path     : the path to the associated raster image.
    
    verbose         : bool. Default is False.
                      Wheter the function will print progress steps.

    """

    t0_aux = time.time()

    _check_presence_of_crs(target_gdf)
    _check_presence_of_crs(source_CTs)

    if _check_crs(source_CTs, target_gdf):
        pass
    else:
        return None

    if verbose: print('Opening raster metadata...')
    raster = rasterio.open(raster_path)

    if verbose:
        print('Matching both crs\'s (reprojecting source_CTs to raster)...')
    source_CTs = source_CTs.to_crs(crs=raster.crs.data)
    if verbose: print('...reprojecting target_gdf to raster)...')
    target_gdf = target_gdf.to_crs(crs=raster.crs.data)

    # Check if Operational System is Windows
    sep_cmd = ";" if os.name == 'nt' else ":"

    if ('geometry' not in target_gdf.columns):
        target_gdf['geometry'] = target_gdf[target_gdf._geometry_column_name]
        target_gdf = target_gdf.drop([target_gdf._geometry_column_name],
                                     axis=1)
        target_gdf = target_gdf.set_geometry('geometry')

    if ('geometry' not in source_CTs.columns):
        source_CTs['geometry'] = source_CTs[source_CTs._geometry_column_name]
        source_CTs = source_CTs.drop([source_CTs._geometry_column_name],
                                     axis=1)
        source_CTs = source_CTs.set_geometry('geometry')

    # Create a temporary directory for ALL input files
    with tempfile.TemporaryDirectory() as temp_dir:

        # parquet like internal file
        if verbose:
            print('Starting to create well-known text (wkt) of geometries...')
        target_gdf['geometry_wkt'] = target_gdf['geometry'].apply(
            lambda x: x.wkt
        )  # Create well-know text (raw text) for the geometry column
        target_gdf = target_gdf.drop(['geometry'], axis=1)
        target_gdf_temp_file_name = os.path.join(temp_dir,
                                                 'target_gdf_temp.parquet')

        # Just extract the useful column for optimization
        target_gdf = target_gdf[['geometry_wkt']]

        if verbose:
            print(
                'Starting to convert the GeoDataFrame to a temporary file...')
        target_gdf.to_parquet(target_gdf_temp_file_name)

        # parquet like internal file
        if verbose:
            print(
                'Source CT: Starting to create well-known text (wkt) of geometries...'
            )
        source_CTs['geometry_wkt'] = source_CTs['geometry'].apply(
            lambda x: x.wkt
        )  # Create well-know text (raw text) for the geometry column
        source_CTs = source_CTs.drop(['geometry'], axis=1)
        source_CTs_temp_file_name = os.path.join(temp_dir,
                                                 'source_CTs_temp.parquet')

        # Just extract the useful column for optimization
        # For source we need also the Correction Terms!
        source_CTs = source_CTs[['geometry_wkt', 'CT']]

        if verbose:
            print(
                'Starting to convert the GeoDataFrame to a temporary file...')
        source_CTs.to_parquet(source_CTs_temp_file_name)

        weights_temp_file_name = os.path.join(temp_dir, 'input_weights.csv')

        np.savetxt(weights_temp_file_name,
                   weights_long,
                   delimiter=",",
                   header='weights',
                   comments='')

        cmd_pre = os.path.join('java -client -cp dependency', '*')

        cmd = cmd_pre + "{}ucrspatial-6.0-SNAPSHOT.jar interpolate {} {} {} {}".format(
            sep_cmd, raster_path, source_CTs_temp_file_name,
            target_gdf_temp_file_name, weights_temp_file_name)

        t1_aux = time.time()

        if verbose:
            print(
                'Time of preparation before scanline (in seconds): {}'.format(
                    t1_aux - t0_aux))

        if verbose: print('Starting to perform the scanline...')
        t0_aux = time.time()
        run(cmd, shell=True, check=True
            )  # Will generate an parquet for output: interpolate.parquet
        t1_aux = time.time()
        if verbose: print('Scanline: Done.')
        if verbose:
            print('Time of scanline itself (in seconds): {}'.format(t1_aux -
                                                                    t0_aux))

    interpolated_df = pd.read_parquet("interpolate.parquet")

    os.remove("interpolate.parquet")

    return interpolated_df
Ejemplo n.º 15
0
def _return_weights_from_regression(
    geodataframe,
    raster_path,
    pop_string,
    codes=[21, 22, 23, 24],
    likelihood="poisson",
    formula_string=None,
    n_pixels_option_values=256,
    force_crs_match=True,
    na_value=255,
    ReLU=True,
):

    """Function that returns the weights of each land type according to NLCD
    types/codes.

    Parameters
    ----------
    geodataframe :  geopandas.GeoDataFrame 
        used to build regression
    raster_path : str
        the path to the associated raster image.
    formula_string : str
        patsy-style model formula
    pop_string : str
        the name of the variable on geodataframe that the regression shall be conducted
    codes : list
        an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD).
        The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html
        The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity).
    likelihood : str, {'Poisson', 'Gaussian'}
        the likelihood assumed for the dependent variable (population). It can be 'Poisson' or 'Gaussian'.
        With the 'Poisson' a Generalized Linear Model with log as link function will be fitted and 'Gaussian' an ordinary least squares will be fitted.
    n_pixels_option_values : int
        number of options of the pixel values of rasterior. Default is 256.
    force_crs_match   : bool. Default is True.
        Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
        It is recommended to let this argument as True.
    na_value : int. Default is 255.
        The number which is considered to be 'Not a Number' (NaN) in the raster pixel values.
    ReLU : bool. Default is True.
         Whether the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types.

    Notes
    -----
    1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
    2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256.
    """

    _check_presence_of_crs(geodataframe)

    if na_value in codes:
        raise ValueError("codes should not assume the na_value value.")

    if not likelihood in ["poisson", "gaussian"]:
        raise ValueError("likelihood must one of 'poisson', 'gaussian'")

    profiled_df = _fast_append_profile_in_gdf(
        geodataframe[["geometry", pop_string]], raster_path, force_crs_match
    )  # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it).

    # If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match
    codes.sort()

    if not formula_string:
        # Formula WITHOUT intercept
        str_codes = [str(i) for i in codes]
        formula_string = (
            pop_string + " ~ -1 + " + " + ".join(["Type_" + s for s in str_codes])
        )

    if likelihood == "poisson":
        results = smf.glm(formula_string, data=profiled_df, family=Poisson()).fit()

    if likelihood == "gaussian":
        results = smf.ols(formula_string, data=profiled_df).fit()

    weights = np.zeros(n_pixels_option_values)
    weights[codes] = results.params

    if ReLU:
        weights = np.where(weights < 0, 0, weights)

    return weights
Ejemplo n.º 16
0
def harmonize(
    raw_community,
    target_year=None,
    weights_method="area",
    extensive_variables=None,
    intensive_variables=None,
    allocate_total=True,
    raster=None,
    codes=[21, 22, 23, 24],
    force_crs_match=True,
    index="geoid",
    time_col="year",
):
    r"""
    Use spatial interpolation to standardize neighborhood boundaries over time.

    Parameters
    ----------
    raw_community : list of geopandas.GeoDataFrames
        Multiple GeoDataFrames given by a list (see (1) in Notes).

    target_year : string
        The target year that represents the bondaries of all datasets generated
        in the harmonization. Could be, for example '2010'.

    weights_method : string
        The method that the harmonization will be conducted. This can be set to:
            * "area"                      : harmonization using simple area-weighted interprolation.
            * "dasymetric"                : harmonization using area-weighted interpolation with raster-based
                                            ancillary data to mask out uninhabited land.

    extensive_variables : list
        The names of variables in each dataset of raw_community that contains
        extensive variables to be harmonized (see (2) in Notes).

    intensive_variables : list
        The names of variables in each dataset of raw_community that contains
        intensive variables to be harmonized (see (2) in Notes).

    allocate_total : boolean
        True if total value of source area should be allocated.
        False if denominator is area of i. Note that the two cases
        would be identical when the area of the source polygon is
        exhausted by intersections. See (3) in Notes for more details.

    raster : str
        the path to a local raster image to be used as a dasymetric mask. If using
        "dasymetric" this is a required argument.

    codes : list of ints
        list of raster pixel values that should be considered as
        'populated'. Since this draw inspiration using the National Land Cover
        Database (NLCD), the default is 21 (Developed, Open Space),
        22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and
        24 (Developed, High Intensity). The description of each code can be
        found here:
        https://www.mrlc.gov/sites/default/files/metadata/landcover.html
        Ignored if not using dasymetric harmonizatiton.

    force_crs_match : bool. Default is True.
        Wheter the Coordinate Reference System (CRS) of the polygon will be
        reprojected to the CRS of the raster file. It is recommended to
        leave this argument True.
        Only taken into consideration for harmonization raster based.


    Notes
    -----
    1) Each GeoDataFrame of raw_community is assumed to have a 'year' column
       Also, all GeoDataFrames must have the same Coordinate Reference System (CRS).

    2) A quick explanation of extensive and intensive variables can be found
    here: http://ibis.geog.ubc.ca/courses/geob370/notes/intensive_extensive.htm

    3) For an extensive variable, the estimate at target polygon j (default case) is:

        v_j = \sum_i v_i w_{i,j}

        w_{i,j} = a_{i,j} / \sum_k a_{i,k}

        If the area of the source polygon is not exhausted by intersections with
        target polygons and there is reason to not allocate the complete value of
        an extensive attribute, then setting allocate_total=False will use the
        following weights:

        v_j = \sum_i v_i w_{i,j}

        w_{i,j} = a_{i,j} / a_i

        where a_i is the total area of source polygon i.

        For an intensive variable, the estimate at target polygon j is:

        v_j = \sum_i v_i w_{i,j}

        w_{i,j} = a_{i,j} / \sum_k a_{k,j}

    """
    assert target_year, ('target_year is a required parameter')
    if extensive_variables is None and intensive_variables is None:
        raise ValueError(
            "You must pass a set of extensive and/or intensive variables to interpolate"
        )
    if not extensive_variables:
        extensive_variables = []
    if not intensive_variables:
        intensive_variables = []
    all_vars = extensive_variables + intensive_variables

    _check_presence_of_crs(raw_community)
    dfs = raw_community.copy()
    times = dfs[time_col].unique().tolist()
    times.remove(target_year)

    target_df = dfs[dfs[time_col] == target_year].reset_index()

    interpolated_dfs = {}
    interpolated_dfs[target_year] = target_df.copy()

    with tqdm(total=len(times), desc=f'Converting {len(times)} time periods') as pbar:
        for i in times:
            pbar.write(f"Harmonizing {i}")
            source_df = dfs[dfs[time_col] == i]

            if weights_method == "area":

                # In area_interpolate, the resulting variable has same lenght as target_df
                interpolation = area_interpolate(
                    source_df,
                    target_df.copy(),
                    extensive_variables=extensive_variables,
                    intensive_variables=intensive_variables,
                    allocate_total=allocate_total,
                )

            elif weights_method == "dasymetric":
                try:
                    # In area_interpolate, the resulting variable has same lenght as target_df
                    interpolation = masked_area_interpolate(
                        source_df,
                        target_df.copy(),
                        extensive_variables=extensive_variables,
                        intensive_variables=intensive_variables,
                        allocate_total=allocate_total,
                        codes=codes,
                        raster=raster,
                    )
                except IOError:
                    raise IOError(
                        "Unable to locate raster. If using the `dasymetric` or model-based methods. You"
                        "must provide a raster file and indicate which pixel values contain developed land"
                    )
            else:
                raise ValueError('weights_method must of one of ["area", "dasymetric"]')

            profiles = []
            profile = interpolation[all_vars]
            profiles.append(profile)

            profile["geometry"] = target_df["geometry"]
            profile[index] = target_df[index]
            profile[time_col] = i

            interpolated_dfs[i] = profile
            pbar.update(1)
        pbar.set_description("Complete")
        pbar.close()


    harmonized_df = gpd.GeoDataFrame(
        pd.concat(list(interpolated_dfs.values()), sort=True)
    )

    return harmonized_df
def create_non_zero_population_by_pixels_locations(geodataframe,
                                                   raster,
                                                   pop_string,
                                                   weights=None,
                                                   force_crs_match=True):
    '''Function that returns the actual population of each pixel from a given geodataframe and variable.
    
    geodataframe       : a geopandas dataframe that it is contained in the raster
    
    raster             : the raster used from rasterio
    
    pop_string         : a string of the column name of the geodataframe that the estimation will be made
    
    weights            : vector of weights in each position of the pixel values according 'return_weights_from_regression' function. This must be provided by the user.
    
    force_crs_match    : bool. Default is True.
                         Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. 
                         It is recommended to let this argument as True.
    
    '''

    _check_presence_of_crs(geodataframe)

    if force_crs_match:
        geodataframe_projected = geodataframe.to_crs(crs=raster.crs.data)
    else:
        warnings.warn(
            'The polygon is not being reprojected. The clipping might be being performing on unmatching polygon to the raster.'
        )

    result_pops_array = np.array([])
    result_lons_array = np.array([])
    result_lats_array = np.array([])

    for line_index in range(len(geodataframe_projected)):
        polygon_projected = geodataframe_projected.iloc[[line_index]]

        coords = getFeatures(polygon_projected)

        out_img, out_transform = mask(dataset=raster, shapes=coords, crop=True)
        '''Calculating the population for each pixel'''
        trans_numpy = weights[out_img]  # Pixel population from regression
        orig_estimate = polygon_projected[
            pop_string]  # Original Population Value of The polygon
        correction_term = orig_estimate / trans_numpy.sum()
        final_pop_numpy_pre = trans_numpy * np.array(correction_term)

        flatten_final_pop_numpy_pre = np.ndarray.flatten(final_pop_numpy_pre)

        non_zero_pop_index = np.where(flatten_final_pop_numpy_pre != 0)

        final_pop_numpy = flatten_final_pop_numpy_pre[non_zero_pop_index]
        '''Retrieving location of each pixel'''
        lons, lats = create_lon_lat(out_img, out_transform)

        final_lons = np.ndarray.flatten(lons)[non_zero_pop_index]
        final_lats = np.ndarray.flatten(lats)[non_zero_pop_index]
        '''Append all flattens numpy arrays'''
        result_pops_array = np.append(result_pops_array, final_pop_numpy)
        result_lons_array = np.append(result_lons_array, final_lons)
        result_lats_array = np.append(result_lats_array, final_lats)

        print('Polygon {} processed out of {}'.format(line_index + 1,
                                                      len(geodataframe)),
              end="\r")

    data = {
        'pop_value': result_pops_array,
        'lons': result_lons_array.round().astype(int).tolist(),
        'lats': result_lats_array.round().astype(int).tolist()
    }

    corresp = pd.DataFrame.from_dict(data)

    return corresp