Esempio n. 1
0
def filter_records(data: pd.DataFrame, field: str, value_list: set, exclude_matching: bool = False, exclusion_analysis: bool = False) -> pd.DataFrame:
    """Filters the Census records by any field in ONS PD.

    Args:
        data:
        field: The field on which to filter
        value_list: The values on which to filter
        exclude_matching: If True, exclude the values that match the filter. If False, keep the values that match the filter.
        exclusion_analysis:

    Returns:
        Filtered data

    """
    # Count number of rows
    original_records = data.index.size
    matching_records = data[field].isin(value_list)
    if exclude_matching:
        # Excluding records that match the filter criteria
        filter_mask = ~matching_records
        logger.info(f"Selecting records that satisfy {field} not in {value_list} from {original_records} records.")
    else:
        # Including records that match the filter criteria
        filter_mask = matching_records
        logger.info(f"Selecting records that satisfy {field} in {value_list} from {original_records} records.")

    filtered = data.loc[filter_mask]
    logger.debug(f"Resulting in {filtered.index.size} records remaining.")

    if exclusion_analysis:
        excluded = data.loc[~filter_mask]
        _exclusion_analysis(data, filtered, excluded)

    return filtered
Esempio n. 2
0
def _clean_and_verify_postcode(census_data: pd.DataFrame) -> None:
    """Cleans postcode data and inserts clean postcodes and validity check

    Cleans postcode data from passed table and index
    Gets index of postcode column, and inserts new columns after postcode column

    Args:
        census_data: table of data with a postcode column

    """
    # Gets the index of the postcode column, and increments as insertion is from the left.
    # Columns must be inserted in number order otherwise it wont't make sense
    postcode_column = scout_census.column_labels.POSTCODE  # heading of the postcode column in the table
    postcode_column_index = census_data.columns.get_loc(
        postcode_column)  # scout_census.column_labels.POSTCODE
    cleaned_postcode_index = postcode_column_index + 1
    valid_postcode_index = postcode_column_index + 2

    # Sets the labels for the columns to be inserted
    valid_postcode_label = scout_census.column_labels.VALID_POSTCODE

    logger.info("Cleaning postcodes")
    cleaned_postcode_column = _postcode_cleaner(census_data[postcode_column])

    logger.info("Inserting columns")
    census_data.insert(cleaned_postcode_index, CLEAN_POSTCODE_LABEL,
                       cleaned_postcode_column)
    census_data.insert(valid_postcode_index, valid_postcode_label,
                       float("NaN"))
Esempio n. 3
0
 def section_history_summary(self,
                             years: list,
                             report_name: str = None) -> pd.DataFrame:
     # Works effectively for years after 2017
     logger.info("Beginning section_history_summary")
     report = self._history_summary(years, "compass ID", "compass")
     if report_name:
         report_io.save_report(report, report_name)
     return report
Esempio n. 4
0
def load_postcode_directory(ons_pd: ONSPostcodeDirectory) -> pd.DataFrame:
    logger.info(f"Loading ONS postcode data.")
    return pd.read_csv(
        config.SETTINGS.ons_pd.full,
        index_col=ons_pd.index_column,
        dtype=ons_pd.data_types,
        usecols=[f for f in ons_pd.fields if f != "imd_decile"
                 ],  # imd_decile isn't defined in the raw file
        encoding="utf-8",
    )
Esempio n. 5
0
def add_shapefile_data(census_data: pd.DataFrame,
                       metadata: Boundary) -> pd.DataFrame:
    logger.info("Adding shapefile data")
    # self.census_data = self.census_data.copy()

    shapefile_key = metadata.shapefile.key
    new_data, points_data = add_shape_data(census_data,
                                           shapefile_key,
                                           path=metadata.shapefile.path)
    return new_data.rename(columns={shapefile_key: metadata.key})
Esempio n. 6
0
    def __init__(self, map_name: str, map_title: str):
        """Initialise Map class.

        Args:
            map_name: Filename for the saved map

        """
        logger.info("Initialising leaflet map")
        self.map: dict[str, Any] = {"map_title": map_title}
        self.out_file = config.SETTINGS.folders.output / f"{map_name}.html"
Esempio n. 7
0
 def group_history_summary(self,
                           years: list,
                           report_name: str = None) -> pd.DataFrame:
     logger.info("Beginning group_history_summary")
     report = self._history_summary(years,
                                    "Group ID",
                                    scout_census.column_labels.id.GROUP,
                                    unit_type="Group")
     if report_name:
         report_io.save_report(report, report_name)
     return report
Esempio n. 8
0
    def wrapper(self, *args, **kwargs):
        # record a start time for the function
        start_time = time.time()
        logger.info(f"Calling function {method.__name__}")

        # call the original method with the passed arguments and keyword arguments, and store the result
        output = method(self, *args, **kwargs)
        logger.info(
            f"{method.__name__} took {time.time() - start_time:.2f} seconds")

        # return the output of the original function
        return output
Esempio n. 9
0
def _exclusion_analysis(original: pd.DataFrame, filtered: pd.DataFrame, excluded: pd.DataFrame):
    cols = {scout_census.column_labels.UNIT_TYPE, *(section_model.total for section, section_model in sections_model)}
    if not set(original.columns) >= cols:
        o_cols = original.columns.to_list()
        raise ValueError("Required columns are not in dataset!\n" f"Required columns are: {cols}.\n" f"Your columns are: {o_cols}")

    # Calculate the number of records that have been filtered out
    original_records = original.index.size
    excluded_records = original_records - filtered.index.size
    logger.info(f"{excluded_records} records were removed ({excluded_records / original_records * 100}% of total)")

    # Prints number of members and % of members filtered out for each section
    for section_name, section_model in sections_model:
        logger.debug(f"Analysis of {section_name} member exclusions")
        section_type = section_model.type
        members_col = section_model.total

        excluded_sections = excluded.loc[excluded[scout_census.column_labels.UNIT_TYPE] == section_type]
        excluded_members = 0
        if not excluded_sections.empty:
            logger.debug(f"Excluded sections\n{excluded_sections}")
            logger.debug(f"Finding number of excluded {section_name} by summing {members_col}")
            excluded_members = excluded_sections[members_col].sum()
            logger.debug(f"{excluded_members} {section_name} excluded")

        original_members = original.loc[original[scout_census.column_labels.UNIT_TYPE] == section_type, members_col].sum()
        if original_members > 0:
            logger.info(f"{excluded_members} {section_name} members were removed ({excluded_members / original_members * 100}%) of total")
        else:
            logger.info(f"There are no {section_name} members present in data")
Esempio n. 10
0
def save_merged_data(data: pd.DataFrame, ons_pd_publication_date: str) -> None:
    """Save passed dataframe to csv file.

    Also output list of errors in the merge process to a text file

    Args:
        data: Census data
        ons_pd_publication_date: Refers to the ONS Postcode Directory's publication date

    """
    raw_extract_path = config.SETTINGS.census_extract.original
    output_path = raw_extract_path.parent / f"{raw_extract_path.stem} with {ons_pd_publication_date} fields"
    error_output_path = config.SETTINGS.folders.output / "error_file.csv"

    valid_postcode_label = scout_census.column_labels.VALID_POSTCODE
    postcode_merge_column = "clean_postcode"
    original_postcode_label = scout_census.column_labels.POSTCODE
    compass_id_label = scout_census.column_labels.id.COMPASS

    # The errors file contains all the postcodes that failed to be looked up in the ONS Postcode Directory
    error_output_fields = [
        postcode_merge_column, original_postcode_label, compass_id_label,
        "type", "name", "G_name", "D_name", "C_name", "R_name", "X_name",
        "Census Date"
    ]
    data.loc[~data[valid_postcode_label],
             error_output_fields].to_csv(error_output_path,
                                         index=False,
                                         encoding="utf-8-sig")

    # Write the new data to a csv file (utf-8-sig only to force excel to use UTF-8)
    logger.info("Writing merged data")
    data.to_csv(output_path.with_suffix(".csv"),
                index=False,
                encoding="utf-8-sig")
    data.to_feather(output_path.with_suffix(".feather"))
Esempio n. 11
0
def _load_boundary(boundary_report: pd.DataFrame, boundary_metadata: config.Boundary) -> gpd.GeoDataFrame:
    """Loads a given boundary from a boundary report and metadata.

    Loads shapefile from path into GeoPandas dataframe
    Filters out unneeded shapes within all shapes loaded
    Converts from British National Grid to WGS84, as Leaflet doesn't understand BNG

    Args:
        boundary_report: A DataFrame object with boundary report data
        boundary_metadata: This contains shapefile paths, and labels for region codes and names

    Returns:
        GeoDataFrame with filtered and CRS transformed shapes

    """
    metadata = boundary_metadata
    data = boundary_report

    # Read a shape file. shapefile_path is the path to ESRI shapefile with region information
    logger.info("Loading Shapefile data")
    logger.debug(f"Shapefile path: {metadata.shapefile.path}")
    start_time = time.time()
    all_shapes = gpd.read_file(metadata.shapefile.path)
    logger.info(f"Loading Shapefile data finished, {time.time() - start_time:.2f} seconds elapsed")
    if metadata.shapefile.key not in all_shapes.columns:
        raise KeyError(f"{metadata.shapefile.key} not present in shapefile. Valid columns are: {all_shapes.columns}")

    # Rename columns
    shapes_col_map = {metadata.shapefile.key: "shape_codes", metadata.shapefile.name: "shape_names"}
    all_shapes.columns = [shapes_col_map.get(col, col) for col in all_shapes.columns]

    # Filter and convert GeoDataFrame to world co-ordinates
    logger.info(f"Filtering {len(all_shapes.index)} shapes by shape_codes being in the codes column of the map_data")
    all_codes = set(data["codes"])
    logger.debug(f"All codes list: {all_codes}")
    geo_data = all_shapes.loc[all_shapes["shape_codes"].isin(all_codes), ["geometry", "shape_codes", "shape_names"]].to_crs(epsg=constants.WGS_84)
    logger.info(f"Loaded {len(geo_data.index):,} boundary shapes. Columns now in data: {[*data.columns]}.")
    return geo_data
Esempio n. 12
0
def merge_with_postcode_directory(
        census_data: pd.DataFrame, ons_pd_data: pd.DataFrame,
        ons_fields_data_types: dict[str, list[str]]) -> pd.DataFrame:
    logger.info("Cleaning the postcodes")
    _clean_and_verify_postcode(census_data)

    # attempt to fix invalid postcodes
    logger.info("Adding ONS postcode directory data to Census and outputting")
    data = _try_fix_invalid_postcodes(census_data, ons_pd_data.index)

    # fully merge the data
    logger.info("Merging data")
    data = pd.merge(data,
                    ons_pd_data,
                    how="left",
                    left_on="clean_postcode",
                    right_index=True,
                    sort=False)

    # fill unmerged rows with default values
    logger.info("filling unmerged rows")
    data = _fill_unmerged_rows(data, ons_fields_data_types)

    return data
Esempio n. 13
0
def _try_fix_invalid_postcodes(census_data: pd.DataFrame,
                               all_valid_postcodes: pd.Index) -> pd.DataFrame:
    """Uses various methods attempting to provide every record with a valid postcode

    Currently only implemented for sections with youth membership.
    TODO: implement for all entity types

    Methodology:
    - If section has an invalid postcode in 2017 or 2018, use 2019's if valid (all are valid or missing in 2019)
    - If section has no valid postcodes, use most common (mode) postcode from sections in group in that year, then try successive years
    - If group or district has no valid postcode in 2010-2016, use following years (e.g. if 2010 not valid, try 2011, 12, 13 etc.)

    Args:
        census_data: Dataframe of census data including invalid postcodes
        all_valid_postcodes: All valid postcodes from the ONS Postcode Directory

    Returns:
        modified data table with more correct postcodes

    """

    logger.info("filling postcodes in sections with invalid postcodes")

    # Helper variables to store field headings for often used fields
    section_id_label = scout_census.column_labels.id.COMPASS
    group_id_label = scout_census.column_labels.id.GROUP
    district_id_label = scout_census.column_labels.id.DISTRICT

    # Lists of entity types to match against in constructing section records tables
    group_section_types = scout_census.TYPES_GROUP
    district_section_types = scout_census.TYPES_DISTRICT
    section_types = group_section_types | district_section_types
    pre_2017_types = {"Group", "District"}

    # Columns to use in constructing the MultiIndex. Larger groups go first towards smaller
    index_cols = [
        district_id_label, group_id_label, section_id_label,
        scout_census.column_labels.CENSUS_ID
    ]

    # Find which postcodes are valid
    census_data[scout_census.column_labels.VALID_POSTCODE] = census_data[
        CLEAN_POSTCODE_LABEL].isin(all_valid_postcodes)

    # Sets a MultiIndex on the data table to enable fast searching and querying for data
    census_data = census_data.set_index(index_cols, drop=False)

    census_data = _run_postcode_fix_step(census_data, all_valid_postcodes,
                                         "section", "latest Census",
                                         section_types, section_id_label, 2)
    census_data = _run_postcode_fix_step(census_data, all_valid_postcodes,
                                         "group-section", "same group",
                                         group_section_types, group_id_label,
                                         1)
    census_data = _run_postcode_fix_step(census_data, all_valid_postcodes,
                                         "district-section", "same district",
                                         district_section_types,
                                         district_id_label, 0)
    census_data = _run_postcode_fix_step(census_data, all_valid_postcodes,
                                         "pre 2017", "same entity",
                                         pre_2017_types, section_id_label, 2)

    # Undo the changes made in this method by removing the MultiIndex and
    # removing the merge test column
    census_data = census_data.reset_index(drop=True)
    return census_data
Esempio n. 14
0
    def _history_summary(self,
                         years: list,
                         id_name: str,
                         census_col: str,
                         unit_type: str = None) -> pd.DataFrame:
        sections_model = scout_census.column_labels.sections

        # Must have imd scores and deciles already in census_postcode_data.
        logger.info(f"Grouping data by {census_col}")
        data = self.census_data
        grouped_data = data.groupby([census_col], sort=False)

        # create dataframe of all constant values, which happen to all be scout org hierachy related
        logger.info(f"Creating table of Scout organisational data")
        scout_org_cols = [
            census_col,
            scout_census.column_labels.UNIT_TYPE,
            scout_census.column_labels.name.GROUP,
            scout_census.column_labels.name.DISTRICT,
            scout_census.column_labels.name.COUNTY,
            scout_census.column_labels.name.REGION,
            scout_census.column_labels.name.COUNTRY,
        ]
        scout_org_data = grouped_data[scout_org_cols].first()

        # if unit_types is set the series should be overwritten with that value
        # this is for manually overwriting the unit type
        if unit_type:
            scout_org_data[scout_census.column_labels.UNIT_TYPE] = unit_type

        logger.info(f"Finding opening and closing years")
        # Takes the year column from the grouped_data object resulting in a SeriesGroupBy
        # Applies the years_of_return method to get max and min years for each series in the object
        # Applies to a Series, unpacking the returned tuples to individual series
        # Casts to an object dtype as later we introduce text.
        years_return = grouped_data["Year"].agg(
            lambda series: (series.min(), series.max())).apply(
                pd.Series).astype(object)

        # If open in the first year of data or last year of data, add an explanatory note that the limits are not certain
        years_return[0] = years_return[0].mask(years_return[0] == years[0],
                                               f"{years[0]} or before")
        years_return[1] = years_return[1].mask(years_return[1] == years[-1],
                                               f"Open in {years[-1]}")
        years_return.columns = ["min_year", "max_year"]

        # for each dataframe in the groupby object, find the lowest IMD rank from the latest year, and return
        # values associated with that rank. This requires IMD decile to have been added beforehand
        imd_cols = ["clean_postcode", "ctry", "imd", "imd_decile"]

        def _imd_groupby(df: pd.DataFrame):
            # To find the postcode and IMD for a range of Sections and Group records across several years.
            # Find the most recent year, and then choose the Postcode with the lowest IMD Rank.
            most_recent_year = df["Year"].max()
            df["imd"] = df["imd"].where(
                df["imd"] >
                0)  # Only keep values where IMD rank is greater than 0
            most_recent_records = df[
                df["Year"] == most_recent_year]  # The latest year of records
            min_imd_records = most_recent_records.nsmallest(
                1, "imd")  # get smallest imd rank
            return min_imd_records[imd_cols]

        # per year open, record the total young people per section and the number of adults
        # uses a nested groupby for efficiency
        sections_list = [
            section_name for section_name, section_model in sections_model
            if section_name not in {"Explorers", "Network"}
        ]
        adult_cols = ["Leaders", "SectAssistants",
                      "OtherAdults"]  # TODO re add

        def _year_groupby(df):
            dicts: pd.Series = df.groupby(
                ["Year"], sort=True).apply(_section_groupby).to_list()
            output = {}
            for row in dicts:
                output |= row
            return output

        def _section_groupby(df):
            census_year = df.name
            output = {}
            for section in sections_list:
                output[f"{section}-{census_year}"] = df[getattr(
                    scout_census.column_labels.sections, section).total].sum()
            output[f"Adults-{census_year}"] = df[adult_cols].to_numpy().sum()
            return output

        # For each year, calculate and add number of beavers, cubs, scouts.  Explorers, Network deliberately omitted.
        # Expand series of dictionaries to a dataframe with the same index
        logger.info(f"Creating table of members by section and adults by year")
        member_numbers_table = grouped_data.apply(_year_groupby)
        member_numbers_table = pd.DataFrame(member_numbers_table.to_list(),
                                            index=member_numbers_table.index)

        # apply the imd function and map country codes to country names
        logger.info(f"Creating table of IMD data and postcodes")
        imd_table = grouped_data.apply(_imd_groupby).droplevel(1)
        imd_table["IMD Country"] = imd_table["ctry"].map(ONS_PD.COUNTRY_CODES)

        # fmt: off
        column_renaming = {
            census_col: id_name,
            "type": "Type",
            "G_name": "Group",
            "D_name": "District",
            "C_name": "County",
            "R_name": "Region",
            "X_name": "Scout Country",
            "clean_postcode": "Postcode",
            "imd": "IMD Rank",
            "imd_decile": "IMD Decile",
            "min_year": "First Year",
            "max_year": "Last Year",
        }
        # fmt: on

        logger.info(f"Merging tables and conforming columns")
        history_summary_data = scout_org_data.join([
            imd_table, years_return, member_numbers_table
        ]).rename(columns=column_renaming).reset_index(drop=True)

        # create output columns list and add generated section names
        output_columns = [
            id_name, "Type", "Group", "District", "County", "Region",
            "Scout Country", "Postcode", "IMD Country", "IMD Rank",
            "IMD Decile", "First Year", "Last Year"
        ]
        for year in years:
            output_columns.extend([
                f"{section_name}-{year}"
                for section_name, section_model in sections_model
                if section_name != "Explorers"
            ])
            output_columns.append(f"Adults-{year}")

        return pd.DataFrame(history_summary_data, columns=output_columns)
Esempio n. 15
0
    def add_meeting_places_to_map(
        self,
        sections: pd.DataFrame,
        colour_key: str,
        marker_data: set[str],
        layer_name: str = "Sections",
        cluster_markers: bool = False,
        show_layer: bool = True,
        coloured_region: set[str] = None,
        coloured_region_key: str = "",
    ) -> None:
        """Adds the sections provided as markers to map with the colour, and data
        indicated by marker_data.

        Args:
            sections: Census records relating to Sections with lat and long Columns
            colour_key: Determines marker colour. If a column in `sections`, categorical colours. Otherwise, must be a CSS colour name.
            marker_data: Set of strings which determines content for popup, including:
                - youth membership
                - awards
            layer_name: Name of map layer for meeting places. Default = "Sections"
            cluster_markers: Whether to cluster markers on the map
            show_layer: Whether to show the layer by default
            coloured_region: If specified, markers on the map but not within coloured_region are grey
            coloured_region_key: Column for coloured_region boundary codes

        """
        logger.info("Adding section markers to map")

        # check that sections dataframe has data, and that there are any sections
        if sections.empty or (sections[scout_census.column_labels.id.DISTRICT].dropna().empty and sections[scout_census.column_labels.id.GROUP].dropna().empty):
            return

        # Sort sections dataframe
        sections = sections.sort_values(scout_census.column_labels.id.OBJECT).reset_index(drop=True)

        if layer_name in self.map:
            raise ValueError("Layer already used!")

        # Sets the map so that it opens in the right area
        valid_points = sections.loc[sections[scout_census.column_labels.VALID_POSTCODE], ["lat", "long"]]
        self.map["bounds"] = _output_fit_bounds(((valid_points.lat.min(), valid_points.long.min()), (valid_points.lat.max(), valid_points.long.max())))

        section_names = sections["name"].astype(str)

        if "youth membership" in marker_data:
            section_type = sections[scout_census.column_labels.UNIT_TYPE].map(constants.section_types)
            yp_total_cols = [section_model.total for section_name, section_model in scout_census.column_labels.sections]
            yp_totals = sections[yp_total_cols].sum(axis=1).astype(int).astype(str)  # Each row only has values for one section type
            sections["sect_overview"] = section_names + " : " + yp_totals + " " + section_type
        else:
            sections["sect_overview"] = section_names

        if "awards" in marker_data:
            # This uses just the first top award - so only Diamond/QSA for Explorers/Network
            top_award_cols = [section_model.top_award[0] for section_name, section_model in scout_census.column_labels.sections]
            awards = sections[top_award_cols].sum(axis=1).astype(int).astype(str)
            award_eligible_cols = [section_model.top_award_eligible[0] for section_name, section_model in scout_census.column_labels.sections]
            eligible = sections[award_eligible_cols].sum(axis=1).astype(int).astype(str)
            sections["awards_info"] = section_names + " : " + awards + " Top Awards out of " + eligible + " eligible"

        if colour_key in sections.columns:
            sections["marker_colour"] = sections[colour_key].map(_colour_mapping(sections[colour_key]))
        else:
            sections["marker_colour"] = colour_key

        if coloured_region_key and coloured_region is not None:
            # Areas outside the region_of_colour have markers coloured grey
            sections.loc[~sections[coloured_region_key].isin(coloured_region), "marker_colour"] = "gray"

        sections["postcode"] = sections[scout_census.column_labels.POSTCODE]
        sections["c_name"] = sections[scout_census.column_labels.name.COUNTY]
        sections["d_name"] = sections[scout_census.column_labels.name.DISTRICT]
        sections["g_name"] = sections[scout_census.column_labels.name.GROUP].astype(str).fillna("District")

        sections_info_cols = ["postcode", "lat", "long", "marker_colour", "c_name", "d_name", "g_name", "sect_overview"]
        if "awards" in marker_data:
            sections_info_cols += ["awards_info"]
        sections_info_table = sections[sections_info_cols].dropna(subset=["d_name"]).dropna(subset=["postcode"])

        # else the final marker would not be added
        last_row = pd.Series(sections_info_table.iloc[0].to_dict() | {"postcode": "~ FINAL"}, name=0)
        sections_info_table = sections_info_table.append(last_row)

        # set and sort index
        sections_info_table = sections_info_table.set_index(["postcode", "d_name", "g_name"], drop=True).sort_index(level=[0, 1, 2])

        # pre-calculate inner loop vars
        include_awards_data = "awards" in marker_data

        # initialise change-detector variables
        old_postcode = sections_info_table.index[0][0]
        old_district_name = ""
        reset_district = False

        # initialise first marker variables
        html = ""
        lat = round(sections_info_table["lat"].array[0], 4)
        long = round(sections_info_table["long"].array[0], 4)
        marker_colour = sections_info_table["marker_colour"].array[0]

        # Find all the sections with the same postcode:
        out = []
        for (postcode, district_name, group_name), sub_table in sections_info_table.groupby(level=[0, 1, 2]):
            if old_postcode != postcode:
                # Add a marker each time the postcode changes.
                out.append({"lat": lat, "lon": long, "col": marker_colour, "html": html})

                old_postcode = postcode  # update the old postcode
                lat = round(sub_table["lat"].array[0], 4)
                long = round(sub_table["long"].array[0], 4)
                marker_colour = sub_table["marker_colour"].array[0]

                # reset HTML string and mark district name for re-adding
                html = ""
                reset_district = True

            if old_district_name != district_name or reset_district:
                old_district_name = district_name
                reset_district = False

                county_name = sub_table["c_name"].array[0]

                # District sections first followed by Group sections
                html += f"<h3>{district_name} ({county_name})</h3>"

            html += f"<h4>{group_name}</h4><p align='center'>"
            html += "<br>".join(sub_table["sect_overview"])
            if include_awards_data and group_name != "District":
                awards_info = "<br>".join(sub_table["awards_info"])
                html += "<br>" + awards_info
            html += "</p>"
        # TODO marker cluster/feature group
        self.map[layer_name] = _output_marker_layer(layer_name, out)
Esempio n. 16
0
    def new_section_history_summary(self,
                                    years: list,
                                    report_name: str = None) -> pd.DataFrame:
        sections_model = scout_census.column_labels.sections

        # Given data on all sections, provides summary of all new sections, and
        # copes with the pre-2017 section reporting structure
        logger.info(f"Beginning new_section_history_summary for {years}")
        new_section_ids: list[dict] = []

        logger.info(
            f"Getting group ID list in column {scout_census.column_labels.id.GROUP}"
        )
        # Iterate through Groups looking for new Sections
        group_ids = self.census_data[
            scout_census.column_labels.id.GROUP].dropna().drop_duplicates(
            ).to_list()

        logger.info(f"Found {len(group_ids)} Groups")

        # for each section in each group in the census
        # construct dict of {year: number of sections of that type open in that year}
        # construct list of number of sections of that type open in that year
        # construct list of changes in number of sections per year
        # if there is an increase year on year
        # for each year from the second year calculate the change in the number of sections
        # whatever happens with change
        # do the same for district sections (explorers)
        #
        #
        # .

        census_data = self.census_data.fillna({
            scout_census.column_labels.id.GROUP:
            0,
            scout_census.column_labels.id.DISTRICT:
            0
        })

        for group_id in group_ids:
            logger.info(f"Investigating {group_id}")
            group_records = census_data.loc[census_data[
                scout_census.column_labels.id.GROUP] == group_id]

            for section in scout_census.SECTIONS_GROUP:
                logger.info(f"Finding {section} sections")
                units_by_year = {}
                for year in years:
                    section_numbers_year = group_records.loc[
                        group_records["Year"] == year,
                        getattr(scout_census.column_labels.sections, section
                                ).unit_label].sum()
                    units_by_year[year] = section_numbers_year

                increments = [
                    units_by_year[year + 1] - units_by_year[year]
                    for year in units_by_year.keys()
                    if (year + 1) in units_by_year
                ]
                if max(increments) > 0:
                    logger.debug(
                        f"Identified year profile of sections: {units_by_year}"
                    )
                    opened_sections = []
                    closed_sections = []
                    for year in years[1:]:
                        change = units_by_year[year] - units_by_year[year - 1]
                        if change > 0:
                            # Extent life of current sections
                            for open_sections in opened_sections:
                                open_sections["years"].append(year)
                            # Create new section record
                            for ii in range(change):
                                logger.debug(
                                    f"New {section} section found for {group_id} in {year}"
                                )
                                opened_sections.append({
                                    "id":
                                    group_id,
                                    "section":
                                    section,
                                    "years": [year],
                                    "nu_sections":
                                    units_by_year
                                })
                        elif change == 0:
                            # Lengthens all sections by a year
                            for open_sections in opened_sections:
                                open_sections["years"].append(year)
                        elif change < 0:
                            for ii in range(-change):
                                # Close sections in newest first
                                if len(opened_sections) > 0:
                                    logger.debug(
                                        f"{section} closed for {group_id} in {year}"
                                    )
                                    closed_sections.append(
                                        opened_sections.pop(-1))
                            # Lengthens remaining open sections by a year
                            for open_sections in opened_sections:
                                open_sections["years"].append(year)

                    logger.debug(
                        f"For {group_id} adding\n{opened_sections + closed_sections}"
                    )
                    new_section_ids += opened_sections
                    new_section_ids += closed_sections
                else:
                    logger.info(f"No new {section} sections in {group_id}")

        logger.info("Finding new Explorer Sections")
        # Iterate through District looking for new Sections

        district_ids = self.census_data[
            scout_census.column_labels.id.DISTRICT].drop_duplicates().dropna(
            ).to_list()

        for district_id in district_ids:
            logger.info(f"Investigating {district_id}")
            district_records = census_data.loc[census_data[
                scout_census.column_labels.id.DISTRICT] == district_id]
            units_by_year = {}
            for year in years:
                district_records_year = district_records.loc[
                    district_records["Year"] == year]
                units_by_year[year] = district_records_year[
                    sections_model.Explorers.unit_label].sum()

            increments = [
                units_by_year[year + 1] - units_by_year[year]
                for year in units_by_year.keys() if (year + 1) in units_by_year
            ]
            if max(increments) > 0:
                opened_sections = []
                closed_sections = []
                for year in years[1:]:
                    change = units_by_year[year] - units_by_year[year - 1]
                    if change > 0:
                        # Extent life of current sections
                        for open_sections in opened_sections:
                            open_sections["years"].append(year)
                        # Create new section record
                        for ii in range(change):
                            opened_sections.append({
                                "id": district_id,
                                "section": "Explorers",
                                "years": [year],
                                "nu_sections": units_by_year
                            })
                    elif change == 0:
                        # Lengthens all sections by a year
                        for open_sections in opened_sections:
                            open_sections["years"].append(year)
                    elif change < 0:
                        for ii in range(-change):
                            # Close sections in oldest order
                            if len(opened_sections) > 0:
                                closed_sections.append(opened_sections.pop(-1))
                        for open_sections in opened_sections:
                            open_sections["years"].append(year)

                logger.debug(
                    f"For {district_id} adding\n{opened_sections + closed_sections}"
                )
                new_section_ids += opened_sections
                new_section_ids += closed_sections

        section_details = []
        for year in years:
            if year < 2017:
                section_details.append(f"{year}_Est_Members")
            else:
                section_details.append(f"{year}_Members")

        # fmt: off
        output_columns = [
            "Object_ID", "Section Name", "Section", "Group_ID", "Group",
            "District_ID", "District", "County", "Region", "Scout Country",
            "Postcode", "IMD Country", "IMD Rank", "IMD Decile", "First Year",
            "Last Year", f"{years[0]}_sections", *section_details
        ]
        # fmt: on
        output_data = pd.DataFrame(columns=output_columns)

        logger.info(
            f"Start iteration through {len(new_section_ids)} new Sections")
        used_compass_ids = set()
        count = 0
        total = len(new_section_ids)
        new_sections_id: dict
        for new_sections_id in new_section_ids:
            section_data = {}
            logger.debug(f"Recording {new_sections_id}")
            count += 1
            logger.info(f"{count} of {total}")
            section_id = new_sections_id["id"]
            open_years = new_sections_id["years"]
            section = new_sections_id["section"]
            section_type = getattr(scout_census.column_labels.sections,
                                   section).type

            if section in scout_census.SECTIONS_GROUP:
                records = census_data.loc[census_data[
                    scout_census.column_labels.id.GROUP] == section_id]
                section_data["Group_ID"] = records[
                    scout_census.column_labels.id.GROUP].unique()[0]
                section_data["Group"] = records[
                    scout_census.column_labels.name.GROUP].unique()[0]
            elif section in scout_census.SECTIONS_DISTRICT:
                records = census_data.loc[census_data[
                    scout_census.column_labels.id.DISTRICT] == section_id]
                section_data["Group_ID"] = ""
                section_data["Group"] = ""
            else:
                raise Exception(
                    f"{section} neither belongs to a Group or District. id = {new_sections_id}"
                )

            for year in open_years:
                members_cols = getattr(scout_census.column_labels.sections,
                                       section).total
                year_records = records.loc[records["Year"] == year]
                if year >= 2017:
                    compass_id = section_data.get("Object_ID")
                    section_year_records = year_records.loc[records[
                        scout_census.column_labels.UNIT_TYPE] == section_type]

                    if compass_id:
                        section_record = section_year_records.loc[
                            section_year_records["Object_ID"] == compass_id]
                        section_data[f"{year}_Members"] = section_record[
                            members_cols].sum()
                    else:
                        section_year_ids: pd.Series = section_year_records[
                            "Object_ID"].drop_duplicates()
                        if open_years[0] >= 2017:
                            # If section became open after 31st January 2017 then can identify by Object_ID id
                            last_year_records = records.loc[records["Year"] ==
                                                            (year - 1)]
                            old_section_ids = last_year_records[
                                "Object_ID"].unique()
                            opened_section_ids = section_year_ids[
                                ~section_year_ids.isin(old_section_ids)]
                            if len(opened_section_ids) > 1:
                                logger.info(
                                    f"{len(opened_section_ids)} sections opened"
                                )
                                unused_ids = opened_section_ids[
                                    ~opened_section_ids.isin(used_compass_ids)]
                                compass_id = unused_ids.iloc[
                                    0] if not unused_ids.empty else opened_section_ids.iloc[
                                        -1]
                            elif len(opened_section_ids) == 0:
                                logger.error(
                                    f"No sections opened\n{year}: {section_year_ids}\n{year-1}: {old_section_ids}"
                                )
                            elif len(opened_section_ids) == 1:
                                compass_id = opened_section_ids.iloc[0]
                                logger.debug(f"Assigned id: {compass_id}")

                            section_data["Object_ID"] = compass_id
                            used_compass_ids.add(compass_id)
                            section_data[
                                f"{year}_Members"] = section_year_records.loc[
                                    section_year_records["Object_ID"] ==
                                    compass_id, members_cols].sum()
                        else:
                            compass_id = section_year_ids.max()

                            if compass_id in used_compass_ids:
                                section_year_ids.sort_values(ascending=False)
                                unused_ids = section_year_ids[
                                    ~section_year_ids.isin(used_compass_ids)]
                                if not unused_ids.empty:
                                    compass_id = unused_ids.iloc[0]
                                else:
                                    compass_id = section_year_ids.iloc[0]

                            section_data["Object_ID"] = compass_id
                            used_compass_ids.add(compass_id)
                            total_members = section_year_records.loc[
                                section_year_records["Object_ID"] ==
                                compass_id, members_cols].sum()

                            logger.debug(
                                f"{section} in {section_id} in {year} found {total_members} members"
                            )
                            section_data[f"{year}_Members"] = total_members
                else:
                    year_before_section_opened = open_years[0] - 1
                    year_before_records = records.loc[
                        records["Year"] == year_before_section_opened]

                    number_of_new_sections = new_sections_id["nu_sections"][
                        open_years[0]] - new_sections_id["nu_sections"][
                            year_before_section_opened]

                    new_members = year_records[members_cols].sum()
                    old_members = year_before_records[members_cols].sum()

                    additional_members = (new_members -
                                          old_members) / number_of_new_sections
                    if additional_members < 0:
                        logger.warning(
                            f"{section_id} increased number of {section} sections but membership decreased by {additional_members}"
                        )

                    logger.debug(
                        f"{section} in {section_id} in {year} found {additional_members} members"
                    )
                    section_data[f"{year}_Est_Members"] = additional_members

            closed_years = [year for year in years if year not in open_years]
            for year in closed_years:
                if year >= 2017:
                    section_data[f"{year}_Members"] = 0
                else:
                    section_data[f"{year}_Est_Members"] = 0

            section_data[f"{years[0]}_sections"] = new_sections_id[
                "nu_sections"][years[0]]

            section_records = None

            if section_data.get("Object_ID"):
                section_records = records.loc[records["Object_ID"] ==
                                              section_data.get("Object_ID")]
                section_data["Section Name"] = section_records["name"].unique(
                )[0]
            else:
                if open_years[-1] < 2017:
                    if section in scout_census.SECTIONS_GROUP:
                        section_records = records.loc[
                            records[scout_census.column_labels.UNIT_TYPE] ==
                            scout_census.UNIT_LEVEL_GROUP]
                    elif section in scout_census.SECTIONS_DISTRICT:
                        section_records = records.loc[
                            records[scout_census.column_labels.UNIT_TYPE] ==
                            scout_census.UNIT_LEVEL_DISTRICT]
                elif open_years[-1] == 2017:
                    section_records = records.loc[records[
                        scout_census.column_labels.UNIT_TYPE] == section_type]
                else:
                    raise Exception(
                        f"Unable to find section records for {new_section_ids}"
                    )

            section_data["Section"] = section
            section_data["District_ID"] = section_records[
                scout_census.column_labels.id.DISTRICT].unique()[0]
            section_data["District"] = section_records[
                scout_census.column_labels.name.DISTRICT].unique()[0]
            section_data["County"] = section_records["C_name"].unique()[0]
            section_data["Region"] = section_records["R_name"].unique()[0]
            section_data["Scout Country"] = section_records["X_name"].unique(
            )[0]

            if open_years[0] == years[0]:
                section_data["First Year"] = f"{years[0]} or before"
            else:
                section_data["First Year"] = open_years[0]
            if open_years[-1] == years[-1]:
                section_data["Last Year"] = f"Open in {years[-1]}"
            else:
                section_data["Last Year"] = open_years[-1]

            # To find the postcode and IMD for a range of Sections and Group
            # records across several years. Find the most recent year, and then
            # choose the Postcode, where the IMD Rank is the lowest.
            most_recent_year = open_years[-1]
            logger.debug(f"Checking {most_recent_year}")
            most_recent = section_records.loc[section_records["Year"] ==
                                              most_recent_year]
            if most_recent.shape[0] == 1:
                most_recent = most_recent.iloc[0]
            elif most_recent.shape[0] == 0:
                logger.warning("Inconsistent ids")
                if section in scout_census.SECTIONS_GROUP:
                    # In the event that the Object_IDs aren't consistent, pick a section in the group that's most recent
                    # is only applicable after 2017, so sections are assumed to exist.
                    most_recent = records.loc[
                        (records[scout_census.column_labels.id.GROUP] ==
                         section_data["Group_ID"])
                        & (records[scout_census.column_labels.UNIT_TYPE] ==
                           section_type)
                        & (records["Year"] == most_recent_year)].iloc[0]
                elif section in scout_census.SECTIONS_DISTRICT:
                    most_recent_record = records.loc[
                        (records[scout_census.column_labels.id.DISTRICT] ==
                         section_data["District_ID"])
                        & (records[scout_census.column_labels.UNIT_TYPE] ==
                           section_type)
                        & (records["Year"] == most_recent_year)]

                    if most_recent_record.empty:
                        logger.error(
                            f"No records found with D_ID = {section_data['District_ID']} in {most_recent_year} that are {section}"
                        )

                    most_recent = most_recent_record.iloc[0]
            else:
                logger.warning("Multiple sections found, assigning a section")
                most_recent = most_recent.iloc[0]

            postcode_valid = most_recent.at["postcode_is_valid"]
            # logger.debug(f"Identified:\n{most_recent} determined postcode valid:\n{postcode_valid}\n{postcode_valid == 1}\n{postcode_valid == 1}")
            # add postcode
            if postcode_valid:
                logger.debug(
                    f"Adding postcode {most_recent.at[scout_census.column_labels.POSTCODE]}"
                )
                section_data["Postcode"] = most_recent.at[
                    scout_census.column_labels.POSTCODE]
                country = ONS_PD.COUNTRY_CODES.get(most_recent.at["ctry"])
                section_data[
                    "IMD Country"] = country if country else scout_census.DEFAULT_VALUE
                section_data["IMD Decile"] = most_recent.at["imd_decile"]
                section_data["IMD Rank"] = most_recent.at["imd"]
            else:
                section_data["Postcode"] = scout_census.DEFAULT_VALUE
                section_data["IMD Country"] = scout_census.DEFAULT_VALUE
                section_data["IMD Decile"] = scout_census.DEFAULT_VALUE
                section_data["IMD Rank"] = scout_census.DEFAULT_VALUE

            section_data_df = pd.DataFrame([section_data],
                                           columns=output_columns)
            output_data = pd.concat([output_data, section_data_df], axis=0)

        output_data.reset_index(drop=True, inplace=True)
        if report_name:
            report_io.save_report(output_data, report_name)
        return output_data
Esempio n. 17
0
def close(start_time: float) -> None:
    """Outputs the duration of the programme"""
    logger.info(
        f"Script finished, {time.time() - start_time:.2f} seconds elapsed.")
Esempio n. 18
0
def save_report(report: pd.DataFrame, report_name: str) -> None:
    logger.info(f"Writing to {report_name}")
    report.to_csv(config.SETTINGS.folders.output / f"{report_name}.csv",
                  index=False,
                  encoding="utf-8-sig")
Esempio n. 19
0
import geopandas as gpd
import pandas as pd

from incognita.data.ons_pd import ONS_POSTCODE_DIRECTORY_MAY_20 as ONS_PD
from incognita.logger import logger
from incognita.logger import set_up_logger
from incognita.utility import config
from incognita.utility import constants
from incognita.utility import deciles

if __name__ == "__main__":
    set_up_logger()

    logger.info("Starting")
    to_keep = ("oscty", "oslaua", "osward", "ctry", "rgn", "pcon", "lsoa11",
               "msoa11", "imd", "imd_decile"
               )  # 'lat', 'long', 'nys_districts', 'pcd'
    fields = [f for f in to_keep if f in ONS_PD.fields]

    # Load Full ONS Postcode Directory
    data = pd.read_csv(config.SETTINGS.ons_pd.full,
                       dtype=ONS_PD.data_types,
                       encoding="utf-8")
    logger.info("Loaded data")

    orig = data.copy()
    logger.info("DEBUG - copied original data")

    # Add IMD Decile
    data["imd_decile"] = deciles.calc_imd_decile(data["imd"], data["ctry"],
                                                 ONS_PD).astype("UInt8")
Esempio n. 20
0
    def create_boundary_report(self,
                               options: set[str] = None,
                               historical: bool = False,
                               report_name: str = None) -> pd.DataFrame:
        """Produces .csv file summarising by boundary provided.

        Args:
            options: List of data to be included in report
            historical: Check to ensure that multiple years of data are intentional
            report_name:

        """

        # Set default option set for `options`
        if options is None:
            options = {
                "Number of Sections", "Groups", "Section numbers",
                "6 to 17 numbers", "awards", "waiting list total"
            }

        opt_groups = "Groups" in options
        opt_section_numbers = "Section numbers" in options
        opt_number_of_sections = "Number of Sections" in options
        opt_6_to_17_numbers = "6 to 17 numbers" in options
        opt_waiting_list_totals = "waiting list total" in options
        opt_adult_numbers = "Adult numbers" in options
        opt_awards = "awards" in options

        census_data = self.census_data
        boundary_codes = self.geography.boundary_codes
        geog_name = self.geography.metadata.key  # e.g oslaua osward pcon lsoa11
        logger.info(
            f"Creating report by {geog_name} with {', '.join(options)} from {len(census_data.index)} records"
        )

        census_dates = sorted(set(census_data["Census Date"].dropna()))
        if len(census_dates) > 1:
            if not historical:
                raise ValueError(
                    f"Historical option not selected, but multiple censuses selected ({census_dates[0]} - {census_dates[-1]})"
                )
            logger.info(
                f"Historical analysis from {census_dates[0]} to {census_dates[-1]}"
            )

        sections_model = column_labels.sections

        dataframes = []

        if opt_groups:
            # Used to list the groups that operate within the boundary.
            # Gets all groups in the census_data dataframe and calculates the
            # number of groups.
            logger.debug(f"Adding group data")
            groups = census_data[[geog_name, column_labels.name.GROUP]].copy()
            groups[column_labels.name.GROUP] = groups[
                column_labels.name.GROUP].str.strip()
            grouped_rgn = groups.drop_duplicates().dropna().groupby(
                [geog_name], dropna=False)[column_labels.name.GROUP]
            dataframes.append(
                pd.DataFrame({
                    "Groups":
                    grouped_rgn.unique().apply("\n".join),
                    "Number of Groups":
                    grouped_rgn.nunique(dropna=True)
                }))

        if opt_section_numbers or opt_number_of_sections or opt_6_to_17_numbers or opt_waiting_list_totals or opt_adult_numbers:
            total_cols = [
                section_model.total
                for section_name, section_model in sections_model
                if section_name != "Network"
            ]
            waiting_cols = [
                section_model.waiting_list
                for section_name, section_model in sections_model
                if section_name != "Network"
            ]
            census_data["All"] = census_data[total_cols].sum(
                axis=1).astype("Int32")
            census_data["Waiting List"] = census_data[waiting_cols].sum(
                axis=1).astype("Int32")
            census_data["Adults"] = census_data[[
                "Leaders", "AssistantLeaders", "SectAssistants", "OtherAdults"
            ]].sum(axis=1).astype("Int32")

            logger.debug(f"Adding young people numbers")
            metric_cols = []
            rename = {}
            if opt_section_numbers:
                metric_cols += [
                    section_model.total
                    for section_name, section_model in sections_model
                    if section_name != "Network"
                ]
            if opt_number_of_sections:
                # TODO correct for pluralisation (e.g. Colony -> Colonys not Colonies)
                metric_cols += [
                    section_model.unit_label
                    for section_name, section_model in sections_model
                    if section_name != "Network"
                ]
                rename |= {
                    section_model.unit_label: f"{section_model.type}s"
                    for section_name, section_model in sections_model
                    if section_name != "Network"
                }
            if opt_6_to_17_numbers:
                metric_cols += ["All"]
            if opt_waiting_list_totals:
                metric_cols += ["Waiting List"]
            if opt_adult_numbers:
                metric_cols += ["Adults"]
            agg = census_data.groupby(
                [geog_name, "Census_ID"],
                dropna=False)[metric_cols].sum().unstack().sort_index()
            agg.columns = [
                f"{rename.get(key, key)}-{census_year}".replace("_total", "")
                for key, census_year in agg.columns
            ]
            dataframes.append(agg)

        if opt_awards:
            if geog_name not in ONS_GEOG_NAMES:
                raise ValueError(
                    f"{geog_name} is not a valid geography name. Valid values are {ONS_GEOG_NAMES}"
                )

            district_id_column = column_labels.id.DISTRICT
            award_name = sections_model.Beavers.top_award[0]
            award_eligible = sections_model.Beavers.top_award_eligible[0]

            logger.debug(f"Creating awards mapping")
            awards_mapping = _ons_to_district_mapping(census_data,
                                                      boundary_codes,
                                                      geog_name)
            district_numbers = {
                district_id: num
                for district_dict in awards_mapping.values()
                for district_id, num in district_dict.items()
            }
            grouped_dist = census_data[[
                "Queens_Scout_Awards", "Eligible4QSA", district_id_column
            ]].groupby(district_id_column, dropna=False)
            ons_regions_in_district = grouped_dist[district_id_column].first(
            ).map(district_numbers)
            awards_per_district_per_regions = pd.DataFrame({
                # QSAs achieved in district, divided by the number of regions the district is in
                "QSA":
                grouped_dist["Queens_Scout_Awards"].sum() /
                ons_regions_in_district,
                # number of young people eligible to achieve the QSA in district, divided by the number of regions the district is in
                "qsa_eligible":
                grouped_dist["Eligible4QSA"].sum() / ons_regions_in_district,
            })

            # Check that our pivot keeps the total membership constant
            yp_cols = [
                "Beavers_total", "Cubs_total", "Scouts_total",
                "Explorers_total"
            ]
            grouped_rgn = census_data.groupby([geog_name], dropna=False)
            assert int(census_data[yp_cols].sum().sum()) == int(
                grouped_rgn[yp_cols].sum().sum().sum())

            logger.debug(f"Adding awards data")
            award_total = grouped_rgn[award_name].sum()
            eligible_total = grouped_rgn[award_eligible].sum()
            award_prop = 100 * award_total / eligible_total
            award_prop[eligible_total == 0] = pd.NA

            max_value = award_prop.quantile(0.95)
            award_prop = award_prop.clip(upper=max_value)

            # calculates the nominal QSAs per ONS region specified.
            # Divides total # of awards by the number of Scout Districts that the ONS Region is in
            region_ids = grouped_rgn.name.first().index.to_series()
            if geog_name == "D_ID":
                district_ids = region_ids
            else:
                region_district_map = {
                    rgn_id: list(district_dict)
                    for rgn_id, district_dict in awards_mapping.items()
                }
                district_ids = region_ids.map(region_district_map)
            awards_regions_data = pd.DataFrame.from_dict(
                {
                    idx: awards_per_district_per_regions.loc[ids].sum()
                    for idx, ids in district_ids.items()
                },
                orient="index")
            qsa_prop = 100 * awards_regions_data["QSA"] / awards_regions_data[
                "qsa_eligible"]
            qsa_prop[awards_regions_data["qsa_eligible"] == 0] = pd.NA

            award_data = {
                award_name: award_total,
                award_eligible: eligible_total,
                f"%-{award_name}": award_prop,
                "QSA": awards_regions_data["QSA"],
                "%-QSA": qsa_prop,
            }
            dataframes.append(pd.DataFrame(award_data))

        # TODO find a way to keep DUMMY geography coding
        output_data = boundary_codes.reset_index(drop=True).copy()
        output_data = output_data.merge(pd.concat(dataframes, axis=1),
                                        how="left",
                                        left_on="codes",
                                        right_index=True,
                                        sort=False)

        if geog_name == "lsoa11":
            logger.debug(f"Loading ONS postcode data & Adding IMD deciles.")
            ons_pd_data = pd.read_feather(config.SETTINGS.ons_pd.reduced,
                                          columns=["lsoa11", "imd_decile"
                                                   ]).drop_duplicates()
            output_data = output_data.merge(
                ons_pd_data, how="left", left_on="codes",
                right_on="lsoa11").drop(columns="lsoa11")

        if report_name:
            report_io.save_report(output_data, report_name)

        return output_data
Esempio n. 21
0
 def add(self, number1: Real, number2: Real) -> Real:
     logger.info("Example Function")
     return number1 + number2
import time

import geopandas as gpd
import pandas as pd

from incognita.data.scout_census import load_census_data
from incognita.geographies import district_boundaries
from incognita.logger import logger
from incognita.utility import config
from incognita.utility import filter
from incognita.utility import timing

if __name__ == "__main__":
    start_time = time.time()
    logger.info(
        f"Starting at {time.strftime('%H:%M:%S', time.localtime(start_time))}")

    census_data = load_census_data()
    census_data = filter.filter_records(census_data, "Census_ID", {20})
    # Remove Jersey, Guernsey, and Isle of Man as they have invalid lat/long coordinates for their postcodes
    census_data = filter.filter_records(
        census_data,
        "C_name", {"Bailiwick of Guernsey", "Isle of Man", "Jersey"},
        exclude_matching=True)

    # low resolution shape data
    world_low_res = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
    uk_shape = world_low_res.loc[world_low_res.name == "United Kingdom",
                                 "geometry"].array.data[0]
    # # high resolution shape data
    # uk_shape = gpd.read_file(r"S:\Development\incognita\data\UK Shape\GBR_adm0.shp")["geometry"].array.data[0]
Esempio n. 23
0
    def add_areas(
        self,
        var_col: str,
        tooltip: str,
        layer_name: str,
        boundary_report: pd.DataFrame,
        boundary_metadata: config.Boundary,
        show: bool = False,
        colour_bounds: list[int] = None,
        significance_threshold: float = 2.5,
        categorical: bool = False,
    ) -> None:
        """Creates a 2D colouring with geometry specified by the boundary

        Args:
            var_col: Data column to use for choropleth colour values
            tooltip: Mouseover tooltip for each boundary (e.g. "% Change 6-18")
            layer_name: Legend key for the layer (e.g. "% Change 6-18 (Counties)")
            boundary_report:
            boundary_metadata:
            show: If True, show the layer by default
            colour_bounds: Colour breaks to create a fixed legend
            significance_threshold: If an area's value is significant enough to be displayed
            categorical: If the data are categorical

        """
        data = boundary_report
        if var_col not in data.columns:
            logger.error(f"{var_col} is not a valid column in the data. \n" f"Valid columns include {data.columns}")
            raise KeyError(f"{var_col} is not a valid column in the data.")

        colours = list(reversed(("#4dac26", "#b8e186", "#f1b6da", "#d01c8b")))
        choropleth_data = data[["codes", var_col]].set_index("codes")[var_col]  # contains shapefile paths, and labels for region codes and names

        # Set value col properties to use for a particular boundary
        logger.info(f"Setting choropleth column to {var_col} (displayed: {tooltip})")

        non_zero_choropleth_data = choropleth_data[choropleth_data != 0].dropna().sort_values()
        colour_map_id = "0"
        if categorical:
            categories = [*non_zero_choropleth_data.drop_duplicates()]
            self.map["colour_map"] = _output_colour_scale_categorical(
                colour_map_id,
                layer_name,
                colours,
                classes=categories,
                legend_categories=categories,
            )
        else:
            if colour_bounds is None:
                quantiles = (20, 40, 60, 80, 100)
                colour_bounds = np.unique(np.percentile(non_zero_choropleth_data, quantiles, interpolation="nearest")).tolist()

            num_ranges = len(colour_bounds) - 1
            self.map["colour_map"] = _output_colour_scale_ranges(
                colour_map_id,
                layer_name,
                colours,
                classes=colour_bounds,
                legend_ranges=[(colour_bounds[i], colour_bounds[i + 1]) for i in range(num_ranges)],
            )

            logger.info(f"Colour scale boundary values {colour_bounds}")

        logger.info(f"Merging geo_json on shape_codes from shapefile with codes from boundary report")

        metadata = boundary_metadata
        self.map[f"layer_{layer_name}"] = _output_shape_layer(
            legend_key=layer_name,  # the name of the Layer, as it will appear in the layer controls
            colour_data=choropleth_data.to_dict(),
            api_base=metadata.api.url,
            query_params=metadata.api.query_params,
            colour_scale_id=colour_map_id,
            threshold=significance_threshold,
            code_col=metadata.api.codes_col,
            name_col=metadata.api.names_col,
            measure_name=tooltip,
            show=show,
        )
Esempio n. 24
0
def _run_postcode_fix_step(data: pd.DataFrame, all_valid_postcodes: pd.Index,
                           invalid_type: str, fill_from: str,
                           entity_types: set[str], column_label: str,
                           index_level: int) -> pd.DataFrame:
    """Runs postcode fixer for given data and parameters.

    Method:
    Gets all records with ID from given column and index level, then clears the indexing
    Returns the first row's postcode. As the index is sorted, this will return the earliest correct year.
    TODO change to use modal result instead of first (If section has no valid postcodes, use most common
        (modal) postcode from sections in group in that year, then try successive years)

    Args:
        data: Census data
        all_valid_postcodes: Index of all valid postcodes in the ONS postcode directory
        invalid_type: Which type of issue are we fixing (for log message)
        fill_from: Where are we pulling valid postcodes from (for log message)
        entity_types: Entity types to filter the fixing on (e.g. Colony, Group, Network, District)
        column_label: Name of the index level being used
        index_level: Level of the MultiIndex to filter on

    Returns:
        Updated census data

    """
    # Index level: 0=District; 1=Group; 2=Section; 3=Census_ID

    logger.info(
        f"Fill invalid {invalid_type} postcodes with valid section postcodes from {fill_from}"
    )

    entity_type_label = scout_census.column_labels.UNIT_TYPE
    valid_postcode_label = scout_census.column_labels.VALID_POSTCODE

    # Gets all entity records matching the given criteria, and returns a
    # minimal set of fields for memory optimisation
    records = data.loc[
        data[entity_type_label].isin(entity_types),
        [valid_postcode_label, column_label, CLEAN_POSTCODE_LABEL]]

    valid_postcodes_start = data[valid_postcode_label].to_numpy().sum()

    # Get all valid clean postcodes from the filtered records. Then sort the
    # index with census IDs high -> low. Then group the data by the passed
    # index level. As the census IDs are sorted descending, the first item will
    # be the newest possible clean postcode, indexed by the passed level.
    firsts = records.loc[records[valid_postcode_label],
                         CLEAN_POSTCODE_LABEL].sort_index(
                             ascending=(True, True, True, False)).groupby(
                                 level=index_level).first()

    # Map invalid postcodes to valid postcodes by the given ID type/field
    clean_postcodes = records.loc[~records[valid_postcode_label],
                                  column_label].map(firsts)

    # Merge in the changed postcodes and overwrite pre-existing postcodes in the Clean Postcode column
    clean_postcodes_not_na = clean_postcodes.loc[
        clean_postcodes.notna()]  # .update(*) uses not_na filter
    data.loc[clean_postcodes_not_na.index,
             CLEAN_POSTCODE_LABEL] = clean_postcodes_not_na

    # Update valid postcode status
    data[valid_postcode_label] = data[CLEAN_POSTCODE_LABEL].isin(
        all_valid_postcodes)

    logger.info(
        f"change in valid postcodes is: {data[valid_postcode_label].to_numpy().sum() - valid_postcodes_start}"
    )

    return data
Esempio n. 25
0
import time

from incognita.data.scout_census import load_census_data
from incognita.logger import logger
from incognita.reports.history_summary import HistorySummary
from incognita.utility import filter
from incognita.utility import timing

if __name__ == "__main__":
    start_time = time.time()
    logger.info(
        f"Starting at {time.strftime('%H:%M:%S', time.localtime(start_time))}")

    census_ids = {15, 16, 17, 18, 19, 20}

    census_data = load_census_data()
    census_data = filter.filter_records(census_data, "Census_ID", census_ids)
    census_data = filter.filter_records(
        census_data, "X_name",
        {"England", "Scotland", "Wales", "Northern Ireland"})

    # If filtering on IMD, remove NA values
    # census_data = filter.filter_records(census_data, "imd_decile", ["nan"], exclude_matching=True)
    # census_data = filter.filter_records(census_data, "imd_decile", [1, 2, 3])

    section_history = HistorySummary(census_data)
    section_history.new_section_history_summary(
        sorted(census_ids), report_name="opened_section_data")
    timing.close(start_time)
Esempio n. 26
0
    # The errors file contains all the postcodes that failed to be looked up in the ONS Postcode Directory
    error_output_fields = [
        postcode_merge_column, original_postcode_label, compass_id_label,
        "type", "name", "G_name", "D_name", "C_name", "R_name", "X_name",
        "Census Date"
    ]
    data.loc[~data[valid_postcode_label],
             error_output_fields].to_csv(error_output_path,
                                         index=False,
                                         encoding="utf-8-sig")

    # Write the new data to a csv file (utf-8-sig only to force excel to use UTF-8)
    logger.info("Writing merged data")
    data.to_csv(output_path.with_suffix(".csv"),
                index=False,
                encoding="utf-8-sig")
    data.to_feather(output_path.with_suffix(".feather"))


if __name__ == "__main__":
    # Turn on logging
    set_up_logger()

    logger.info(f"Starting at {time.strftime('%H:%M:%S', time.localtime())}")
    start_time = time.time()

    process_census_extract()

    logger.info(
        f"Script finished, {time.time() - start_time:.2f} seconds elapsed.")