Beispiel #1
0
def filter_records(data: pd.DataFrame, field: str, value_list: set, exclude_matching: bool = False, exclusion_analysis: bool = False) -> pd.DataFrame:
    """Filters the Census records by any field in ONS PD.

    Args:
        data:
        field: The field on which to filter
        value_list: The values on which to filter
        exclude_matching: If True, exclude the values that match the filter. If False, keep the values that match the filter.
        exclusion_analysis:

    Returns:
        Filtered data

    """
    # Count number of rows
    original_records = data.index.size
    matching_records = data[field].isin(value_list)
    if exclude_matching:
        # Excluding records that match the filter criteria
        filter_mask = ~matching_records
        logger.info(f"Selecting records that satisfy {field} not in {value_list} from {original_records} records.")
    else:
        # Including records that match the filter criteria
        filter_mask = matching_records
        logger.info(f"Selecting records that satisfy {field} in {value_list} from {original_records} records.")

    filtered = data.loc[filter_mask]
    logger.debug(f"Resulting in {filtered.index.size} records remaining.")

    if exclusion_analysis:
        excluded = data.loc[~filter_mask]
        _exclusion_analysis(data, filtered, excluded)

    return filtered
Beispiel #2
0
def _exclusion_analysis(original: pd.DataFrame, filtered: pd.DataFrame, excluded: pd.DataFrame):
    cols = {scout_census.column_labels.UNIT_TYPE, *(section_model.total for section, section_model in sections_model)}
    if not set(original.columns) >= cols:
        o_cols = original.columns.to_list()
        raise ValueError("Required columns are not in dataset!\n" f"Required columns are: {cols}.\n" f"Your columns are: {o_cols}")

    # Calculate the number of records that have been filtered out
    original_records = original.index.size
    excluded_records = original_records - filtered.index.size
    logger.info(f"{excluded_records} records were removed ({excluded_records / original_records * 100}% of total)")

    # Prints number of members and % of members filtered out for each section
    for section_name, section_model in sections_model:
        logger.debug(f"Analysis of {section_name} member exclusions")
        section_type = section_model.type
        members_col = section_model.total

        excluded_sections = excluded.loc[excluded[scout_census.column_labels.UNIT_TYPE] == section_type]
        excluded_members = 0
        if not excluded_sections.empty:
            logger.debug(f"Excluded sections\n{excluded_sections}")
            logger.debug(f"Finding number of excluded {section_name} by summing {members_col}")
            excluded_members = excluded_sections[members_col].sum()
            logger.debug(f"{excluded_members} {section_name} excluded")

        original_members = original.loc[original[scout_census.column_labels.UNIT_TYPE] == section_type, members_col].sum()
        if original_members > 0:
            logger.info(f"{excluded_members} {section_name} members were removed ({excluded_members / original_members * 100}%) of total")
        else:
            logger.info(f"There are no {section_name} members present in data")
Beispiel #3
0
    def add_custom_data(self, csv_file_path: Path, layer_name: str, location_cols: Union[Literal["Postcodes"], dict], marker_data: list = None) -> None:
        """Function to add custom data as markers on map

        Args:
            csv_file_path: file path to open csv file
            layer_name: Name of layer that the markers will be added to
            location_cols: Indicates whether adding data with postcodes or co-ordinates
                - if postcodes, str "Postcodes"
                - if co-ordinates, dict of co-ordinate data with keys ["crs", "x", "y"]
            marker_data: list of strings for values in data that should be in popup

        """

        custom_data = pd.read_csv(csv_file_path)

        if location_cols == "Postcodes":
            # Merge with ONS Postcode Directory to obtain dataframe with lat/long
            logger.debug(f"Loading ONS postcode data.")
            ons_pd_data = pd.read_feather(config.SETTINGS.ons_pd.reduced)
            custom_data = pd.merge(custom_data, ons_pd_data, how="left", left_on=location_cols, right_index=True, sort=False)
            location_cols = {"crs": constants.WGS_84, "x": "long", "y": "lat"}

        # Create geo data frame with points generated from lat/long or OS
        custom_data = gpd.GeoDataFrame(custom_data, geometry=gpd.points_from_xy(x=custom_data[location_cols["x"]], y=custom_data[location_cols["y"]]), crs=location_cols["crs"])

        # Convert the 'Co-ordinate reference system' (crs) to WGS_84 (i.e. lat/long) if not already
        if location_cols["crs"] != constants.WGS_84:
            custom_data = custom_data.to_crs(epsg=constants.WGS_84)

        if layer_name in self.map._children:  # NoQA
            raise ValueError("Layer already used!")

        # Plot marker and include marker_data in the popup for every item in custom_data
        out = []
        if marker_data:
            x_not_nan = custom_data.geometry.x.notna()
            for row in custom_data[x_not_nan].itertuples():
                out.append(
                    {
                        "lat": round(row.geometry.y, 4),
                        "lon": round(row.geometry.x, 4),
                        "col": "green",
                        "html": "".join(f'<p align="center">{row[marker_col]}</p>' for marker_col in marker_data),
                    }
                )
        else:
            for points in custom_data.geometry[custom_data.geometry.x.notna()].to_list():
                out.append({"lat": round(points.y, 4), "lon": round(points.x, 4), "col": "green", "html": ""})
        self.map[layer_name] = _output_marker_layer(layer_name, out)
Beispiel #4
0
def _ons_to_district_mapping(census_data: pd.DataFrame,
                             boundary_codes: pd.DataFrame,
                             region_type: str) -> dict:
    """Create json file, containing which scout districts are within an
    each ONS area, and how many ONS areas those districts are in.

    Args:
        region_type:
            A field in the modified census report corresponding to an
            administrative region (lsoa11, msoa11, oslaua, osward, pcon,
            oscty, ctry, rgn). region_type is also a census column heading
            for the region geography type

    """

    logger.debug("Creating mapping from ons boundary to scout district")

    district_id_column = column_labels.id.DISTRICT

    region_ids = set(boundary_codes["codes"].dropna())

    district_ids_by_region = census_data.loc[
        census_data[region_type].isin(region_ids),
        [region_type, district_id_column]].dropna().drop_duplicates()
    district_ids = set(district_ids_by_region[district_id_column].dropna())

    # count of how many regions the district occupies:
    count_regions_in_district = (
        census_data.loc[(census_data[district_id_column].isin(district_ids) &
                         (census_data[region_type] != DEFAULT_VALUE)),
                        [district_id_column, region_type]].dropna().
        drop_duplicates().groupby(district_id_column).count().rename(
            columns={region_type: "count"}))
    count_by_district_by_region = pd.merge(
        left=district_ids_by_region,
        right=count_regions_in_district,
        on=district_id_column).set_index([region_type, district_id_column])

    nested_dict = {}
    for (region_id,
         district_id), value in count_by_district_by_region["count"].items():
        nested_dict.setdefault(region_id, {})[district_id] = value

    logger.debug("Finished mapping from ons boundary to district")
    return dict(nested_dict)  # Return the mapping
Beispiel #5
0
def _load_boundary(boundary_report: pd.DataFrame, boundary_metadata: config.Boundary) -> gpd.GeoDataFrame:
    """Loads a given boundary from a boundary report and metadata.

    Loads shapefile from path into GeoPandas dataframe
    Filters out unneeded shapes within all shapes loaded
    Converts from British National Grid to WGS84, as Leaflet doesn't understand BNG

    Args:
        boundary_report: A DataFrame object with boundary report data
        boundary_metadata: This contains shapefile paths, and labels for region codes and names

    Returns:
        GeoDataFrame with filtered and CRS transformed shapes

    """
    metadata = boundary_metadata
    data = boundary_report

    # Read a shape file. shapefile_path is the path to ESRI shapefile with region information
    logger.info("Loading Shapefile data")
    logger.debug(f"Shapefile path: {metadata.shapefile.path}")
    start_time = time.time()
    all_shapes = gpd.read_file(metadata.shapefile.path)
    logger.info(f"Loading Shapefile data finished, {time.time() - start_time:.2f} seconds elapsed")
    if metadata.shapefile.key not in all_shapes.columns:
        raise KeyError(f"{metadata.shapefile.key} not present in shapefile. Valid columns are: {all_shapes.columns}")

    # Rename columns
    shapes_col_map = {metadata.shapefile.key: "shape_codes", metadata.shapefile.name: "shape_names"}
    all_shapes.columns = [shapes_col_map.get(col, col) for col in all_shapes.columns]

    # Filter and convert GeoDataFrame to world co-ordinates
    logger.info(f"Filtering {len(all_shapes.index)} shapes by shape_codes being in the codes column of the map_data")
    all_codes = set(data["codes"])
    logger.debug(f"All codes list: {all_codes}")
    geo_data = all_shapes.loc[all_shapes["shape_codes"].isin(all_codes), ["geometry", "shape_codes", "shape_names"]].to_crs(epsg=constants.WGS_84)
    logger.info(f"Loaded {len(geo_data.index):,} boundary shapes. Columns now in data: {[*data.columns]}.")
    return geo_data
Beispiel #6
0
    def new_section_history_summary(self,
                                    years: list,
                                    report_name: str = None) -> pd.DataFrame:
        sections_model = scout_census.column_labels.sections

        # Given data on all sections, provides summary of all new sections, and
        # copes with the pre-2017 section reporting structure
        logger.info(f"Beginning new_section_history_summary for {years}")
        new_section_ids: list[dict] = []

        logger.info(
            f"Getting group ID list in column {scout_census.column_labels.id.GROUP}"
        )
        # Iterate through Groups looking for new Sections
        group_ids = self.census_data[
            scout_census.column_labels.id.GROUP].dropna().drop_duplicates(
            ).to_list()

        logger.info(f"Found {len(group_ids)} Groups")

        # for each section in each group in the census
        # construct dict of {year: number of sections of that type open in that year}
        # construct list of number of sections of that type open in that year
        # construct list of changes in number of sections per year
        # if there is an increase year on year
        # for each year from the second year calculate the change in the number of sections
        # whatever happens with change
        # do the same for district sections (explorers)
        #
        #
        # .

        census_data = self.census_data.fillna({
            scout_census.column_labels.id.GROUP:
            0,
            scout_census.column_labels.id.DISTRICT:
            0
        })

        for group_id in group_ids:
            logger.info(f"Investigating {group_id}")
            group_records = census_data.loc[census_data[
                scout_census.column_labels.id.GROUP] == group_id]

            for section in scout_census.SECTIONS_GROUP:
                logger.info(f"Finding {section} sections")
                units_by_year = {}
                for year in years:
                    section_numbers_year = group_records.loc[
                        group_records["Year"] == year,
                        getattr(scout_census.column_labels.sections, section
                                ).unit_label].sum()
                    units_by_year[year] = section_numbers_year

                increments = [
                    units_by_year[year + 1] - units_by_year[year]
                    for year in units_by_year.keys()
                    if (year + 1) in units_by_year
                ]
                if max(increments) > 0:
                    logger.debug(
                        f"Identified year profile of sections: {units_by_year}"
                    )
                    opened_sections = []
                    closed_sections = []
                    for year in years[1:]:
                        change = units_by_year[year] - units_by_year[year - 1]
                        if change > 0:
                            # Extent life of current sections
                            for open_sections in opened_sections:
                                open_sections["years"].append(year)
                            # Create new section record
                            for ii in range(change):
                                logger.debug(
                                    f"New {section} section found for {group_id} in {year}"
                                )
                                opened_sections.append({
                                    "id":
                                    group_id,
                                    "section":
                                    section,
                                    "years": [year],
                                    "nu_sections":
                                    units_by_year
                                })
                        elif change == 0:
                            # Lengthens all sections by a year
                            for open_sections in opened_sections:
                                open_sections["years"].append(year)
                        elif change < 0:
                            for ii in range(-change):
                                # Close sections in newest first
                                if len(opened_sections) > 0:
                                    logger.debug(
                                        f"{section} closed for {group_id} in {year}"
                                    )
                                    closed_sections.append(
                                        opened_sections.pop(-1))
                            # Lengthens remaining open sections by a year
                            for open_sections in opened_sections:
                                open_sections["years"].append(year)

                    logger.debug(
                        f"For {group_id} adding\n{opened_sections + closed_sections}"
                    )
                    new_section_ids += opened_sections
                    new_section_ids += closed_sections
                else:
                    logger.info(f"No new {section} sections in {group_id}")

        logger.info("Finding new Explorer Sections")
        # Iterate through District looking for new Sections

        district_ids = self.census_data[
            scout_census.column_labels.id.DISTRICT].drop_duplicates().dropna(
            ).to_list()

        for district_id in district_ids:
            logger.info(f"Investigating {district_id}")
            district_records = census_data.loc[census_data[
                scout_census.column_labels.id.DISTRICT] == district_id]
            units_by_year = {}
            for year in years:
                district_records_year = district_records.loc[
                    district_records["Year"] == year]
                units_by_year[year] = district_records_year[
                    sections_model.Explorers.unit_label].sum()

            increments = [
                units_by_year[year + 1] - units_by_year[year]
                for year in units_by_year.keys() if (year + 1) in units_by_year
            ]
            if max(increments) > 0:
                opened_sections = []
                closed_sections = []
                for year in years[1:]:
                    change = units_by_year[year] - units_by_year[year - 1]
                    if change > 0:
                        # Extent life of current sections
                        for open_sections in opened_sections:
                            open_sections["years"].append(year)
                        # Create new section record
                        for ii in range(change):
                            opened_sections.append({
                                "id": district_id,
                                "section": "Explorers",
                                "years": [year],
                                "nu_sections": units_by_year
                            })
                    elif change == 0:
                        # Lengthens all sections by a year
                        for open_sections in opened_sections:
                            open_sections["years"].append(year)
                    elif change < 0:
                        for ii in range(-change):
                            # Close sections in oldest order
                            if len(opened_sections) > 0:
                                closed_sections.append(opened_sections.pop(-1))
                        for open_sections in opened_sections:
                            open_sections["years"].append(year)

                logger.debug(
                    f"For {district_id} adding\n{opened_sections + closed_sections}"
                )
                new_section_ids += opened_sections
                new_section_ids += closed_sections

        section_details = []
        for year in years:
            if year < 2017:
                section_details.append(f"{year}_Est_Members")
            else:
                section_details.append(f"{year}_Members")

        # fmt: off
        output_columns = [
            "Object_ID", "Section Name", "Section", "Group_ID", "Group",
            "District_ID", "District", "County", "Region", "Scout Country",
            "Postcode", "IMD Country", "IMD Rank", "IMD Decile", "First Year",
            "Last Year", f"{years[0]}_sections", *section_details
        ]
        # fmt: on
        output_data = pd.DataFrame(columns=output_columns)

        logger.info(
            f"Start iteration through {len(new_section_ids)} new Sections")
        used_compass_ids = set()
        count = 0
        total = len(new_section_ids)
        new_sections_id: dict
        for new_sections_id in new_section_ids:
            section_data = {}
            logger.debug(f"Recording {new_sections_id}")
            count += 1
            logger.info(f"{count} of {total}")
            section_id = new_sections_id["id"]
            open_years = new_sections_id["years"]
            section = new_sections_id["section"]
            section_type = getattr(scout_census.column_labels.sections,
                                   section).type

            if section in scout_census.SECTIONS_GROUP:
                records = census_data.loc[census_data[
                    scout_census.column_labels.id.GROUP] == section_id]
                section_data["Group_ID"] = records[
                    scout_census.column_labels.id.GROUP].unique()[0]
                section_data["Group"] = records[
                    scout_census.column_labels.name.GROUP].unique()[0]
            elif section in scout_census.SECTIONS_DISTRICT:
                records = census_data.loc[census_data[
                    scout_census.column_labels.id.DISTRICT] == section_id]
                section_data["Group_ID"] = ""
                section_data["Group"] = ""
            else:
                raise Exception(
                    f"{section} neither belongs to a Group or District. id = {new_sections_id}"
                )

            for year in open_years:
                members_cols = getattr(scout_census.column_labels.sections,
                                       section).total
                year_records = records.loc[records["Year"] == year]
                if year >= 2017:
                    compass_id = section_data.get("Object_ID")
                    section_year_records = year_records.loc[records[
                        scout_census.column_labels.UNIT_TYPE] == section_type]

                    if compass_id:
                        section_record = section_year_records.loc[
                            section_year_records["Object_ID"] == compass_id]
                        section_data[f"{year}_Members"] = section_record[
                            members_cols].sum()
                    else:
                        section_year_ids: pd.Series = section_year_records[
                            "Object_ID"].drop_duplicates()
                        if open_years[0] >= 2017:
                            # If section became open after 31st January 2017 then can identify by Object_ID id
                            last_year_records = records.loc[records["Year"] ==
                                                            (year - 1)]
                            old_section_ids = last_year_records[
                                "Object_ID"].unique()
                            opened_section_ids = section_year_ids[
                                ~section_year_ids.isin(old_section_ids)]
                            if len(opened_section_ids) > 1:
                                logger.info(
                                    f"{len(opened_section_ids)} sections opened"
                                )
                                unused_ids = opened_section_ids[
                                    ~opened_section_ids.isin(used_compass_ids)]
                                compass_id = unused_ids.iloc[
                                    0] if not unused_ids.empty else opened_section_ids.iloc[
                                        -1]
                            elif len(opened_section_ids) == 0:
                                logger.error(
                                    f"No sections opened\n{year}: {section_year_ids}\n{year-1}: {old_section_ids}"
                                )
                            elif len(opened_section_ids) == 1:
                                compass_id = opened_section_ids.iloc[0]
                                logger.debug(f"Assigned id: {compass_id}")

                            section_data["Object_ID"] = compass_id
                            used_compass_ids.add(compass_id)
                            section_data[
                                f"{year}_Members"] = section_year_records.loc[
                                    section_year_records["Object_ID"] ==
                                    compass_id, members_cols].sum()
                        else:
                            compass_id = section_year_ids.max()

                            if compass_id in used_compass_ids:
                                section_year_ids.sort_values(ascending=False)
                                unused_ids = section_year_ids[
                                    ~section_year_ids.isin(used_compass_ids)]
                                if not unused_ids.empty:
                                    compass_id = unused_ids.iloc[0]
                                else:
                                    compass_id = section_year_ids.iloc[0]

                            section_data["Object_ID"] = compass_id
                            used_compass_ids.add(compass_id)
                            total_members = section_year_records.loc[
                                section_year_records["Object_ID"] ==
                                compass_id, members_cols].sum()

                            logger.debug(
                                f"{section} in {section_id} in {year} found {total_members} members"
                            )
                            section_data[f"{year}_Members"] = total_members
                else:
                    year_before_section_opened = open_years[0] - 1
                    year_before_records = records.loc[
                        records["Year"] == year_before_section_opened]

                    number_of_new_sections = new_sections_id["nu_sections"][
                        open_years[0]] - new_sections_id["nu_sections"][
                            year_before_section_opened]

                    new_members = year_records[members_cols].sum()
                    old_members = year_before_records[members_cols].sum()

                    additional_members = (new_members -
                                          old_members) / number_of_new_sections
                    if additional_members < 0:
                        logger.warning(
                            f"{section_id} increased number of {section} sections but membership decreased by {additional_members}"
                        )

                    logger.debug(
                        f"{section} in {section_id} in {year} found {additional_members} members"
                    )
                    section_data[f"{year}_Est_Members"] = additional_members

            closed_years = [year for year in years if year not in open_years]
            for year in closed_years:
                if year >= 2017:
                    section_data[f"{year}_Members"] = 0
                else:
                    section_data[f"{year}_Est_Members"] = 0

            section_data[f"{years[0]}_sections"] = new_sections_id[
                "nu_sections"][years[0]]

            section_records = None

            if section_data.get("Object_ID"):
                section_records = records.loc[records["Object_ID"] ==
                                              section_data.get("Object_ID")]
                section_data["Section Name"] = section_records["name"].unique(
                )[0]
            else:
                if open_years[-1] < 2017:
                    if section in scout_census.SECTIONS_GROUP:
                        section_records = records.loc[
                            records[scout_census.column_labels.UNIT_TYPE] ==
                            scout_census.UNIT_LEVEL_GROUP]
                    elif section in scout_census.SECTIONS_DISTRICT:
                        section_records = records.loc[
                            records[scout_census.column_labels.UNIT_TYPE] ==
                            scout_census.UNIT_LEVEL_DISTRICT]
                elif open_years[-1] == 2017:
                    section_records = records.loc[records[
                        scout_census.column_labels.UNIT_TYPE] == section_type]
                else:
                    raise Exception(
                        f"Unable to find section records for {new_section_ids}"
                    )

            section_data["Section"] = section
            section_data["District_ID"] = section_records[
                scout_census.column_labels.id.DISTRICT].unique()[0]
            section_data["District"] = section_records[
                scout_census.column_labels.name.DISTRICT].unique()[0]
            section_data["County"] = section_records["C_name"].unique()[0]
            section_data["Region"] = section_records["R_name"].unique()[0]
            section_data["Scout Country"] = section_records["X_name"].unique(
            )[0]

            if open_years[0] == years[0]:
                section_data["First Year"] = f"{years[0]} or before"
            else:
                section_data["First Year"] = open_years[0]
            if open_years[-1] == years[-1]:
                section_data["Last Year"] = f"Open in {years[-1]}"
            else:
                section_data["Last Year"] = open_years[-1]

            # To find the postcode and IMD for a range of Sections and Group
            # records across several years. Find the most recent year, and then
            # choose the Postcode, where the IMD Rank is the lowest.
            most_recent_year = open_years[-1]
            logger.debug(f"Checking {most_recent_year}")
            most_recent = section_records.loc[section_records["Year"] ==
                                              most_recent_year]
            if most_recent.shape[0] == 1:
                most_recent = most_recent.iloc[0]
            elif most_recent.shape[0] == 0:
                logger.warning("Inconsistent ids")
                if section in scout_census.SECTIONS_GROUP:
                    # In the event that the Object_IDs aren't consistent, pick a section in the group that's most recent
                    # is only applicable after 2017, so sections are assumed to exist.
                    most_recent = records.loc[
                        (records[scout_census.column_labels.id.GROUP] ==
                         section_data["Group_ID"])
                        & (records[scout_census.column_labels.UNIT_TYPE] ==
                           section_type)
                        & (records["Year"] == most_recent_year)].iloc[0]
                elif section in scout_census.SECTIONS_DISTRICT:
                    most_recent_record = records.loc[
                        (records[scout_census.column_labels.id.DISTRICT] ==
                         section_data["District_ID"])
                        & (records[scout_census.column_labels.UNIT_TYPE] ==
                           section_type)
                        & (records["Year"] == most_recent_year)]

                    if most_recent_record.empty:
                        logger.error(
                            f"No records found with D_ID = {section_data['District_ID']} in {most_recent_year} that are {section}"
                        )

                    most_recent = most_recent_record.iloc[0]
            else:
                logger.warning("Multiple sections found, assigning a section")
                most_recent = most_recent.iloc[0]

            postcode_valid = most_recent.at["postcode_is_valid"]
            # logger.debug(f"Identified:\n{most_recent} determined postcode valid:\n{postcode_valid}\n{postcode_valid == 1}\n{postcode_valid == 1}")
            # add postcode
            if postcode_valid:
                logger.debug(
                    f"Adding postcode {most_recent.at[scout_census.column_labels.POSTCODE]}"
                )
                section_data["Postcode"] = most_recent.at[
                    scout_census.column_labels.POSTCODE]
                country = ONS_PD.COUNTRY_CODES.get(most_recent.at["ctry"])
                section_data[
                    "IMD Country"] = country if country else scout_census.DEFAULT_VALUE
                section_data["IMD Decile"] = most_recent.at["imd_decile"]
                section_data["IMD Rank"] = most_recent.at["imd"]
            else:
                section_data["Postcode"] = scout_census.DEFAULT_VALUE
                section_data["IMD Country"] = scout_census.DEFAULT_VALUE
                section_data["IMD Decile"] = scout_census.DEFAULT_VALUE
                section_data["IMD Rank"] = scout_census.DEFAULT_VALUE

            section_data_df = pd.DataFrame([section_data],
                                           columns=output_columns)
            output_data = pd.concat([output_data, section_data_df], axis=0)

        output_data.reset_index(drop=True, inplace=True)
        if report_name:
            report_io.save_report(output_data, report_name)
        return output_data
Beispiel #7
0
    def create_boundary_report(self,
                               options: set[str] = None,
                               historical: bool = False,
                               report_name: str = None) -> pd.DataFrame:
        """Produces .csv file summarising by boundary provided.

        Args:
            options: List of data to be included in report
            historical: Check to ensure that multiple years of data are intentional
            report_name:

        """

        # Set default option set for `options`
        if options is None:
            options = {
                "Number of Sections", "Groups", "Section numbers",
                "6 to 17 numbers", "awards", "waiting list total"
            }

        opt_groups = "Groups" in options
        opt_section_numbers = "Section numbers" in options
        opt_number_of_sections = "Number of Sections" in options
        opt_6_to_17_numbers = "6 to 17 numbers" in options
        opt_waiting_list_totals = "waiting list total" in options
        opt_adult_numbers = "Adult numbers" in options
        opt_awards = "awards" in options

        census_data = self.census_data
        boundary_codes = self.geography.boundary_codes
        geog_name = self.geography.metadata.key  # e.g oslaua osward pcon lsoa11
        logger.info(
            f"Creating report by {geog_name} with {', '.join(options)} from {len(census_data.index)} records"
        )

        census_dates = sorted(set(census_data["Census Date"].dropna()))
        if len(census_dates) > 1:
            if not historical:
                raise ValueError(
                    f"Historical option not selected, but multiple censuses selected ({census_dates[0]} - {census_dates[-1]})"
                )
            logger.info(
                f"Historical analysis from {census_dates[0]} to {census_dates[-1]}"
            )

        sections_model = column_labels.sections

        dataframes = []

        if opt_groups:
            # Used to list the groups that operate within the boundary.
            # Gets all groups in the census_data dataframe and calculates the
            # number of groups.
            logger.debug(f"Adding group data")
            groups = census_data[[geog_name, column_labels.name.GROUP]].copy()
            groups[column_labels.name.GROUP] = groups[
                column_labels.name.GROUP].str.strip()
            grouped_rgn = groups.drop_duplicates().dropna().groupby(
                [geog_name], dropna=False)[column_labels.name.GROUP]
            dataframes.append(
                pd.DataFrame({
                    "Groups":
                    grouped_rgn.unique().apply("\n".join),
                    "Number of Groups":
                    grouped_rgn.nunique(dropna=True)
                }))

        if opt_section_numbers or opt_number_of_sections or opt_6_to_17_numbers or opt_waiting_list_totals or opt_adult_numbers:
            total_cols = [
                section_model.total
                for section_name, section_model in sections_model
                if section_name != "Network"
            ]
            waiting_cols = [
                section_model.waiting_list
                for section_name, section_model in sections_model
                if section_name != "Network"
            ]
            census_data["All"] = census_data[total_cols].sum(
                axis=1).astype("Int32")
            census_data["Waiting List"] = census_data[waiting_cols].sum(
                axis=1).astype("Int32")
            census_data["Adults"] = census_data[[
                "Leaders", "AssistantLeaders", "SectAssistants", "OtherAdults"
            ]].sum(axis=1).astype("Int32")

            logger.debug(f"Adding young people numbers")
            metric_cols = []
            rename = {}
            if opt_section_numbers:
                metric_cols += [
                    section_model.total
                    for section_name, section_model in sections_model
                    if section_name != "Network"
                ]
            if opt_number_of_sections:
                # TODO correct for pluralisation (e.g. Colony -> Colonys not Colonies)
                metric_cols += [
                    section_model.unit_label
                    for section_name, section_model in sections_model
                    if section_name != "Network"
                ]
                rename |= {
                    section_model.unit_label: f"{section_model.type}s"
                    for section_name, section_model in sections_model
                    if section_name != "Network"
                }
            if opt_6_to_17_numbers:
                metric_cols += ["All"]
            if opt_waiting_list_totals:
                metric_cols += ["Waiting List"]
            if opt_adult_numbers:
                metric_cols += ["Adults"]
            agg = census_data.groupby(
                [geog_name, "Census_ID"],
                dropna=False)[metric_cols].sum().unstack().sort_index()
            agg.columns = [
                f"{rename.get(key, key)}-{census_year}".replace("_total", "")
                for key, census_year in agg.columns
            ]
            dataframes.append(agg)

        if opt_awards:
            if geog_name not in ONS_GEOG_NAMES:
                raise ValueError(
                    f"{geog_name} is not a valid geography name. Valid values are {ONS_GEOG_NAMES}"
                )

            district_id_column = column_labels.id.DISTRICT
            award_name = sections_model.Beavers.top_award[0]
            award_eligible = sections_model.Beavers.top_award_eligible[0]

            logger.debug(f"Creating awards mapping")
            awards_mapping = _ons_to_district_mapping(census_data,
                                                      boundary_codes,
                                                      geog_name)
            district_numbers = {
                district_id: num
                for district_dict in awards_mapping.values()
                for district_id, num in district_dict.items()
            }
            grouped_dist = census_data[[
                "Queens_Scout_Awards", "Eligible4QSA", district_id_column
            ]].groupby(district_id_column, dropna=False)
            ons_regions_in_district = grouped_dist[district_id_column].first(
            ).map(district_numbers)
            awards_per_district_per_regions = pd.DataFrame({
                # QSAs achieved in district, divided by the number of regions the district is in
                "QSA":
                grouped_dist["Queens_Scout_Awards"].sum() /
                ons_regions_in_district,
                # number of young people eligible to achieve the QSA in district, divided by the number of regions the district is in
                "qsa_eligible":
                grouped_dist["Eligible4QSA"].sum() / ons_regions_in_district,
            })

            # Check that our pivot keeps the total membership constant
            yp_cols = [
                "Beavers_total", "Cubs_total", "Scouts_total",
                "Explorers_total"
            ]
            grouped_rgn = census_data.groupby([geog_name], dropna=False)
            assert int(census_data[yp_cols].sum().sum()) == int(
                grouped_rgn[yp_cols].sum().sum().sum())

            logger.debug(f"Adding awards data")
            award_total = grouped_rgn[award_name].sum()
            eligible_total = grouped_rgn[award_eligible].sum()
            award_prop = 100 * award_total / eligible_total
            award_prop[eligible_total == 0] = pd.NA

            max_value = award_prop.quantile(0.95)
            award_prop = award_prop.clip(upper=max_value)

            # calculates the nominal QSAs per ONS region specified.
            # Divides total # of awards by the number of Scout Districts that the ONS Region is in
            region_ids = grouped_rgn.name.first().index.to_series()
            if geog_name == "D_ID":
                district_ids = region_ids
            else:
                region_district_map = {
                    rgn_id: list(district_dict)
                    for rgn_id, district_dict in awards_mapping.items()
                }
                district_ids = region_ids.map(region_district_map)
            awards_regions_data = pd.DataFrame.from_dict(
                {
                    idx: awards_per_district_per_regions.loc[ids].sum()
                    for idx, ids in district_ids.items()
                },
                orient="index")
            qsa_prop = 100 * awards_regions_data["QSA"] / awards_regions_data[
                "qsa_eligible"]
            qsa_prop[awards_regions_data["qsa_eligible"] == 0] = pd.NA

            award_data = {
                award_name: award_total,
                award_eligible: eligible_total,
                f"%-{award_name}": award_prop,
                "QSA": awards_regions_data["QSA"],
                "%-QSA": qsa_prop,
            }
            dataframes.append(pd.DataFrame(award_data))

        # TODO find a way to keep DUMMY geography coding
        output_data = boundary_codes.reset_index(drop=True).copy()
        output_data = output_data.merge(pd.concat(dataframes, axis=1),
                                        how="left",
                                        left_on="codes",
                                        right_index=True,
                                        sort=False)

        if geog_name == "lsoa11":
            logger.debug(f"Loading ONS postcode data & Adding IMD deciles.")
            ons_pd_data = pd.read_feather(config.SETTINGS.ons_pd.reduced,
                                          columns=["lsoa11", "imd_decile"
                                                   ]).drop_duplicates()
            output_data = output_data.merge(
                ons_pd_data, how="left", left_on="codes",
                right_on="lsoa11").drop(columns="lsoa11")

        if report_name:
            report_io.save_report(output_data, report_name)

        return output_data
Beispiel #8
0
    def create_uptake_report(self,
                             boundary_report: pd.DataFrame,
                             report_name: str = None) -> pd.DataFrame:
        """Creates a report of scouting uptake in geographic areas

        Creates an report by the boundary that has been set, requires a boundary report to already have been run.
        Requires population data by age for the specified boundary.

        Args:
            boundary_report: Boundary report from `Reports.create_boundary_report`
            report_name: Name to save the report as

        Returns:
            Uptake data of Scouts in the boundary

        """
        metadata = self.geography.metadata
        census_data = self.census_data
        geog_key = metadata.key
        try:
            age_profile_path = config.SETTINGS.folders.national_statistical / metadata.age_profile.path
            age_profile_key = metadata.age_profile.key
        except KeyError:
            raise AttributeError(
                f"Population by age data not present for this {geog_key}")

        data_types = {str(key): "Int16" for key in range(5, 26)}
        try:
            age_profile_pd = pd.read_csv(age_profile_path, dtype=data_types)
        except TypeError:
            logger.error("Age profiles must be integers in each age category")
            raise

        # population data
        for section, ages in SECTION_AGES.items():
            section_population = age_profile_pd[ages["ages"]].sum(axis=1)
            section_population += age_profile_pd[ages["halves"]].sum(
                axis=1) // 2 if ages.get("halves") else 0
            age_profile_pd[f"Pop_{section}"] = section_population.astype(
                "UInt32")
        age_profile_pd["Pop_All"] = age_profile_pd[[
            f"{age}" for age in range(6, 17 + 1)
        ]].sum(axis=1).astype("UInt32")

        # merge population data
        cols = [age_profile_key
                ] + [f"Pop_{section}"
                     for section in SECTION_AGES.keys()] + ["Pop_All"]
        reduced_age_profile_pd = age_profile_pd[cols]

        # Pivot age profile to current geography type if needed
        pivot_key = metadata.age_profile.pivot_key
        if pivot_key and pivot_key != geog_key:
            logger.debug(f"Loading ONS postcode data.")
            ons_pd_data = pd.read_feather(config.SETTINGS.ons_pd.reduced,
                                          columns=[geog_key, pivot_key])
            merged_age_profile = reduced_age_profile_pd.merge(
                ons_pd_data,
                how="left",
                left_on=age_profile_key,
                right_on=pivot_key).drop(pivot_key, axis=1)
            merged_age_profile_no_na = merged_age_profile.dropna(
                subset=[geog_key])
            pivoted_age_profile = merged_age_profile_no_na.groupby(
                geog_key).sum().astype("UInt32")

            # Check we did not accidentally expand the population!
            # assert merged_age_profile["Pop_All"].sum() == reduced_age_profile_pd["Pop_All"].sum()  # this will fail
            assert pivoted_age_profile["Pop_All"].sum(
            ) == merged_age_profile_no_na["Pop_All"].sum()
            uptake_report = boundary_report.merge(pivoted_age_profile,
                                                  how="left",
                                                  left_on="codes",
                                                  right_index=True,
                                                  sort=False)
        else:
            uptake_report = boundary_report.merge(reduced_age_profile_pd,
                                                  how="left",
                                                  left_on="codes",
                                                  right_on=age_profile_key,
                                                  sort=False)
            del uptake_report[age_profile_key]

        census_ids = census_data["Census_ID"].drop_duplicates().dropna(
        ).sort_values()

        # add uptake data
        for census_id in census_ids:
            # clip here as unexpectedly large values throw off the scale bars.
            # TODO normalise unexpectedly large values so that we don't need to clip
            for section in SECTION_AGES.keys():
                uptake_section = 100 * uptake_report[
                    f"{section}-{census_id}"] / uptake_report[f"Pop_{section}"]
                max_value = uptake_section.quantile(0.975)
                uptake_report[
                    f"%-{section}-{census_id}"] = uptake_section.clip(
                        upper=max_value)
            uptake_all = 100 * uptake_report[
                f"All-{census_id}"] / uptake_report[f"Pop_All"]
            max_value = uptake_all.quantile(0.975)
            uptake_report[f"%-All-{census_id}"] = uptake_all.clip(
                upper=max_value)
            # TODO explain 97.5th percentile clip
        # TODO check edge cases - 0 population and 0 or more scouts

        if report_name:
            report_io.save_report(uptake_report, report_name)

        return uptake_report