def filter_records(data: pd.DataFrame, field: str, value_list: set, exclude_matching: bool = False, exclusion_analysis: bool = False) -> pd.DataFrame: """Filters the Census records by any field in ONS PD. Args: data: field: The field on which to filter value_list: The values on which to filter exclude_matching: If True, exclude the values that match the filter. If False, keep the values that match the filter. exclusion_analysis: Returns: Filtered data """ # Count number of rows original_records = data.index.size matching_records = data[field].isin(value_list) if exclude_matching: # Excluding records that match the filter criteria filter_mask = ~matching_records logger.info(f"Selecting records that satisfy {field} not in {value_list} from {original_records} records.") else: # Including records that match the filter criteria filter_mask = matching_records logger.info(f"Selecting records that satisfy {field} in {value_list} from {original_records} records.") filtered = data.loc[filter_mask] logger.debug(f"Resulting in {filtered.index.size} records remaining.") if exclusion_analysis: excluded = data.loc[~filter_mask] _exclusion_analysis(data, filtered, excluded) return filtered
def _exclusion_analysis(original: pd.DataFrame, filtered: pd.DataFrame, excluded: pd.DataFrame): cols = {scout_census.column_labels.UNIT_TYPE, *(section_model.total for section, section_model in sections_model)} if not set(original.columns) >= cols: o_cols = original.columns.to_list() raise ValueError("Required columns are not in dataset!\n" f"Required columns are: {cols}.\n" f"Your columns are: {o_cols}") # Calculate the number of records that have been filtered out original_records = original.index.size excluded_records = original_records - filtered.index.size logger.info(f"{excluded_records} records were removed ({excluded_records / original_records * 100}% of total)") # Prints number of members and % of members filtered out for each section for section_name, section_model in sections_model: logger.debug(f"Analysis of {section_name} member exclusions") section_type = section_model.type members_col = section_model.total excluded_sections = excluded.loc[excluded[scout_census.column_labels.UNIT_TYPE] == section_type] excluded_members = 0 if not excluded_sections.empty: logger.debug(f"Excluded sections\n{excluded_sections}") logger.debug(f"Finding number of excluded {section_name} by summing {members_col}") excluded_members = excluded_sections[members_col].sum() logger.debug(f"{excluded_members} {section_name} excluded") original_members = original.loc[original[scout_census.column_labels.UNIT_TYPE] == section_type, members_col].sum() if original_members > 0: logger.info(f"{excluded_members} {section_name} members were removed ({excluded_members / original_members * 100}%) of total") else: logger.info(f"There are no {section_name} members present in data")
def add_custom_data(self, csv_file_path: Path, layer_name: str, location_cols: Union[Literal["Postcodes"], dict], marker_data: list = None) -> None: """Function to add custom data as markers on map Args: csv_file_path: file path to open csv file layer_name: Name of layer that the markers will be added to location_cols: Indicates whether adding data with postcodes or co-ordinates - if postcodes, str "Postcodes" - if co-ordinates, dict of co-ordinate data with keys ["crs", "x", "y"] marker_data: list of strings for values in data that should be in popup """ custom_data = pd.read_csv(csv_file_path) if location_cols == "Postcodes": # Merge with ONS Postcode Directory to obtain dataframe with lat/long logger.debug(f"Loading ONS postcode data.") ons_pd_data = pd.read_feather(config.SETTINGS.ons_pd.reduced) custom_data = pd.merge(custom_data, ons_pd_data, how="left", left_on=location_cols, right_index=True, sort=False) location_cols = {"crs": constants.WGS_84, "x": "long", "y": "lat"} # Create geo data frame with points generated from lat/long or OS custom_data = gpd.GeoDataFrame(custom_data, geometry=gpd.points_from_xy(x=custom_data[location_cols["x"]], y=custom_data[location_cols["y"]]), crs=location_cols["crs"]) # Convert the 'Co-ordinate reference system' (crs) to WGS_84 (i.e. lat/long) if not already if location_cols["crs"] != constants.WGS_84: custom_data = custom_data.to_crs(epsg=constants.WGS_84) if layer_name in self.map._children: # NoQA raise ValueError("Layer already used!") # Plot marker and include marker_data in the popup for every item in custom_data out = [] if marker_data: x_not_nan = custom_data.geometry.x.notna() for row in custom_data[x_not_nan].itertuples(): out.append( { "lat": round(row.geometry.y, 4), "lon": round(row.geometry.x, 4), "col": "green", "html": "".join(f'<p align="center">{row[marker_col]}</p>' for marker_col in marker_data), } ) else: for points in custom_data.geometry[custom_data.geometry.x.notna()].to_list(): out.append({"lat": round(points.y, 4), "lon": round(points.x, 4), "col": "green", "html": ""}) self.map[layer_name] = _output_marker_layer(layer_name, out)
def _ons_to_district_mapping(census_data: pd.DataFrame, boundary_codes: pd.DataFrame, region_type: str) -> dict: """Create json file, containing which scout districts are within an each ONS area, and how many ONS areas those districts are in. Args: region_type: A field in the modified census report corresponding to an administrative region (lsoa11, msoa11, oslaua, osward, pcon, oscty, ctry, rgn). region_type is also a census column heading for the region geography type """ logger.debug("Creating mapping from ons boundary to scout district") district_id_column = column_labels.id.DISTRICT region_ids = set(boundary_codes["codes"].dropna()) district_ids_by_region = census_data.loc[ census_data[region_type].isin(region_ids), [region_type, district_id_column]].dropna().drop_duplicates() district_ids = set(district_ids_by_region[district_id_column].dropna()) # count of how many regions the district occupies: count_regions_in_district = ( census_data.loc[(census_data[district_id_column].isin(district_ids) & (census_data[region_type] != DEFAULT_VALUE)), [district_id_column, region_type]].dropna(). drop_duplicates().groupby(district_id_column).count().rename( columns={region_type: "count"})) count_by_district_by_region = pd.merge( left=district_ids_by_region, right=count_regions_in_district, on=district_id_column).set_index([region_type, district_id_column]) nested_dict = {} for (region_id, district_id), value in count_by_district_by_region["count"].items(): nested_dict.setdefault(region_id, {})[district_id] = value logger.debug("Finished mapping from ons boundary to district") return dict(nested_dict) # Return the mapping
def _load_boundary(boundary_report: pd.DataFrame, boundary_metadata: config.Boundary) -> gpd.GeoDataFrame: """Loads a given boundary from a boundary report and metadata. Loads shapefile from path into GeoPandas dataframe Filters out unneeded shapes within all shapes loaded Converts from British National Grid to WGS84, as Leaflet doesn't understand BNG Args: boundary_report: A DataFrame object with boundary report data boundary_metadata: This contains shapefile paths, and labels for region codes and names Returns: GeoDataFrame with filtered and CRS transformed shapes """ metadata = boundary_metadata data = boundary_report # Read a shape file. shapefile_path is the path to ESRI shapefile with region information logger.info("Loading Shapefile data") logger.debug(f"Shapefile path: {metadata.shapefile.path}") start_time = time.time() all_shapes = gpd.read_file(metadata.shapefile.path) logger.info(f"Loading Shapefile data finished, {time.time() - start_time:.2f} seconds elapsed") if metadata.shapefile.key not in all_shapes.columns: raise KeyError(f"{metadata.shapefile.key} not present in shapefile. Valid columns are: {all_shapes.columns}") # Rename columns shapes_col_map = {metadata.shapefile.key: "shape_codes", metadata.shapefile.name: "shape_names"} all_shapes.columns = [shapes_col_map.get(col, col) for col in all_shapes.columns] # Filter and convert GeoDataFrame to world co-ordinates logger.info(f"Filtering {len(all_shapes.index)} shapes by shape_codes being in the codes column of the map_data") all_codes = set(data["codes"]) logger.debug(f"All codes list: {all_codes}") geo_data = all_shapes.loc[all_shapes["shape_codes"].isin(all_codes), ["geometry", "shape_codes", "shape_names"]].to_crs(epsg=constants.WGS_84) logger.info(f"Loaded {len(geo_data.index):,} boundary shapes. Columns now in data: {[*data.columns]}.") return geo_data
def new_section_history_summary(self, years: list, report_name: str = None) -> pd.DataFrame: sections_model = scout_census.column_labels.sections # Given data on all sections, provides summary of all new sections, and # copes with the pre-2017 section reporting structure logger.info(f"Beginning new_section_history_summary for {years}") new_section_ids: list[dict] = [] logger.info( f"Getting group ID list in column {scout_census.column_labels.id.GROUP}" ) # Iterate through Groups looking for new Sections group_ids = self.census_data[ scout_census.column_labels.id.GROUP].dropna().drop_duplicates( ).to_list() logger.info(f"Found {len(group_ids)} Groups") # for each section in each group in the census # construct dict of {year: number of sections of that type open in that year} # construct list of number of sections of that type open in that year # construct list of changes in number of sections per year # if there is an increase year on year # for each year from the second year calculate the change in the number of sections # whatever happens with change # do the same for district sections (explorers) # # # . census_data = self.census_data.fillna({ scout_census.column_labels.id.GROUP: 0, scout_census.column_labels.id.DISTRICT: 0 }) for group_id in group_ids: logger.info(f"Investigating {group_id}") group_records = census_data.loc[census_data[ scout_census.column_labels.id.GROUP] == group_id] for section in scout_census.SECTIONS_GROUP: logger.info(f"Finding {section} sections") units_by_year = {} for year in years: section_numbers_year = group_records.loc[ group_records["Year"] == year, getattr(scout_census.column_labels.sections, section ).unit_label].sum() units_by_year[year] = section_numbers_year increments = [ units_by_year[year + 1] - units_by_year[year] for year in units_by_year.keys() if (year + 1) in units_by_year ] if max(increments) > 0: logger.debug( f"Identified year profile of sections: {units_by_year}" ) opened_sections = [] closed_sections = [] for year in years[1:]: change = units_by_year[year] - units_by_year[year - 1] if change > 0: # Extent life of current sections for open_sections in opened_sections: open_sections["years"].append(year) # Create new section record for ii in range(change): logger.debug( f"New {section} section found for {group_id} in {year}" ) opened_sections.append({ "id": group_id, "section": section, "years": [year], "nu_sections": units_by_year }) elif change == 0: # Lengthens all sections by a year for open_sections in opened_sections: open_sections["years"].append(year) elif change < 0: for ii in range(-change): # Close sections in newest first if len(opened_sections) > 0: logger.debug( f"{section} closed for {group_id} in {year}" ) closed_sections.append( opened_sections.pop(-1)) # Lengthens remaining open sections by a year for open_sections in opened_sections: open_sections["years"].append(year) logger.debug( f"For {group_id} adding\n{opened_sections + closed_sections}" ) new_section_ids += opened_sections new_section_ids += closed_sections else: logger.info(f"No new {section} sections in {group_id}") logger.info("Finding new Explorer Sections") # Iterate through District looking for new Sections district_ids = self.census_data[ scout_census.column_labels.id.DISTRICT].drop_duplicates().dropna( ).to_list() for district_id in district_ids: logger.info(f"Investigating {district_id}") district_records = census_data.loc[census_data[ scout_census.column_labels.id.DISTRICT] == district_id] units_by_year = {} for year in years: district_records_year = district_records.loc[ district_records["Year"] == year] units_by_year[year] = district_records_year[ sections_model.Explorers.unit_label].sum() increments = [ units_by_year[year + 1] - units_by_year[year] for year in units_by_year.keys() if (year + 1) in units_by_year ] if max(increments) > 0: opened_sections = [] closed_sections = [] for year in years[1:]: change = units_by_year[year] - units_by_year[year - 1] if change > 0: # Extent life of current sections for open_sections in opened_sections: open_sections["years"].append(year) # Create new section record for ii in range(change): opened_sections.append({ "id": district_id, "section": "Explorers", "years": [year], "nu_sections": units_by_year }) elif change == 0: # Lengthens all sections by a year for open_sections in opened_sections: open_sections["years"].append(year) elif change < 0: for ii in range(-change): # Close sections in oldest order if len(opened_sections) > 0: closed_sections.append(opened_sections.pop(-1)) for open_sections in opened_sections: open_sections["years"].append(year) logger.debug( f"For {district_id} adding\n{opened_sections + closed_sections}" ) new_section_ids += opened_sections new_section_ids += closed_sections section_details = [] for year in years: if year < 2017: section_details.append(f"{year}_Est_Members") else: section_details.append(f"{year}_Members") # fmt: off output_columns = [ "Object_ID", "Section Name", "Section", "Group_ID", "Group", "District_ID", "District", "County", "Region", "Scout Country", "Postcode", "IMD Country", "IMD Rank", "IMD Decile", "First Year", "Last Year", f"{years[0]}_sections", *section_details ] # fmt: on output_data = pd.DataFrame(columns=output_columns) logger.info( f"Start iteration through {len(new_section_ids)} new Sections") used_compass_ids = set() count = 0 total = len(new_section_ids) new_sections_id: dict for new_sections_id in new_section_ids: section_data = {} logger.debug(f"Recording {new_sections_id}") count += 1 logger.info(f"{count} of {total}") section_id = new_sections_id["id"] open_years = new_sections_id["years"] section = new_sections_id["section"] section_type = getattr(scout_census.column_labels.sections, section).type if section in scout_census.SECTIONS_GROUP: records = census_data.loc[census_data[ scout_census.column_labels.id.GROUP] == section_id] section_data["Group_ID"] = records[ scout_census.column_labels.id.GROUP].unique()[0] section_data["Group"] = records[ scout_census.column_labels.name.GROUP].unique()[0] elif section in scout_census.SECTIONS_DISTRICT: records = census_data.loc[census_data[ scout_census.column_labels.id.DISTRICT] == section_id] section_data["Group_ID"] = "" section_data["Group"] = "" else: raise Exception( f"{section} neither belongs to a Group or District. id = {new_sections_id}" ) for year in open_years: members_cols = getattr(scout_census.column_labels.sections, section).total year_records = records.loc[records["Year"] == year] if year >= 2017: compass_id = section_data.get("Object_ID") section_year_records = year_records.loc[records[ scout_census.column_labels.UNIT_TYPE] == section_type] if compass_id: section_record = section_year_records.loc[ section_year_records["Object_ID"] == compass_id] section_data[f"{year}_Members"] = section_record[ members_cols].sum() else: section_year_ids: pd.Series = section_year_records[ "Object_ID"].drop_duplicates() if open_years[0] >= 2017: # If section became open after 31st January 2017 then can identify by Object_ID id last_year_records = records.loc[records["Year"] == (year - 1)] old_section_ids = last_year_records[ "Object_ID"].unique() opened_section_ids = section_year_ids[ ~section_year_ids.isin(old_section_ids)] if len(opened_section_ids) > 1: logger.info( f"{len(opened_section_ids)} sections opened" ) unused_ids = opened_section_ids[ ~opened_section_ids.isin(used_compass_ids)] compass_id = unused_ids.iloc[ 0] if not unused_ids.empty else opened_section_ids.iloc[ -1] elif len(opened_section_ids) == 0: logger.error( f"No sections opened\n{year}: {section_year_ids}\n{year-1}: {old_section_ids}" ) elif len(opened_section_ids) == 1: compass_id = opened_section_ids.iloc[0] logger.debug(f"Assigned id: {compass_id}") section_data["Object_ID"] = compass_id used_compass_ids.add(compass_id) section_data[ f"{year}_Members"] = section_year_records.loc[ section_year_records["Object_ID"] == compass_id, members_cols].sum() else: compass_id = section_year_ids.max() if compass_id in used_compass_ids: section_year_ids.sort_values(ascending=False) unused_ids = section_year_ids[ ~section_year_ids.isin(used_compass_ids)] if not unused_ids.empty: compass_id = unused_ids.iloc[0] else: compass_id = section_year_ids.iloc[0] section_data["Object_ID"] = compass_id used_compass_ids.add(compass_id) total_members = section_year_records.loc[ section_year_records["Object_ID"] == compass_id, members_cols].sum() logger.debug( f"{section} in {section_id} in {year} found {total_members} members" ) section_data[f"{year}_Members"] = total_members else: year_before_section_opened = open_years[0] - 1 year_before_records = records.loc[ records["Year"] == year_before_section_opened] number_of_new_sections = new_sections_id["nu_sections"][ open_years[0]] - new_sections_id["nu_sections"][ year_before_section_opened] new_members = year_records[members_cols].sum() old_members = year_before_records[members_cols].sum() additional_members = (new_members - old_members) / number_of_new_sections if additional_members < 0: logger.warning( f"{section_id} increased number of {section} sections but membership decreased by {additional_members}" ) logger.debug( f"{section} in {section_id} in {year} found {additional_members} members" ) section_data[f"{year}_Est_Members"] = additional_members closed_years = [year for year in years if year not in open_years] for year in closed_years: if year >= 2017: section_data[f"{year}_Members"] = 0 else: section_data[f"{year}_Est_Members"] = 0 section_data[f"{years[0]}_sections"] = new_sections_id[ "nu_sections"][years[0]] section_records = None if section_data.get("Object_ID"): section_records = records.loc[records["Object_ID"] == section_data.get("Object_ID")] section_data["Section Name"] = section_records["name"].unique( )[0] else: if open_years[-1] < 2017: if section in scout_census.SECTIONS_GROUP: section_records = records.loc[ records[scout_census.column_labels.UNIT_TYPE] == scout_census.UNIT_LEVEL_GROUP] elif section in scout_census.SECTIONS_DISTRICT: section_records = records.loc[ records[scout_census.column_labels.UNIT_TYPE] == scout_census.UNIT_LEVEL_DISTRICT] elif open_years[-1] == 2017: section_records = records.loc[records[ scout_census.column_labels.UNIT_TYPE] == section_type] else: raise Exception( f"Unable to find section records for {new_section_ids}" ) section_data["Section"] = section section_data["District_ID"] = section_records[ scout_census.column_labels.id.DISTRICT].unique()[0] section_data["District"] = section_records[ scout_census.column_labels.name.DISTRICT].unique()[0] section_data["County"] = section_records["C_name"].unique()[0] section_data["Region"] = section_records["R_name"].unique()[0] section_data["Scout Country"] = section_records["X_name"].unique( )[0] if open_years[0] == years[0]: section_data["First Year"] = f"{years[0]} or before" else: section_data["First Year"] = open_years[0] if open_years[-1] == years[-1]: section_data["Last Year"] = f"Open in {years[-1]}" else: section_data["Last Year"] = open_years[-1] # To find the postcode and IMD for a range of Sections and Group # records across several years. Find the most recent year, and then # choose the Postcode, where the IMD Rank is the lowest. most_recent_year = open_years[-1] logger.debug(f"Checking {most_recent_year}") most_recent = section_records.loc[section_records["Year"] == most_recent_year] if most_recent.shape[0] == 1: most_recent = most_recent.iloc[0] elif most_recent.shape[0] == 0: logger.warning("Inconsistent ids") if section in scout_census.SECTIONS_GROUP: # In the event that the Object_IDs aren't consistent, pick a section in the group that's most recent # is only applicable after 2017, so sections are assumed to exist. most_recent = records.loc[ (records[scout_census.column_labels.id.GROUP] == section_data["Group_ID"]) & (records[scout_census.column_labels.UNIT_TYPE] == section_type) & (records["Year"] == most_recent_year)].iloc[0] elif section in scout_census.SECTIONS_DISTRICT: most_recent_record = records.loc[ (records[scout_census.column_labels.id.DISTRICT] == section_data["District_ID"]) & (records[scout_census.column_labels.UNIT_TYPE] == section_type) & (records["Year"] == most_recent_year)] if most_recent_record.empty: logger.error( f"No records found with D_ID = {section_data['District_ID']} in {most_recent_year} that are {section}" ) most_recent = most_recent_record.iloc[0] else: logger.warning("Multiple sections found, assigning a section") most_recent = most_recent.iloc[0] postcode_valid = most_recent.at["postcode_is_valid"] # logger.debug(f"Identified:\n{most_recent} determined postcode valid:\n{postcode_valid}\n{postcode_valid == 1}\n{postcode_valid == 1}") # add postcode if postcode_valid: logger.debug( f"Adding postcode {most_recent.at[scout_census.column_labels.POSTCODE]}" ) section_data["Postcode"] = most_recent.at[ scout_census.column_labels.POSTCODE] country = ONS_PD.COUNTRY_CODES.get(most_recent.at["ctry"]) section_data[ "IMD Country"] = country if country else scout_census.DEFAULT_VALUE section_data["IMD Decile"] = most_recent.at["imd_decile"] section_data["IMD Rank"] = most_recent.at["imd"] else: section_data["Postcode"] = scout_census.DEFAULT_VALUE section_data["IMD Country"] = scout_census.DEFAULT_VALUE section_data["IMD Decile"] = scout_census.DEFAULT_VALUE section_data["IMD Rank"] = scout_census.DEFAULT_VALUE section_data_df = pd.DataFrame([section_data], columns=output_columns) output_data = pd.concat([output_data, section_data_df], axis=0) output_data.reset_index(drop=True, inplace=True) if report_name: report_io.save_report(output_data, report_name) return output_data
def create_boundary_report(self, options: set[str] = None, historical: bool = False, report_name: str = None) -> pd.DataFrame: """Produces .csv file summarising by boundary provided. Args: options: List of data to be included in report historical: Check to ensure that multiple years of data are intentional report_name: """ # Set default option set for `options` if options is None: options = { "Number of Sections", "Groups", "Section numbers", "6 to 17 numbers", "awards", "waiting list total" } opt_groups = "Groups" in options opt_section_numbers = "Section numbers" in options opt_number_of_sections = "Number of Sections" in options opt_6_to_17_numbers = "6 to 17 numbers" in options opt_waiting_list_totals = "waiting list total" in options opt_adult_numbers = "Adult numbers" in options opt_awards = "awards" in options census_data = self.census_data boundary_codes = self.geography.boundary_codes geog_name = self.geography.metadata.key # e.g oslaua osward pcon lsoa11 logger.info( f"Creating report by {geog_name} with {', '.join(options)} from {len(census_data.index)} records" ) census_dates = sorted(set(census_data["Census Date"].dropna())) if len(census_dates) > 1: if not historical: raise ValueError( f"Historical option not selected, but multiple censuses selected ({census_dates[0]} - {census_dates[-1]})" ) logger.info( f"Historical analysis from {census_dates[0]} to {census_dates[-1]}" ) sections_model = column_labels.sections dataframes = [] if opt_groups: # Used to list the groups that operate within the boundary. # Gets all groups in the census_data dataframe and calculates the # number of groups. logger.debug(f"Adding group data") groups = census_data[[geog_name, column_labels.name.GROUP]].copy() groups[column_labels.name.GROUP] = groups[ column_labels.name.GROUP].str.strip() grouped_rgn = groups.drop_duplicates().dropna().groupby( [geog_name], dropna=False)[column_labels.name.GROUP] dataframes.append( pd.DataFrame({ "Groups": grouped_rgn.unique().apply("\n".join), "Number of Groups": grouped_rgn.nunique(dropna=True) })) if opt_section_numbers or opt_number_of_sections or opt_6_to_17_numbers or opt_waiting_list_totals or opt_adult_numbers: total_cols = [ section_model.total for section_name, section_model in sections_model if section_name != "Network" ] waiting_cols = [ section_model.waiting_list for section_name, section_model in sections_model if section_name != "Network" ] census_data["All"] = census_data[total_cols].sum( axis=1).astype("Int32") census_data["Waiting List"] = census_data[waiting_cols].sum( axis=1).astype("Int32") census_data["Adults"] = census_data[[ "Leaders", "AssistantLeaders", "SectAssistants", "OtherAdults" ]].sum(axis=1).astype("Int32") logger.debug(f"Adding young people numbers") metric_cols = [] rename = {} if opt_section_numbers: metric_cols += [ section_model.total for section_name, section_model in sections_model if section_name != "Network" ] if opt_number_of_sections: # TODO correct for pluralisation (e.g. Colony -> Colonys not Colonies) metric_cols += [ section_model.unit_label for section_name, section_model in sections_model if section_name != "Network" ] rename |= { section_model.unit_label: f"{section_model.type}s" for section_name, section_model in sections_model if section_name != "Network" } if opt_6_to_17_numbers: metric_cols += ["All"] if opt_waiting_list_totals: metric_cols += ["Waiting List"] if opt_adult_numbers: metric_cols += ["Adults"] agg = census_data.groupby( [geog_name, "Census_ID"], dropna=False)[metric_cols].sum().unstack().sort_index() agg.columns = [ f"{rename.get(key, key)}-{census_year}".replace("_total", "") for key, census_year in agg.columns ] dataframes.append(agg) if opt_awards: if geog_name not in ONS_GEOG_NAMES: raise ValueError( f"{geog_name} is not a valid geography name. Valid values are {ONS_GEOG_NAMES}" ) district_id_column = column_labels.id.DISTRICT award_name = sections_model.Beavers.top_award[0] award_eligible = sections_model.Beavers.top_award_eligible[0] logger.debug(f"Creating awards mapping") awards_mapping = _ons_to_district_mapping(census_data, boundary_codes, geog_name) district_numbers = { district_id: num for district_dict in awards_mapping.values() for district_id, num in district_dict.items() } grouped_dist = census_data[[ "Queens_Scout_Awards", "Eligible4QSA", district_id_column ]].groupby(district_id_column, dropna=False) ons_regions_in_district = grouped_dist[district_id_column].first( ).map(district_numbers) awards_per_district_per_regions = pd.DataFrame({ # QSAs achieved in district, divided by the number of regions the district is in "QSA": grouped_dist["Queens_Scout_Awards"].sum() / ons_regions_in_district, # number of young people eligible to achieve the QSA in district, divided by the number of regions the district is in "qsa_eligible": grouped_dist["Eligible4QSA"].sum() / ons_regions_in_district, }) # Check that our pivot keeps the total membership constant yp_cols = [ "Beavers_total", "Cubs_total", "Scouts_total", "Explorers_total" ] grouped_rgn = census_data.groupby([geog_name], dropna=False) assert int(census_data[yp_cols].sum().sum()) == int( grouped_rgn[yp_cols].sum().sum().sum()) logger.debug(f"Adding awards data") award_total = grouped_rgn[award_name].sum() eligible_total = grouped_rgn[award_eligible].sum() award_prop = 100 * award_total / eligible_total award_prop[eligible_total == 0] = pd.NA max_value = award_prop.quantile(0.95) award_prop = award_prop.clip(upper=max_value) # calculates the nominal QSAs per ONS region specified. # Divides total # of awards by the number of Scout Districts that the ONS Region is in region_ids = grouped_rgn.name.first().index.to_series() if geog_name == "D_ID": district_ids = region_ids else: region_district_map = { rgn_id: list(district_dict) for rgn_id, district_dict in awards_mapping.items() } district_ids = region_ids.map(region_district_map) awards_regions_data = pd.DataFrame.from_dict( { idx: awards_per_district_per_regions.loc[ids].sum() for idx, ids in district_ids.items() }, orient="index") qsa_prop = 100 * awards_regions_data["QSA"] / awards_regions_data[ "qsa_eligible"] qsa_prop[awards_regions_data["qsa_eligible"] == 0] = pd.NA award_data = { award_name: award_total, award_eligible: eligible_total, f"%-{award_name}": award_prop, "QSA": awards_regions_data["QSA"], "%-QSA": qsa_prop, } dataframes.append(pd.DataFrame(award_data)) # TODO find a way to keep DUMMY geography coding output_data = boundary_codes.reset_index(drop=True).copy() output_data = output_data.merge(pd.concat(dataframes, axis=1), how="left", left_on="codes", right_index=True, sort=False) if geog_name == "lsoa11": logger.debug(f"Loading ONS postcode data & Adding IMD deciles.") ons_pd_data = pd.read_feather(config.SETTINGS.ons_pd.reduced, columns=["lsoa11", "imd_decile" ]).drop_duplicates() output_data = output_data.merge( ons_pd_data, how="left", left_on="codes", right_on="lsoa11").drop(columns="lsoa11") if report_name: report_io.save_report(output_data, report_name) return output_data
def create_uptake_report(self, boundary_report: pd.DataFrame, report_name: str = None) -> pd.DataFrame: """Creates a report of scouting uptake in geographic areas Creates an report by the boundary that has been set, requires a boundary report to already have been run. Requires population data by age for the specified boundary. Args: boundary_report: Boundary report from `Reports.create_boundary_report` report_name: Name to save the report as Returns: Uptake data of Scouts in the boundary """ metadata = self.geography.metadata census_data = self.census_data geog_key = metadata.key try: age_profile_path = config.SETTINGS.folders.national_statistical / metadata.age_profile.path age_profile_key = metadata.age_profile.key except KeyError: raise AttributeError( f"Population by age data not present for this {geog_key}") data_types = {str(key): "Int16" for key in range(5, 26)} try: age_profile_pd = pd.read_csv(age_profile_path, dtype=data_types) except TypeError: logger.error("Age profiles must be integers in each age category") raise # population data for section, ages in SECTION_AGES.items(): section_population = age_profile_pd[ages["ages"]].sum(axis=1) section_population += age_profile_pd[ages["halves"]].sum( axis=1) // 2 if ages.get("halves") else 0 age_profile_pd[f"Pop_{section}"] = section_population.astype( "UInt32") age_profile_pd["Pop_All"] = age_profile_pd[[ f"{age}" for age in range(6, 17 + 1) ]].sum(axis=1).astype("UInt32") # merge population data cols = [age_profile_key ] + [f"Pop_{section}" for section in SECTION_AGES.keys()] + ["Pop_All"] reduced_age_profile_pd = age_profile_pd[cols] # Pivot age profile to current geography type if needed pivot_key = metadata.age_profile.pivot_key if pivot_key and pivot_key != geog_key: logger.debug(f"Loading ONS postcode data.") ons_pd_data = pd.read_feather(config.SETTINGS.ons_pd.reduced, columns=[geog_key, pivot_key]) merged_age_profile = reduced_age_profile_pd.merge( ons_pd_data, how="left", left_on=age_profile_key, right_on=pivot_key).drop(pivot_key, axis=1) merged_age_profile_no_na = merged_age_profile.dropna( subset=[geog_key]) pivoted_age_profile = merged_age_profile_no_na.groupby( geog_key).sum().astype("UInt32") # Check we did not accidentally expand the population! # assert merged_age_profile["Pop_All"].sum() == reduced_age_profile_pd["Pop_All"].sum() # this will fail assert pivoted_age_profile["Pop_All"].sum( ) == merged_age_profile_no_na["Pop_All"].sum() uptake_report = boundary_report.merge(pivoted_age_profile, how="left", left_on="codes", right_index=True, sort=False) else: uptake_report = boundary_report.merge(reduced_age_profile_pd, how="left", left_on="codes", right_on=age_profile_key, sort=False) del uptake_report[age_profile_key] census_ids = census_data["Census_ID"].drop_duplicates().dropna( ).sort_values() # add uptake data for census_id in census_ids: # clip here as unexpectedly large values throw off the scale bars. # TODO normalise unexpectedly large values so that we don't need to clip for section in SECTION_AGES.keys(): uptake_section = 100 * uptake_report[ f"{section}-{census_id}"] / uptake_report[f"Pop_{section}"] max_value = uptake_section.quantile(0.975) uptake_report[ f"%-{section}-{census_id}"] = uptake_section.clip( upper=max_value) uptake_all = 100 * uptake_report[ f"All-{census_id}"] / uptake_report[f"Pop_All"] max_value = uptake_all.quantile(0.975) uptake_report[f"%-All-{census_id}"] = uptake_all.clip( upper=max_value) # TODO explain 97.5th percentile clip # TODO check edge cases - 0 population and 0 or more scouts if report_name: report_io.save_report(uptake_report, report_name) return uptake_report