def filter_records(data: pd.DataFrame, field: str, value_list: set, exclude_matching: bool = False, exclusion_analysis: bool = False) -> pd.DataFrame: """Filters the Census records by any field in ONS PD. Args: data: field: The field on which to filter value_list: The values on which to filter exclude_matching: If True, exclude the values that match the filter. If False, keep the values that match the filter. exclusion_analysis: Returns: Filtered data """ # Count number of rows original_records = data.index.size matching_records = data[field].isin(value_list) if exclude_matching: # Excluding records that match the filter criteria filter_mask = ~matching_records logger.info(f"Selecting records that satisfy {field} not in {value_list} from {original_records} records.") else: # Including records that match the filter criteria filter_mask = matching_records logger.info(f"Selecting records that satisfy {field} in {value_list} from {original_records} records.") filtered = data.loc[filter_mask] logger.debug(f"Resulting in {filtered.index.size} records remaining.") if exclusion_analysis: excluded = data.loc[~filter_mask] _exclusion_analysis(data, filtered, excluded) return filtered
def _clean_and_verify_postcode(census_data: pd.DataFrame) -> None: """Cleans postcode data and inserts clean postcodes and validity check Cleans postcode data from passed table and index Gets index of postcode column, and inserts new columns after postcode column Args: census_data: table of data with a postcode column """ # Gets the index of the postcode column, and increments as insertion is from the left. # Columns must be inserted in number order otherwise it wont't make sense postcode_column = scout_census.column_labels.POSTCODE # heading of the postcode column in the table postcode_column_index = census_data.columns.get_loc( postcode_column) # scout_census.column_labels.POSTCODE cleaned_postcode_index = postcode_column_index + 1 valid_postcode_index = postcode_column_index + 2 # Sets the labels for the columns to be inserted valid_postcode_label = scout_census.column_labels.VALID_POSTCODE logger.info("Cleaning postcodes") cleaned_postcode_column = _postcode_cleaner(census_data[postcode_column]) logger.info("Inserting columns") census_data.insert(cleaned_postcode_index, CLEAN_POSTCODE_LABEL, cleaned_postcode_column) census_data.insert(valid_postcode_index, valid_postcode_label, float("NaN"))
def section_history_summary(self, years: list, report_name: str = None) -> pd.DataFrame: # Works effectively for years after 2017 logger.info("Beginning section_history_summary") report = self._history_summary(years, "compass ID", "compass") if report_name: report_io.save_report(report, report_name) return report
def load_postcode_directory(ons_pd: ONSPostcodeDirectory) -> pd.DataFrame: logger.info(f"Loading ONS postcode data.") return pd.read_csv( config.SETTINGS.ons_pd.full, index_col=ons_pd.index_column, dtype=ons_pd.data_types, usecols=[f for f in ons_pd.fields if f != "imd_decile" ], # imd_decile isn't defined in the raw file encoding="utf-8", )
def add_shapefile_data(census_data: pd.DataFrame, metadata: Boundary) -> pd.DataFrame: logger.info("Adding shapefile data") # self.census_data = self.census_data.copy() shapefile_key = metadata.shapefile.key new_data, points_data = add_shape_data(census_data, shapefile_key, path=metadata.shapefile.path) return new_data.rename(columns={shapefile_key: metadata.key})
def __init__(self, map_name: str, map_title: str): """Initialise Map class. Args: map_name: Filename for the saved map """ logger.info("Initialising leaflet map") self.map: dict[str, Any] = {"map_title": map_title} self.out_file = config.SETTINGS.folders.output / f"{map_name}.html"
def group_history_summary(self, years: list, report_name: str = None) -> pd.DataFrame: logger.info("Beginning group_history_summary") report = self._history_summary(years, "Group ID", scout_census.column_labels.id.GROUP, unit_type="Group") if report_name: report_io.save_report(report, report_name) return report
def wrapper(self, *args, **kwargs): # record a start time for the function start_time = time.time() logger.info(f"Calling function {method.__name__}") # call the original method with the passed arguments and keyword arguments, and store the result output = method(self, *args, **kwargs) logger.info( f"{method.__name__} took {time.time() - start_time:.2f} seconds") # return the output of the original function return output
def _exclusion_analysis(original: pd.DataFrame, filtered: pd.DataFrame, excluded: pd.DataFrame): cols = {scout_census.column_labels.UNIT_TYPE, *(section_model.total for section, section_model in sections_model)} if not set(original.columns) >= cols: o_cols = original.columns.to_list() raise ValueError("Required columns are not in dataset!\n" f"Required columns are: {cols}.\n" f"Your columns are: {o_cols}") # Calculate the number of records that have been filtered out original_records = original.index.size excluded_records = original_records - filtered.index.size logger.info(f"{excluded_records} records were removed ({excluded_records / original_records * 100}% of total)") # Prints number of members and % of members filtered out for each section for section_name, section_model in sections_model: logger.debug(f"Analysis of {section_name} member exclusions") section_type = section_model.type members_col = section_model.total excluded_sections = excluded.loc[excluded[scout_census.column_labels.UNIT_TYPE] == section_type] excluded_members = 0 if not excluded_sections.empty: logger.debug(f"Excluded sections\n{excluded_sections}") logger.debug(f"Finding number of excluded {section_name} by summing {members_col}") excluded_members = excluded_sections[members_col].sum() logger.debug(f"{excluded_members} {section_name} excluded") original_members = original.loc[original[scout_census.column_labels.UNIT_TYPE] == section_type, members_col].sum() if original_members > 0: logger.info(f"{excluded_members} {section_name} members were removed ({excluded_members / original_members * 100}%) of total") else: logger.info(f"There are no {section_name} members present in data")
def save_merged_data(data: pd.DataFrame, ons_pd_publication_date: str) -> None: """Save passed dataframe to csv file. Also output list of errors in the merge process to a text file Args: data: Census data ons_pd_publication_date: Refers to the ONS Postcode Directory's publication date """ raw_extract_path = config.SETTINGS.census_extract.original output_path = raw_extract_path.parent / f"{raw_extract_path.stem} with {ons_pd_publication_date} fields" error_output_path = config.SETTINGS.folders.output / "error_file.csv" valid_postcode_label = scout_census.column_labels.VALID_POSTCODE postcode_merge_column = "clean_postcode" original_postcode_label = scout_census.column_labels.POSTCODE compass_id_label = scout_census.column_labels.id.COMPASS # The errors file contains all the postcodes that failed to be looked up in the ONS Postcode Directory error_output_fields = [ postcode_merge_column, original_postcode_label, compass_id_label, "type", "name", "G_name", "D_name", "C_name", "R_name", "X_name", "Census Date" ] data.loc[~data[valid_postcode_label], error_output_fields].to_csv(error_output_path, index=False, encoding="utf-8-sig") # Write the new data to a csv file (utf-8-sig only to force excel to use UTF-8) logger.info("Writing merged data") data.to_csv(output_path.with_suffix(".csv"), index=False, encoding="utf-8-sig") data.to_feather(output_path.with_suffix(".feather"))
def _load_boundary(boundary_report: pd.DataFrame, boundary_metadata: config.Boundary) -> gpd.GeoDataFrame: """Loads a given boundary from a boundary report and metadata. Loads shapefile from path into GeoPandas dataframe Filters out unneeded shapes within all shapes loaded Converts from British National Grid to WGS84, as Leaflet doesn't understand BNG Args: boundary_report: A DataFrame object with boundary report data boundary_metadata: This contains shapefile paths, and labels for region codes and names Returns: GeoDataFrame with filtered and CRS transformed shapes """ metadata = boundary_metadata data = boundary_report # Read a shape file. shapefile_path is the path to ESRI shapefile with region information logger.info("Loading Shapefile data") logger.debug(f"Shapefile path: {metadata.shapefile.path}") start_time = time.time() all_shapes = gpd.read_file(metadata.shapefile.path) logger.info(f"Loading Shapefile data finished, {time.time() - start_time:.2f} seconds elapsed") if metadata.shapefile.key not in all_shapes.columns: raise KeyError(f"{metadata.shapefile.key} not present in shapefile. Valid columns are: {all_shapes.columns}") # Rename columns shapes_col_map = {metadata.shapefile.key: "shape_codes", metadata.shapefile.name: "shape_names"} all_shapes.columns = [shapes_col_map.get(col, col) for col in all_shapes.columns] # Filter and convert GeoDataFrame to world co-ordinates logger.info(f"Filtering {len(all_shapes.index)} shapes by shape_codes being in the codes column of the map_data") all_codes = set(data["codes"]) logger.debug(f"All codes list: {all_codes}") geo_data = all_shapes.loc[all_shapes["shape_codes"].isin(all_codes), ["geometry", "shape_codes", "shape_names"]].to_crs(epsg=constants.WGS_84) logger.info(f"Loaded {len(geo_data.index):,} boundary shapes. Columns now in data: {[*data.columns]}.") return geo_data
def merge_with_postcode_directory( census_data: pd.DataFrame, ons_pd_data: pd.DataFrame, ons_fields_data_types: dict[str, list[str]]) -> pd.DataFrame: logger.info("Cleaning the postcodes") _clean_and_verify_postcode(census_data) # attempt to fix invalid postcodes logger.info("Adding ONS postcode directory data to Census and outputting") data = _try_fix_invalid_postcodes(census_data, ons_pd_data.index) # fully merge the data logger.info("Merging data") data = pd.merge(data, ons_pd_data, how="left", left_on="clean_postcode", right_index=True, sort=False) # fill unmerged rows with default values logger.info("filling unmerged rows") data = _fill_unmerged_rows(data, ons_fields_data_types) return data
def _try_fix_invalid_postcodes(census_data: pd.DataFrame, all_valid_postcodes: pd.Index) -> pd.DataFrame: """Uses various methods attempting to provide every record with a valid postcode Currently only implemented for sections with youth membership. TODO: implement for all entity types Methodology: - If section has an invalid postcode in 2017 or 2018, use 2019's if valid (all are valid or missing in 2019) - If section has no valid postcodes, use most common (mode) postcode from sections in group in that year, then try successive years - If group or district has no valid postcode in 2010-2016, use following years (e.g. if 2010 not valid, try 2011, 12, 13 etc.) Args: census_data: Dataframe of census data including invalid postcodes all_valid_postcodes: All valid postcodes from the ONS Postcode Directory Returns: modified data table with more correct postcodes """ logger.info("filling postcodes in sections with invalid postcodes") # Helper variables to store field headings for often used fields section_id_label = scout_census.column_labels.id.COMPASS group_id_label = scout_census.column_labels.id.GROUP district_id_label = scout_census.column_labels.id.DISTRICT # Lists of entity types to match against in constructing section records tables group_section_types = scout_census.TYPES_GROUP district_section_types = scout_census.TYPES_DISTRICT section_types = group_section_types | district_section_types pre_2017_types = {"Group", "District"} # Columns to use in constructing the MultiIndex. Larger groups go first towards smaller index_cols = [ district_id_label, group_id_label, section_id_label, scout_census.column_labels.CENSUS_ID ] # Find which postcodes are valid census_data[scout_census.column_labels.VALID_POSTCODE] = census_data[ CLEAN_POSTCODE_LABEL].isin(all_valid_postcodes) # Sets a MultiIndex on the data table to enable fast searching and querying for data census_data = census_data.set_index(index_cols, drop=False) census_data = _run_postcode_fix_step(census_data, all_valid_postcodes, "section", "latest Census", section_types, section_id_label, 2) census_data = _run_postcode_fix_step(census_data, all_valid_postcodes, "group-section", "same group", group_section_types, group_id_label, 1) census_data = _run_postcode_fix_step(census_data, all_valid_postcodes, "district-section", "same district", district_section_types, district_id_label, 0) census_data = _run_postcode_fix_step(census_data, all_valid_postcodes, "pre 2017", "same entity", pre_2017_types, section_id_label, 2) # Undo the changes made in this method by removing the MultiIndex and # removing the merge test column census_data = census_data.reset_index(drop=True) return census_data
def _history_summary(self, years: list, id_name: str, census_col: str, unit_type: str = None) -> pd.DataFrame: sections_model = scout_census.column_labels.sections # Must have imd scores and deciles already in census_postcode_data. logger.info(f"Grouping data by {census_col}") data = self.census_data grouped_data = data.groupby([census_col], sort=False) # create dataframe of all constant values, which happen to all be scout org hierachy related logger.info(f"Creating table of Scout organisational data") scout_org_cols = [ census_col, scout_census.column_labels.UNIT_TYPE, scout_census.column_labels.name.GROUP, scout_census.column_labels.name.DISTRICT, scout_census.column_labels.name.COUNTY, scout_census.column_labels.name.REGION, scout_census.column_labels.name.COUNTRY, ] scout_org_data = grouped_data[scout_org_cols].first() # if unit_types is set the series should be overwritten with that value # this is for manually overwriting the unit type if unit_type: scout_org_data[scout_census.column_labels.UNIT_TYPE] = unit_type logger.info(f"Finding opening and closing years") # Takes the year column from the grouped_data object resulting in a SeriesGroupBy # Applies the years_of_return method to get max and min years for each series in the object # Applies to a Series, unpacking the returned tuples to individual series # Casts to an object dtype as later we introduce text. years_return = grouped_data["Year"].agg( lambda series: (series.min(), series.max())).apply( pd.Series).astype(object) # If open in the first year of data or last year of data, add an explanatory note that the limits are not certain years_return[0] = years_return[0].mask(years_return[0] == years[0], f"{years[0]} or before") years_return[1] = years_return[1].mask(years_return[1] == years[-1], f"Open in {years[-1]}") years_return.columns = ["min_year", "max_year"] # for each dataframe in the groupby object, find the lowest IMD rank from the latest year, and return # values associated with that rank. This requires IMD decile to have been added beforehand imd_cols = ["clean_postcode", "ctry", "imd", "imd_decile"] def _imd_groupby(df: pd.DataFrame): # To find the postcode and IMD for a range of Sections and Group records across several years. # Find the most recent year, and then choose the Postcode with the lowest IMD Rank. most_recent_year = df["Year"].max() df["imd"] = df["imd"].where( df["imd"] > 0) # Only keep values where IMD rank is greater than 0 most_recent_records = df[ df["Year"] == most_recent_year] # The latest year of records min_imd_records = most_recent_records.nsmallest( 1, "imd") # get smallest imd rank return min_imd_records[imd_cols] # per year open, record the total young people per section and the number of adults # uses a nested groupby for efficiency sections_list = [ section_name for section_name, section_model in sections_model if section_name not in {"Explorers", "Network"} ] adult_cols = ["Leaders", "SectAssistants", "OtherAdults"] # TODO re add def _year_groupby(df): dicts: pd.Series = df.groupby( ["Year"], sort=True).apply(_section_groupby).to_list() output = {} for row in dicts: output |= row return output def _section_groupby(df): census_year = df.name output = {} for section in sections_list: output[f"{section}-{census_year}"] = df[getattr( scout_census.column_labels.sections, section).total].sum() output[f"Adults-{census_year}"] = df[adult_cols].to_numpy().sum() return output # For each year, calculate and add number of beavers, cubs, scouts. Explorers, Network deliberately omitted. # Expand series of dictionaries to a dataframe with the same index logger.info(f"Creating table of members by section and adults by year") member_numbers_table = grouped_data.apply(_year_groupby) member_numbers_table = pd.DataFrame(member_numbers_table.to_list(), index=member_numbers_table.index) # apply the imd function and map country codes to country names logger.info(f"Creating table of IMD data and postcodes") imd_table = grouped_data.apply(_imd_groupby).droplevel(1) imd_table["IMD Country"] = imd_table["ctry"].map(ONS_PD.COUNTRY_CODES) # fmt: off column_renaming = { census_col: id_name, "type": "Type", "G_name": "Group", "D_name": "District", "C_name": "County", "R_name": "Region", "X_name": "Scout Country", "clean_postcode": "Postcode", "imd": "IMD Rank", "imd_decile": "IMD Decile", "min_year": "First Year", "max_year": "Last Year", } # fmt: on logger.info(f"Merging tables and conforming columns") history_summary_data = scout_org_data.join([ imd_table, years_return, member_numbers_table ]).rename(columns=column_renaming).reset_index(drop=True) # create output columns list and add generated section names output_columns = [ id_name, "Type", "Group", "District", "County", "Region", "Scout Country", "Postcode", "IMD Country", "IMD Rank", "IMD Decile", "First Year", "Last Year" ] for year in years: output_columns.extend([ f"{section_name}-{year}" for section_name, section_model in sections_model if section_name != "Explorers" ]) output_columns.append(f"Adults-{year}") return pd.DataFrame(history_summary_data, columns=output_columns)
def add_meeting_places_to_map( self, sections: pd.DataFrame, colour_key: str, marker_data: set[str], layer_name: str = "Sections", cluster_markers: bool = False, show_layer: bool = True, coloured_region: set[str] = None, coloured_region_key: str = "", ) -> None: """Adds the sections provided as markers to map with the colour, and data indicated by marker_data. Args: sections: Census records relating to Sections with lat and long Columns colour_key: Determines marker colour. If a column in `sections`, categorical colours. Otherwise, must be a CSS colour name. marker_data: Set of strings which determines content for popup, including: - youth membership - awards layer_name: Name of map layer for meeting places. Default = "Sections" cluster_markers: Whether to cluster markers on the map show_layer: Whether to show the layer by default coloured_region: If specified, markers on the map but not within coloured_region are grey coloured_region_key: Column for coloured_region boundary codes """ logger.info("Adding section markers to map") # check that sections dataframe has data, and that there are any sections if sections.empty or (sections[scout_census.column_labels.id.DISTRICT].dropna().empty and sections[scout_census.column_labels.id.GROUP].dropna().empty): return # Sort sections dataframe sections = sections.sort_values(scout_census.column_labels.id.OBJECT).reset_index(drop=True) if layer_name in self.map: raise ValueError("Layer already used!") # Sets the map so that it opens in the right area valid_points = sections.loc[sections[scout_census.column_labels.VALID_POSTCODE], ["lat", "long"]] self.map["bounds"] = _output_fit_bounds(((valid_points.lat.min(), valid_points.long.min()), (valid_points.lat.max(), valid_points.long.max()))) section_names = sections["name"].astype(str) if "youth membership" in marker_data: section_type = sections[scout_census.column_labels.UNIT_TYPE].map(constants.section_types) yp_total_cols = [section_model.total for section_name, section_model in scout_census.column_labels.sections] yp_totals = sections[yp_total_cols].sum(axis=1).astype(int).astype(str) # Each row only has values for one section type sections["sect_overview"] = section_names + " : " + yp_totals + " " + section_type else: sections["sect_overview"] = section_names if "awards" in marker_data: # This uses just the first top award - so only Diamond/QSA for Explorers/Network top_award_cols = [section_model.top_award[0] for section_name, section_model in scout_census.column_labels.sections] awards = sections[top_award_cols].sum(axis=1).astype(int).astype(str) award_eligible_cols = [section_model.top_award_eligible[0] for section_name, section_model in scout_census.column_labels.sections] eligible = sections[award_eligible_cols].sum(axis=1).astype(int).astype(str) sections["awards_info"] = section_names + " : " + awards + " Top Awards out of " + eligible + " eligible" if colour_key in sections.columns: sections["marker_colour"] = sections[colour_key].map(_colour_mapping(sections[colour_key])) else: sections["marker_colour"] = colour_key if coloured_region_key and coloured_region is not None: # Areas outside the region_of_colour have markers coloured grey sections.loc[~sections[coloured_region_key].isin(coloured_region), "marker_colour"] = "gray" sections["postcode"] = sections[scout_census.column_labels.POSTCODE] sections["c_name"] = sections[scout_census.column_labels.name.COUNTY] sections["d_name"] = sections[scout_census.column_labels.name.DISTRICT] sections["g_name"] = sections[scout_census.column_labels.name.GROUP].astype(str).fillna("District") sections_info_cols = ["postcode", "lat", "long", "marker_colour", "c_name", "d_name", "g_name", "sect_overview"] if "awards" in marker_data: sections_info_cols += ["awards_info"] sections_info_table = sections[sections_info_cols].dropna(subset=["d_name"]).dropna(subset=["postcode"]) # else the final marker would not be added last_row = pd.Series(sections_info_table.iloc[0].to_dict() | {"postcode": "~ FINAL"}, name=0) sections_info_table = sections_info_table.append(last_row) # set and sort index sections_info_table = sections_info_table.set_index(["postcode", "d_name", "g_name"], drop=True).sort_index(level=[0, 1, 2]) # pre-calculate inner loop vars include_awards_data = "awards" in marker_data # initialise change-detector variables old_postcode = sections_info_table.index[0][0] old_district_name = "" reset_district = False # initialise first marker variables html = "" lat = round(sections_info_table["lat"].array[0], 4) long = round(sections_info_table["long"].array[0], 4) marker_colour = sections_info_table["marker_colour"].array[0] # Find all the sections with the same postcode: out = [] for (postcode, district_name, group_name), sub_table in sections_info_table.groupby(level=[0, 1, 2]): if old_postcode != postcode: # Add a marker each time the postcode changes. out.append({"lat": lat, "lon": long, "col": marker_colour, "html": html}) old_postcode = postcode # update the old postcode lat = round(sub_table["lat"].array[0], 4) long = round(sub_table["long"].array[0], 4) marker_colour = sub_table["marker_colour"].array[0] # reset HTML string and mark district name for re-adding html = "" reset_district = True if old_district_name != district_name or reset_district: old_district_name = district_name reset_district = False county_name = sub_table["c_name"].array[0] # District sections first followed by Group sections html += f"<h3>{district_name} ({county_name})</h3>" html += f"<h4>{group_name}</h4><p align='center'>" html += "<br>".join(sub_table["sect_overview"]) if include_awards_data and group_name != "District": awards_info = "<br>".join(sub_table["awards_info"]) html += "<br>" + awards_info html += "</p>" # TODO marker cluster/feature group self.map[layer_name] = _output_marker_layer(layer_name, out)
def new_section_history_summary(self, years: list, report_name: str = None) -> pd.DataFrame: sections_model = scout_census.column_labels.sections # Given data on all sections, provides summary of all new sections, and # copes with the pre-2017 section reporting structure logger.info(f"Beginning new_section_history_summary for {years}") new_section_ids: list[dict] = [] logger.info( f"Getting group ID list in column {scout_census.column_labels.id.GROUP}" ) # Iterate through Groups looking for new Sections group_ids = self.census_data[ scout_census.column_labels.id.GROUP].dropna().drop_duplicates( ).to_list() logger.info(f"Found {len(group_ids)} Groups") # for each section in each group in the census # construct dict of {year: number of sections of that type open in that year} # construct list of number of sections of that type open in that year # construct list of changes in number of sections per year # if there is an increase year on year # for each year from the second year calculate the change in the number of sections # whatever happens with change # do the same for district sections (explorers) # # # . census_data = self.census_data.fillna({ scout_census.column_labels.id.GROUP: 0, scout_census.column_labels.id.DISTRICT: 0 }) for group_id in group_ids: logger.info(f"Investigating {group_id}") group_records = census_data.loc[census_data[ scout_census.column_labels.id.GROUP] == group_id] for section in scout_census.SECTIONS_GROUP: logger.info(f"Finding {section} sections") units_by_year = {} for year in years: section_numbers_year = group_records.loc[ group_records["Year"] == year, getattr(scout_census.column_labels.sections, section ).unit_label].sum() units_by_year[year] = section_numbers_year increments = [ units_by_year[year + 1] - units_by_year[year] for year in units_by_year.keys() if (year + 1) in units_by_year ] if max(increments) > 0: logger.debug( f"Identified year profile of sections: {units_by_year}" ) opened_sections = [] closed_sections = [] for year in years[1:]: change = units_by_year[year] - units_by_year[year - 1] if change > 0: # Extent life of current sections for open_sections in opened_sections: open_sections["years"].append(year) # Create new section record for ii in range(change): logger.debug( f"New {section} section found for {group_id} in {year}" ) opened_sections.append({ "id": group_id, "section": section, "years": [year], "nu_sections": units_by_year }) elif change == 0: # Lengthens all sections by a year for open_sections in opened_sections: open_sections["years"].append(year) elif change < 0: for ii in range(-change): # Close sections in newest first if len(opened_sections) > 0: logger.debug( f"{section} closed for {group_id} in {year}" ) closed_sections.append( opened_sections.pop(-1)) # Lengthens remaining open sections by a year for open_sections in opened_sections: open_sections["years"].append(year) logger.debug( f"For {group_id} adding\n{opened_sections + closed_sections}" ) new_section_ids += opened_sections new_section_ids += closed_sections else: logger.info(f"No new {section} sections in {group_id}") logger.info("Finding new Explorer Sections") # Iterate through District looking for new Sections district_ids = self.census_data[ scout_census.column_labels.id.DISTRICT].drop_duplicates().dropna( ).to_list() for district_id in district_ids: logger.info(f"Investigating {district_id}") district_records = census_data.loc[census_data[ scout_census.column_labels.id.DISTRICT] == district_id] units_by_year = {} for year in years: district_records_year = district_records.loc[ district_records["Year"] == year] units_by_year[year] = district_records_year[ sections_model.Explorers.unit_label].sum() increments = [ units_by_year[year + 1] - units_by_year[year] for year in units_by_year.keys() if (year + 1) in units_by_year ] if max(increments) > 0: opened_sections = [] closed_sections = [] for year in years[1:]: change = units_by_year[year] - units_by_year[year - 1] if change > 0: # Extent life of current sections for open_sections in opened_sections: open_sections["years"].append(year) # Create new section record for ii in range(change): opened_sections.append({ "id": district_id, "section": "Explorers", "years": [year], "nu_sections": units_by_year }) elif change == 0: # Lengthens all sections by a year for open_sections in opened_sections: open_sections["years"].append(year) elif change < 0: for ii in range(-change): # Close sections in oldest order if len(opened_sections) > 0: closed_sections.append(opened_sections.pop(-1)) for open_sections in opened_sections: open_sections["years"].append(year) logger.debug( f"For {district_id} adding\n{opened_sections + closed_sections}" ) new_section_ids += opened_sections new_section_ids += closed_sections section_details = [] for year in years: if year < 2017: section_details.append(f"{year}_Est_Members") else: section_details.append(f"{year}_Members") # fmt: off output_columns = [ "Object_ID", "Section Name", "Section", "Group_ID", "Group", "District_ID", "District", "County", "Region", "Scout Country", "Postcode", "IMD Country", "IMD Rank", "IMD Decile", "First Year", "Last Year", f"{years[0]}_sections", *section_details ] # fmt: on output_data = pd.DataFrame(columns=output_columns) logger.info( f"Start iteration through {len(new_section_ids)} new Sections") used_compass_ids = set() count = 0 total = len(new_section_ids) new_sections_id: dict for new_sections_id in new_section_ids: section_data = {} logger.debug(f"Recording {new_sections_id}") count += 1 logger.info(f"{count} of {total}") section_id = new_sections_id["id"] open_years = new_sections_id["years"] section = new_sections_id["section"] section_type = getattr(scout_census.column_labels.sections, section).type if section in scout_census.SECTIONS_GROUP: records = census_data.loc[census_data[ scout_census.column_labels.id.GROUP] == section_id] section_data["Group_ID"] = records[ scout_census.column_labels.id.GROUP].unique()[0] section_data["Group"] = records[ scout_census.column_labels.name.GROUP].unique()[0] elif section in scout_census.SECTIONS_DISTRICT: records = census_data.loc[census_data[ scout_census.column_labels.id.DISTRICT] == section_id] section_data["Group_ID"] = "" section_data["Group"] = "" else: raise Exception( f"{section} neither belongs to a Group or District. id = {new_sections_id}" ) for year in open_years: members_cols = getattr(scout_census.column_labels.sections, section).total year_records = records.loc[records["Year"] == year] if year >= 2017: compass_id = section_data.get("Object_ID") section_year_records = year_records.loc[records[ scout_census.column_labels.UNIT_TYPE] == section_type] if compass_id: section_record = section_year_records.loc[ section_year_records["Object_ID"] == compass_id] section_data[f"{year}_Members"] = section_record[ members_cols].sum() else: section_year_ids: pd.Series = section_year_records[ "Object_ID"].drop_duplicates() if open_years[0] >= 2017: # If section became open after 31st January 2017 then can identify by Object_ID id last_year_records = records.loc[records["Year"] == (year - 1)] old_section_ids = last_year_records[ "Object_ID"].unique() opened_section_ids = section_year_ids[ ~section_year_ids.isin(old_section_ids)] if len(opened_section_ids) > 1: logger.info( f"{len(opened_section_ids)} sections opened" ) unused_ids = opened_section_ids[ ~opened_section_ids.isin(used_compass_ids)] compass_id = unused_ids.iloc[ 0] if not unused_ids.empty else opened_section_ids.iloc[ -1] elif len(opened_section_ids) == 0: logger.error( f"No sections opened\n{year}: {section_year_ids}\n{year-1}: {old_section_ids}" ) elif len(opened_section_ids) == 1: compass_id = opened_section_ids.iloc[0] logger.debug(f"Assigned id: {compass_id}") section_data["Object_ID"] = compass_id used_compass_ids.add(compass_id) section_data[ f"{year}_Members"] = section_year_records.loc[ section_year_records["Object_ID"] == compass_id, members_cols].sum() else: compass_id = section_year_ids.max() if compass_id in used_compass_ids: section_year_ids.sort_values(ascending=False) unused_ids = section_year_ids[ ~section_year_ids.isin(used_compass_ids)] if not unused_ids.empty: compass_id = unused_ids.iloc[0] else: compass_id = section_year_ids.iloc[0] section_data["Object_ID"] = compass_id used_compass_ids.add(compass_id) total_members = section_year_records.loc[ section_year_records["Object_ID"] == compass_id, members_cols].sum() logger.debug( f"{section} in {section_id} in {year} found {total_members} members" ) section_data[f"{year}_Members"] = total_members else: year_before_section_opened = open_years[0] - 1 year_before_records = records.loc[ records["Year"] == year_before_section_opened] number_of_new_sections = new_sections_id["nu_sections"][ open_years[0]] - new_sections_id["nu_sections"][ year_before_section_opened] new_members = year_records[members_cols].sum() old_members = year_before_records[members_cols].sum() additional_members = (new_members - old_members) / number_of_new_sections if additional_members < 0: logger.warning( f"{section_id} increased number of {section} sections but membership decreased by {additional_members}" ) logger.debug( f"{section} in {section_id} in {year} found {additional_members} members" ) section_data[f"{year}_Est_Members"] = additional_members closed_years = [year for year in years if year not in open_years] for year in closed_years: if year >= 2017: section_data[f"{year}_Members"] = 0 else: section_data[f"{year}_Est_Members"] = 0 section_data[f"{years[0]}_sections"] = new_sections_id[ "nu_sections"][years[0]] section_records = None if section_data.get("Object_ID"): section_records = records.loc[records["Object_ID"] == section_data.get("Object_ID")] section_data["Section Name"] = section_records["name"].unique( )[0] else: if open_years[-1] < 2017: if section in scout_census.SECTIONS_GROUP: section_records = records.loc[ records[scout_census.column_labels.UNIT_TYPE] == scout_census.UNIT_LEVEL_GROUP] elif section in scout_census.SECTIONS_DISTRICT: section_records = records.loc[ records[scout_census.column_labels.UNIT_TYPE] == scout_census.UNIT_LEVEL_DISTRICT] elif open_years[-1] == 2017: section_records = records.loc[records[ scout_census.column_labels.UNIT_TYPE] == section_type] else: raise Exception( f"Unable to find section records for {new_section_ids}" ) section_data["Section"] = section section_data["District_ID"] = section_records[ scout_census.column_labels.id.DISTRICT].unique()[0] section_data["District"] = section_records[ scout_census.column_labels.name.DISTRICT].unique()[0] section_data["County"] = section_records["C_name"].unique()[0] section_data["Region"] = section_records["R_name"].unique()[0] section_data["Scout Country"] = section_records["X_name"].unique( )[0] if open_years[0] == years[0]: section_data["First Year"] = f"{years[0]} or before" else: section_data["First Year"] = open_years[0] if open_years[-1] == years[-1]: section_data["Last Year"] = f"Open in {years[-1]}" else: section_data["Last Year"] = open_years[-1] # To find the postcode and IMD for a range of Sections and Group # records across several years. Find the most recent year, and then # choose the Postcode, where the IMD Rank is the lowest. most_recent_year = open_years[-1] logger.debug(f"Checking {most_recent_year}") most_recent = section_records.loc[section_records["Year"] == most_recent_year] if most_recent.shape[0] == 1: most_recent = most_recent.iloc[0] elif most_recent.shape[0] == 0: logger.warning("Inconsistent ids") if section in scout_census.SECTIONS_GROUP: # In the event that the Object_IDs aren't consistent, pick a section in the group that's most recent # is only applicable after 2017, so sections are assumed to exist. most_recent = records.loc[ (records[scout_census.column_labels.id.GROUP] == section_data["Group_ID"]) & (records[scout_census.column_labels.UNIT_TYPE] == section_type) & (records["Year"] == most_recent_year)].iloc[0] elif section in scout_census.SECTIONS_DISTRICT: most_recent_record = records.loc[ (records[scout_census.column_labels.id.DISTRICT] == section_data["District_ID"]) & (records[scout_census.column_labels.UNIT_TYPE] == section_type) & (records["Year"] == most_recent_year)] if most_recent_record.empty: logger.error( f"No records found with D_ID = {section_data['District_ID']} in {most_recent_year} that are {section}" ) most_recent = most_recent_record.iloc[0] else: logger.warning("Multiple sections found, assigning a section") most_recent = most_recent.iloc[0] postcode_valid = most_recent.at["postcode_is_valid"] # logger.debug(f"Identified:\n{most_recent} determined postcode valid:\n{postcode_valid}\n{postcode_valid == 1}\n{postcode_valid == 1}") # add postcode if postcode_valid: logger.debug( f"Adding postcode {most_recent.at[scout_census.column_labels.POSTCODE]}" ) section_data["Postcode"] = most_recent.at[ scout_census.column_labels.POSTCODE] country = ONS_PD.COUNTRY_CODES.get(most_recent.at["ctry"]) section_data[ "IMD Country"] = country if country else scout_census.DEFAULT_VALUE section_data["IMD Decile"] = most_recent.at["imd_decile"] section_data["IMD Rank"] = most_recent.at["imd"] else: section_data["Postcode"] = scout_census.DEFAULT_VALUE section_data["IMD Country"] = scout_census.DEFAULT_VALUE section_data["IMD Decile"] = scout_census.DEFAULT_VALUE section_data["IMD Rank"] = scout_census.DEFAULT_VALUE section_data_df = pd.DataFrame([section_data], columns=output_columns) output_data = pd.concat([output_data, section_data_df], axis=0) output_data.reset_index(drop=True, inplace=True) if report_name: report_io.save_report(output_data, report_name) return output_data
def close(start_time: float) -> None: """Outputs the duration of the programme""" logger.info( f"Script finished, {time.time() - start_time:.2f} seconds elapsed.")
def save_report(report: pd.DataFrame, report_name: str) -> None: logger.info(f"Writing to {report_name}") report.to_csv(config.SETTINGS.folders.output / f"{report_name}.csv", index=False, encoding="utf-8-sig")
import geopandas as gpd import pandas as pd from incognita.data.ons_pd import ONS_POSTCODE_DIRECTORY_MAY_20 as ONS_PD from incognita.logger import logger from incognita.logger import set_up_logger from incognita.utility import config from incognita.utility import constants from incognita.utility import deciles if __name__ == "__main__": set_up_logger() logger.info("Starting") to_keep = ("oscty", "oslaua", "osward", "ctry", "rgn", "pcon", "lsoa11", "msoa11", "imd", "imd_decile" ) # 'lat', 'long', 'nys_districts', 'pcd' fields = [f for f in to_keep if f in ONS_PD.fields] # Load Full ONS Postcode Directory data = pd.read_csv(config.SETTINGS.ons_pd.full, dtype=ONS_PD.data_types, encoding="utf-8") logger.info("Loaded data") orig = data.copy() logger.info("DEBUG - copied original data") # Add IMD Decile data["imd_decile"] = deciles.calc_imd_decile(data["imd"], data["ctry"], ONS_PD).astype("UInt8")
def create_boundary_report(self, options: set[str] = None, historical: bool = False, report_name: str = None) -> pd.DataFrame: """Produces .csv file summarising by boundary provided. Args: options: List of data to be included in report historical: Check to ensure that multiple years of data are intentional report_name: """ # Set default option set for `options` if options is None: options = { "Number of Sections", "Groups", "Section numbers", "6 to 17 numbers", "awards", "waiting list total" } opt_groups = "Groups" in options opt_section_numbers = "Section numbers" in options opt_number_of_sections = "Number of Sections" in options opt_6_to_17_numbers = "6 to 17 numbers" in options opt_waiting_list_totals = "waiting list total" in options opt_adult_numbers = "Adult numbers" in options opt_awards = "awards" in options census_data = self.census_data boundary_codes = self.geography.boundary_codes geog_name = self.geography.metadata.key # e.g oslaua osward pcon lsoa11 logger.info( f"Creating report by {geog_name} with {', '.join(options)} from {len(census_data.index)} records" ) census_dates = sorted(set(census_data["Census Date"].dropna())) if len(census_dates) > 1: if not historical: raise ValueError( f"Historical option not selected, but multiple censuses selected ({census_dates[0]} - {census_dates[-1]})" ) logger.info( f"Historical analysis from {census_dates[0]} to {census_dates[-1]}" ) sections_model = column_labels.sections dataframes = [] if opt_groups: # Used to list the groups that operate within the boundary. # Gets all groups in the census_data dataframe and calculates the # number of groups. logger.debug(f"Adding group data") groups = census_data[[geog_name, column_labels.name.GROUP]].copy() groups[column_labels.name.GROUP] = groups[ column_labels.name.GROUP].str.strip() grouped_rgn = groups.drop_duplicates().dropna().groupby( [geog_name], dropna=False)[column_labels.name.GROUP] dataframes.append( pd.DataFrame({ "Groups": grouped_rgn.unique().apply("\n".join), "Number of Groups": grouped_rgn.nunique(dropna=True) })) if opt_section_numbers or opt_number_of_sections or opt_6_to_17_numbers or opt_waiting_list_totals or opt_adult_numbers: total_cols = [ section_model.total for section_name, section_model in sections_model if section_name != "Network" ] waiting_cols = [ section_model.waiting_list for section_name, section_model in sections_model if section_name != "Network" ] census_data["All"] = census_data[total_cols].sum( axis=1).astype("Int32") census_data["Waiting List"] = census_data[waiting_cols].sum( axis=1).astype("Int32") census_data["Adults"] = census_data[[ "Leaders", "AssistantLeaders", "SectAssistants", "OtherAdults" ]].sum(axis=1).astype("Int32") logger.debug(f"Adding young people numbers") metric_cols = [] rename = {} if opt_section_numbers: metric_cols += [ section_model.total for section_name, section_model in sections_model if section_name != "Network" ] if opt_number_of_sections: # TODO correct for pluralisation (e.g. Colony -> Colonys not Colonies) metric_cols += [ section_model.unit_label for section_name, section_model in sections_model if section_name != "Network" ] rename |= { section_model.unit_label: f"{section_model.type}s" for section_name, section_model in sections_model if section_name != "Network" } if opt_6_to_17_numbers: metric_cols += ["All"] if opt_waiting_list_totals: metric_cols += ["Waiting List"] if opt_adult_numbers: metric_cols += ["Adults"] agg = census_data.groupby( [geog_name, "Census_ID"], dropna=False)[metric_cols].sum().unstack().sort_index() agg.columns = [ f"{rename.get(key, key)}-{census_year}".replace("_total", "") for key, census_year in agg.columns ] dataframes.append(agg) if opt_awards: if geog_name not in ONS_GEOG_NAMES: raise ValueError( f"{geog_name} is not a valid geography name. Valid values are {ONS_GEOG_NAMES}" ) district_id_column = column_labels.id.DISTRICT award_name = sections_model.Beavers.top_award[0] award_eligible = sections_model.Beavers.top_award_eligible[0] logger.debug(f"Creating awards mapping") awards_mapping = _ons_to_district_mapping(census_data, boundary_codes, geog_name) district_numbers = { district_id: num for district_dict in awards_mapping.values() for district_id, num in district_dict.items() } grouped_dist = census_data[[ "Queens_Scout_Awards", "Eligible4QSA", district_id_column ]].groupby(district_id_column, dropna=False) ons_regions_in_district = grouped_dist[district_id_column].first( ).map(district_numbers) awards_per_district_per_regions = pd.DataFrame({ # QSAs achieved in district, divided by the number of regions the district is in "QSA": grouped_dist["Queens_Scout_Awards"].sum() / ons_regions_in_district, # number of young people eligible to achieve the QSA in district, divided by the number of regions the district is in "qsa_eligible": grouped_dist["Eligible4QSA"].sum() / ons_regions_in_district, }) # Check that our pivot keeps the total membership constant yp_cols = [ "Beavers_total", "Cubs_total", "Scouts_total", "Explorers_total" ] grouped_rgn = census_data.groupby([geog_name], dropna=False) assert int(census_data[yp_cols].sum().sum()) == int( grouped_rgn[yp_cols].sum().sum().sum()) logger.debug(f"Adding awards data") award_total = grouped_rgn[award_name].sum() eligible_total = grouped_rgn[award_eligible].sum() award_prop = 100 * award_total / eligible_total award_prop[eligible_total == 0] = pd.NA max_value = award_prop.quantile(0.95) award_prop = award_prop.clip(upper=max_value) # calculates the nominal QSAs per ONS region specified. # Divides total # of awards by the number of Scout Districts that the ONS Region is in region_ids = grouped_rgn.name.first().index.to_series() if geog_name == "D_ID": district_ids = region_ids else: region_district_map = { rgn_id: list(district_dict) for rgn_id, district_dict in awards_mapping.items() } district_ids = region_ids.map(region_district_map) awards_regions_data = pd.DataFrame.from_dict( { idx: awards_per_district_per_regions.loc[ids].sum() for idx, ids in district_ids.items() }, orient="index") qsa_prop = 100 * awards_regions_data["QSA"] / awards_regions_data[ "qsa_eligible"] qsa_prop[awards_regions_data["qsa_eligible"] == 0] = pd.NA award_data = { award_name: award_total, award_eligible: eligible_total, f"%-{award_name}": award_prop, "QSA": awards_regions_data["QSA"], "%-QSA": qsa_prop, } dataframes.append(pd.DataFrame(award_data)) # TODO find a way to keep DUMMY geography coding output_data = boundary_codes.reset_index(drop=True).copy() output_data = output_data.merge(pd.concat(dataframes, axis=1), how="left", left_on="codes", right_index=True, sort=False) if geog_name == "lsoa11": logger.debug(f"Loading ONS postcode data & Adding IMD deciles.") ons_pd_data = pd.read_feather(config.SETTINGS.ons_pd.reduced, columns=["lsoa11", "imd_decile" ]).drop_duplicates() output_data = output_data.merge( ons_pd_data, how="left", left_on="codes", right_on="lsoa11").drop(columns="lsoa11") if report_name: report_io.save_report(output_data, report_name) return output_data
def add(self, number1: Real, number2: Real) -> Real: logger.info("Example Function") return number1 + number2
import time import geopandas as gpd import pandas as pd from incognita.data.scout_census import load_census_data from incognita.geographies import district_boundaries from incognita.logger import logger from incognita.utility import config from incognita.utility import filter from incognita.utility import timing if __name__ == "__main__": start_time = time.time() logger.info( f"Starting at {time.strftime('%H:%M:%S', time.localtime(start_time))}") census_data = load_census_data() census_data = filter.filter_records(census_data, "Census_ID", {20}) # Remove Jersey, Guernsey, and Isle of Man as they have invalid lat/long coordinates for their postcodes census_data = filter.filter_records( census_data, "C_name", {"Bailiwick of Guernsey", "Isle of Man", "Jersey"}, exclude_matching=True) # low resolution shape data world_low_res = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")) uk_shape = world_low_res.loc[world_low_res.name == "United Kingdom", "geometry"].array.data[0] # # high resolution shape data # uk_shape = gpd.read_file(r"S:\Development\incognita\data\UK Shape\GBR_adm0.shp")["geometry"].array.data[0]
def add_areas( self, var_col: str, tooltip: str, layer_name: str, boundary_report: pd.DataFrame, boundary_metadata: config.Boundary, show: bool = False, colour_bounds: list[int] = None, significance_threshold: float = 2.5, categorical: bool = False, ) -> None: """Creates a 2D colouring with geometry specified by the boundary Args: var_col: Data column to use for choropleth colour values tooltip: Mouseover tooltip for each boundary (e.g. "% Change 6-18") layer_name: Legend key for the layer (e.g. "% Change 6-18 (Counties)") boundary_report: boundary_metadata: show: If True, show the layer by default colour_bounds: Colour breaks to create a fixed legend significance_threshold: If an area's value is significant enough to be displayed categorical: If the data are categorical """ data = boundary_report if var_col not in data.columns: logger.error(f"{var_col} is not a valid column in the data. \n" f"Valid columns include {data.columns}") raise KeyError(f"{var_col} is not a valid column in the data.") colours = list(reversed(("#4dac26", "#b8e186", "#f1b6da", "#d01c8b"))) choropleth_data = data[["codes", var_col]].set_index("codes")[var_col] # contains shapefile paths, and labels for region codes and names # Set value col properties to use for a particular boundary logger.info(f"Setting choropleth column to {var_col} (displayed: {tooltip})") non_zero_choropleth_data = choropleth_data[choropleth_data != 0].dropna().sort_values() colour_map_id = "0" if categorical: categories = [*non_zero_choropleth_data.drop_duplicates()] self.map["colour_map"] = _output_colour_scale_categorical( colour_map_id, layer_name, colours, classes=categories, legend_categories=categories, ) else: if colour_bounds is None: quantiles = (20, 40, 60, 80, 100) colour_bounds = np.unique(np.percentile(non_zero_choropleth_data, quantiles, interpolation="nearest")).tolist() num_ranges = len(colour_bounds) - 1 self.map["colour_map"] = _output_colour_scale_ranges( colour_map_id, layer_name, colours, classes=colour_bounds, legend_ranges=[(colour_bounds[i], colour_bounds[i + 1]) for i in range(num_ranges)], ) logger.info(f"Colour scale boundary values {colour_bounds}") logger.info(f"Merging geo_json on shape_codes from shapefile with codes from boundary report") metadata = boundary_metadata self.map[f"layer_{layer_name}"] = _output_shape_layer( legend_key=layer_name, # the name of the Layer, as it will appear in the layer controls colour_data=choropleth_data.to_dict(), api_base=metadata.api.url, query_params=metadata.api.query_params, colour_scale_id=colour_map_id, threshold=significance_threshold, code_col=metadata.api.codes_col, name_col=metadata.api.names_col, measure_name=tooltip, show=show, )
def _run_postcode_fix_step(data: pd.DataFrame, all_valid_postcodes: pd.Index, invalid_type: str, fill_from: str, entity_types: set[str], column_label: str, index_level: int) -> pd.DataFrame: """Runs postcode fixer for given data and parameters. Method: Gets all records with ID from given column and index level, then clears the indexing Returns the first row's postcode. As the index is sorted, this will return the earliest correct year. TODO change to use modal result instead of first (If section has no valid postcodes, use most common (modal) postcode from sections in group in that year, then try successive years) Args: data: Census data all_valid_postcodes: Index of all valid postcodes in the ONS postcode directory invalid_type: Which type of issue are we fixing (for log message) fill_from: Where are we pulling valid postcodes from (for log message) entity_types: Entity types to filter the fixing on (e.g. Colony, Group, Network, District) column_label: Name of the index level being used index_level: Level of the MultiIndex to filter on Returns: Updated census data """ # Index level: 0=District; 1=Group; 2=Section; 3=Census_ID logger.info( f"Fill invalid {invalid_type} postcodes with valid section postcodes from {fill_from}" ) entity_type_label = scout_census.column_labels.UNIT_TYPE valid_postcode_label = scout_census.column_labels.VALID_POSTCODE # Gets all entity records matching the given criteria, and returns a # minimal set of fields for memory optimisation records = data.loc[ data[entity_type_label].isin(entity_types), [valid_postcode_label, column_label, CLEAN_POSTCODE_LABEL]] valid_postcodes_start = data[valid_postcode_label].to_numpy().sum() # Get all valid clean postcodes from the filtered records. Then sort the # index with census IDs high -> low. Then group the data by the passed # index level. As the census IDs are sorted descending, the first item will # be the newest possible clean postcode, indexed by the passed level. firsts = records.loc[records[valid_postcode_label], CLEAN_POSTCODE_LABEL].sort_index( ascending=(True, True, True, False)).groupby( level=index_level).first() # Map invalid postcodes to valid postcodes by the given ID type/field clean_postcodes = records.loc[~records[valid_postcode_label], column_label].map(firsts) # Merge in the changed postcodes and overwrite pre-existing postcodes in the Clean Postcode column clean_postcodes_not_na = clean_postcodes.loc[ clean_postcodes.notna()] # .update(*) uses not_na filter data.loc[clean_postcodes_not_na.index, CLEAN_POSTCODE_LABEL] = clean_postcodes_not_na # Update valid postcode status data[valid_postcode_label] = data[CLEAN_POSTCODE_LABEL].isin( all_valid_postcodes) logger.info( f"change in valid postcodes is: {data[valid_postcode_label].to_numpy().sum() - valid_postcodes_start}" ) return data
import time from incognita.data.scout_census import load_census_data from incognita.logger import logger from incognita.reports.history_summary import HistorySummary from incognita.utility import filter from incognita.utility import timing if __name__ == "__main__": start_time = time.time() logger.info( f"Starting at {time.strftime('%H:%M:%S', time.localtime(start_time))}") census_ids = {15, 16, 17, 18, 19, 20} census_data = load_census_data() census_data = filter.filter_records(census_data, "Census_ID", census_ids) census_data = filter.filter_records( census_data, "X_name", {"England", "Scotland", "Wales", "Northern Ireland"}) # If filtering on IMD, remove NA values # census_data = filter.filter_records(census_data, "imd_decile", ["nan"], exclude_matching=True) # census_data = filter.filter_records(census_data, "imd_decile", [1, 2, 3]) section_history = HistorySummary(census_data) section_history.new_section_history_summary( sorted(census_ids), report_name="opened_section_data") timing.close(start_time)
# The errors file contains all the postcodes that failed to be looked up in the ONS Postcode Directory error_output_fields = [ postcode_merge_column, original_postcode_label, compass_id_label, "type", "name", "G_name", "D_name", "C_name", "R_name", "X_name", "Census Date" ] data.loc[~data[valid_postcode_label], error_output_fields].to_csv(error_output_path, index=False, encoding="utf-8-sig") # Write the new data to a csv file (utf-8-sig only to force excel to use UTF-8) logger.info("Writing merged data") data.to_csv(output_path.with_suffix(".csv"), index=False, encoding="utf-8-sig") data.to_feather(output_path.with_suffix(".feather")) if __name__ == "__main__": # Turn on logging set_up_logger() logger.info(f"Starting at {time.strftime('%H:%M:%S', time.localtime())}") start_time = time.time() process_census_extract() logger.info( f"Script finished, {time.time() - start_time:.2f} seconds elapsed.")