def __init__(self, region, resolution, cache_dir, upstream_dir, input_dir, output_dir): self.region = region self.resolution = resolution self.upstream_dir = upstream_dir self.input_dir = input_dir self.output_dir = output_dir self.scotland = False if self.region[0] == "S": self.scotland = True # load the subnational household projections self.snhpdata = SNHPData.SNHPData(cache_dir) # old way (needed for pre-2014/6 data for Wales/Scotland/NI) if not self.scotland: self.snhp_fallback = pd.read_csv(self.input_dir + "/snhp2014.csv", index_col="AreaCode") else: self.snhp_fallback = pd.read_csv(self.input_dir + "/snhp2016_sc.csv", index_col="GEOGRAPHY_CODE") # load the output from the microsynthesis (census 2011 based) self.base_population = self.__get_base_populationdata()
def fetch_dummy_data_into_raw(): if not os.environ['NOMIS_API_KEY'] == "DUMMY": print("This Function requires to be NOMIS_API_KEY == 'DUMMY' in env.\n" "Currently set to {} ".format(os.environ['NOMIS_API_KEY'])) sys.exit() NPPData.NPPData(test_data_dir) MYEData.MYEData(test_data_dir) SNPPData.SNPPData(test_data_dir) SNHPData.SNHPData(test_data_dir)
def setUp(self): """ Check env set up correctly for tests (it's too late to override the env in this function unfortunately) """ self.mye = MYEData.MYEData(TEST_DATA_DIR) self.npp = NPPData.NPPData(TEST_DATA_DIR) self.snpp = SNPPData.SNPPData(TEST_DATA_DIR) self.snhp = SNHPData.SNHPData(TEST_DATA_DIR) # fix issue with test dataset self.snpp.data[utils.EN].PROJECTED_YEAR_NAME = self.snpp.data[utils.EN].PROJECTED_YEAR_NAME.astype(int)
def __init__(self, input_files, ht_trans, cache_dir): self.cache_dir = cache_dir # guard for no input data (if more MPI processes than input files) if not len(input_files): raise ValueError("proc {}/{}: no input data".format( no.mpi.rank(), no.mpi.size())) self.lads = [file.split("_")[1] for file in input_files] # assumes all files in same dir self.data_dir = os.path.dirname(input_files[0]) # store as dict of DFs self.pop = pd.DataFrame() for file in input_files: no.log("reading initial population: %s" % file) data = pd.read_csv(file) data["LAD"] = file.split("_")[1] self.pop = self.pop.append(data) # no.log(self.pop.LC4408_C_AHTHUK11.unique()) # self.cat = self.pop.LC4408_C_AHTHUK11.unique() # "C_AHTHUK11": { # "0": "All categories: Household type", # "1": "One person household", # "2": "Married or same-sex civil partnership couple household", # "3": "Cohabiting couple household", # "4": "Lone parent household", # "5": "Multi-person household" # } self.cat = {"LC4408_C_AHTHUK11": np.array([1, 2, 3, 4, 5])} # NOTE: pandas stores column-major order but numpy view is row major so the matrix looks right but is actually transposed # (no amount of transposing actually changes the memory layout (it just changes the view) # the C++ code assumes the transition matrix is column major (col sums to unity not rows) self.t = pd.read_csv(ht_trans).set_index( "initial state").values / 100.0 # check rows sum to unity assert np.allclose(np.sum(self.t, 1), np.ones(len(self.t))) # TODO get snhp self.snhp = SNHPData.SNHPData(self.cache_dir) self.projection = self.snhp.aggregate(self.lads)
def __init__(self, cache_dir, file_pattern, areas): self.areas = areas self.file_pattern = file_pattern self.cache_dir = cache_dir year = no.timeline[0] # store as dict of DFs self.pop = {} for area in areas: file = os.path.join(self.cache_dir, self.file_pattern % (area, year)) no.log("reading initial population: %s" % file) self.pop[area] = pd.read_csv(file) self.pop[area]["LC4408_C_AHTHUK11_orig"] = self.pop[ area].LC4408_C_AHTHUK11 # no.log(self.pop.LC4408_C_AHTHUK11.unique()) # self.cat = self.pop.LC4408_C_AHTHUK11.unique() # "C_AHTHUK11": { # "0": "All categories: Household type", # "1": "One person household", # "2": "Married or same-sex civil partnership couple household", # "3": "Cohabiting couple household", # "4": "Lone parent household", # "5": "Multi-person household" # } self.cat = {"LC4408_C_AHTHUK11": np.array([1, 2, 3, 4, 5])} # NOTE: pandas stores column-major order but numpy view is row major so the matrix looks right but is actually transposed # (no amount of transposing actually changes the memory layout (it just changes the view) # the C++ code assumes the transition matrix is column major (col sums to unity not rows) self.t = pd.read_csv( os.path.join(self.cache_dir, "w_hhtype_dv-tpm.csv")).set_index( "initial state").values / 100.0 # check rows sum to unity assert np.allclose(np.sum(self.t, 1), np.ones(len(self.t))) # TODO get snhp self.snhp = SNHPData.SNHPData(self.cache_dir) self.projection = self.snhp.aggregate(self.areas)
def setUp(self): """ Check env set up correctly for tests. It's too late to override the env in this function unfortunately. """ print( "Warning: Some SNPP tests are disabled temporarily for the sake of development of the new dynamic " "microsimulation but the code works") # Build the test data objects from the raw_data directory. self.mye = MYEData.MYEData( TEST_DATA_DIR ) # Needs to be complete data for tests when upgrading to new estimates. self.npp = NPPData.NPPData( TEST_DATA_DIR) # Need to build the test version every migration. self.snpp = SNPPData.SNPPData( TEST_DATA_DIR) # Need to build the test version every migration. self.snhp = SNHPData.SNHPData(TEST_DATA_DIR) # fix issue with test dataset self.snpp.data[utils.EN].PROJECTED_YEAR_NAME = self.snpp.data[ utils.EN].PROJECTED_YEAR_NAME.astype(int) if not self.npp.data_api.key == "DUMMY" or not self.snpp.data_api.key == "DUMMY": print("Test requires NOMIS_API_KEY=DUMMY in env") sys.exit()
def __init__(self, params): self.coverage = { "EW": ukpoputils.EW, "GB": ukpoputils.GB, "UK": ukpoputils.UK }.get(params["coverage"]) if not self.coverage: raise RuntimeError("invalid coverage: %s" % params["coverage"]) self.cache_dir = params["cache_dir"] # initialise data sources self.census_ew = Nomisweb.Nomisweb(self.cache_dir) self.census_sc = NRScotland.NRScotland(self.cache_dir) self.census_ni = NISRA.NISRA(self.cache_dir) # population projections self.mye = MYEData.MYEData(self.cache_dir) self.snpp = SNPPData.SNPPData(self.cache_dir) self.npp = NPPData.NPPData(self.cache_dir) # households self.baseline = params["base_projection"] if not os.path.isdir(params["output_dir"]): raise ValueError("Output directory %s not found" % params["output_dir"]) self.output_file = os.path.join( params["output_dir"], "simim_%s_%s_%s" % (params["model_type"], params["base_projection"], os.path.basename(params["scenario"]))) self.custom_snpp_variant = pd.DataFrame() self.snhp = SNHPData.SNHPData(self.cache_dir) # holder for shapefile when requested self.shapefile = None
import matplotlib.pyplot as plt import ukpopulation.snppdata as SNPPData import ukpopulation.snhpdata as SNHPData #import ukpopulation.utils as utils # initialise the population modules snhp = SNHPData.SNHPData() snpp = SNPPData.SNPPData() lad = "E08000021" # Newcastle start_year = 2016 end_year = snhp.max_year(lad) # get the total hh = snhp.aggregate(lad, range(start_year, end_year + 1)) p = snpp.aggregate(["C_AGE", "GENDER"], lad, range(start_year, end_year + 1)) # plot the data fig, ax1 = plt.subplots() ax1.plot(hh.PROJECTED_YEAR_NAME, hh.OBS_VALUE, "bo", label="households") ax1.set_xlabel("Year") ax1.set_ylabel("Households") ax1.legend() ax2 = ax1.twinx() ax2.plot(p.PROJECTED_YEAR_NAME, p.OBS_VALUE, "ro", label="people") ax2.set_xlabel("Year") ax2.set_ylabel("People") ax2.legend(loc=4) plt.title(lad + " Households/People Projections") plt.show()
def fetch_full_data_into_cache(): npp = NPPData.NPPData() npp.force_load_variants(['hhh', 'ppp', 'lll']) MYEData.MYEData() SNPPData.SNPPData() SNHPData.SNHPData()
import pandas as pd import ukpopulation.snhpdata as SNHPData import ukpopulation.utils #as utils snhp_e = pd.read_csv("data/ons_hh_e_2016-2041.csv").drop( [str(y) for y in range(2001, 2014)], axis=1) snhp_w = pd.read_csv("data/hh_w_2014-2039.csv").drop( ["Unnamed: 0", "Unnamed: 1"], axis=1) snhp_s = SNHPData.SNHPData("../microsimulation/cache").data[ ukpopulation.utils.SC] snhp_e = snhp_e.groupby("CODE").sum().reset_index().rename( {"CODE": "GEOGRAPHY_CODE"}, axis=1) snhp_e = snhp_e[snhp_e.GEOGRAPHY_CODE.str.startswith("E0")] #print(snhp_e) snhp_w = snhp_w.groupby("GEOGRAPHY_CODE").sum().reset_index() #print(snhp_w) #print(snhp_s) snhp = pd.concat([snhp_e, snhp_w, snhp_s], ignore_index=True, sort=False) snhp.to_csv("./snhp.csv")