コード例 #1
0
    def __do_nireland(self):
        # Niron
        # (1 worksheet per LAD equivalent)
        print("Collating SNPP data for Northern Ireland...")
        ni_src = "https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/SNPP16_LGD14_SYA_1641.xlsx"
        ni_raw = self.cache_dir + "/snpp_ni.csv"
        if os.path.isfile(ni_raw):
            snpp_ni = pd.read_csv(ni_raw)
        else:
            response = requests.get(ni_src)
            with open(self.cache_dir + "/ni_raw.xlsx", 'wb') as fd:
                for chunk in response.iter_content(chunk_size=1024):
                    fd.write(chunk)

            # easier to hard-code the worksheet names we need (since unlikely to change frequently)
            districts = [
                "Antrim & Newtownabbey", "Ards & North Down",
                "Armagh Banbridge & Craigavon", "Belfast",
                "Causeway Coast & Glens", "Derry & Strabane",
                "Fermanagh & Omagh", "Lisburn & Castlereagh",
                "Mid & East Antrim", "Mid Ulster", "Newry Mourne & Down"
            ]

            xls_ni = load_workbook(self.cache_dir + "/ni_raw.xlsx",
                                   read_only=True)

            snpp_ni = pd.DataFrame()

            for d in districts:
                # 1 extra row compared to 2014 data (below was A2)
                area_code = xls_ni[d]["A3"].value
                # 2 extra rows compared to 2014 data (below was A3:A95)
                males = utils.read_cell_range(xls_ni[d], "A5", "AA97")
                females = utils.read_cell_range(xls_ni[d], "A100", "AA192")

                dfm = pd.DataFrame(data=males[1:, 1:],
                                   index=males[1:, 0],
                                   columns=males[0, 1:]).drop(
                                       ["Age"]).stack().reset_index()
                dfm.columns = ["C_AGE", "PROJECTED_YEAR_NAME", "OBS_VALUE"]
                dfm["GENDER"] = pd.Series(1, dfm.index)
                dfm["GEOGRAPHY_CODE"] = pd.Series(area_code, dfm.index)
                dfm.loc[dfm.C_AGE == "90+", "C_AGE"] = "90"

                dff = pd.DataFrame(data=females[1:, 1:],
                                   index=females[1:, 0],
                                   columns=females[0, 1:]).drop(
                                       ["Age"]).stack().reset_index()
                dff.columns = ["C_AGE", "PROJECTED_YEAR_NAME", "OBS_VALUE"]
                dff["GENDER"] = pd.Series(2, dff.index)
                dff["GEOGRAPHY_CODE"] = pd.Series(area_code, dff.index)
                dff.loc[dff.C_AGE == "90+", "C_AGE"] = 90

                snpp_ni = snpp_ni.append(dfm)
                snpp_ni = snpp_ni.append(dff)

            # assert(len(snpp_ni) == 26*2*91*11) # 11 districts x 91 ages x 2 genders x 26 years
            snpp_ni.to_csv(ni_raw, index=False)

        return snpp_ni
コード例 #2
0
    def __do_england(self):
        print("Collating SNHP data for England...")
        england_src = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/populationprojections/datasets/householdprojectionsforenglanddetaileddataformodellingandanalysis/2016based/detailedtablesstage1and2.zip"
        england_raw = os.path.join(self.cache_dir,
                                   os.path.basename(england_src))
        england_processed = self.cache_dir + "/snhp_e.csv"

        if os.path.isfile(england_processed):
            snhp_e = pd.read_csv(england_processed)
        else:
            response = requests.get(england_src)
            with open(england_raw, 'wb') as fd:
                for chunk in response.iter_content(chunk_size=1024):
                    fd.write(chunk)
                print("Downloaded", england_raw)

            # this doesnt work if you directly supply the file in the zip to load_workbook
            # workaround is to extract the file to a tmp dir and load from there
            z = zipfile.ZipFile(england_raw)
            tmpdir = tempfile.TemporaryDirectory().name
            #print(tmpdir)
            z.extract("detailedtablesstage1and2/s2 Households.xlsx", tmpdir)
            sheet = load_workbook(os.path.join(
                tmpdir, "detailedtablesstage1and2/s2 Households.xlsx"),
                                  read_only=True)["Households"]

            raw = utils.read_cell_range(sheet, "A7", "AS32263")
            snhp_e = pd.DataFrame(raw[1:, :], columns=raw[0, :])

            # remove years before 2011 census and switch years from columns to rows
            snhp_e = snhp_e.drop([str(y) for y in range(2001,2011)], axis=1) \
              .melt(id_vars=["CODE", "AREA", "AGE GROUP", "HOUSEHOLD TYPE"]).drop("AREA", axis=1)
            # ensure count is numeric
            snhp_e.value = snhp_e.value.astype(float)
            # remove age categories and standardise column names
            snhp_e = snhp_e.groupby(["CODE", "HOUSEHOLD TYPE", "variable"]).sum().reset_index() \
              .rename({"CODE": "GEOGRAPHY_CODE",
                      "HOUSEHOLD TYPE": "HOUSEHOLD_TYPE",
                      "variable": "PROJECTED_YEAR_NAME",
                      "value": "OBS_VALUE"}, axis=1)

            snhp_e.to_csv(england_processed, index=False)

        return snhp_e
コード例 #3
0
  def __do_nireland(self):
    # 1 worksheet per LAD equivalent
    print("Collating SNHP data for Northern Ireland...")
    ni_src = "https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/HHP16_LGD2014.xls"
    ni_processed = os.path.join(self.cache_dir, "snhp_ni.csv")
    if os.path.isfile(ni_processed): 
      snhp_ni = pd.read_csv(ni_processed)
    else:
      ni_raw = os.path.join(self.cache_dir, os.path.basename(ni_src))
      response = requests.get(ni_src)
      with open(ni_raw, 'wb') as fd:
        for chunk in response.iter_content(chunk_size=1024):
          fd.write(chunk)

      districts = ["N090000{:02d}".format(i) for i in range(1,12)]

      # convert to temp xlsx
      tmp_xlsx_file = tempfile.NamedTemporaryFile(suffix=".xlsx").name
      #print(tmp_xlsx_file)
      pyexcel.save_book_as(file_name=ni_raw, dest_file_name=tmp_xlsx_file)
      xlsx_ni = load_workbook(tmp_xlsx_file, read_only=True)

      snhp_ni = pd.DataFrame()

      for d in districts:
        raw = utils.read_cell_range(xlsx_ni[d], "A10", "AA15") # omitting Total row
        data = pd.DataFrame(raw[1:,:], columns=raw[0,:]).melt(id_vars="Household Type*") \
          .rename({"Household Type*": "HOUSEHOLD_TYPE", "variable": "PROJECTED_YEAR_NAME", "value": "OBS_VALUE"}, axis=1)

        data.insert(0, "GEOGRAPHY_CODE", d)
        snhp_ni = snhp_ni.append(data, ignore_index=True)
        
      snhp_ni.to_csv(ni_processed, index=False)

    return snhp_ni