コード例 #1
0
ファイル: base.py プロジェクト: john-sandall/maven
    def process(self):
        """Process results data for a UK General Election."""
        filename = self.sources[0][1]
        processed_results_location = self.directory / "processed" / self.target[0]
        os.makedirs(
            self.directory / "processed", exist_ok=True
        )  # create directory if it doesn't exist

        def process_and_export():
            # Either caching disabled or file not yet processed; process regardless.
            results = self.process_hoc_sheet(
                input_file=filename, data_dir=self.directory, sheet_name=str(self.year)
            )
            # Export
            print(f"Exporting dataset to {processed_results_location.resolve()}")
            results.to_csv(processed_results_location, index=False)

        utils.retrieve_from_cache_if_exists(
            filename=self.target[0],
            target_dir=(self.directory / "processed"),
            processing_fn=process_and_export,
            md5_checksum=self.target[1],
            caching_enabled=self.cache,
            verbose=self.verbose,
        )
コード例 #2
0
ファイル: base.py プロジェクト: john-sandall/maven
 def retrieve(self):
     """Retrieve data from self.sources into self.directory / 'raw' and validate against checksum."""
     target_dir = self.directory / "raw"
     os.makedirs(target_dir, exist_ok=True)  # create directory if it doesn't exist
     for url, filename, md5_checksum in self.sources:
         if utils.is_url(url):
             processing_fn = partial(
                 utils.fetch_url, url=url, filename=filename, target_dir=target_dir
             )
         else:
             processing_fn = partial(
                 utils.get_and_copy, identifier=url, filename=filename, target_dir=target_dir
             )
         utils.retrieve_from_cache_if_exists(
             filename=filename,
             target_dir=target_dir,
             processing_fn=processing_fn,
             md5_checksum=md5_checksum,
             caching_enabled=self.cache,
             verbose=self.verbose,
         )
         if not self.retrieve_all:  # retrieve just the first dataset
             return
     if self.retrieve_all:  # all datasets retrieved
         return
     else:  # retrieving first dataset only but all fallbacks failed
         raise RuntimeError(f"Unable to download {self.verbose_name} data.")
コード例 #3
0
ファイル: test_utils.py プロジェクト: john-sandall/maven
def test_retrieve_from_cache_if_exists(tmpdir):
    def _create_file(target_dir):
        """Puts file.txt in the target_dir"""
        with open(target_dir / "file.txt", "w") as f:
            f.write("some content")

    # Put it there for now.
    _create_file(target_dir=tmpdir)

    # Test basic usage
    utils.retrieve_from_cache_if_exists(
        filename="file.txt",
        target_dir=Path(tmpdir),
        processing_fn=None,
        md5_checksum=None,
        caching_enabled=True,
        verbose=False,
    )
    # Test incorrect MD5
    with pytest.warns(UserWarning):
        utils.retrieve_from_cache_if_exists(
            filename="file.txt",
            target_dir=Path(tmpdir),
            processing_fn=None,
            md5_checksum="badchecksum",
            caching_enabled=True,
            verbose=True,
        )
    # Remove file & put it there via processing_fn
    os.remove(tmpdir / "file.txt")
    utils.retrieve_from_cache_if_exists(
        filename="file.txt",
        target_dir=Path(tmpdir),
        processing_fn=partial(_create_file, target_dir=tmpdir),
        md5_checksum=None,
        caching_enabled=True,
        verbose=True,
    )
コード例 #4
0
ファイル: csse.py プロジェクト: john-sandall/maven
    def process(self):
        """Process CSSE data."""
        target_dir = self.directory / "processed"
        os.makedirs(target_dir,
                    exist_ok=True)  # create directory if it doesn't exist

        def process_and_export():
            """Either caching disabled or file not yet processed; process regardless."""
            data = {}
            for metric in ["Confirmed", "Deaths", "Recovered"]:
                df = pd.read_csv(self.directory / "raw" /
                                 f"time_series_19-covid-{metric}.csv")
                # Pivot all to long
                id_vars = ["Province/State", "Country/Region", "Lat", "Long"]
                value_vars = list(set(df.columns) - set(id_vars))
                df = df.melt(id_vars=id_vars,
                             value_vars=value_vars,
                             var_name="date",
                             value_name=metric)
                df["date"] = pd.to_datetime(df.date, format="%m/%d/%y")
                data[metric] = df.copy()

            # Merge together
            df_country_province = pd.merge(
                data["Confirmed"],
                data["Deaths"],
                how="outer",
                on=["Province/State", "Country/Region", "Lat", "Long", "date"],
            ).merge(
                data["Recovered"],
                how="outer",
                on=["Province/State", "Country/Region", "Lat", "Long", "date"],
            )

            # Clean
            df_country_province.columns = utils.sanitise(
                df_country_province.columns, replace={"long": "lon"})
            df_country_province = df_country_province[[
                "date",
                "country_region",
                "province_state",
                "lat",
                "lon",
                "confirmed",
                "deaths",
                "recovered",
            ]].sort_values(["date", "country_region", "province_state"])

            # Country-level data
            df_country = (df_country_province.groupby([
                "date", "country_region"
            ])[["confirmed", "deaths", "recovered"]].sum().reset_index())

            # Export
            print(f"Exporting dataset to {target_dir.resolve()}")
            df_country_province.to_csv(target_dir /
                                       "CSSE_country_province.csv",
                                       index=False)
            df_country.to_csv(target_dir / "CSSE_country.csv", index=False)

        for filename, checksum in self.targets:
            utils.retrieve_from_cache_if_exists(
                filename=filename,
                target_dir=target_dir,
                processing_fn=process_and_export,
                md5_checksum=checksum,
                caching_enabled=self.cache,
                verbose=self.verbose,
            )
コード例 #5
0
ファイル: uk_polls.py プロジェクト: john-sandall/maven
    def process(self):
        """Process UK polling data."""
        filename = self.sources[0][1]
        processed_results_location = self.directory / "processed" / self.target[
            0]
        os.makedirs(self.directory / "processed",
                    exist_ok=True)  # create directory if it doesn't exist

        def process_and_export():
            # Read in PollBase
            df = pd.read_excel(
                self.directory / "raw" / filename,
                sheet_name="17-19",
                usecols="A:C,G:H,I,K,M,O,Q,S,U,Y",
            )

            # Clean it up
            df.columns = utils.sanitise(
                df.columns,
                replace={
                    "polling": "company",
                    "publisher": "client",
                    "unnamed:_24": "method",
                    "green": "grn",
                    "tig_cuk": "chuk",
                },
            )
            df["year"] = df.year.replace({"?": 2019}).ffill().astype(int)
            df["month"] = df.month.ffill()
            df = df[df["fieldwork"].notnull()].copy()
            df["day_from"] = df.fieldwork.apply(
                lambda x: str(x).split("-")[0].replace("?", "")
                if "-" in str(x) else str(x).replace("?", ""))
            df["day_to"] = df.fieldwork.apply(
                lambda x: str(x).split("-")[1].replace("?", "")
                if "-" in str(x) else str(x).replace("?", ""))
            df["from"] = pd.to_datetime(
                df.apply(lambda row: f"{row.year}-{row.month}-{row.day_from}",
                         axis=1))
            df["to"] = pd.to_datetime(
                df.apply(lambda row: f"{row.year}-{row.month}-{row.day_to}",
                         axis=1))

            # Fix month & year in df['to'] where e.g. fieldwork is "30-3 Jan"
            month_shifted = (df.year.astype(str) + "-" + (
                (df.to.dt.month + 1) % 12).astype(str).replace("0", "12") +
                             "-" + df.day_to.astype(str))
            year_needs_shifting = month_shifted.apply(
                lambda x: str(x).split("-")[1]) == "1"
            month_shifted.loc[year_needs_shifting] = (
                ((df.loc[year_needs_shifting, "year"]).astype(int) +
                 1).astype(str).replace("0", "12") + "-" +
                ((df.to.dt.month + 1) % 12).astype(str) + "-" +
                df.day_to.astype(str))
            df.loc[df["from"] > df["to"],
                   "to"] = month_shifted.loc[df["from"] > df["to"]]
            df["to"] = pd.to_datetime(df.to)

            # Divide numbers by 100
            for party in ["con", "lab", "ld", "ukip", "grn", "chuk", "bxp"]:
                df[party] = df[party].replace(" ", np.nan).astype(float) / 100

            # Prepare for merge with SixFifty data
            df["sample_size"] = np.nan
            df["snp"] = np.nan
            df["pdf"] = np.nan
            columns = [
                "company",
                "client",
                "method",
                "from",
                "to",
                "sample_size",
                "con",
                "lab",
                "ld",
                "ukip",
                "grn",
                "chuk",
                "bxp",
                "snp",
                "pdf",
            ]
            df = df[columns].copy().sort_values("to")

            # Read in SixFifty polling data (2005 -> June 2017)
            df_sixfifty = pd.read_csv(self.directory / "raw" / "polls.csv",
                                      parse_dates=["from", "to"])
            df_sixfifty["chuk"] = np.nan
            df_sixfifty["bxp"] = np.nan
            df_sixfifty = df_sixfifty[columns].copy().sort_values("to")

            # Merge
            df_sixfifty = df_sixfifty[df_sixfifty.to < df.to.min()].copy()
            assert df_sixfifty.to.max() < df.to.min()
            df_polls = pd.concat([df_sixfifty, df], axis=0)

            # Export
            print(
                f"Exporting dataset to {processed_results_location.resolve()}")
            df_polls.to_csv(processed_results_location, index=False)

        utils.retrieve_from_cache_if_exists(
            filename=self.target[0],
            target_dir=(self.directory / "processed"),
            processing_fn=process_and_export,
            md5_checksum=self.target[1],
            caching_enabled=self.cache,
            verbose=self.verbose,
        )