def process(self): """Process results data for a UK General Election.""" filename = self.sources[0][1] processed_results_location = self.directory / "processed" / self.target[0] os.makedirs( self.directory / "processed", exist_ok=True ) # create directory if it doesn't exist def process_and_export(): # Either caching disabled or file not yet processed; process regardless. results = self.process_hoc_sheet( input_file=filename, data_dir=self.directory, sheet_name=str(self.year) ) # Export print(f"Exporting dataset to {processed_results_location.resolve()}") results.to_csv(processed_results_location, index=False) utils.retrieve_from_cache_if_exists( filename=self.target[0], target_dir=(self.directory / "processed"), processing_fn=process_and_export, md5_checksum=self.target[1], caching_enabled=self.cache, verbose=self.verbose, )
def retrieve(self): """Retrieve data from self.sources into self.directory / 'raw' and validate against checksum.""" target_dir = self.directory / "raw" os.makedirs(target_dir, exist_ok=True) # create directory if it doesn't exist for url, filename, md5_checksum in self.sources: if utils.is_url(url): processing_fn = partial( utils.fetch_url, url=url, filename=filename, target_dir=target_dir ) else: processing_fn = partial( utils.get_and_copy, identifier=url, filename=filename, target_dir=target_dir ) utils.retrieve_from_cache_if_exists( filename=filename, target_dir=target_dir, processing_fn=processing_fn, md5_checksum=md5_checksum, caching_enabled=self.cache, verbose=self.verbose, ) if not self.retrieve_all: # retrieve just the first dataset return if self.retrieve_all: # all datasets retrieved return else: # retrieving first dataset only but all fallbacks failed raise RuntimeError(f"Unable to download {self.verbose_name} data.")
def test_retrieve_from_cache_if_exists(tmpdir): def _create_file(target_dir): """Puts file.txt in the target_dir""" with open(target_dir / "file.txt", "w") as f: f.write("some content") # Put it there for now. _create_file(target_dir=tmpdir) # Test basic usage utils.retrieve_from_cache_if_exists( filename="file.txt", target_dir=Path(tmpdir), processing_fn=None, md5_checksum=None, caching_enabled=True, verbose=False, ) # Test incorrect MD5 with pytest.warns(UserWarning): utils.retrieve_from_cache_if_exists( filename="file.txt", target_dir=Path(tmpdir), processing_fn=None, md5_checksum="badchecksum", caching_enabled=True, verbose=True, ) # Remove file & put it there via processing_fn os.remove(tmpdir / "file.txt") utils.retrieve_from_cache_if_exists( filename="file.txt", target_dir=Path(tmpdir), processing_fn=partial(_create_file, target_dir=tmpdir), md5_checksum=None, caching_enabled=True, verbose=True, )
def process(self): """Process CSSE data.""" target_dir = self.directory / "processed" os.makedirs(target_dir, exist_ok=True) # create directory if it doesn't exist def process_and_export(): """Either caching disabled or file not yet processed; process regardless.""" data = {} for metric in ["Confirmed", "Deaths", "Recovered"]: df = pd.read_csv(self.directory / "raw" / f"time_series_19-covid-{metric}.csv") # Pivot all to long id_vars = ["Province/State", "Country/Region", "Lat", "Long"] value_vars = list(set(df.columns) - set(id_vars)) df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name="date", value_name=metric) df["date"] = pd.to_datetime(df.date, format="%m/%d/%y") data[metric] = df.copy() # Merge together df_country_province = pd.merge( data["Confirmed"], data["Deaths"], how="outer", on=["Province/State", "Country/Region", "Lat", "Long", "date"], ).merge( data["Recovered"], how="outer", on=["Province/State", "Country/Region", "Lat", "Long", "date"], ) # Clean df_country_province.columns = utils.sanitise( df_country_province.columns, replace={"long": "lon"}) df_country_province = df_country_province[[ "date", "country_region", "province_state", "lat", "lon", "confirmed", "deaths", "recovered", ]].sort_values(["date", "country_region", "province_state"]) # Country-level data df_country = (df_country_province.groupby([ "date", "country_region" ])[["confirmed", "deaths", "recovered"]].sum().reset_index()) # Export print(f"Exporting dataset to {target_dir.resolve()}") df_country_province.to_csv(target_dir / "CSSE_country_province.csv", index=False) df_country.to_csv(target_dir / "CSSE_country.csv", index=False) for filename, checksum in self.targets: utils.retrieve_from_cache_if_exists( filename=filename, target_dir=target_dir, processing_fn=process_and_export, md5_checksum=checksum, caching_enabled=self.cache, verbose=self.verbose, )
def process(self): """Process UK polling data.""" filename = self.sources[0][1] processed_results_location = self.directory / "processed" / self.target[ 0] os.makedirs(self.directory / "processed", exist_ok=True) # create directory if it doesn't exist def process_and_export(): # Read in PollBase df = pd.read_excel( self.directory / "raw" / filename, sheet_name="17-19", usecols="A:C,G:H,I,K,M,O,Q,S,U,Y", ) # Clean it up df.columns = utils.sanitise( df.columns, replace={ "polling": "company", "publisher": "client", "unnamed:_24": "method", "green": "grn", "tig_cuk": "chuk", }, ) df["year"] = df.year.replace({"?": 2019}).ffill().astype(int) df["month"] = df.month.ffill() df = df[df["fieldwork"].notnull()].copy() df["day_from"] = df.fieldwork.apply( lambda x: str(x).split("-")[0].replace("?", "") if "-" in str(x) else str(x).replace("?", "")) df["day_to"] = df.fieldwork.apply( lambda x: str(x).split("-")[1].replace("?", "") if "-" in str(x) else str(x).replace("?", "")) df["from"] = pd.to_datetime( df.apply(lambda row: f"{row.year}-{row.month}-{row.day_from}", axis=1)) df["to"] = pd.to_datetime( df.apply(lambda row: f"{row.year}-{row.month}-{row.day_to}", axis=1)) # Fix month & year in df['to'] where e.g. fieldwork is "30-3 Jan" month_shifted = (df.year.astype(str) + "-" + ( (df.to.dt.month + 1) % 12).astype(str).replace("0", "12") + "-" + df.day_to.astype(str)) year_needs_shifting = month_shifted.apply( lambda x: str(x).split("-")[1]) == "1" month_shifted.loc[year_needs_shifting] = ( ((df.loc[year_needs_shifting, "year"]).astype(int) + 1).astype(str).replace("0", "12") + "-" + ((df.to.dt.month + 1) % 12).astype(str) + "-" + df.day_to.astype(str)) df.loc[df["from"] > df["to"], "to"] = month_shifted.loc[df["from"] > df["to"]] df["to"] = pd.to_datetime(df.to) # Divide numbers by 100 for party in ["con", "lab", "ld", "ukip", "grn", "chuk", "bxp"]: df[party] = df[party].replace(" ", np.nan).astype(float) / 100 # Prepare for merge with SixFifty data df["sample_size"] = np.nan df["snp"] = np.nan df["pdf"] = np.nan columns = [ "company", "client", "method", "from", "to", "sample_size", "con", "lab", "ld", "ukip", "grn", "chuk", "bxp", "snp", "pdf", ] df = df[columns].copy().sort_values("to") # Read in SixFifty polling data (2005 -> June 2017) df_sixfifty = pd.read_csv(self.directory / "raw" / "polls.csv", parse_dates=["from", "to"]) df_sixfifty["chuk"] = np.nan df_sixfifty["bxp"] = np.nan df_sixfifty = df_sixfifty[columns].copy().sort_values("to") # Merge df_sixfifty = df_sixfifty[df_sixfifty.to < df.to.min()].copy() assert df_sixfifty.to.max() < df.to.min() df_polls = pd.concat([df_sixfifty, df], axis=0) # Export print( f"Exporting dataset to {processed_results_location.resolve()}") df_polls.to_csv(processed_results_location, index=False) utils.retrieve_from_cache_if_exists( filename=self.target[0], target_dir=(self.directory / "processed"), processing_fn=process_and_export, md5_checksum=self.target[1], caching_enabled=self.cache, verbose=self.verbose, )