def retrieve(self): """Will check to see if this already exists in directory tree, otherwise puts the datasets there by executing the necessary code from within this repo.""" destination_target = self.directory / 'raw' os.makedirs(destination_target, exist_ok=True) # create directory if it doesn't exist data_directory = (self.directory / '..' / '..' / '..' / '..').resolve() # sensible guess data = [ # (identifier, type, filename) ( 'general-election/UK/2010/results', 'processed', 'general_election-uk-2010-results.csv', ), ( 'general-election/UK/2010/results', 'processed', 'general_election-uk-2010-results-full.csv', ), ( 'general-election/UK/2015/results', 'processed', 'general_election-uk-2015-results.csv', ), ( 'general-election/UK/2015/results', 'processed', 'general_election-uk-2015-results-full.csv', ), ('general-election/UK/polls', 'processed', 'general_election-uk-polls.csv'), ( 'general-election/UK/polls', 'processed', 'general_election-london-polls.csv', ), ( 'general-election/UK/polls', 'processed', 'general_election-scotland-polls.csv', ), ( 'general-election/UK/polls', 'processed', 'general_election-wales-polls.csv', ), ('general-election/UK/polls', 'processed', 'general_election-ni-polls.csv'), ] for identifier, data_type, filename in data: source_target = f'{identifier}/{data_type}/{filename}' if not (data_directory / source_target).is_file(): print(f'Dataset {identifier} not found - retrieving now') maven.get(identifier, data_directory=data_directory) shutil.copyfile(src=data_directory / source_target, dst=destination_target / filename)
def check_uk_hoc_results_data(identifier, processed_filename): maven.get(identifier, data_directory="./data/") df = pd.read_csv(Path("./data") / identifier / "processed" / processed_filename) assert df.shape == (8450, 11) assert df.columns.tolist() == [ "ons_id", "constituency", "county", "region", "country", "electorate", "total_votes", "turnout", "party", "votes", "voteshare", ]
def check_uk_model_output(identifier, output_file): maven.get(identifier, data_directory="./data/") df = pd.read_csv(Path("./data") / identifier / "processed" / output_file) geo_columns = [] target_columns = [] if "geo_polls_now" in df.columns: geo_columns += [ "geo_polls_now", "geo_voteshare_last", "geo_swing", "geo_swing_forecast", "geo_swing_winner", ] if "total_votes_now" in df.columns: target_columns += [ "total_votes_now", "turnout_now", "votes_now", "voteshare_now", "winner_now", ] column_list = ([ "ons_id", "constituency", "county", "region", "geo", "country", "electorate", "total_votes_last", "turnout_last", "party", "votes_last", "voteshare_last", "winner_last", "won_here_last", "national_voteshare_last", "national_polls_now", "national_swing", "national_swing_forecast", "national_swing_winner", ] + geo_columns + target_columns) assert df.shape == (7800, len(column_list)) assert df.columns.tolist() == column_list
def test_csse(): identifier = "coronavirus/CSSE" maven.get(identifier, data_directory="./data/") # CSSE_country.csv processed_filename = "CSSE_country.csv" df = pd.read_csv( Path("./data") / identifier / "processed" / processed_filename) assert df.columns.tolist() == [ "date", "country_region", "confirmed", "deaths", "recovered" ] # CSSE_country_province.csv processed_filename = "CSSE_country_province.csv" df = pd.read_csv( Path("./data") / identifier / "processed" / processed_filename) assert df.columns.tolist() == [ "date", "country_region", "province_state", "lat", "lon", "confirmed", "deaths", "recovered", ]
def test_nothing_happens(): """Setting retrieve=False and process=False should do nothing.""" maven.get("general-election/UK/2010/results", retrieve=False, process=False)
def test_nonexisting_identifier(): with pytest.raises(KeyError): maven.get("this-identifier-will-never-exist", data_directory="./data/")
def test_process_with_retrieve(): # TODO: This is a useful test for now but we should actually handle this explicitly with a better error message. with pytest.raises(FileNotFoundError): maven.get('general-election/UK/2015/results', retrieve=False, process=True)
def test_nonexisting_identifier(): with pytest.raises(KeyError): maven.get('this-identifier-will-never-exist')