Esempio n. 1
0
 def retrieve(self):
     """Will check to see if this already exists in directory tree, otherwise puts the
        datasets there by executing the necessary code from within this repo."""
     destination_target = self.directory / 'raw'
     os.makedirs(destination_target,
                 exist_ok=True)  # create directory if it doesn't exist
     data_directory = (self.directory / '..' / '..' / '..' /
                       '..').resolve()  # sensible guess
     data = [
         # (identifier, type, filename)
         (
             'general-election/UK/2010/results',
             'processed',
             'general_election-uk-2010-results.csv',
         ),
         (
             'general-election/UK/2010/results',
             'processed',
             'general_election-uk-2010-results-full.csv',
         ),
         (
             'general-election/UK/2015/results',
             'processed',
             'general_election-uk-2015-results.csv',
         ),
         (
             'general-election/UK/2015/results',
             'processed',
             'general_election-uk-2015-results-full.csv',
         ),
         ('general-election/UK/polls', 'processed',
          'general_election-uk-polls.csv'),
         (
             'general-election/UK/polls',
             'processed',
             'general_election-london-polls.csv',
         ),
         (
             'general-election/UK/polls',
             'processed',
             'general_election-scotland-polls.csv',
         ),
         (
             'general-election/UK/polls',
             'processed',
             'general_election-wales-polls.csv',
         ),
         ('general-election/UK/polls', 'processed',
          'general_election-ni-polls.csv'),
     ]
     for identifier, data_type, filename in data:
         source_target = f'{identifier}/{data_type}/{filename}'
         if not (data_directory / source_target).is_file():
             print(f'Dataset {identifier} not found - retrieving now')
             maven.get(identifier, data_directory=data_directory)
         shutil.copyfile(src=data_directory / source_target,
                         dst=destination_target / filename)
Esempio n. 2
0
def check_uk_hoc_results_data(identifier, processed_filename):
    maven.get(identifier, data_directory="./data/")
    df = pd.read_csv(Path("./data") / identifier / "processed" / processed_filename)
    assert df.shape == (8450, 11)
    assert df.columns.tolist() == [
        "ons_id",
        "constituency",
        "county",
        "region",
        "country",
        "electorate",
        "total_votes",
        "turnout",
        "party",
        "votes",
        "voteshare",
    ]
Esempio n. 3
0
def check_uk_model_output(identifier, output_file):
    maven.get(identifier, data_directory="./data/")
    df = pd.read_csv(Path("./data") / identifier / "processed" / output_file)
    geo_columns = []
    target_columns = []
    if "geo_polls_now" in df.columns:
        geo_columns += [
            "geo_polls_now",
            "geo_voteshare_last",
            "geo_swing",
            "geo_swing_forecast",
            "geo_swing_winner",
        ]
    if "total_votes_now" in df.columns:
        target_columns += [
            "total_votes_now",
            "turnout_now",
            "votes_now",
            "voteshare_now",
            "winner_now",
        ]
    column_list = ([
        "ons_id",
        "constituency",
        "county",
        "region",
        "geo",
        "country",
        "electorate",
        "total_votes_last",
        "turnout_last",
        "party",
        "votes_last",
        "voteshare_last",
        "winner_last",
        "won_here_last",
        "national_voteshare_last",
        "national_polls_now",
        "national_swing",
        "national_swing_forecast",
        "national_swing_winner",
    ] + geo_columns + target_columns)
    assert df.shape == (7800, len(column_list))
    assert df.columns.tolist() == column_list
Esempio n. 4
0
def test_csse():
    identifier = "coronavirus/CSSE"
    maven.get(identifier, data_directory="./data/")
    # CSSE_country.csv
    processed_filename = "CSSE_country.csv"
    df = pd.read_csv(
        Path("./data") / identifier / "processed" / processed_filename)
    assert df.columns.tolist() == [
        "date", "country_region", "confirmed", "deaths", "recovered"
    ]
    # CSSE_country_province.csv
    processed_filename = "CSSE_country_province.csv"
    df = pd.read_csv(
        Path("./data") / identifier / "processed" / processed_filename)
    assert df.columns.tolist() == [
        "date",
        "country_region",
        "province_state",
        "lat",
        "lon",
        "confirmed",
        "deaths",
        "recovered",
    ]
Esempio n. 5
0
def test_nothing_happens():
    """Setting retrieve=False and process=False should do nothing."""
    maven.get("general-election/UK/2010/results",
              retrieve=False,
              process=False)
Esempio n. 6
0
def test_nonexisting_identifier():
    with pytest.raises(KeyError):
        maven.get("this-identifier-will-never-exist", data_directory="./data/")
Esempio n. 7
0
def test_process_with_retrieve():
    # TODO: This is a useful test for now but we should actually handle this explicitly with a better error message.
    with pytest.raises(FileNotFoundError):
        maven.get('general-election/UK/2015/results',
                  retrieve=False,
                  process=True)
Esempio n. 8
0
def test_nonexisting_identifier():
    with pytest.raises(KeyError):
        maven.get('this-identifier-will-never-exist')