def test_request_raises_bad_country(self): client = airbase.AirbaseClient() with pytest.raises(ValueError): client.request(country="lol123") with pytest.raises(ValueError): client.request(["NL", "lol123"])
def test_download_metadata(self, tmpdir, metadata_response, capsys): fpath = str(tmpdir / "meta.csv") client = airbase.AirbaseClient() client.download_metadata(fpath) assert os.path.exists(fpath) with open(fpath) as h: assert h.read() == metadata_response.body
def test_request_raises_bad_year(self): client = airbase.AirbaseClient() with pytest.raises(ValueError): client.request(year_from="1234") client.request(year_to="1234") with pytest.raises(ValueError): client.request(year_from="9999") client.request(year_to="9999")
def test_request_pl(self): client = airbase.AirbaseClient() r = client.request(pl="NO") assert len(r.shortpl) == 1 r = client.request(pl=["NO", "NO3"]) assert len(r.shortpl) == 2 with pytest.raises(ValueError): r = client.request(pl=["NO", "NO3", "Not a pl"])
def test_init_connect_false(self, summary_response): client = airbase.AirbaseClient(connect=False) with pytest.raises(AttributeError): client.all_countries with pytest.raises(AttributeError): client.all_pollutants with pytest.raises(AttributeError): client.pollutants_per_country with pytest.raises(AttributeError): client.request() client.connect() assert client.all_countries is not None assert client.all_pollutants is not None assert client.pollutants_per_country is not None assert client.request() is not None
def main(output_file, retries, ignore_errors=False): """Check the entire AirBase database for broken links""" print("Will output bad links to {}".format(output_file)) client = ab.AirbaseClient() req = client.request(preload_csv_links=True) # get links to all files session = requests.Session() # reuse HTTP connections # Define inside main to re-use Session def is_404(url, r=retries): try: response = session.head(url, timeout=1) return response.status_code == 404 except: if r == 0 and not ignore_errors: raise elif r == 0 and ignore_errors: return None else: return is_404(url, r - 1) # clear output file try: os.remove(output_file) except FileNotFoundError: pass with ThreadPoolExecutor(REQUESTS_SESSION_CONNECTION_POOL_SIZE) as executor: promises = executor.map( is_404, tqdm(req._csv_links, desc="Creating queue") ) total_bad = 0 pbar = tqdm(total=len(req._csv_links), desc="Checking links") for i, not_found in enumerate(promises): pbar.update() if not_found: total_bad += 1 with open(output_file, "a") as h: h.write(req._csv_links[i] + "\n") pbar.set_description(f"{total_bad:,} bad links")
import pandas as pd import numpy as np import requests import glob import os import airbase client = airbase.AirbaseClient() from nuts_finder import NutsFinder nf = NutsFinder(year=2016) def get_NutsCode(row: pd.Series) -> pd.Series: try: result = nf.find(lat=row["station_latitude_deg"], lon=row["station_longitude_deg"]) levels = [r['LEVL_CODE'] for r in result] result = result[levels.index(2)] return [result['NUTS_ID'], result['NUTS_NAME']] except: return [np.nan, np.nan] def translate_stationCode(row: pd.Series) -> pd.Series: try: temp = airstations_nuts2_dict[row['station_european_code']] row['NUTS_ID'] = temp[0] row['NUTS_NAME'] = temp[1] except: row['NUTS_ID'] = np.nan row['NUTS_NAME'] = np.nan return row
def download_raw_data(): print('Downloading the raw data.') if not os.path.exists('./data/airbase_data'): os.makedirs('./data/airbase_data') client = airbase.AirbaseClient() all_countries = client.all_countries for curr_country in all_countries: tt = time.time() if not os.path.exists('./data/airbase_data/' + curr_country): os.makedirs('./data/airbase_data/' + curr_country) r = client.request(country=curr_country, pl=['NO2', 'O3', 'PM10', 'SO2'], year_from=2015, preload_csv_links=True, verbose=False) all_csv_links = r._csv_links print(f'{curr_country} | {len(all_csv_links):5d} csv files') def download_csv_link(url): filename = url[url.rfind('/') + 1:] fullpath = './data/airbase_data/' + curr_country + '/' + filename if os.path.exists(fullpath): return with requests.Session() as s: attempts = 0 while True: try: download = s.get(url) break except Exception as e: attempts = attempts + 1 time.sleep(1) if attempts > 5: print('Failed to download', url) return try: decoded_content = download.content.decode('utf-8') except Exception as e: try: decoded_content = download.content.decode('utf-16') except Exception as e: print('Failed to decode.', url) return cr = csv.reader(decoded_content.splitlines(), delimiter=',') my_list = list(cr) with open(fullpath, "w", newline="") as f: writer = csv.writer(f, delimiter='\t') writer.writerows(my_list) return parallel_output = [] parallel_inputs = zip(all_csv_links) for parameters in parallel_inputs: lazy_result = dask.delayed(download_csv_link)(*parameters) parallel_output.append(lazy_result) n_workers = 8 # Set this to the number of cpus you have. dask.compute(*parallel_output, scheduler='processes', num_workers=n_workers) print(f'{curr_country} | {int(time.time() - tt):5d} sec')
def test_search_pl_limit(self): client = airbase.AirbaseClient() result = client.search_pollutant("N", limit=1) assert len(result) == 1
def test_search_pl_no_result(self): client = airbase.AirbaseClient() result = client.search_pollutant("Definitely not a pollutant") assert result == []
def test_search_pl_exact(self): client = airbase.AirbaseClient() result = client.search_pollutant("NO3") assert result[0]["pl"] == "NO3"
def test_search_pl_shortest_first(self): client = airbase.AirbaseClient() result = client.search_pollutant("N") names = [r["pl"] for r in result] assert len(names[0]) <= len(names[1]) assert len(names[0]) <= len(names[-1])
def test_request_not_pl_and_shortpl(self): client = airbase.AirbaseClient() with pytest.raises(ValueError): client.request(pl="O3", shortpl="123")
def test_request_response_generated(self): client = airbase.AirbaseClient() r = client.request() assert isinstance(r, airbase.AirbaseRequest)
def client(withoutresponses): """Return an initialized AirbaseClient""" return airbase.AirbaseClient(connect=True)
def test_saerch_pl_case_insensitive(self): client = airbase.AirbaseClient() result = client.search_pollutant("no3") assert result[0]["pl"] == "NO3"
def test_init_connect(self, summary_response): client = airbase.AirbaseClient(connect=True) assert client.all_countries is not None assert client.all_pollutants is not None assert client.pollutants_per_country is not None