def scrape_bolivia(): cwd = getcwd() bolivia_dir = path.join(cwd, 'data', 'bolivia') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(bolivia_dir, tmp_dir) data = requests.get(URL).json() for key, iso in REGION_ISO.items(): region_data = defaultdict(dict) for entry in data['confirmados']: region_data[entry['fecha']]['cases'] = entry['dep'][key] for entry in data['decesos']: region_data[entry['fecha']]['deaths'] = entry['dep'][key] for entry in data['recuperados']: region_data[entry['fecha']]['recovered'] = entry['dep'][key] for date in region_data.keys(): region_data[date]['date'] = date region_data[date]['region_iso'] = iso region_data[date]['region'] = ISO_REGION[iso] region_data[date]['province'] = '' region_data[date]['city'] = '' region_data[date]['place_type'] = 'departamento' df = pd.DataFrame(region_data.values(), columns=[ 'date', 'region_iso', 'region', 'province', 'city', 'place_type', 'cases', 'deaths', 'recovered' ]) region_file = path.join(bolivia_dir, f'{iso.lower()}.csv') df.to_csv(region_file, index=False, float_format='%.f') with open(path.join(getcwd(), 'data', 'bolivia', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_countries(): cwd = getcwd() countries_dir = path.join(cwd, 'data', 'countries') ensure_dirs(countries_dir) countries = {} df = pd.read_csv(COUNTRIES_DATA, parse_dates=[0], dayfirst=True) for country in df['location'].unique(): is_country = df['location'] == country country_filename = country.lower().replace(' ', '_') + '.csv' country_file = path.join(countries_dir, country_filename) countries[country] = country_filename country_df = df[is_country] country_df.rename(columns={ "date": "dateRep", "new_cases": "cases", "new_deaths": "deaths", "location": "countriesAndTerritories", "iso_code": "countryterritoryCode", "population": "popData2019", "continent": "continentExp", }, inplace=True) country_df.to_csv(country_file, index=False, float_format='%.f') with open(path.join(countries_dir, 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents(countries))
def scrape_uruguay(): cwd = getcwd() uruguay_dir = path.join(cwd, 'data', 'uruguay') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(uruguay_dir, tmp_dir) not_number_regexp = re.compile(r'\D') today = str(datetime.date.today()) page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') tables = soup.find_all('table', {'class': 'wikitable'}) per_departament_table = None for table in tables: headers = [th.get_text().strip() for th in table.find_all('th')] if len(headers) > 0 and 'Departamento' == headers[0]: per_departament_table = table updated_files = [] header = 'date,iso,region,city,place_type,cases,deaths,recovered\n' for tr in per_departament_table.tbody.find_all('tr'): cols = [td.get_text().strip() for td in tr.find_all('td')] if len(cols) != 5: continue departament = cols[0] iso = DEPARTAMENT_ISO[departament] line = ','.join([ today, iso, departament, '', 'departamento', not_number_regexp.sub('', cols[1]), not_number_regexp.sub('', cols[3]), not_number_regexp.sub('', cols[2]), ]) departament_file = path.join(uruguay_dir, f'{iso.lower()}.csv') is_empty = not path.exists(departament_file) with open(departament_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_files.append(departament_file) ensure_consistency(updated_files, lambda row: row[:4]) with open(path.join(getcwd(), 'data', 'uruguay', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_ecuador(): cwd = getcwd() ecuador_dir = path.join(cwd, 'data', 'ecuador') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(ecuador_dir, tmp_dir) not_number_regexp = re.compile(r'\D') today = str(datetime.date.today()) page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') tables = soup.find_all('table', {'class': 'sortable'}) per_province_table = None for table in tables: headers = [th.get_text().strip() for th in table.find_all('th')] if len(headers) > 0 and 'Provincias' == headers[0]: per_province_table = table updated_files = [] header = 'date,iso,province,city,place_type,cases,deaths\n' for tr in per_province_table.tbody.find_all('tr'): cols = [td.get_text().strip() for td in tr.find_all('td')] if len(cols) != 3: continue province = cols[0] iso = PROVINCE_ISO[province] line = ','.join([ today, iso, province, '', 'unknown' if iso == 'UNK' else 'province', not_number_regexp.sub('', cols[1]), not_number_regexp.sub('', cols[2]), ]) province_file = path.join(ecuador_dir, f'{iso.lower()}.csv') is_empty = not path.exists(province_file) with open(province_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_files.append(province_file) ensure_consistency(updated_files, lambda row: row[:4]) with open(path.join(getcwd(), 'data', 'ecuador', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_canada(): cwd = getcwd() canada_dir = path.join(cwd, 'data', 'canada') ensure_dirs(canada_dir) df_cases = pd.read_csv(CASES_URL) df_deaths = pd.read_csv(DEATHS_URL) df_recovered = pd.read_csv(RECOVERED_URL) data = defaultdict(lambda: defaultdict(dict)) for _, row in df_cases.iterrows(): date = row['date_report'] code = row['province'] data[code][date]['cases'] = row['cumulative_cases'] for _, row in df_deaths.iterrows(): date = row['date_death_report'] code = row['province'] data[code][date]['deaths'] = row['cumulative_deaths'] for _, row in df_recovered.iterrows(): date = row['date_recovered'] code = row['province'] data[code][date]['recovered'] = row['cumulative_recovered'] for code, region_data in data.items(): if code not in CODE_REGION: continue region = CODE_REGION[code] iso = REGION_ISO[region] place_type = 'province' if iso in TERRITORIES: place_type = 'territory' for date in region_data.keys(): region_data[date]['date'] = date region_data[date]['iso'] = iso region_data[date]['province'] = region region_data[date]['city'] = '' region_data[date]['place_type'] = place_type df = pd.DataFrame(region_data.values(), columns=[ 'date', 'iso', 'province', 'city', 'place_type', 'cases', 'deaths', 'recovered']) df['date'] = pd.to_datetime(df['date'], dayfirst=True) region_file = path.join(canada_dir, f'{iso.lower()}.csv') df.to_csv(region_file, index=False, float_format='%.f') with open(path.join(canada_dir, 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_by_counties(): cwd = getcwd() sweden_dir = path.join(cwd, 'data', 'sweden') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(sweden_dir, tmp_dir) today = str(datetime.date.today()) r = requests.get(DATA_PER_COUNTY) data = r.json() updated_county_files = [] header = 'date,county,county_iso,city,place_type,cases,deaths,estimated_population_2019,area_km2,confirmed_per_100k_inhabitants,critical\n' for feat in data['features']: attributes = feat['attributes'] county = attributes['Region'] iso = COUNTY_ISO_MAPPED[county].lower() confirmed = attributes['Totalt_antal_fall'] deaths = attributes['Totalt_antal_avlidna'] confirmed_per_100k = attributes['Fall_per_100000_inv'] critical = attributes['Totalt_antal_intensivvårdade'] line = ','.join([ today, county, iso.upper(), '', 'county', str(confirmed), str(deaths), str(COUNTY_POPULATION_MAPPED[county]), str(COUNTY_AREA_MAPPED[county]), str(confirmed_per_100k), str(critical) if critical is not None else '', ]) county_file = path.join(sweden_dir, f'{iso}.csv') is_empty = not path.exists(county_file) with open(county_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_county_files.append(county_file) ensure_consistency(updated_county_files, lambda a: a[:5])
def scrape_united_states_of_america(): cwd = getcwd() us_dir = path.join(cwd, 'data', 'united_states_of_america') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(us_dir, tmp_dir) headers = [ 'date', 'state', 'county', 'place_type', 'fips', 'cases', 'deaths' ] counties_df = pd.read_csv(COUNTIES_DATASET) counties_df = counties_df.sort_values(by=['state', 'county', 'date'], ascending=[True, True, False]) counties_df['place_type'] = 'county' counties_df = counties_df[headers] states_df = pd.read_csv(STATES_DATASET) states_df = states_df.sort_values(by=['state', 'date'], ascending=[True, False]) states_df['county'] = '' states_df['place_type'] = 'state' states_df = states_df[headers] states_fips = {} fipses = states_df['fips'].unique() for fips in fipses: is_current_fips = states_df['fips'] == fips fips_file = path.join(us_dir, f'{fips:02d}.csv') current_df = states_df[is_current_fips] current_df.to_csv(fips_file, index=False, float_format='%.f') state = current_df['state'].iloc[0] is_same_fips = counties_df['state'] == state current_counties_df = counties_df[is_same_fips] current_counties_df.to_csv(fips_file, index=False, header=False, mode='a', float_format='%.f') states_fips[f'{fips:02d}'] = state with open(path.join(us_dir, 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents(states_fips))
def scrape_argentina(): cwd = getcwd() argentina_dir = path.join(cwd, 'data', 'argentina') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(argentina_dir, tmp_dir) page = requests.get(URL).json() updated_files = [] header = 'date,region_iso,region,province,city,place_type,cases,deaths,recovered\n' for dep in page: if dep['provincia-key'] == 'totales': continue region = CODE_REGION[dep['provincia-key']] day = str( datetime.datetime.strptime(dep['ultima-actualizacion'], '%d/%m/%Y'))[:10] iso = REGION_ISO[region] confirmed = get(dep, 'Afectados', '0') deaths = get(dep, 'Muertos', '0') recovered = get(dep, 'Recuperados', '0') line = ','.join([ day, iso, region, '', '', 'unknown' if iso == 'UNK' else 'provincia', str(confirmed), str(deaths), str(recovered) ]) region_file = path.join(argentina_dir, f'{iso.lower()}.csv') is_empty = not path.exists(region_file) with open(region_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_files.append(region_file) ensure_consistency(updated_files, lambda row: row[:5]) with open(path.join(getcwd(), 'data', 'argentina', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_spain(): cwd = getcwd() spain_dir = path.join(cwd, 'data', 'spain') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(spain_dir, tmp_dir) headers = ['date', 'region', 'city', 'place_type', 'iso', 'cases', 'deaths', 'hospitalized', 'critical'] df = pd.read_csv(COUNTIES_DATASET, parse_dates=[1], dayfirst=True, encoding='iso-8859-1', error_bad_lines=False) df = df.rename(columns={ 'CCAA': 'iso', 'FECHA': 'date', 'CASOS': 'cases', 'Hospitalizados': 'hospitalized', 'UCI': 'critical', 'Fallecidos': 'deaths', }) df = df[df['iso'].str.len() == 2] def fill_cases(row): cases = row['cases'] if np.isnan(cases): return row['PCR+'] + row['TestAc+'] return cases df['cases'] = df.apply(fill_cases, axis=1) df = df.sort_values(by=['iso', 'date'], ascending=[True, False]) df['region'] = df.apply(lambda r: CCAA_ISO[r['iso']], axis=1) df['city'] = '' df['place_type'] = 'autonomous_community' df = df[headers] for iso in df['iso'].unique(): is_current_iso = df['iso'] == iso region_file = path.join(spain_dir, f'es-{iso.lower()}.csv') current_df = df[is_current_iso] current_df.to_csv(region_file, index=False, float_format='%.f') with open(path.join(spain_dir, 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_countries(): cwd = getcwd() countries_dir = path.join(cwd, 'data', 'countries') ensure_dirs(countries_dir) countries = {} df = pd.read_csv(COUNTRIES_DATA, parse_dates=[0], dayfirst=True) for country in df['countriesAndTerritories'].unique(): is_country = df['countriesAndTerritories'] == country country_filename = country.lower().replace(' ', '_') + '.csv' country_file = path.join(countries_dir, country_filename) countries[country] = country_filename country_df = df[is_country] country_df.to_csv(country_file, index=False, float_format='%.f') with open(path.join(countries_dir, 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents(countries))
def scrape_brazil(): cwd = getcwd() brazil_dir = path.join(cwd, 'data', 'brazil') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(brazil_dir, tmp_dir) gz_filename = path.join(tmp_dir, 'brazil.csv.gz') with open(gz_filename, 'wb') as gz_file: r = requests.get(BRAZIL_DATA, allow_redirects=True) gz_file.write(r.content) states = {} prev_state = '' header = '' curr_lines = [] def write_file(): with open(path.join(brazil_dir, f'{prev_state}.csv'), 'w') as state_file: state_file.writelines([header] + curr_lines) with gzip.open(gz_filename, 'rt') as f: for line in f: if header == '': header = line continue state = line.split(',')[1].lower() if len(prev_state) > 0 and state != prev_state: write_file() curr_lines = [] curr_lines.append(line) prev_state = state states[state] = True write_file() with open(path.join(brazil_dir, 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_australia(): cwd = getcwd() australia_dir = path.join(cwd, 'data', 'australia') ensure_dirs(australia_dir) df_cases = pd.read_csv(CASES_URL) df_deaths = pd.read_csv(DEATHS_URL) df_recovered = pd.read_csv(RECOVERED_URL) data = defaultdict(lambda: defaultdict(dict)) for _, row in df_cases.iterrows(): date = row['Date'] for iso in ISO_REGION.keys(): data[iso][date]['cases'] = row[iso] for _, row in df_deaths.iterrows(): date = row['Date'] for iso in ISO_REGION.keys(): data[iso][date]['deaths'] = row[iso] for _, row in df_recovered.iterrows(): date = row['Date'] for iso in ISO_REGION.keys(): data[iso][date]['recovered'] = row[iso] for iso, region_data in data.items(): place_type = 'state' if iso in ['ACT', 'NT']: place_type = 'territory' for date in region_data.keys(): region_data[date]['date'] = date region_data[date]['iso'] = iso region_data[date]['state'] = ISO_REGION[iso] region_data[date]['city'] = '' region_data[date]['place_type'] = place_type df = pd.DataFrame(region_data.values(), columns=['date', 'iso', 'state', 'city', 'place_type', 'cases', 'deaths', 'recovered']) region_file = path.join(australia_dir, f'{iso.lower()}.csv') df.to_csv(region_file, index=False, float_format='%.f') with open(path.join(australia_dir, 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_peru(): cwd = getcwd() peru_dir = path.join(cwd, 'data', 'peru') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(peru_dir, tmp_dir) not_number_regexp = re.compile(r'\D') today = str(datetime.date.today()) page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') tables = soup.find_all('table', {'class': 'sortable'}) per_departament_table = None for table in tables: headers = [th.get_text().strip() for th in table.find_all('th')] if len(headers) > 0 and 'Departamento' in headers[0]: per_departament_table = table updated_files = [] header = 'date,iso,region,city,place_type,cases,deaths\n' mapped = {} for tr in per_departament_table.tbody.find_all('tr'): headers = [th.get_text().strip() for th in tr.find_all('th')] cols = [td.get_text().strip() for td in tr.find_all('td')] if len(cols) != 10: continue departament = headers[0] cases = int(not_number_regexp.sub('', cols[1])) deaths = int(not_number_regexp.sub('', cols[3])) if 'Lima' in departament: departament = 'Lima' if 'Lima' in mapped: _cases, _deaths = mapped['Lima'] cases += _cases deaths += _deaths else: mapped['Lima'] = (cases, deaths) continue iso = DEPARTAMENT_ISO[departament] line = ','.join([ today, iso, departament, '', 'departamento', str(cases), str(deaths), ]) departament_file = path.join(peru_dir, f'{iso.lower()}.csv') is_empty = not path.exists(departament_file) with open(departament_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_files.append(departament_file) ensure_consistency(updated_files, lambda row: row[:4]) with open(path.join(getcwd(), 'data', 'peru', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_united_kingdom(): cwd = getcwd() tmp_dir = path.join(cwd, 'tmp', 'united_kingdom') uk_dir = path.join(cwd, 'data', 'united_kingdom') ensure_dirs(uk_dir, tmp_dir) england_cases_by_area_url = requests.get(ENGLAND_CASES_BY_AREA).url deaths_by_area_csv = requests.get(DEATHS_BY_AREA, verify=False).text headers = [ 'date', 'country', 'region', 'place_type', 'geo_code', 'cases', 'deaths' ] deaths_df = pd.read_csv(io.StringIO(deaths_by_area_csv), parse_dates=[3]) deaths_df = deaths_df.set_index(['areaName', 'areaType', 'date']) df = pd.read_csv(UK_CASES_BY_AREA, parse_dates=[0], dayfirst=True) df = df.rename( columns={ 'GSS_CD': 'geo_code', 'type': 'place_type', 'confirm': 'cases', 'area': 'region', }) df = df.fillna(value={'place_type': 'unknown'}) df['place_type'] = df.apply(lambda r: get_place_type(r['place_type']), axis=1) df = df.sort_values(by=['country', 'region', 'date'], ascending=[True, True, False]) def fill_deaths(row): area_type = row['place_type'] if area_type == 'country': area_type = 'nation' key = (row['region'], area_type, row['date']) if not key in deaths_df.index: return np.NaN return deaths_df.loc[key]['cumDeaths28DaysByDeathDate'] df['deaths'] = df.apply(fill_deaths, axis=1) df = df[headers] countries = {} for country in df['country'].unique(): is_country_data = df['region'] == country is_not_country_data = df['region'] != country is_current_country = df['country'] == country country_filename = country.lower().replace(' ', '_') + '.csv' country_file = path.join(uk_dir, country_filename) countries[country] = country_filename country_df = df[is_country_data] country_df.to_csv(country_file, index=False, float_format='%.f') regions_df = df[is_current_country & is_not_country_data] regions_df.to_csv(country_file, index=False, header=False, float_format='%.f', mode='a') england_df = pd.read_csv(england_cases_by_area_url, parse_dates=[3]) england_df = england_df.rename( columns={ 'Area name': 'region', 'Area code': 'geo_code', 'Cumulative lab-confirmed cases': 'cases', 'Area type': 'place_type', 'Specimen date': 'date' }) england_df = england_df.fillna(value={'place_type': 'unknown'}) england_df['country'] = 'England' england_df['place_type'] = england_df.apply( lambda r: get_place_type(r['place_type']), axis=1) england_df['deaths'] = england_df.apply(fill_deaths, axis=1) england_df = england_df.sort_values(by=['country', 'region', 'date'], ascending=[True, True, False]) england_df = england_df[headers] england_filename = 'england.csv' england_file = path.join(uk_dir, england_filename) countries['England'] = england_filename england_df[england_df['place_type'] == 'country'].to_csv( england_file, index=False, float_format='%.f') england_df[england_df['place_type'] != 'country'].to_csv( england_file, index=False, float_format='%.f', header=False, mode='a') with open(path.join(uk_dir, 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents(countries))
def scrape_chile(): cwd = getcwd() chile_dir = path.join(cwd, 'data', 'chile') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(chile_dir, tmp_dir) today = str(datetime.date.today()) page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') not_number_regexp = re.compile(r'\D') per_region_table = None tables = soup.find_all('table') for table in tables: headers = table.find_all('th') if len(headers) > 0 and 'Regiones' in headers[0].get_text(): per_region_table = table break updated_files = [] header = 'date,region,region_iso,province,city,place_type,cases,deaths\n' for tr in per_region_table.find_all('tr')[2:-1]: cols = [td.get_text() for td in tr.find_all('td')] if len(cols) != 6: continue iso = None for region in REGION_ISO: if region in cols[0]: iso = REGION_ISO[region] break if iso is None: continue region = ISO_REGION[iso] line = ','.join([ today, region, iso, '', '', 'region', not_number_regexp.sub('', cols[2]), not_number_regexp.sub('', cols[4]), ]) region_file = path.join(chile_dir, f'{iso.lower()}.csv') is_empty = not path.exists(region_file) with open(region_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_files.append(region_file) ensure_consistency(updated_files, lambda row: row[:5]) with open(path.join(getcwd(), 'data', 'chile', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_additional(): cwd = getcwd() ensure_dirs(path.join(cwd, 'data', 'sweden', 'additional')) scrape_cases_by_age() scrape_deaths_by_age()