def scrape_by_age(url, filename): today = str(datetime.date.today()) r = requests.get(url) data = r.json() header = 'date,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-90,90+,unknown' data_dict = {'date': today} for feat in data['features']: group = feat['attributes']['Åldersgrupp2'] value = feat['attributes']['value'] if group == 'Uppgift saknas': data_dict['unknown'] = value continue group = group.replace('år', ' ').strip() data_dict[group] = value today_line = ','.join([ str(data_dict[k]) if k in data_dict else '' for k in header.split(',') ]) cases_by_age_file = path.join(getcwd(), 'data', 'sweden', 'additional', filename) is_empty = not path.exists(cases_by_age_file) with open(cases_by_age_file, 'a+') as f: if is_empty: f.write(header + '\n') f.write(today_line + '\n') if not is_empty: ensure_consistency([cases_by_age_file], lambda a: a[0])
def scrape_uruguay(): cwd = getcwd() uruguay_dir = path.join(cwd, 'data', 'uruguay') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(uruguay_dir, tmp_dir) not_number_regexp = re.compile(r'\D') today = str(datetime.date.today()) page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') tables = soup.find_all('table', {'class': 'wikitable'}) per_departament_table = None for table in tables: headers = [th.get_text().strip() for th in table.find_all('th')] if len(headers) > 0 and 'Departamento' == headers[0]: per_departament_table = table updated_files = [] header = 'date,iso,region,city,place_type,cases,deaths,recovered\n' for tr in per_departament_table.tbody.find_all('tr'): cols = [td.get_text().strip() for td in tr.find_all('td')] if len(cols) != 5: continue departament = cols[0] iso = DEPARTAMENT_ISO[departament] line = ','.join([ today, iso, departament, '', 'departamento', not_number_regexp.sub('', cols[1]), not_number_regexp.sub('', cols[3]), not_number_regexp.sub('', cols[2]), ]) departament_file = path.join(uruguay_dir, f'{iso.lower()}.csv') is_empty = not path.exists(departament_file) with open(departament_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_files.append(departament_file) ensure_consistency(updated_files, lambda row: row[:4]) with open(path.join(getcwd(), 'data', 'uruguay', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_ecuador(): cwd = getcwd() ecuador_dir = path.join(cwd, 'data', 'ecuador') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(ecuador_dir, tmp_dir) not_number_regexp = re.compile(r'\D') today = str(datetime.date.today()) page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') tables = soup.find_all('table', {'class': 'sortable'}) per_province_table = None for table in tables: headers = [th.get_text().strip() for th in table.find_all('th')] if len(headers) > 0 and 'Provincias' == headers[0]: per_province_table = table updated_files = [] header = 'date,iso,province,city,place_type,cases,deaths\n' for tr in per_province_table.tbody.find_all('tr'): cols = [td.get_text().strip() for td in tr.find_all('td')] if len(cols) != 3: continue province = cols[0] iso = PROVINCE_ISO[province] line = ','.join([ today, iso, province, '', 'unknown' if iso == 'UNK' else 'province', not_number_regexp.sub('', cols[1]), not_number_regexp.sub('', cols[2]), ]) province_file = path.join(ecuador_dir, f'{iso.lower()}.csv') is_empty = not path.exists(province_file) with open(province_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_files.append(province_file) ensure_consistency(updated_files, lambda row: row[:4]) with open(path.join(getcwd(), 'data', 'ecuador', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_by_counties(): cwd = getcwd() sweden_dir = path.join(cwd, 'data', 'sweden') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(sweden_dir, tmp_dir) today = str(datetime.date.today()) r = requests.get(DATA_PER_COUNTY) data = r.json() updated_county_files = [] header = 'date,county,county_iso,city,place_type,cases,deaths,estimated_population_2019,area_km2,confirmed_per_100k_inhabitants,critical\n' for feat in data['features']: attributes = feat['attributes'] county = attributes['Region'] iso = COUNTY_ISO_MAPPED[county].lower() confirmed = attributes['Totalt_antal_fall'] deaths = attributes['Totalt_antal_avlidna'] confirmed_per_100k = attributes['Fall_per_100000_inv'] critical = attributes['Totalt_antal_intensivvårdade'] line = ','.join([ today, county, iso.upper(), '', 'county', str(confirmed), str(deaths), str(COUNTY_POPULATION_MAPPED[county]), str(COUNTY_AREA_MAPPED[county]), str(confirmed_per_100k), str(critical) if critical is not None else '', ]) county_file = path.join(sweden_dir, f'{iso}.csv') is_empty = not path.exists(county_file) with open(county_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_county_files.append(county_file) ensure_consistency(updated_county_files, lambda a: a[:5])
def scrape_argentina(): cwd = getcwd() argentina_dir = path.join(cwd, 'data', 'argentina') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(argentina_dir, tmp_dir) page = requests.get(URL).json() updated_files = [] header = 'date,region_iso,region,province,city,place_type,cases,deaths,recovered\n' for dep in page: if dep['provincia-key'] == 'totales': continue region = CODE_REGION[dep['provincia-key']] day = str( datetime.datetime.strptime(dep['ultima-actualizacion'], '%d/%m/%Y'))[:10] iso = REGION_ISO[region] confirmed = get(dep, 'Afectados', '0') deaths = get(dep, 'Muertos', '0') recovered = get(dep, 'Recuperados', '0') line = ','.join([ day, iso, region, '', '', 'unknown' if iso == 'UNK' else 'provincia', str(confirmed), str(deaths), str(recovered) ]) region_file = path.join(argentina_dir, f'{iso.lower()}.csv') is_empty = not path.exists(region_file) with open(region_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_files.append(region_file) ensure_consistency(updated_files, lambda row: row[:5]) with open(path.join(getcwd(), 'data', 'argentina', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_peru(): cwd = getcwd() peru_dir = path.join(cwd, 'data', 'peru') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(peru_dir, tmp_dir) not_number_regexp = re.compile(r'\D') today = str(datetime.date.today()) page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') tables = soup.find_all('table', {'class': 'sortable'}) per_departament_table = None for table in tables: headers = [th.get_text().strip() for th in table.find_all('th')] if len(headers) > 0 and 'Departamento' in headers[0]: per_departament_table = table updated_files = [] header = 'date,iso,region,city,place_type,cases,deaths\n' mapped = {} for tr in per_departament_table.tbody.find_all('tr'): headers = [th.get_text().strip() for th in tr.find_all('th')] cols = [td.get_text().strip() for td in tr.find_all('td')] if len(cols) != 10: continue departament = headers[0] cases = int(not_number_regexp.sub('', cols[1])) deaths = int(not_number_regexp.sub('', cols[3])) if 'Lima' in departament: departament = 'Lima' if 'Lima' in mapped: _cases, _deaths = mapped['Lima'] cases += _cases deaths += _deaths else: mapped['Lima'] = (cases, deaths) continue iso = DEPARTAMENT_ISO[departament] line = ','.join([ today, iso, departament, '', 'departamento', str(cases), str(deaths), ]) departament_file = path.join(peru_dir, f'{iso.lower()}.csv') is_empty = not path.exists(departament_file) with open(departament_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_files.append(departament_file) ensure_consistency(updated_files, lambda row: row[:4]) with open(path.join(getcwd(), 'data', 'peru', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())
def scrape_chile(): cwd = getcwd() chile_dir = path.join(cwd, 'data', 'chile') tmp_dir = path.join(cwd, 'tmp') ensure_dirs(chile_dir, tmp_dir) today = str(datetime.date.today()) page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') not_number_regexp = re.compile(r'\D') per_region_table = None tables = soup.find_all('table') for table in tables: headers = table.find_all('th') if len(headers) > 0 and 'Regiones' in headers[0].get_text(): per_region_table = table break updated_files = [] header = 'date,region,region_iso,province,city,place_type,cases,deaths\n' for tr in per_region_table.find_all('tr')[2:-1]: cols = [td.get_text() for td in tr.find_all('td')] if len(cols) != 6: continue iso = None for region in REGION_ISO: if region in cols[0]: iso = REGION_ISO[region] break if iso is None: continue region = ISO_REGION[iso] line = ','.join([ today, region, iso, '', '', 'region', not_number_regexp.sub('', cols[2]), not_number_regexp.sub('', cols[4]), ]) region_file = path.join(chile_dir, f'{iso.lower()}.csv') is_empty = not path.exists(region_file) with open(region_file, 'a+') as f: if is_empty: f.write(header) f.write(f'{line}\n') if not is_empty: updated_files.append(region_file) ensure_consistency(updated_files, lambda row: row[:5]) with open(path.join(getcwd(), 'data', 'chile', 'README.md'), 'w') as readme_f: readme_f.write(get_readme_contents())