コード例 #1
0
def scrape_by_age(url, filename):
    today = str(datetime.date.today())
    r = requests.get(url)
    data = r.json()

    header = 'date,0-9,10-19,20-29,30-39,40-49,50-59,60-69,70-79,80-90,90+,unknown'

    data_dict = {'date': today}
    for feat in data['features']:
        group = feat['attributes']['Åldersgrupp2']
        value = feat['attributes']['value']

        if group == 'Uppgift saknas':
            data_dict['unknown'] = value
            continue

        group = group.replace('år', ' ').strip()
        data_dict[group] = value

    today_line = ','.join([
        str(data_dict[k]) if k in data_dict else '' for k in header.split(',')
    ])

    cases_by_age_file = path.join(getcwd(), 'data', 'sweden', 'additional',
                                  filename)
    is_empty = not path.exists(cases_by_age_file)

    with open(cases_by_age_file, 'a+') as f:
        if is_empty:
            f.write(header + '\n')
        f.write(today_line + '\n')

    if not is_empty:
        ensure_consistency([cases_by_age_file], lambda a: a[0])
コード例 #2
0
def scrape_uruguay():
    cwd = getcwd()
    uruguay_dir = path.join(cwd, 'data', 'uruguay')
    tmp_dir = path.join(cwd, 'tmp')
    ensure_dirs(uruguay_dir, tmp_dir)

    not_number_regexp = re.compile(r'\D')

    today = str(datetime.date.today())
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')

    tables = soup.find_all('table', {'class': 'wikitable'})
    per_departament_table = None

    for table in tables:
        headers = [th.get_text().strip() for th in table.find_all('th')]
        if len(headers) > 0 and 'Departamento' == headers[0]:
            per_departament_table = table

    updated_files = []
    header = 'date,iso,region,city,place_type,cases,deaths,recovered\n'

    for tr in per_departament_table.tbody.find_all('tr'):
        cols = [td.get_text().strip() for td in tr.find_all('td')]
        if len(cols) != 5:
            continue

        departament = cols[0]
        iso = DEPARTAMENT_ISO[departament]

        line = ','.join([
            today,
            iso,
            departament,
            '',
            'departamento',
            not_number_regexp.sub('', cols[1]),
            not_number_regexp.sub('', cols[3]),
            not_number_regexp.sub('', cols[2]),
        ])

        departament_file = path.join(uruguay_dir, f'{iso.lower()}.csv')
        is_empty = not path.exists(departament_file)

        with open(departament_file, 'a+') as f:
            if is_empty:
                f.write(header)
            f.write(f'{line}\n')

        if not is_empty:
            updated_files.append(departament_file)

    ensure_consistency(updated_files, lambda row: row[:4])

    with open(path.join(getcwd(), 'data', 'uruguay', 'README.md'),
              'w') as readme_f:
        readme_f.write(get_readme_contents())
コード例 #3
0
ファイル: ecuador.py プロジェクト: johnfelipe/data
def scrape_ecuador():
    cwd = getcwd()
    ecuador_dir = path.join(cwd, 'data', 'ecuador')
    tmp_dir = path.join(cwd, 'tmp')
    ensure_dirs(ecuador_dir, tmp_dir)

    not_number_regexp = re.compile(r'\D')

    today = str(datetime.date.today())
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')

    tables = soup.find_all('table', {'class': 'sortable'})
    per_province_table = None

    for table in tables:
        headers = [th.get_text().strip() for th in table.find_all('th')]
        if len(headers) > 0 and 'Provincias' == headers[0]:
            per_province_table = table

    updated_files = []
    header = 'date,iso,province,city,place_type,cases,deaths\n'

    for tr in per_province_table.tbody.find_all('tr'):
        cols = [td.get_text().strip() for td in tr.find_all('td')]
        if len(cols) != 3:
            continue

        province = cols[0]
        iso = PROVINCE_ISO[province]

        line = ','.join([
            today,
            iso,
            province,
            '',
            'unknown' if iso == 'UNK' else 'province',
            not_number_regexp.sub('', cols[1]),
            not_number_regexp.sub('', cols[2]),
        ])

        province_file = path.join(ecuador_dir, f'{iso.lower()}.csv')
        is_empty = not path.exists(province_file)

        with open(province_file, 'a+') as f:
            if is_empty:
                f.write(header)
            f.write(f'{line}\n')

        if not is_empty:
            updated_files.append(province_file)

    ensure_consistency(updated_files, lambda row: row[:4])

    with open(path.join(getcwd(), 'data', 'ecuador', 'README.md'), 'w') as readme_f:
        readme_f.write(get_readme_contents())
コード例 #4
0
def scrape_by_counties():
    cwd = getcwd()
    sweden_dir = path.join(cwd, 'data', 'sweden')
    tmp_dir = path.join(cwd, 'tmp')
    ensure_dirs(sweden_dir, tmp_dir)

    today = str(datetime.date.today())
    r = requests.get(DATA_PER_COUNTY)
    data = r.json()

    updated_county_files = []
    header = 'date,county,county_iso,city,place_type,cases,deaths,estimated_population_2019,area_km2,confirmed_per_100k_inhabitants,critical\n'

    for feat in data['features']:
        attributes = feat['attributes']

        county = attributes['Region']
        iso = COUNTY_ISO_MAPPED[county].lower()
        confirmed = attributes['Totalt_antal_fall']
        deaths = attributes['Totalt_antal_avlidna']
        confirmed_per_100k = attributes['Fall_per_100000_inv']
        critical = attributes['Totalt_antal_intensivvårdade']

        line = ','.join([
            today,
            county,
            iso.upper(),
            '',
            'county',
            str(confirmed),
            str(deaths),
            str(COUNTY_POPULATION_MAPPED[county]),
            str(COUNTY_AREA_MAPPED[county]),
            str(confirmed_per_100k),
            str(critical) if critical is not None else '',
        ])

        county_file = path.join(sweden_dir, f'{iso}.csv')
        is_empty = not path.exists(county_file)

        with open(county_file, 'a+') as f:
            if is_empty:
                f.write(header)
            f.write(f'{line}\n')

        if not is_empty:
            updated_county_files.append(county_file)

    ensure_consistency(updated_county_files, lambda a: a[:5])
コード例 #5
0
ファイル: argentina.py プロジェクト: johnfelipe/data
def scrape_argentina():
    cwd = getcwd()
    argentina_dir = path.join(cwd, 'data', 'argentina')
    tmp_dir = path.join(cwd, 'tmp')
    ensure_dirs(argentina_dir, tmp_dir)

    page = requests.get(URL).json()

    updated_files = []
    header = 'date,region_iso,region,province,city,place_type,cases,deaths,recovered\n'
    for dep in page:
        if dep['provincia-key'] == 'totales':
            continue
        region = CODE_REGION[dep['provincia-key']]
        day = str(
            datetime.datetime.strptime(dep['ultima-actualizacion'],
                                       '%d/%m/%Y'))[:10]
        iso = REGION_ISO[region]
        confirmed = get(dep, 'Afectados', '0')
        deaths = get(dep, 'Muertos', '0')
        recovered = get(dep, 'Recuperados', '0')
        line = ','.join([
            day, iso, region, '', '',
            'unknown' if iso == 'UNK' else 'provincia',
            str(confirmed),
            str(deaths),
            str(recovered)
        ])

        region_file = path.join(argentina_dir, f'{iso.lower()}.csv')
        is_empty = not path.exists(region_file)

        with open(region_file, 'a+') as f:
            if is_empty:
                f.write(header)
            f.write(f'{line}\n')

        if not is_empty:
            updated_files.append(region_file)

    ensure_consistency(updated_files, lambda row: row[:5])

    with open(path.join(getcwd(), 'data', 'argentina', 'README.md'),
              'w') as readme_f:
        readme_f.write(get_readme_contents())
コード例 #6
0
ファイル: peru.py プロジェクト: johnfelipe/data
def scrape_peru():
    cwd = getcwd()
    peru_dir = path.join(cwd, 'data', 'peru')
    tmp_dir = path.join(cwd, 'tmp')
    ensure_dirs(peru_dir, tmp_dir)

    not_number_regexp = re.compile(r'\D')

    today = str(datetime.date.today())
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')

    tables = soup.find_all('table', {'class': 'sortable'})
    per_departament_table = None

    for table in tables:
        headers = [th.get_text().strip() for th in table.find_all('th')]
        if len(headers) > 0 and 'Departamento' in headers[0]:
            per_departament_table = table

    updated_files = []
    header = 'date,iso,region,city,place_type,cases,deaths\n'

    mapped = {}

    for tr in per_departament_table.tbody.find_all('tr'):
        headers = [th.get_text().strip() for th in tr.find_all('th')]
        cols = [td.get_text().strip() for td in tr.find_all('td')]
        if len(cols) != 10:
            continue

        departament = headers[0]

        cases = int(not_number_regexp.sub('', cols[1]))
        deaths = int(not_number_regexp.sub('', cols[3]))

        if 'Lima' in departament:
            departament = 'Lima'
            if 'Lima' in mapped:
                _cases, _deaths = mapped['Lima']
                cases += _cases
                deaths += _deaths
            else:
                mapped['Lima'] = (cases, deaths)
                continue

        iso = DEPARTAMENT_ISO[departament]

        line = ','.join([
            today,
            iso,
            departament,
            '',
            'departamento',
            str(cases),
            str(deaths),
        ])

        departament_file = path.join(peru_dir, f'{iso.lower()}.csv')
        is_empty = not path.exists(departament_file)

        with open(departament_file, 'a+') as f:
            if is_empty:
                f.write(header)
            f.write(f'{line}\n')

        if not is_empty:
            updated_files.append(departament_file)

    ensure_consistency(updated_files, lambda row: row[:4])

    with open(path.join(getcwd(), 'data', 'peru', 'README.md'), 'w') as readme_f:
        readme_f.write(get_readme_contents())
コード例 #7
0
def scrape_chile():
    cwd = getcwd()
    chile_dir = path.join(cwd, 'data', 'chile')
    tmp_dir = path.join(cwd, 'tmp')
    ensure_dirs(chile_dir, tmp_dir)

    today = str(datetime.date.today())
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    not_number_regexp = re.compile(r'\D')

    per_region_table = None
    tables = soup.find_all('table')

    for table in tables:
        headers = table.find_all('th')
        if len(headers) > 0 and 'Regiones' in headers[0].get_text():
            per_region_table = table
            break

    updated_files = []
    header = 'date,region,region_iso,province,city,place_type,cases,deaths\n'
    for tr in per_region_table.find_all('tr')[2:-1]:
        cols = [td.get_text() for td in tr.find_all('td')]
        if len(cols) != 6:
            continue

        iso = None
        for region in REGION_ISO:
            if region in cols[0]:
                iso = REGION_ISO[region]
                break

        if iso is None:
            continue

        region = ISO_REGION[iso]

        line = ','.join([
            today,
            region,
            iso,
            '',
            '',
            'region',
            not_number_regexp.sub('', cols[2]),
            not_number_regexp.sub('', cols[4]),
        ])

        region_file = path.join(chile_dir, f'{iso.lower()}.csv')
        is_empty = not path.exists(region_file)

        with open(region_file, 'a+') as f:
            if is_empty:
                f.write(header)
            f.write(f'{line}\n')

        if not is_empty:
            updated_files.append(region_file)

    ensure_consistency(updated_files, lambda row: row[:5])

    with open(path.join(getcwd(), 'data', 'chile', 'README.md'),
              'w') as readme_f:
        readme_f.write(get_readme_contents())