def __get_labels_to_admin_1(self): english = {} non_english = {} all_possible = set() dir_ = get_package_dir() / 'world_geodata' / 'output' for fnam in listdir(dir_): with open(dir_ / fnam, 'r', encoding='utf-8') as f: data = json.loads(f.read()) for region_schema, schema_dict in data.items(): if region_schema.upper() in ('FR_DEPARTMENT', 'LK_DISTRICT', 'ES_MADRID_MUNICIPALITY'): continue # HACK! region_schema = Schemas(region_schema) for region_parent, region_parent_dict in schema_dict.items(): for region_child, region_child_item in region_parent_dict.items( ): if region_child_item['label']['en'] is not None: english[region_schema, region_parent or None, region_child_item['label']['en'].strip( ).lower()] = region_child for label in self.__iter_non_english_labels( region_child_item['label']): non_english[region_schema, region_parent or None, label] = region_child all_possible.add((region_schema, region_parent.lower(), region_child.lower())) return english, non_english, all_possible
def get_schema_types(self): """ """ with open(get_package_dir() / 'covid_db' / 'datatypes' / 'schema_types.json', 'r', encoding='utf-8') as f: data = f.read() schema_types = json.loads(data) schema_types['underlays'] = self.__get_underlay_key() schema_types['boundaries'] = self.__get_boundaries() schema_types['listings'] = self.__get_listings() schema_types[ 'updated_dates_by_datatype'] = self.updated_dates_by_datatype for schema, schema_dict in schema_types['schemas'].items(): if schema_dict['iso_3166']: schema_dict['iso_3166'] = schema_dict['iso_3166'].lower() schema_types[ 'output_path'] = f'{self.time_format}-{self.revision_id}' return schema_types
def create_indexes(self): # Create the indexes after inserting to improve performance sql = open(get_package_dir() / 'covid_db' / 'indexes.sql', 'r', encoding='utf-8').read() with self.conn: self.conn.executescript(sql) self.conn.commit()
def __create_tables(self): sql = open(get_package_dir() / 'covid_db' / 'datapoints.sql', 'r', encoding='utf-8').read() conn = sqlite3.connect(self.path) with conn: conn.executescript(sql) conn.close()
def get_population_map(): r = {} with open(get_package_dir() / 'world_geodata' / 'geojson_pop.tsv', 'r', encoding='utf-8') as f: for item in csv.DictReader(f, delimiter='\t'): r[item['region_schema'], item['region_parent'], item['region_child']] = int(item['pop_2020']) return r
def _get_mappings_to_iso_3166(): r = {} with open(get_package_dir() / 'covid_db' / 'datatypes' / 'schema_mappings.csv', 'r', encoding='utf-8') as f: for item in csv.DictReader(f, delimiter='\t'): r[Schemas(item['original_schema'].strip()), item['original_parent'].strip(), item['original_child'].strip()] = ( Schemas(item['schema'].strip()), item['parent'].strip(), item['child'].strip() ) return r
def get_worksheet(): """ """ gc = pygsheets.authorize(service_file=str(get_package_dir() / 'case_locations' / 'credentials.json')) sh = gc.open_by_url( 'https://docs.google.com/spreadsheets/d/' '1ddw3GqI4RsrphdjJcYX-llnQ-JTV_q2atOr1UToLbw0/edit#gid=0') wk1 = sh[0] return wk1
def get_tokyo_cities_to_en_map(): r = {} with open(get_package_dir() / 'covid_crawlers' / 'se_asia' / 'jp_tokyo_data' / 'tokyo_cities.csv', 'r', encoding='utf-8') as f: for item in csv.DictReader(f, delimiter='\t'): r[item['ja'].strip()] = item['en'].strip() for c in '区町村市島': r[item['ja'].strip().rstrip(c)] = item['en'].strip() return r
def get_population_dict(year): """ Get the population of a country :param country: the country name (not code), e.g. "Australia" or "United States" :param from_year: :param to_year: :return: a two-tuple of ((year, population), ...) """ r = {} with open(get_package_dir() / 'other_data/world_bank_data/world_bank_popdata.csv', 'r', encoding='utf-8-sig') as f: for item in csv.DictReader(f): for k, v in item.items(): # Columns are years try: int(k) except ValueError: continue if not year == int(k): continue try: data_item = get_data_item_by_code(item['Country Code']) except KeyError: print("KeyError:", item['Country Code']) continue if v: r[data_item.iso3166.a2.lower()] = int(v) elif data_item.iso3166.a2 == 'ER': r[data_item.iso3166.a2.lower()] = 3213972 # HACK! else: print("No data:", data_item.iso3166.a2) return r
def get_districts_map(): r = {} with open(get_package_dir() / 'covid_crawlers' / 'se_asia' / 'th_data' / 'th_districts.csv', 'r', encoding='utf-8') as f: for item in csv.DictReader(f, delimiter='\t'): #print(item) if item['Status'].strip() not in ('District', 'City District'): continue assert r.get(item['Native'], item['Name'].strip()) == item['Name'].strip(), \ (item, r[item['Native']]) r[item['Native'].strip()] = item['Name'].strip() r[item['Native'].strip().replace('เขต', '')] = item['Name'].strip() r[item['Native'].strip().replace('อำเภอ', '')] = item['Name'].strip() return r
def _get_data_items(): r = [] f = open(get_package_dir() / 'misc_data_scripts' / 'other_data' / 'iso_3166_2' / 'IP2LOCATION-ISO3166-2.CSV', 'r', encoding='utf-8') added = set() for item in csv.DictReader(f): r.append(DataItem(**item)) added.add((r[-1].code, r[-1].subdivision_name)) for subdivision in pycountry.subdivisions: if (subdivision.code, subdivision.name) in added: continue r.append( DataItem(country_code=subdivision.country.alpha_2, subdivision_name=subdivision.name, code=subdivision.code)) return r
def __get_boundaries(self): """ """ r = {} dir_ = get_package_dir() / 'world_geodata' / 'output' for fnam in listdir(dir_): if not (fnam == 'admin_0.json' or fnam.startswith('admin_1')): continue with open(dir_ / fnam, 'r', encoding='utf-8') as f: data = json.loads(f.read()) for schema, schema_dict in data.items(): for region_parent, region_parent_dict in schema_dict.items(): for region_child, region_child_item in region_parent_dict.items( ): #print(region_child_item) r[region_child.lower()] = self.__get_boundary( [i[1] for i in region_child_item['geodata']]) return r
import csv import datetime import numpy as np from os import listdir import matplotlib.pyplot as plt import matplotlib.dates as mdates from matplotlib.ticker import ScalarFormatter from matplotlib.font_manager import FontProperties from _utility.get_package_dir import get_package_dir OUTPUT_CSV_DIR = (get_package_dir() / 'covid_crawlers' / 'oceania' / 'au_data' / 'output') GRAPH_OUTPUT_DIR = (get_package_dir() / 'output_graphs' / 'output') def read_csv(region_schema, datatype, agerange_filter=None, region_filter=None, value_filter=None, state_filter=None): if state_filter and not isinstance(state_filter, (list, tuple)): state_filter = (state_filter, ) r = {} # Get the newest, based on binary sort order # (year->month->day->revision id) fnam = list( sorted([i for i in listdir(OUTPUT_CSV_DIR) if i.endswith('.tsv')],
import csv from _utility.get_package_dir import get_package_dir PATH = get_package_dir( ) / 'covid_crawlers' / 'world' / 'world_jhu_data' / 'state_and_county_fips.csv' def get_county_to_code_map(): r = {} with open(PATH, 'r', encoding='utf-8') as f: for d in csv.DictReader(f): # fips,name,state assert not d['name'] in r, d['name'] d['name'] = d['name'].lower() r[d['state'], d['name']] = d['fips'] r[d['state'], d['name'].replace(' county', '')] = d['fips'] r[d['state'], d['name'].replace(' borough', '')] = d['fips'] r[d['state'], d['name'].replace(' census area', '')] = d['fips'] r[d['state'], d['name'].replace(' municipality', '')] = d['fips'] r[d['state'], d['name'].replace(' parish', '')] = d['fips'] r[d['state'], d['name'].replace(' city', '')] = d['fips'] r[d['state'], d['name'].replace(' city and borough', '')] = d['fips'] return r if __name__ == '__main__': get_county_to_code_map()
from pyquery import PyQuery as pq from re import compile, IGNORECASE from covid_crawlers.oceania.au_data.StateNewsBase import StateNewsBase, singledaystat, bothlistingandstat from covid_db.datatypes.enums import Schemas, DataTypes from covid_db.datatypes.DataPoint import DataPoint from _utility.word_to_number import word_to_number from _utility.get_package_dir import get_package_dir TAS_BY_LGA = get_package_dir( ) / 'covid_crawlers' / 'oceania' / 'au_data' / 'tas' / 'tas_by_lga.json' TAS_BY_THS = get_package_dir( ) / 'covid_crawlers' / 'oceania' / 'au_data' / 'tas' / 'tas_by_ths.tsv' class TasNews(StateNewsBase): STATE_NAME = 'tas' SOURCE_ID = 'au_tas_press_releases' SOURCE_URL = 'https://coronavirus.tas.gov.au' SOURCE_DESCRIPTION = '' LISTING_URL = ( 'https://www.dhhs.tas.gov.au/news/2020', 'https://www.coronavirus.tas.gov.au/', 'https://coronavirus.tas.gov.au/media-releases', 'https://coronavirus.tas.gov.au/media-releases?result_85500_result_page=2', 'https://coronavirus.tas.gov.au/media-releases?result_85500_result_page=3', 'https://coronavirus.tas.gov.au/media-releases?result_85500_result_page=4', ) LISTING_HREF_SELECTOR = 'table.dhhs a, ' \
def _get_data_items(): r = [] f = open(get_package_dir() / 'misc_data_scripts' / 'other_data' / 'iso_3166_1' / 'cow.csv', 'r', encoding='utf-8') for item in csv.DictReader(filter(lambda row: row[0] != '#', f), delimiter=';'): for k in item: item[k] = item[k].strip() r.append( COWItem(**dict( iso3166=ISO3166(a2=item['ISO3166A2'], a3=item['ISO3166A3'], n3=item['ISO3166N3']), fips104=item['FIPS104'], has_capital=item['HasCapital'], continent=item['continent'], subcontinent=item['subcontinent'], language=item['language'], population=int(item['population']), year=item['year'], conventional_abbreviation=item['conabbr'], international_dialing_code=item['ITU'], international_vehicle_code=item['IVC'], area=dict(land=item['land'], water=item['water'], lang_total=item['land_total']), coords=Coord(float(item['latitude']), float( item['longitude'])), max_coords=Coord(float(item['maxlatitude']), float(item['maxlongitude'])), min_coords=Coord(float(item['minlatitude']), float(item['minlongitude'])), iso=dict(name=dict(en=item['ISOen_name'], en_romanized=item['ISOen_ro_name'], fr=item['ISOfr_name'], es=item['ISOes_name']), proper=dict(en=item['ISOen_proper'], en_romanized=item['ISOen_ro_proper'], fr=item['ISOfr_proper']), region=dict(region=item['ISOregion'], subregion=item['ISOsubregion'])), un=dict(en=item['UNen_capital'], fr=item['UNfr_capital'], es=item['UNes_capital'], ru=item['UNru_capital'], capital_coords=Coord(float(item['UNc_latitude']), float(item['UNc_longitude'])) if item['UNc_latitude'].strip() else None), ungegn=dict( name=dict(en=item['UNGEGNen_name'], fr=item['UNGEGNfr_name'], es=item['UNGEGNes_name'], ru=item['UNGEGNru_name'], native_romanized=item['UNGEGNlc_ro_name']), longname=dict( en=item['UNGEGNen_longname'], fr=item['UNGEGNfr_longname'], es=item['UNGEGNes_longname'], ru=item['UNGEGNru_longname'], native_romanized=item['UNGEGNlc_ro_longname']), capital_romanized=item['UNGEGNlc_capital'], ), bgn=dict(name=dict(en=item['BGN_name'], native_romanized=item['BGNlc_name']), proper=dict(en=item['BGN_proper']), longname=dict(en=item['BGN_longname'], en_romanized=item['BGNlc_longname']), capital=item['BGN_capital'], capital_coords=Coord(float(item['BGNc_latitude']), float(item['BGNc_longitude'])), demonym=item['BGN_demonym'], demonym_adjective=item['BGN_demomyn_adj']), pcgn=dict(name=item['PCGN_name'], proper=item['PCGN_proper'], longname=item['PCGN_longname']), fao=dict(name=item['FAOit_name'], proper=item['FAOit_proper'], longname=item['FAOit_longname']), eki=dict(name=item['EKI_name'], longname=item['EKI_longname'], capital=item['EKI_capital']), url=dict(url_gov=item['url_gov'], url_stats=item['url_stats'], url_gis=item['url_gis'], url_post=item['url_post'])))) return r
except KeyError: print("COUNTRY-LEVEL DATA NOT FOUND:", item) try: out[item.region_schema, item.region_parent, item.region_child] += get_pop(item.geometry) ok_indicator.setdefault( (item.region_schema, item.region_parent, item.region_child), True) except ValueError: import traceback traceback.print_exc() ok_indicator[item.region_schema, item.region_parent, item.region_child] = False print() with open(get_package_dir() / 'world_geodata' / 'geojson_pop.tsv', 'w', encoding='utf-8') as f: f.write('region_schema\tregion_parent\tregion_child\tpop_2020\tno_exc\n') for (region_schema, region_parent, region_child), pop in sorted(out.items()): f.write(f'{region_schema}\t' f'{region_parent}\t' f'{region_child}\t' f'{round(pop)}\t' f'{ok_indicator[region_schema, region_parent, region_child]}' f'\n')
import csv from _utility.get_package_dir import get_package_dir HR_MAP_PATH = get_package_dir( ) / 'covid_crawlers' / 'americas' / 'ca_data' / 'hr_map.csv' _province_map = { 'Alberta': 'CA-AB', 'BC': 'CA-BC', 'Manitoba': 'CA-MB', 'New Brunswick': 'CA-NB', 'NL': 'CA-NL', 'Nova Scotia': 'CA-NS', 'Nunavut': 'CA-NU', 'NWT': 'CA-NT', 'Ontario': 'CA-ON', 'PEI': 'CA-PE', 'Quebec': 'CA-QC', 'Saskatchewan': 'CA-SK', 'Yukon': 'CA-YT', 'Repatriated': 'other', } def _get_hr_map(): uid_to_hr = {} hr_to_uid = {} with open(HR_MAP_PATH, 'r', encoding='utf-8') as f: for item in csv.DictReader(f): province = _province_map[item['Province']]
import json from _utility.get_package_dir import get_package_dir with open(get_package_dir() / 'covid_db' / 'datatypes' / 'schema_types.json', 'r', encoding='utf-8') as f: schema_types = json.loads(f.read())
TimeSeriesKey, DataTypes.PERCENT, DataTypes.INTEGER, DataTypes.FLOATING_POINT from misc_data_scripts.other_data import DataPoint from misc_data_scripts.other_data import DateType from covid_db.datatypes.enums import Schemas, DataTypes from misc_data_scripts.other_data.abs_data.lga_to_state_and_name import \ get_lga_to_state_and_name_dict from _utility.normalize_locality_name import \ normalize_locality_name from _utility.get_package_dir import get_package_dir lga_dict = get_lga_to_state_and_name_dict() BASE_PATH = get_package_dir() / 'misc_data_scripts' / 'other_data' / 'abs_data' / 'stats' BASE_EXCEL_PATH = get_package_dir() / 'misc_data_scripts' / 'other_data' / 'abs_data' / 'excel_stats' class ExcelABSStats(UnderlayDataBase): IGNORE_KEYS = { 'Age of Persons Born Overseas', 'Business Entries', 'Building Approvals', 'Selected Government Pensions and Allowances', 'Children Attending a Preschool Program', 'Children Attending a Preschool Program (4 & 5 year olds)', 'Children Enrolled in a Preschool Program (4 & 5 year olds)', 'Age of Persons Born Overseas', 'Religious Affiliation', 'Religious Affiliation Persons Born Overseas',
import json from os import listdir from _utility.get_package_dir import get_global_subnational_covid_data_dir, get_package_dir from world_geodata.get_population_map import get_population_map GEOJSON_DIR = get_package_dir() / 'world_geodata' / 'output' def output_geojson(): for k, (poly_geojson, point_geojson) in _OutputGeoJSON().get_geojson_data().items(): path_poly = get_global_subnational_covid_data_dir( ) / 'geojson' / 'poly' / f'{k}.json' path_poly.parent.mkdir(parents=True, exist_ok=True) with open(path_poly, 'w', encoding='utf-8') as f: f.write(json.dumps(poly_geojson, indent=2, ensure_ascii=False)) path_point = get_global_subnational_covid_data_dir( ) / 'geojson' / 'point' / f'{k}.json' path_point.parent.mkdir(parents=True, exist_ok=True) with open(path_point, 'w', encoding='utf-8') as f: f.write(json.dumps(point_geojson, indent=2, ensure_ascii=False)) class _OutputGeoJSON: def __init__(self): self._population_map = get_population_map() def get_geojson_data(self): r = {}
from re import compile from pyquery import PyQuery as pq from covid_crawlers.oceania.au_data.StateNewsBase import StateNewsBase, bothlistingandstat, singledaystat from covid_db.datatypes.enums import Schemas, DataTypes from covid_db.datatypes.DataPoint import DataPoint from _utility.word_to_number import word_to_number from _utility.get_package_dir import get_package_dir OUTPUT_DIR = get_package_dir( ) / 'covid_crawlers' / 'oceania' / 'au_data' / 'sa' / 'output' class SANews(StateNewsBase): STATE_NAME = 'sa' SOURCE_ID = 'au_sa_press_releases' SOURCE_URL = 'https://www.covid-19.sa.gov.au' SOURCE_DESCRIPTION = '' LISTING_URL = 'https://www.sahealth.sa.gov.au/wps/wcm/connect/Public+Content/SA+Health+Internet/About+us/News+and+media/all+media+releases/?mr-sort=date-desc&mr-pg=1' LISTING_HREF_SELECTOR = '.news a, .article-list-item a.arrow-link' STATS_BY_REGION_URL = 'https://www.sahealth.sa.gov.au/wps/wcm/connect/public+content/sa+health+internet/conditions/infectious+diseases/covid+2019/latest+updates/covid-19+cases+in+south+australia' def _get_date(self, href, html): #print("HREF:", href) try: # New format of updated SA website as of 23/4/2020 date = pq(html)('.main-content p')[0] if '2020' in pq(date).text() or '2021' in pq(date).text(): return self._extract_date_using_format(
from data_export.output_csv_data import get_csv_data_for_source_id # MONKEY PATCH: Reduce cherrpy json file output _json._encode = json.JSONEncoder(separators=(',', ':')).iterencode env = Environment(loader=FileSystemLoader('./templates')) from covid_db.datatypes.enums import Schemas, DataTypes from covid_db.SQLiteDataRevision import SQLiteDataRevision from covid_db.SQLiteDataRevisions import SQLiteDataRevisions from _utility.get_package_dir import get_package_dir from covid_db.datatypes import date_fns from covid_db.output_compressor.output_revision_datapoints_to_zip import output_revision_datapoints_to_zip from _utility.normalize_locality_name import normalize_locality_name OUTPUT_DIR = get_package_dir( ) / 'covid_crawlers' / 'oceania' / 'au_data' / 'output' OUTPUT_GRAPHS_DIR = get_package_dir( ) / 'world_subnational_covid_crawler' / 'output_graphs' / 'output' UPDATE_SCRIPT_PATH = get_package_dir() / 'output_data.py' UPDATE_CASE_LOCS_PATH = get_package_dir( ) / 'case_locations' / 'update_spreadsheet.py' mimetypes.types_map['.tsv'] = 'text/tab-separated-values' class App(object): def __init__(self): self.revisions = SQLiteDataRevisions() _thread.start_new_thread(self.loop, ()) def loop(self): powerbi_run_1st = False
import json from polylabel import polylabel from os.path import basename from abc import ABC, abstractmethod from covid_db.datatypes.schema_types import schema_types from _utility.get_package_dir import get_package_dir OUTPUT_DIR = get_package_dir() / 'world_geodata' / 'output' DATA_DIR = get_package_dir() / 'world_geodata' / 'data' class ProcessGeoJSONBase(ABC): def __init__(self, schema_name): self.schema_name = schema_name def output_json(self, in_paths, out_dir, pretty_print=False): r = {} for in_path in in_paths: fnam = basename(in_path) with open(in_path, 'r', encoding='utf-8') as f: geojson = json.loads( f.read(), # Limit the float precision to 3 digits # after the decimal place to save space parse_float=lambda x: round(float(x), 3)) # Convert MultiPolygon's to single Polygon's features = [] for feature in geojson['features']: if feature['geometry'] is None:
import csv from glob import glob from _utility.get_package_dir import get_package_dir BASE_PATH = get_package_dir( ) / 'misc_data_scripts' / 'other_data' / 'abs_data' / 'lga' def get_lga_to_state_and_name_dict(): r = {} state_dict = { 'New South Wales': 'AU-NSW', 'Australian Capital Territory': 'AU-NSW', 'Northern Territory': 'AU-NT', 'Other Territories': None, 'Queensland': 'AU-QLD', 'South Australia': 'AU-SA', 'Tasmania': 'AU-TAS', 'Victoria': 'AU-VIC', 'Western Australia': 'AU-WA' } for path in glob(str(BASE_PATH / '*.csv')): with open(path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: r[int(row['LGA_CODE_2016'])] = ( state_dict[row['STATE_NAME_2016']], row['LGA_NAME_2016'].split('(')[0].strip()) return r