def __init__(self, region_mappings=None, mode=MODE_DEV): """ A factory for creating "strict datapoints", which are like lists which make sure there aren't duplicate datapoints for a given region on a given day. * In "dev" mode, this factory also registers the guessed "mappings" from the original source (e.g. "Victoria") to the ISO 3166-2 or other unique code (e.g. "au-vic") * In "strict" mode, this makes sure mappings exist, and raises an exception when it doesn't. """ self.__mode = mode self.__region_mappings = (region_mappings or {}).copy() self.__ltrc = LabelsToRegionChild()
def __init__(self): # Only raw_data4.json is currently being updated, # so won't download the others every day URLBase.__init__(self, # TODO: SUPPORT TOKYO DATA AS WELL from !!! output_dir=get_overseas_dir() / 'jp_city_data' / 'data', urls_dict={ 'jg-jpn.csv': URL('https://dl.dropboxusercontent.com/s/6mztoeb6xf78g5w/COVID-19.csv', static_file=False), } ) self.update() self.sdpf = StrictDataPointsFactory( region_mappings={ (Schemas.JP_CITY, 'jp-13', 'niigata-shi konan-ku'): (Schemas.JP_CITY, 'jp-15', '15104') # HACK (??) }, mode=MODE_DEV ) self._labels_to_region_child = LabelsToRegionChild()
def __init__(self): # Only raw_data4.json is currently being updated, # so won't download the others every day URLBase.__init__( self, # TODO: SUPPORT TOKYO DATA AS WELL from !!! output_dir=get_overseas_dir() / 'jp_city_data' / 'data', urls_dict={}) self.update() self._labels_to_region_child = LabelsToRegionChild()
def __init__(self): GithubRepo.__init__( self, output_dir=get_overseas_dir() / 'cu' / 'covid19cubadata.github.io', github_url= 'https://github.com/covid19cubadata/covid19cubadata.github.io/tree/master/data' ) self.sdpf = StrictDataPointsFactory(region_mappings={ ('admin_1', 'cu', 'trinidad'): ('MERGE', 'admin_1', 'cu', 'cu-07'), ('cu_municipality', 'cu', 'trinidad'): ('MERGE', 'cu_municipality', 'cu-07', 'trinidad'), }, mode=MODE_STRICT) self.ltrc = LabelsToRegionChild() self.update()
class StrictDataPointsFactory: def __init__(self, region_mappings=None, mode=MODE_DEV): """ A factory for creating "strict datapoints", which are like lists which make sure there aren't duplicate datapoints for a given region on a given day. * In "dev" mode, this factory also registers the guessed "mappings" from the original source (e.g. "Victoria") to the ISO 3166-2 or other unique code (e.g. "au-vic") * In "strict" mode, this makes sure mappings exist, and raises an exception when it doesn't. """ self.__mode = mode self.__region_mappings = (region_mappings or {}).copy() self.__ltrc = LabelsToRegionChild() def __call__(self, *args, **kwargs): return _StrictDataPoints(self, self.__mode, self.__region_mappings, self.__ltrc) def register_mapping(self, from_schema, from_parent, from_child, to_schema, to_parent, to_child): if self.__ltrc.region_child_in_geojson(to_schema, to_parent, to_child): self.__region_mappings[from_schema, from_parent, from_child] = (to_schema, to_parent, to_child) else: self.__region_mappings[from_schema, from_parent, from_child] = None def get_mappings(self): return self.__region_mappings.copy() def print_mappings(self): pprint(self.__region_mappings, width=160)
# https://github.com/ishaberry/Covid19Canada import csv from covid_db.datatypes.enums import Schemas, DataTypes from covid_db.datatypes.StrictDataPointsFactory import StrictDataPointsFactory, MODE_STRICT from covid_crawlers._base_classes.GithubRepo import GithubRepo from _utility.get_package_dir import get_overseas_dir from covid_crawlers.americas.ca_data.hr_convert import health_region_to_uid, province_to_iso_3166_2 from world_geodata.LabelsToRegionChild import LabelsToRegionChild _ltrc = LabelsToRegionChild() class CACovid19Canada(GithubRepo): SOURCE_URL = 'https://github.com/ishaberry/Covid19Canada' SOURCE_DESCRIPTION = '' SOURCE_ID = 'ca_covid_19_canada' def __init__(self): GithubRepo.__init__( self, output_dir=get_overseas_dir() / 'ca' / 'Covid19Canada', github_url='https://github.com/ishaberry/Covid19Canada') self.sdpf = StrictDataPointsFactory(mode=MODE_STRICT) self.update() def get_datapoints(self): r = [] r.extend(self._get_cases_by_health_region()) r.extend(self._get_mortality_by_health_region())
def _get_mappings_to_iso_3166(): r = {} with open(get_package_dir() / 'covid_db' / 'datatypes' / 'schema_mappings.csv', 'r', encoding='utf-8') as f: for item in csv.DictReader(f, delimiter='\t'): r[Schemas(item['original_schema'].strip()), item['original_parent'].strip(), item['original_child'].strip()] = ( Schemas(item['schema'].strip()), item['parent'].strip(), item['child'].strip() ) return r _mappings_to_iso_3166 = _get_mappings_to_iso_3166() _labels_to_region_child = LabelsToRegionChild() def DataPoint(region_schema=Schemas.ADMIN_1, region_parent=None, region_child=None, date_updated=None, datatype=None, agerange=None, value=None, source_url=None, text_match=None, source_id=None): """
def __init__(self, base_path, source_url): self.base_path = base_path self.source_url = source_url PowerBIDataReader.__init__(self, base_path, get_globals()) self.ltrc = LabelsToRegionChild()
class _WestAfricaPowerBI(PowerBIDataReader): def __init__(self, base_path, source_url): self.base_path = base_path self.source_url = source_url PowerBIDataReader.__init__(self, base_path, get_globals()) self.ltrc = LabelsToRegionChild() def get_powerbi_data(self): r = [] for updated_date, rev_id, response_dict in self._iter_all_dates(): subdir = f'{self.base_path}/{updated_date}-{rev_id}' print("PROCESSING:", subdir) # Only use most revision if there isn't # a newer revision ID for a given day! next_id = rev_id + 1 next_subdir = f'{self.base_path}/{updated_date}-{next_id}' if exists(next_subdir): print(f"West Africa PowerBI ignoring {subdir}") continue r.extend(self._get_regions_data(updated_date, response_dict)) return r def _to_int(self, i): if not isinstance(i, str): return i return int(i.rstrip('L')) def _get_updated_date(self, updated_date, response_dict): ts = response_dict['updated_date'][1] ts = ts['result']['data']['dsr']['DS'][0]['PH'][0]['DM0'][0]['M0'] if ts < 1000: # FIXME!! ================================================================================================== return None else: return datetime.fromtimestamp(ts / 1000).strftime('%Y_%m_%d') def _get_regions_data(self, updated_date, response_dict): r = [] data = response_dict['country_data'][1] previous_value = None SOURCE_URL = 'https://app.powerbi.com/view?r=eyJrIjoiZTRkZDhmMDctM2NmZi00NjRkLTgzYzMtYzI1MDMzNWI3NTRhIiwidCI6IjBmOWUzNWRiLTU0NGYtNGY2MC1iZGNjLTVlYTQxNmU2ZGM3MCIsImMiOjh9' def get_index(name): for x, i_dict in enumerate( data['result']['data']['descriptor']['Select']): i_name = i_dict['Name'] if name.lower() in i_name.lower(): return x return None mappings = { #'admin0Name', #'admin1Name', 'cas_confirm': DataTypes.TOTAL, 'd\u00e9c\u00e8s': DataTypes.STATUS_DEATHS, 'en_traitement': DataTypes.STATUS_HOSPITALIZED, 'Gueris': DataTypes.STATUS_RECOVERED, 'Femmes': DataTypes.TOTAL_FEMALE, 'Hommes': DataTypes.TOTAL_MALE, #'Contacts_suivis': , 'Tests_effectues': DataTypes.TESTS_TOTAL, 'cas_confirm\u00e9s': DataTypes.TOTAL, } mappings = { k: (v, get_index(k)) for k, v in mappings.items() if get_index(k) is not None } #print(data['result']['data']['dsr']['DS'][0]) region_dicts = data['result']['data']['dsr']['DS'][0]['PH'][1]['DM1'] for region_dict in region_dicts: #print(region_dict, previous_value) value, previous_value = self.process_powerbi_value( region_dict, previous_value, data) if isinstance(value[0], int): value[0] = data['result']['data']['dsr']['DS'][0][ 'ValueDicts']['D0'][value[0]] if isinstance(value[1], int): value[1] = data['result']['data']['dsr']['DS'][0][ 'ValueDicts']['D1'][value[1]] while len(value) != 8: value.append(None) admin_0, admin_1 = value[:2] admin_0 = { 'democratic republic of congo': 'cd', 'republic of congo': 'cg', 'guinea bissau': 'gw', }.get(admin_0.lower(), admin_0) #print(admin_0) for _, (datatype, index) in mappings.items(): cases = value[index] if cases is not None: r.append( DataPoint(region_schema=Schemas.OCHA_ADMIN_1, region_parent=self.ltrc.get_by_label( Schemas.ADMIN_0, '', admin_0, admin_0), region_child=admin_1, datatype=datatype, value=int(cases), date_updated=updated_date, source_url=SOURCE_URL)) return r
class JPCityData(URLBase): SOURCE_URL = 'https://jag-japan.com/covid19map-readme/' SOURCE_DESCRIPTION = '' SOURCE_ID = 'jp_jag_japan' def __init__(self): # Only raw_data4.json is currently being updated, # so won't download the others every day URLBase.__init__(self, # TODO: SUPPORT TOKYO DATA AS WELL from !!! output_dir=get_overseas_dir() / 'jp_city_data' / 'data', urls_dict={ 'jg-jpn.csv': URL('https://dl.dropboxusercontent.com/s/6mztoeb6xf78g5w/COVID-19.csv', static_file=False), } ) self.update() self.sdpf = StrictDataPointsFactory( region_mappings={ (Schemas.JP_CITY, 'jp-13', 'niigata-shi konan-ku'): (Schemas.JP_CITY, 'jp-15', '15104') # HACK (??) }, mode=MODE_DEV ) self._labels_to_region_child = LabelsToRegionChild() def get_datapoints(self): r = [] r.extend(self._get_from_json()) return r def _get_from_json(self): r = self.sdpf() by_date = Counter() by_age = Counter() by_prefecture = Counter() by_city = Counter() by_gender = Counter() by_gender_age = Counter() by_prefecture_gender = Counter() by_city_gender = Counter() by_prefecture_age = Counter() by_city_age_gender = Counter() by_prefecture_age_gender = Counter() f = self.get_file('jg-jpn.csv', include_revision=True, encoding='utf-8-sig') num_city = 0 num_kyoto = 0 for item in csv.DictReader(f): # [ # { # "通し": "1", # "厚労省NO": "1", # "無症状病原体保有者": "", # "国内": "A-1", # "チャーター便": "", # "年代": "30", # "性別": "男性", # "確定日": "1/15/2020", # "発症日": "1/3/2020", # "受診都道府県": "神奈川県", # "居住都道府県": "神奈川県", # "居住管内": "", # "居住市区町村": "", # "キー": "神奈川県", # "発表": "神奈川県", # "都道府県内症例番号": "1", # "市町村内症例番号": "", # "ステータス": "退院", # "備考": "", # "ソース": "https://www.mhlw.go.jp/stf/newpage_08906.html", # "ソース2": "https://www.pref.kanagawa.jp/docs/ga4/bukanshi/occurrence.html", # "ソース3": "", # "人数": "1", # "累計": "1", # "前日比": "1", # "発症数": "0", # "死者合計": "", # "退院数累計": "1", # "退院数": "1", # "PCR検査実施人数": "", # "PCR検査前日比": "", # "職業_正誤確認用": "", # "勤務先_正誤確認用": "", # "Hospital Pref": "Kanagawa", # "Residential Pref": "Kanagawa", # "Release": "Kanagawa Prefecture", # "Gender": "Male", # "X": "139.642347", # "Y": "35.447504", # "確定日YYYYMMDD": "2020/1/15", # "受診都道府県コード": "14", # "居住都道府県コード": "14", # "更新日時": "5/17/2020 13:42", # "Field2": "", # "Field4": "", # "Field5": "", # "Field6": "", # "Field7": "", # "Field8": "", # "Field9": "", # "Field10": "" # }, for k in item: item[k] = item[k].strip() for xxx in range(int(item.get('人数', '').strip() or 1)): #print(item) #item = item['properties'] if not item: print("NOT ITEM:", item) continue elif not item['確定日']: print("NOT 確定日", item) assert not ''.join(item.values()).strip(), item continue # WARNING! if item.get('年代') == '0-10' or item.get('年代') == '10歳未満' or item.get('年代') == '1歳未満': agerange = '0-9' elif item.get('年代') in ('不明', '', None): agerange = 'Unknown' elif item.get('年代') in ('90以上',): agerange = '90+' elif item.get('年代') in ('100歳以上',): agerange = '100+' else: agerange = ( str(int(item['年代'].strip('代'))) + '-' + str(int(item['年代'].strip('代')) + 9) ) gender = { '男性': DataTypes.TOTAL_MALE, '男性\xa0': DataTypes.TOTAL_MALE, '女性\xa0': DataTypes.TOTAL_FEMALE, '女性': DataTypes.TOTAL_FEMALE, '⼥性': DataTypes.TOTAL_FEMALE, '女|生': DataTypes.TOTAL_FEMALE, '不明': None, '惰性': DataTypes.TOTAL_MALE, # Pretty sure this is a typo '未満 女性': DataTypes.TOTAL_FEMALE, '女児': DataTypes.TOTAL_FEMALE, '男児': DataTypes.TOTAL_MALE, '': None, '非公表': None, None: None, }[item['性別']] date_diagnosed = self.convert_date(item['確定日'], formats=('%m/%d/%Y',)) # May as well use English prefecture names to and allow the system to # auto-translate to ISO-3166-2 later region_parent = item['居住都道府県'] if not region_parent: assert item['居住都道府県コード'] == '#N/A', item if ( ( # region_parent == '奈良県' or # region_parent == '和歌山県' or # region_parent == '大阪府' region_parent.startswith('京都') or region_parent in ('福岡県', '沖縄県', '愛媛県', '神奈川県', '兵庫県', '愛知県', '高知県', '山梨県', '栃木県', '三重県', '長野県', '熊本県', '青森県', '茨城県', '静岡県', '福島県', '徳島県', '群馬県', '秋田県',) ) and item['備考'] and not item['居住市区町村'] ): print(region_parent, item) region_child = bikou_map[item['備考']] else: if item['備考'] and not item['居住市区町村']: print("BIKOU!!!", region_parent, item['備考'], item) # e.g. 中富良野町 will be different to the English 'Release' field region_child = ( item.get('居住市区町村') or region_parent.replace('市', '県') or 'unknown' # Japanese only ) region_parent = region_parent.replace('市', '県') # HACK! if region_parent in ('中華人民共和国', 'アイルランド', 'スペイン', 'ジンバブエ共和国', '南アフリカ共和国', 'フィリピン', 'アメリカ', 'カナダ', 'イギリス', 'フランス', 'インドネシア', 'アフガニスタン',): region_parent = 'other' elif region_parent in ('不明',): region_parent = 'unknown' region_parent = self._labels_to_region_child.get_by_label(Schemas.ADMIN_1, 'JP', region_parent, default=region_parent) region_child = city_map.get(region_child.strip().lower(), region_child) region_child = self._labels_to_region_child.get_by_label(Schemas.JP_CITY, region_parent, region_child, default=region_child) if region_parent == 'jp-13' and region_child == 'niigata-shi konan-ku': region_parent = 'jp-15' elif region_parent == 'jp-10' and region_child == 'tochigi-shi': region_parent = 'jp-09' elif region_parent == 'jp-12' and region_child == 'kitaibaraki-shi': region_parent = 'jp-08' elif region_parent == 'jp-14' and region_child == 'nagoya-shi nishi-ku': region_parent = 'jp-23' elif region_parent == 'jp-13' and region_child == '宮崎市': region_parent = 'jp-45' elif region_parent == 'jp-17' and region_child == '富山市': region_parent = 'jp-16' elif region_parent == 'jp-40' and region_child == '中津市': region_parent = 'jp-44' elif region_parent == 'jp-46' and region_child == '上尾市': region_parent = 'jp-11' elif region_parent == 'jp-18' and region_child == '坂出市': region_parent = 'jp-37' elif region_parent == 'jp-28' and region_child == 'osaka-shi taisho-ku': region_parent = 'jp-27' elif region_child == '吹田市': region_parent = 'jp-27' elif region_child == '東京都': continue elif region_child in ( '宮町', '畑野氏', '大網白里市', '⻄尾市', '春日部恣意', 'ふじみ野市', '神奈川県', '滋賀県', '山郷町', '⻑久⼿市', '愛⻄市', '古河市', '大阪府', ): # ??? print("**IGNORING:", item) region_child = 'unknown' if region_parent == 'jp-26': print("KYOTO!!!", region_child) num_kyoto += 1 # Maybe it's worth adding status info, but it can be vague e.g. "退院または死亡" # Occupation info is also present in many cases. by_date[date_diagnosed] += 1 by_age[date_diagnosed, agerange] += 1 by_prefecture[date_diagnosed, region_parent] += 1 if gender is not None: by_gender[date_diagnosed, gender] += 1 by_gender_age[date_diagnosed, gender, agerange] += 1 by_prefecture_gender[date_diagnosed, region_parent, gender] += 1 by_prefecture_age_gender[date_diagnosed, region_parent, agerange, gender] += 1 by_prefecture_age[date_diagnosed, region_parent, agerange] += 1 if region_parent == 'tokyo' and region_child.lower() == 'unknown': # Will add region_child-level data continue else: by_city[date_diagnosed, region_parent, region_child] += 1 if gender is not None: by_city_gender[date_diagnosed, region_parent, region_child, gender] += 1 by_city_age_gender[date_diagnosed, region_parent, region_child, agerange, gender] += 1 if item.get('居住市区町村') and region_parent == 'jp-27': num_city += 1 cumulative = 0 for date, value in sorted(by_date.items()): cumulative += value r.append( region_schema=Schemas.ADMIN_0, region_child='Japan', datatype=DataTypes.TOTAL, value=cumulative, date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) cumulative = Counter() for (date, agerange), value in sorted(by_age.items()): cumulative[agerange] += value r.append( region_schema=Schemas.ADMIN_0, region_child='Japan', datatype=DataTypes.TOTAL, agerange=agerange, value=cumulative[agerange], date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) cumulative = Counter() for (date, prefecture), value in sorted(by_prefecture.items()): cumulative[prefecture] += value r.append( region_schema=Schemas.ADMIN_1, region_parent='Japan', region_child=prefecture, datatype=DataTypes.TOTAL, value=cumulative[prefecture], date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) cumulative = Counter() for (date, gender), value in sorted(by_gender.items()): cumulative[gender] += value r.append( region_schema=Schemas.ADMIN_0, region_child='Japan', datatype=gender, value=cumulative[gender], date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) cumulative = Counter() for (date, gender, agerange), value in sorted(by_gender_age.items()): cumulative[gender, agerange] += value r.append( region_schema=Schemas.ADMIN_0, region_child='Japan', datatype=gender, agerange=agerange, value=cumulative[gender], date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) cumulative = Counter() for (date, prefecture, gender), value in sorted(by_prefecture_gender.items()): cumulative[prefecture, gender] += value r.append( region_schema=Schemas.ADMIN_1, region_parent='Japan', region_child=prefecture, datatype=gender, value=cumulative[prefecture, gender], date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) cumulative = Counter() for (date, prefecture, agerange), value in sorted(by_prefecture_age.items()): cumulative[prefecture, agerange] += value r.append( region_schema=Schemas.ADMIN_1, region_parent='Japan', region_child=prefecture, datatype=DataTypes.TOTAL, agerange=agerange, value=cumulative[prefecture, agerange], date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) cumulative = Counter() for (date, prefecture, agerange, gender), value in sorted(by_prefecture_age_gender.items()): cumulative[prefecture, agerange, gender] += value r.append( region_schema=Schemas.ADMIN_1, region_parent='Japan', region_child=prefecture, datatype=gender, agerange=agerange, value=cumulative[prefecture, agerange, gender], date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) cumulative = Counter() for (date, prefecture, region_child), value in sorted(by_city.items()): cumulative[prefecture, region_child] += value r.append( region_schema=Schemas.JP_CITY, region_parent=prefecture, region_child=region_child, datatype=DataTypes.TOTAL, value=cumulative[prefecture, region_child], date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) #print("***TOTAL SUM:", sum(cumulative.values())) cumulative = Counter() for (date, prefecture, region_child, gender), value in sorted(by_city_gender.items()): cumulative[prefecture, region_child, gender] += value r.append( region_schema=Schemas.JP_CITY, region_parent=prefecture, region_child=region_child, datatype=gender, value=cumulative[prefecture, region_child, gender], date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) cumulative = Counter() for (date, prefecture, region_child, agerange, gender), value in sorted(by_city_age_gender.items()): cumulative[prefecture, region_child, agerange, gender] += value r.append( region_schema=Schemas.JP_CITY, region_parent=prefecture, region_child=region_child, datatype=gender, agerange=agerange, value=cumulative[prefecture, region_child, agerange, gender], date_updated=date, source_url=self.SOURCE_URL, # FIXME!! ) return r
def __init__(self, schema, schema_dict): self.schema = schema self.schema_dict = schema_dict self.ltrc = LabelsToRegionChild()