Exemple #1
0
    def __get_labels_to_admin_1(self):
        english = {}
        non_english = {}
        all_possible = set()

        dir_ = get_package_dir() / 'world_geodata' / 'output'

        for fnam in listdir(dir_):
            with open(dir_ / fnam, 'r', encoding='utf-8') as f:
                data = json.loads(f.read())

            for region_schema, schema_dict in data.items():
                if region_schema.upper() in ('FR_DEPARTMENT', 'LK_DISTRICT',
                                             'ES_MADRID_MUNICIPALITY'):
                    continue  # HACK!
                region_schema = Schemas(region_schema)

                for region_parent, region_parent_dict in schema_dict.items():
                    for region_child, region_child_item in region_parent_dict.items(
                    ):
                        if region_child_item['label']['en'] is not None:
                            english[region_schema, region_parent or None,
                                    region_child_item['label']['en'].strip(
                                    ).lower()] = region_child

                        for label in self.__iter_non_english_labels(
                                region_child_item['label']):
                            non_english[region_schema, region_parent or None,
                                        label] = region_child

                        all_possible.add((region_schema, region_parent.lower(),
                                          region_child.lower()))

        return english, non_english, all_possible
Exemple #2
0
    def get_schema_types(self):
        """

        """
        with open(get_package_dir() / 'covid_db' / 'datatypes' /
                  'schema_types.json',
                  'r',
                  encoding='utf-8') as f:

            data = f.read()
            schema_types = json.loads(data)

            schema_types['underlays'] = self.__get_underlay_key()
            schema_types['boundaries'] = self.__get_boundaries()
            schema_types['listings'] = self.__get_listings()
            schema_types[
                'updated_dates_by_datatype'] = self.updated_dates_by_datatype

            for schema, schema_dict in schema_types['schemas'].items():
                if schema_dict['iso_3166']:
                    schema_dict['iso_3166'] = schema_dict['iso_3166'].lower()

            schema_types[
                'output_path'] = f'{self.time_format}-{self.revision_id}'

        return schema_types
 def create_indexes(self):
     # Create the indexes after inserting to improve performance
     sql = open(get_package_dir() / 'covid_db' / 'indexes.sql',
                'r',
                encoding='utf-8').read()
     with self.conn:
         self.conn.executescript(sql)
         self.conn.commit()
 def __create_tables(self):
     sql = open(get_package_dir() / 'covid_db' / 'datapoints.sql',
                'r',
                encoding='utf-8').read()
     conn = sqlite3.connect(self.path)
     with conn:
         conn.executescript(sql)
     conn.close()
Exemple #5
0
def get_population_map():
    r = {}
    with open(get_package_dir() / 'world_geodata' / 'geojson_pop.tsv',
              'r',
              encoding='utf-8') as f:
        for item in csv.DictReader(f, delimiter='\t'):
            r[item['region_schema'], item['region_parent'],
              item['region_child']] = int(item['pop_2020'])
    return r
Exemple #6
0
def _get_mappings_to_iso_3166():
    r = {}

    with open(get_package_dir() / 'covid_db' / 'datatypes' / 'schema_mappings.csv',
              'r', encoding='utf-8') as f:
        for item in csv.DictReader(f, delimiter='\t'):
            r[Schemas(item['original_schema'].strip()), item['original_parent'].strip(), item['original_child'].strip()] = (
                Schemas(item['schema'].strip()), item['parent'].strip(), item['child'].strip()
            )

    return r
def get_worksheet():
    """

    """
    gc = pygsheets.authorize(service_file=str(get_package_dir() /
                                              'case_locations' /
                                              'credentials.json'))
    sh = gc.open_by_url(
        'https://docs.google.com/spreadsheets/d/'
        '1ddw3GqI4RsrphdjJcYX-llnQ-JTV_q2atOr1UToLbw0/edit#gid=0')
    wk1 = sh[0]
    return wk1
Exemple #8
0
def get_tokyo_cities_to_en_map():
    r = {}
    with open(get_package_dir() / 'covid_crawlers' / 'se_asia' /
              'jp_tokyo_data' / 'tokyo_cities.csv',
              'r',
              encoding='utf-8') as f:

        for item in csv.DictReader(f, delimiter='\t'):
            r[item['ja'].strip()] = item['en'].strip()
            for c in '区町村市島':
                r[item['ja'].strip().rstrip(c)] = item['en'].strip()
    return r
def get_population_dict(year):
    """
    Get the population of a country

    :param country: the country name (not code),
                    e.g. "Australia" or "United States"
    :param from_year:
    :param to_year:
    :return: a two-tuple of ((year, population), ...)
    """

    r = {}

    with open(get_package_dir() / 'other_data/world_bank_data/world_bank_popdata.csv',
              'r', encoding='utf-8-sig') as f:

        for item in csv.DictReader(f):
            for k, v in item.items():
                # Columns are years
                try:
                    int(k)
                except ValueError:
                    continue

                if not year == int(k):
                    continue

                try:
                    data_item = get_data_item_by_code(item['Country Code'])
                except KeyError:
                    print("KeyError:", item['Country Code'])
                    continue

                if v:
                    r[data_item.iso3166.a2.lower()] = int(v)
                elif data_item.iso3166.a2 == 'ER':
                    r[data_item.iso3166.a2.lower()] = 3213972  # HACK!
                else:
                    print("No data:", data_item.iso3166.a2)

    return r
def get_districts_map():
    r = {}

    with open(get_package_dir() / 'covid_crawlers' / 'se_asia' / 'th_data' /
              'th_districts.csv',
              'r',
              encoding='utf-8') as f:

        for item in csv.DictReader(f, delimiter='\t'):
            #print(item)
            if item['Status'].strip() not in ('District', 'City District'):
                continue

            assert r.get(item['Native'], item['Name'].strip()) == item['Name'].strip(), \
                (item, r[item['Native']])
            r[item['Native'].strip()] = item['Name'].strip()
            r[item['Native'].strip().replace('เขต', '')] = item['Name'].strip()
            r[item['Native'].strip().replace('อำเภอ',
                                             '')] = item['Name'].strip()

    return r
Exemple #11
0
def _get_data_items():
    r = []
    f = open(get_package_dir() / 'misc_data_scripts' / 'other_data' /
             'iso_3166_2' / 'IP2LOCATION-ISO3166-2.CSV',
             'r',
             encoding='utf-8')

    added = set()
    for item in csv.DictReader(f):
        r.append(DataItem(**item))
        added.add((r[-1].code, r[-1].subdivision_name))

    for subdivision in pycountry.subdivisions:
        if (subdivision.code, subdivision.name) in added:
            continue

        r.append(
            DataItem(country_code=subdivision.country.alpha_2,
                     subdivision_name=subdivision.name,
                     code=subdivision.code))

    return r
Exemple #12
0
    def __get_boundaries(self):
        """

        """
        r = {}
        dir_ = get_package_dir() / 'world_geodata' / 'output'

        for fnam in listdir(dir_):
            if not (fnam == 'admin_0.json' or fnam.startswith('admin_1')):
                continue

            with open(dir_ / fnam, 'r', encoding='utf-8') as f:
                data = json.loads(f.read())

            for schema, schema_dict in data.items():
                for region_parent, region_parent_dict in schema_dict.items():
                    for region_child, region_child_item in region_parent_dict.items(
                    ):
                        #print(region_child_item)
                        r[region_child.lower()] = self.__get_boundary(
                            [i[1] for i in region_child_item['geodata']])

        return r
import csv
import datetime
import numpy as np
from os import listdir
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import ScalarFormatter
from matplotlib.font_manager import FontProperties
from _utility.get_package_dir import get_package_dir

OUTPUT_CSV_DIR = (get_package_dir() / 'covid_crawlers' / 'oceania' /
                  'au_data' / 'output')
GRAPH_OUTPUT_DIR = (get_package_dir() / 'output_graphs' / 'output')


def read_csv(region_schema,
             datatype,
             agerange_filter=None,
             region_filter=None,
             value_filter=None,
             state_filter=None):

    if state_filter and not isinstance(state_filter, (list, tuple)):
        state_filter = (state_filter, )

    r = {}

    # Get the newest, based on binary sort order
    # (year->month->day->revision id)
    fnam = list(
        sorted([i for i in listdir(OUTPUT_CSV_DIR) if i.endswith('.tsv')],
import csv
from _utility.get_package_dir import get_package_dir

PATH = get_package_dir(
) / 'covid_crawlers' / 'world' / 'world_jhu_data' / 'state_and_county_fips.csv'


def get_county_to_code_map():
    r = {}

    with open(PATH, 'r', encoding='utf-8') as f:
        for d in csv.DictReader(f):
            # fips,name,state
            assert not d['name'] in r, d['name']
            d['name'] = d['name'].lower()
            r[d['state'], d['name']] = d['fips']
            r[d['state'], d['name'].replace(' county', '')] = d['fips']
            r[d['state'], d['name'].replace(' borough', '')] = d['fips']
            r[d['state'], d['name'].replace(' census area', '')] = d['fips']
            r[d['state'], d['name'].replace(' municipality', '')] = d['fips']
            r[d['state'], d['name'].replace(' parish', '')] = d['fips']
            r[d['state'], d['name'].replace(' city', '')] = d['fips']
            r[d['state'],
              d['name'].replace(' city and borough', '')] = d['fips']

    return r


if __name__ == '__main__':
    get_county_to_code_map()
Exemple #15
0
from pyquery import PyQuery as pq
from re import compile, IGNORECASE

from covid_crawlers.oceania.au_data.StateNewsBase import StateNewsBase, singledaystat, bothlistingandstat
from covid_db.datatypes.enums import Schemas, DataTypes
from covid_db.datatypes.DataPoint import DataPoint
from _utility.word_to_number import word_to_number
from _utility.get_package_dir import get_package_dir

TAS_BY_LGA = get_package_dir(
) / 'covid_crawlers' / 'oceania' / 'au_data' / 'tas' / 'tas_by_lga.json'
TAS_BY_THS = get_package_dir(
) / 'covid_crawlers' / 'oceania' / 'au_data' / 'tas' / 'tas_by_ths.tsv'


class TasNews(StateNewsBase):
    STATE_NAME = 'tas'

    SOURCE_ID = 'au_tas_press_releases'
    SOURCE_URL = 'https://coronavirus.tas.gov.au'
    SOURCE_DESCRIPTION = ''

    LISTING_URL = (
        'https://www.dhhs.tas.gov.au/news/2020',
        'https://www.coronavirus.tas.gov.au/',
        'https://coronavirus.tas.gov.au/media-releases',
        'https://coronavirus.tas.gov.au/media-releases?result_85500_result_page=2',
        'https://coronavirus.tas.gov.au/media-releases?result_85500_result_page=3',
        'https://coronavirus.tas.gov.au/media-releases?result_85500_result_page=4',
    )
    LISTING_HREF_SELECTOR = 'table.dhhs a, ' \
def _get_data_items():
    r = []

    f = open(get_package_dir() / 'misc_data_scripts' / 'other_data' /
             'iso_3166_1' / 'cow.csv',
             'r',
             encoding='utf-8')

    for item in csv.DictReader(filter(lambda row: row[0] != '#', f),
                               delimiter=';'):
        for k in item:
            item[k] = item[k].strip()

        r.append(
            COWItem(**dict(
                iso3166=ISO3166(a2=item['ISO3166A2'],
                                a3=item['ISO3166A3'],
                                n3=item['ISO3166N3']),
                fips104=item['FIPS104'],
                has_capital=item['HasCapital'],
                continent=item['continent'],
                subcontinent=item['subcontinent'],
                language=item['language'],
                population=int(item['population']),
                year=item['year'],
                conventional_abbreviation=item['conabbr'],
                international_dialing_code=item['ITU'],
                international_vehicle_code=item['IVC'],
                area=dict(land=item['land'],
                          water=item['water'],
                          lang_total=item['land_total']),
                coords=Coord(float(item['latitude']), float(
                    item['longitude'])),
                max_coords=Coord(float(item['maxlatitude']),
                                 float(item['maxlongitude'])),
                min_coords=Coord(float(item['minlatitude']),
                                 float(item['minlongitude'])),
                iso=dict(name=dict(en=item['ISOen_name'],
                                   en_romanized=item['ISOen_ro_name'],
                                   fr=item['ISOfr_name'],
                                   es=item['ISOes_name']),
                         proper=dict(en=item['ISOen_proper'],
                                     en_romanized=item['ISOen_ro_proper'],
                                     fr=item['ISOfr_proper']),
                         region=dict(region=item['ISOregion'],
                                     subregion=item['ISOsubregion'])),
                un=dict(en=item['UNen_capital'],
                        fr=item['UNfr_capital'],
                        es=item['UNes_capital'],
                        ru=item['UNru_capital'],
                        capital_coords=Coord(float(item['UNc_latitude']),
                                             float(item['UNc_longitude']))
                        if item['UNc_latitude'].strip() else None),
                ungegn=dict(
                    name=dict(en=item['UNGEGNen_name'],
                              fr=item['UNGEGNfr_name'],
                              es=item['UNGEGNes_name'],
                              ru=item['UNGEGNru_name'],
                              native_romanized=item['UNGEGNlc_ro_name']),
                    longname=dict(
                        en=item['UNGEGNen_longname'],
                        fr=item['UNGEGNfr_longname'],
                        es=item['UNGEGNes_longname'],
                        ru=item['UNGEGNru_longname'],
                        native_romanized=item['UNGEGNlc_ro_longname']),
                    capital_romanized=item['UNGEGNlc_capital'],
                ),
                bgn=dict(name=dict(en=item['BGN_name'],
                                   native_romanized=item['BGNlc_name']),
                         proper=dict(en=item['BGN_proper']),
                         longname=dict(en=item['BGN_longname'],
                                       en_romanized=item['BGNlc_longname']),
                         capital=item['BGN_capital'],
                         capital_coords=Coord(float(item['BGNc_latitude']),
                                              float(item['BGNc_longitude'])),
                         demonym=item['BGN_demonym'],
                         demonym_adjective=item['BGN_demomyn_adj']),
                pcgn=dict(name=item['PCGN_name'],
                          proper=item['PCGN_proper'],
                          longname=item['PCGN_longname']),
                fao=dict(name=item['FAOit_name'],
                         proper=item['FAOit_proper'],
                         longname=item['FAOit_longname']),
                eki=dict(name=item['EKI_name'],
                         longname=item['EKI_longname'],
                         capital=item['EKI_capital']),
                url=dict(url_gov=item['url_gov'],
                         url_stats=item['url_stats'],
                         url_gis=item['url_gis'],
                         url_post=item['url_post']))))

    return r
Exemple #17
0
            except KeyError:
                print("COUNTRY-LEVEL DATA NOT FOUND:", item)

        try:
            out[item.region_schema, item.region_parent,
                item.region_child] += get_pop(item.geometry)
            ok_indicator.setdefault(
                (item.region_schema, item.region_parent, item.region_child),
                True)
        except ValueError:
            import traceback
            traceback.print_exc()
            ok_indicator[item.region_schema, item.region_parent,
                         item.region_child] = False

print()

with open(get_package_dir() / 'world_geodata' / 'geojson_pop.tsv',
          'w',
          encoding='utf-8') as f:
    f.write('region_schema\tregion_parent\tregion_child\tpop_2020\tno_exc\n')

    for (region_schema, region_parent,
         region_child), pop in sorted(out.items()):
        f.write(f'{region_schema}\t'
                f'{region_parent}\t'
                f'{region_child}\t'
                f'{round(pop)}\t'
                f'{ok_indicator[region_schema, region_parent, region_child]}'
                f'\n')
import csv
from _utility.get_package_dir import get_package_dir

HR_MAP_PATH = get_package_dir(
) / 'covid_crawlers' / 'americas' / 'ca_data' / 'hr_map.csv'

_province_map = {
    'Alberta': 'CA-AB',
    'BC': 'CA-BC',
    'Manitoba': 'CA-MB',
    'New Brunswick': 'CA-NB',
    'NL': 'CA-NL',
    'Nova Scotia': 'CA-NS',
    'Nunavut': 'CA-NU',
    'NWT': 'CA-NT',
    'Ontario': 'CA-ON',
    'PEI': 'CA-PE',
    'Quebec': 'CA-QC',
    'Saskatchewan': 'CA-SK',
    'Yukon': 'CA-YT',
    'Repatriated': 'other',
}


def _get_hr_map():
    uid_to_hr = {}
    hr_to_uid = {}

    with open(HR_MAP_PATH, 'r', encoding='utf-8') as f:
        for item in csv.DictReader(f):
            province = _province_map[item['Province']]
Exemple #19
0
import json
from _utility.get_package_dir import get_package_dir

with open(get_package_dir() / 'covid_db' / 'datatypes' / 'schema_types.json',
          'r',
          encoding='utf-8') as f:
    schema_types = json.loads(f.read())
    TimeSeriesKey, DataTypes.PERCENT, DataTypes.INTEGER, DataTypes.FLOATING_POINT
from misc_data_scripts.other_data import DataPoint
from misc_data_scripts.other_data import DateType
from covid_db.datatypes.enums import Schemas, DataTypes

from misc_data_scripts.other_data.abs_data.lga_to_state_and_name import \
    get_lga_to_state_and_name_dict
from _utility.normalize_locality_name import \
    normalize_locality_name
from _utility.get_package_dir import get_package_dir


lga_dict = get_lga_to_state_and_name_dict()


BASE_PATH = get_package_dir() / 'misc_data_scripts' / 'other_data' / 'abs_data' / 'stats'
BASE_EXCEL_PATH = get_package_dir() / 'misc_data_scripts' / 'other_data' / 'abs_data' / 'excel_stats'


class ExcelABSStats(UnderlayDataBase):
    IGNORE_KEYS = {
        'Age of Persons Born Overseas',
        'Business Entries',
        'Building Approvals',
        'Selected Government Pensions and Allowances',
        'Children Attending a Preschool Program',
        'Children Attending a Preschool Program (4 & 5 year olds)',
        'Children Enrolled in a Preschool Program (4 & 5 year olds)',
        'Age of Persons Born Overseas',
        'Religious Affiliation',
        'Religious Affiliation Persons Born Overseas',
import json
from os import listdir

from _utility.get_package_dir import get_global_subnational_covid_data_dir, get_package_dir
from world_geodata.get_population_map import get_population_map

GEOJSON_DIR = get_package_dir() / 'world_geodata' / 'output'


def output_geojson():
    for k, (poly_geojson,
            point_geojson) in _OutputGeoJSON().get_geojson_data().items():
        path_poly = get_global_subnational_covid_data_dir(
        ) / 'geojson' / 'poly' / f'{k}.json'
        path_poly.parent.mkdir(parents=True, exist_ok=True)
        with open(path_poly, 'w', encoding='utf-8') as f:
            f.write(json.dumps(poly_geojson, indent=2, ensure_ascii=False))

        path_point = get_global_subnational_covid_data_dir(
        ) / 'geojson' / 'point' / f'{k}.json'
        path_point.parent.mkdir(parents=True, exist_ok=True)
        with open(path_point, 'w', encoding='utf-8') as f:
            f.write(json.dumps(point_geojson, indent=2, ensure_ascii=False))


class _OutputGeoJSON:
    def __init__(self):
        self._population_map = get_population_map()

    def get_geojson_data(self):
        r = {}
from re import compile
from pyquery import PyQuery as pq

from covid_crawlers.oceania.au_data.StateNewsBase import StateNewsBase, bothlistingandstat, singledaystat
from covid_db.datatypes.enums import Schemas, DataTypes
from covid_db.datatypes.DataPoint import DataPoint
from _utility.word_to_number import word_to_number
from _utility.get_package_dir import get_package_dir

OUTPUT_DIR = get_package_dir(
) / 'covid_crawlers' / 'oceania' / 'au_data' / 'sa' / 'output'


class SANews(StateNewsBase):
    STATE_NAME = 'sa'

    SOURCE_ID = 'au_sa_press_releases'
    SOURCE_URL = 'https://www.covid-19.sa.gov.au'
    SOURCE_DESCRIPTION = ''

    LISTING_URL = 'https://www.sahealth.sa.gov.au/wps/wcm/connect/Public+Content/SA+Health+Internet/About+us/News+and+media/all+media+releases/?mr-sort=date-desc&mr-pg=1'
    LISTING_HREF_SELECTOR = '.news a, .article-list-item a.arrow-link'
    STATS_BY_REGION_URL = 'https://www.sahealth.sa.gov.au/wps/wcm/connect/public+content/sa+health+internet/conditions/infectious+diseases/covid+2019/latest+updates/covid-19+cases+in+south+australia'

    def _get_date(self, href, html):
        #print("HREF:", href)
        try:
            # New format of updated SA website as of 23/4/2020
            date = pq(html)('.main-content p')[0]
            if '2020' in pq(date).text() or '2021' in pq(date).text():
                return self._extract_date_using_format(
Exemple #23
0
from data_export.output_csv_data import get_csv_data_for_source_id

# MONKEY PATCH: Reduce cherrpy json file output
_json._encode = json.JSONEncoder(separators=(',', ':')).iterencode

env = Environment(loader=FileSystemLoader('./templates'))

from covid_db.datatypes.enums import Schemas, DataTypes
from covid_db.SQLiteDataRevision import SQLiteDataRevision
from covid_db.SQLiteDataRevisions import SQLiteDataRevisions
from _utility.get_package_dir import get_package_dir
from covid_db.datatypes import date_fns
from covid_db.output_compressor.output_revision_datapoints_to_zip import output_revision_datapoints_to_zip
from _utility.normalize_locality_name import normalize_locality_name

OUTPUT_DIR = get_package_dir(
) / 'covid_crawlers' / 'oceania' / 'au_data' / 'output'
OUTPUT_GRAPHS_DIR = get_package_dir(
) / 'world_subnational_covid_crawler' / 'output_graphs' / 'output'
UPDATE_SCRIPT_PATH = get_package_dir() / 'output_data.py'
UPDATE_CASE_LOCS_PATH = get_package_dir(
) / 'case_locations' / 'update_spreadsheet.py'
mimetypes.types_map['.tsv'] = 'text/tab-separated-values'


class App(object):
    def __init__(self):
        self.revisions = SQLiteDataRevisions()
        _thread.start_new_thread(self.loop, ())

    def loop(self):
        powerbi_run_1st = False
import json
from polylabel import polylabel
from os.path import basename
from abc import ABC, abstractmethod
from covid_db.datatypes.schema_types import schema_types
from _utility.get_package_dir import get_package_dir

OUTPUT_DIR = get_package_dir() / 'world_geodata' / 'output'
DATA_DIR = get_package_dir() / 'world_geodata' / 'data'


class ProcessGeoJSONBase(ABC):
    def __init__(self, schema_name):
        self.schema_name = schema_name

    def output_json(self, in_paths, out_dir, pretty_print=False):
        r = {}

        for in_path in in_paths:
            fnam = basename(in_path)
            with open(in_path, 'r', encoding='utf-8') as f:
                geojson = json.loads(
                    f.read(),
                    # Limit the float precision to 3 digits
                    # after the decimal place to save space
                    parse_float=lambda x: round(float(x), 3))

            # Convert MultiPolygon's to single Polygon's
            features = []
            for feature in geojson['features']:
                if feature['geometry'] is None:
import csv
from glob import glob
from _utility.get_package_dir import get_package_dir

BASE_PATH = get_package_dir(
) / 'misc_data_scripts' / 'other_data' / 'abs_data' / 'lga'


def get_lga_to_state_and_name_dict():
    r = {}

    state_dict = {
        'New South Wales': 'AU-NSW',
        'Australian Capital Territory': 'AU-NSW',
        'Northern Territory': 'AU-NT',
        'Other Territories': None,
        'Queensland': 'AU-QLD',
        'South Australia': 'AU-SA',
        'Tasmania': 'AU-TAS',
        'Victoria': 'AU-VIC',
        'Western Australia': 'AU-WA'
    }

    for path in glob(str(BASE_PATH / '*.csv')):
        with open(path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                r[int(row['LGA_CODE_2016'])] = (
                    state_dict[row['STATE_NAME_2016']],
                    row['LGA_NAME_2016'].split('(')[0].strip())
    return r