Example #1
0
def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing",
                        "automated_sheets", "Kenya.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "http://covidkenya.org/"

    soup = get_soup(source_url)

    element = soup.find("div", class_="elementor-element-b36fad5").find(
        class_="elementor-text-editor")
    cumulative_total = clean_count(element.text)

    date_raw = soup.select(".elementor-element-75168b2 p")[0].text
    date = extract_clean_date(
        date_raw,
        regex=r"\[Updated on ([A-Za-z]+ \d+) \[\d\d:\d\d\]",
        date_format="%B %d",
        replace_year=2021)

    if cumulative_total > data["Cumulative total"].max():
        new = pd.DataFrame({
            "Cumulative total": cumulative_total,
            "Date": [date],
            "Country": "Kenya",
            "Units": "samples tested",
            "Source URL": source_url,
            "Source label": "Kenya Ministry of Health",
        })

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)
Example #2
0
def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Lebanon.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "https://corona.ministryinfo.gov.lb/"

    soup = get_soup(source_url)

    element = soup.find("h1", class_="s-counter3")
    cumulative_total = clean_count(element.text)

    date_raw = soup.select(".last-update strong")[0].text
    date = extract_clean_date(date_raw, regex=r"([A-Za-z]+ \d+)", date_format="%b %d", replace_year=2021)

    if cumulative_total > data["Cumulative total"].max():
        new = pd.DataFrame(
            {
                "Cumulative total": cumulative_total,
                "Date": [date],
                "Country": "Lebanon",
                "Units": "tests performed",
                "Source URL": source_url,
                "Source label": "Lebanon Ministry of Health",
            }
        )

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)
Example #3
0
def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing",
                        "automated_sheets", "Azerbaijan.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "https://koronavirusinfo.az/az/page/statistika/azerbaycanda-cari-veziyyet"

    soup = get_soup(source_url)

    element = soup.find_all("div",
                            class_="gray_little_statistic")[5].find("strong")
    cumulative_total = clean_count(element.text)

    if cumulative_total > data["Cumulative total"].max():
        new = pd.DataFrame({
            "Cumulative total":
            cumulative_total,
            "Date": [localdate("Asia/Baku")],
            "Country":
            "Azerbaijan",
            "Units":
            "tests performed",
            "Source URL":
            source_url,
            "Source label":
            "Cabinet of Ministers of Azerbaijan",
        })

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)
Example #4
0
def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing",
                        "automated_sheets", "Nigeria.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "http://covid19.ncdc.gov.ng/"

    soup = get_soup(source_url)

    element = soup.find("div", class_="col-xl-3").find("span")
    cumulative_total = clean_count(element.text)

    if cumulative_total > data["Cumulative total"].max():

        new = pd.DataFrame({
            "Date": [localdate("Africa/Lagos")],
            "Cumulative total":
            cumulative_total,
            "Country":
            "Nigeria",
            "Units":
            "samples tested",
            "Source URL":
            source_url,
            "Source label":
            "Nigeria Centre for Disease Control",
        })

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)
Example #5
0
def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing",
                        "automated_sheets", "Tunisia.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "https://onmne.tn"

    soup = get_soup(source_url)

    cumulative_total = json.loads(
        soup.find(
            "span",
            class_="vcex-milestone-time").attrs["data-options"])["endVal"]

    Date = soup.select("p span")[0].text.replace(
        "Chiffres clés mis à jour le ", "")
    Date = pd.to_datetime(Date, format="%d %B %Y").strftime("%Y-%m-%d")

    if cumulative_total > data["Cumulative total"].max():
        new = pd.DataFrame({
            "Cumulative total": cumulative_total,
            "Date": [Date],
            "Country": "Tunisia",
            "Units": "people tested",
            "Source URL": source_url,
            "Source label": "Tunisia Ministry of Health",
        })

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)
Example #6
0
 def pipe_filter_locations(self, df: pd.DataFrame) -> pd.DataFrame:
     # Filter locations
     populations_path = os.path.join(get_project_dir(), "scripts", "input",
                                     "un", "population_latest.csv")
     dfc = pd.read_csv(populations_path)
     df = df[df.location.isin(dfc.entity.unique())]
     return df
Example #7
0
def countries_missing(
    path_population: str = None,
    path_locations: str = None,
    ascending: bool = False,
    as_dict: bool = False,
):
    """Get countries currently not present in our dataset.

    Args:
        path_population (str, optional): Path to UN population csv file.
                                            Default value works if repo structure is left unmodified.
        path_locations (str, optional): Path to locations csv file.
                                        Default value works if repo structure is left unmodified.
        ascending (bool, optional): Set to True to sort results in ascending order. By default sorts in ascedning
                                    order.
        as_dict (bool, optional): Set to True for the return value to be shaped as a dictionary. Otherwise returns a
                                    DataFrame.
    """
    if not path_population:
        path_population = os.path.abspath(
            os.path.join(get_project_dir(), "scripts", "input", "un",
                         "population_latest.csv"))
    if not path_locations:
        path_locations = os.path.abspath(
            os.path.join(get_project_dir(), "public", "data", "vaccinations",
                         "locations.csv"))
    df_loc = pd.read_csv(path_locations, usecols=["location"])
    df_pop = pd.read_csv(path_population)
    df_pop = df_pop[df_pop.iso_code.apply(
        lambda x: isinstance(x, str) and len(x) == 3)]
    df_mis = df_pop.loc[~df_pop["entity"].isin(df_loc["location"]),
                        ["entity", "population"]]
    # Sort
    if not ascending:
        df_mis = df_mis.sort_values(by="population", ascending=False)
    # Return data
    if as_dict:
        return df_mis.to_dict(orient="records")
    return df_mis
Example #8
0
# MIN_RESPONSES: country-date-question observations with less than this
# many valid responses will be dropped. If "None", no observations will
# be dropped.
MIN_RESPONSES = 500

# FREQ: temporal level at which to aggregate the individual survey
# responses, passed as the `freq` argument to
# pandas.Series.dt.to_period. Must conform to a valid Pandas offset
# string (e.g. 'M' = "month", "W" = "week").
FREQ = 'M'

# ZERO_DAY: reference date for internal yearIsDay Grapher usage.
ZERO_DAY = "2020-01-21"

# File paths
PROJECT_DIR = get_project_dir()

INPUT_PATH = os.path.join(PROJECT_DIR, "scripts", "input", "yougov")
OUTPUT_PATH = os.path.join(PROJECT_DIR, "scripts", "grapher")
MAPPING_PATH = os.path.join(INPUT_PATH, "mapping.csv")
MAPPING_VALUES_PATH = os.path.join(INPUT_PATH, 'mapped_values.json')

MAPPING = pd.read_csv(MAPPING_PATH, na_values=None)
MAPPING['label'] = MAPPING['label'].str.lower()
with open(MAPPING_VALUES_PATH, 'r') as f:
    MAPPED_VALUES = json.load(f)


class YouGov:
    def __init__(self, output_path: str, debug: bool = False):
        self.source_url = "https://github.com/YouGov-Data/covid-19-tracker/raw/master"
Example #9
0
def country_updates_summary(
    path_vaccinations: str = None,
    path_locations: str = None,
    path_automation_state: str = None,
    as_dict: bool = False,
    sortby_counts: bool = False,
    sortby_updatefreq: bool = False,
    who: bool = False,
    vaccines: bool = False,
):
    """Check last updated countries.

    It loads the content from locations.csv, vaccinations.csv and automation_state.csv to present results on the update
    frequency and timeline of all countries. By default, the countries are sorted from least to most recently updated.
    You can also sort them from least to most frequently updated ones by using argument `sortby_counts`.

    In Jupyter is recommended to ass the following lines to enable the DataFrame to be fully shown:

    ```python
    import pandas as pd
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_colwidth', None)
    ```

    Args:
        path_vaccinations (str, optional): Path to vaccinations csv file.
                                            Default value works if repo structure is left unmodified.
        path_locations (str, optional): Path to locations csv file.
                                        Default value works if repo structure is left unmodified.
        path_automation_state (str, optional): Path to automation state csv file.
                                                Default value works if repo structure is left unmodified.
        as_dict (bool, optional): Set to True for the return value to be shaped as a dictionary. Otherwise returns a
                                    DataFrame.
        sortby_counts (bool, optional): Set to True to sort resuls from least to most updated countries.
        who (bool, optional): Display WHO columns

    Returns:
        Union[pd.DataFrame, dict]: List or DataFrame, where each row (or element) contains five fields:
                                    - 'last_observation_date': Last update date.
                                    - 'location': Country name.
                                    - 'source_website': Source used to retrieve last added data.
                                    - 'automated': True if country process is automated.
                                    - 'counts': Number of times the country has been updated.
    """
    # Get data paths
    if not path_vaccinations:
        path_vaccinations = os.path.abspath(
            os.path.join(
                get_project_dir(),
                "public",
                "data",
                "vaccinations",
                "vaccinations.csv",
            ))
    if not path_locations:
        path_locations = os.path.abspath(
            os.path.join(get_project_dir(), "public", "data", "vaccinations",
                         "locations.csv"))
    if not path_automation_state:
        path_automation_state = os.path.abspath(
            os.path.join(get_project_dir(), "scripts", "output",
                         "vaccinations", "automation_state.csv"))
    columns_output = [
        "location",
        "last_observation_date",
        "first_observation_date",
        "counts",
        "update_frequency",
        "num_observation_days",
        "source_website",
        "automated",
    ]
    # Read data
    df_vax = pd.read_csv(path_vaccinations)
    df_loc = pd.read_csv(path_locations)
    df_state = pd.read_csv(path_automation_state)
    df_who = get_who_data()
    # Get counts
    df_vax = df_vax.dropna(
        subset=[
            "total_vaccinations", "people_vaccinated",
            "people_fully_vaccinated"
        ],
        how="all",
    )
    df_vax = pd.DataFrame({
        "counts":
        df_vax.groupby("location").date.count().sort_values(),
        "first_observation_date":
        df_vax.groupby("location").date.min(),
    })
    # Merge data
    df = df_loc.merge(df_state, on="location")
    df = df.merge(df_vax, on="location")
    # Merge with WHO
    if who:
        # print(df_who.columns)
        df = df.merge(df_who, left_on="iso_code", right_on="ISO3", how="left")
        columns_output += ["reporting_to_WHO", "location_WHO"]
    # Additional fields
    num_observation_days = (
        datetime.now() - pd.to_datetime(df.first_observation_date)).dt.days + 1
    num_updates_per_observation_day = df.counts / num_observation_days

    df = df.assign(
        num_observation_days=num_observation_days,
        update_frequency=num_updates_per_observation_day,
    )
    # Sort data
    if sortby_updatefreq:
        sort_column = "update_frequency"
    elif sortby_counts:
        sort_column = "counts"
    else:
        sort_column = "last_observation_date"
    df = df.sort_values(by=sort_column)[columns_output]

    def _web_type(x):
        govs = [
            ".gov/",
            "gov.",
            ".gob.",
            ".moh.",
            ".gub.",
            ".go.",
            ".gouv.",
            "govern",
            ".govt",
            ".coronavirus2020.kz/",
            "thl.fi",
            ".gv.",
            "corona.nun.gl",
            "exploregov.ky",
            "covid19response.lc/",
            "corona.fo/",
            "103.247.238.92/webportal/",
            "data.public.lu/",
            "vaccinocovid.iss.sm/",
            "koronavirus.hr",
            "koronavirusinfo.az",
            "covid.is",
            "government.",
            "covid19ireland-geohive.hub.arcgis",
            "sacoronavirus.co.za",
            "covidodgovor.me",
            "experience.arcgis.com/experience/59226cacd2b441c7a939dca13f832112/",
            "guineasalud.org/estadisticas/",
            "bakuna.cw/",
            "laatjevaccineren.sr/",
            "coronavirus.bg/bg/statistika",
            "admin.ch",
            "folkhalsomyndigheten.se/",
            "covid19.ssi.dk/",
            "fhi.no/",
            "impfdashboard.de/",
            "covid-19.nczisk.sk",
            "opendata.digilugu.ee",
            ".mzcr.cz/",
            "ghanahealthservice.org/",
            "ccss.sa.cr/",
            "epistat.wiv-isp.be",
            "covidmaroc.ma",
            "experience.arcgis.com/experience/cab84dcfe0464c2a8050a78f817924ca",
            "gtmvigilanciacovid.shinyapps",
            "belta.by",
            "fohm.se",
            "moh.",
            "vaccines.ncdc.ge",
            "opendata.swiss",
        ]
        if "facebook." in x.lower():
            return "Facebook"
        elif "twitter." in x.lower():
            return "Twitter"
        elif "github." in x.lower() or "githubusercontent" in x.lower():
            return "GitHub"
        elif any(gov in x.lower() for gov in govs):
            return "Govern/Official"
        elif (".who.int" in x.lower()) or ("who.maps.arcgis.com" in x.lower()):
            return "WHO"
        elif ".pacificdata.org" in x.lower():
            return "SPC"
        elif "ecdc.europa." in x.lower():
            return "ECDC"
        elif "paho.org" in x.lower():
            return "PAHO"
        elif "africacdc.org" in x.lower():
            return "Africa CDC"
        else:
            return "Others"

    df = df.assign(**{"web_type": df.source_website.apply(_web_type)})

    if vaccines:
        df_vax = vaccines_comparison_with_who()
        df = df.merge(
            df_vax[["location", "missing_in_who", "missing_in_owid"]],
            on="location",
            how="left",
        )
    # Return data
    if as_dict:
        return df.to_dict(orient="records")
    return df
Example #10
0
import os

from cowidev.utils.utils import get_project_dir
from .etl import run_etl
from .grapher import run_explorerizer
from ._parser import _parse_args


FILE_DS = os.path.join(
    get_project_dir(), "public", "data", "excess_mortality", "excess_mortality.csv"
)
FILE_EXPLORER = os.path.join(
    get_project_dir(), "public", "data", "internal", "megafile--excess-mortality.json"
)


def run_step(step: str):
    if step == "etl":
        run_etl(FILE_DS)
    elif step == "explorer-file":
        run_explorerizer(FILE_DS, FILE_EXPLORER)


if __name__ == "__main__":
    args = _parse_args()
    run_step(args.step)
Example #11
0
import os

from cowidev.utils.utils import get_project_dir
from .etl import run_etl
from .grapher import run_grapheriser, run_explorerizer, run_db_updater
from ._parser import _parse_args


FILE_DS = os.path.join(get_project_dir(), "public", "data", "variants", "covid-variants.csv")
FILE_GRAPHER = os.path.join(get_project_dir(), "scripts", "grapher", "COVID-19 - Variants.csv")
FILE_EXPLORER = os.path.join(get_project_dir(), "public", "data", "internal", "megafile--variants.json")


def run_step(step: str):
    if step == "etl":
        run_etl(FILE_DS)
    elif step == "grapher-file":
        run_grapheriser(FILE_DS, FILE_GRAPHER)
    elif step == "explorer-file":
        run_explorerizer(FILE_DS, FILE_EXPLORER)
    elif step == "grapher-db":
        run_db_updater(FILE_GRAPHER)


if __name__ == "__main__":
    args = _parse_args()
    run_step(args.step)
Example #12
0
import os

from cowidev.utils.utils import get_project_dir
from .etl import run_etl
from .grapher import run_explorerizer
from ._parser import _parse_args

FILE_DS = os.path.join(get_project_dir(), "public", "data", "excess_mortality",
                       "excess_mortality.csv")
FILE_EXPLORER = os.path.join(get_project_dir(), "public", "data", "internal",
                             "megafile--excess-mortality.json")


def run_step(step: str):
    if step == "etl":
        run_etl(FILE_DS)
    elif step == "explorer-file":
        run_explorerizer(FILE_DS, FILE_EXPLORER)


if __name__ == "__main__":
    args = _parse_args()
    run_step(args.step)
Example #13
0
import os

from cowidev.utils.utils import get_project_dir
from .etl import run_etl
from .grapher import run_grapheriser, run_db_updater
from ._parser import _parse_args

project_dir = get_project_dir()
FILE_DS = os.path.join("/tmp", "google-mobility.csv")
FILE_GRAPHER = os.path.join(project_dir, "scripts", "grapher",
                            "Google Mobility Trends (2020).csv")
FILE_COUNTRY_STD = os.path.join(project_dir, "scripts", "input", "gmobility",
                                "gmobility_country_standardized.csv")


def run_step(step: str):
    if step == "etl":
        run_etl(FILE_DS)
    elif step == "grapher-file":
        run_grapheriser(FILE_DS, FILE_COUNTRY_STD, FILE_GRAPHER)
    elif step == "grapher-db":
        run_db_updater(FILE_GRAPHER)


if __name__ == "__main__":
    args = _parse_args()
    run_step(args.step)
Example #14
0
import os

from cowidev.utils.utils import get_project_dir

from cowidev.vax.us_states.etl import run_etl
from cowidev.vax.us_states.grapher import run_grapheriser
from cowidev.vax.us_states._parser import _parse_args


FILE_DS = os.path.join(get_project_dir(), "public", "data", "vaccinations", "us_state_vaccinations.csv")
FILE_GRAPHER = os.path.join(get_project_dir(), "scripts", "grapher", "COVID-19 - United States vaccinations.csv")


def run_step(step: str):
    if step == "etl":
        run_etl(FILE_DS)
    elif step == "grapher-file":
        run_grapheriser(FILE_DS, FILE_GRAPHER)
    # elif step == "explorer-file":
    #     run_explorerizer(FILE_DS, FILE_EXPLORER)
    # elif step == "grapher-db":
    #     run_db_updater(FILE_GRAPHER)


if __name__ == "__main__":
    args = _parse_args()
    run_step(args.step)
Example #15
0
import os
import pandas as pd

from cowidev.utils.utils import get_project_dir

POPULATION = pd.read_csv(
    os.path.join(get_project_dir(), "scripts", "input", "un",
                 "population_latest.csv"),
    usecols=["iso_code", "entity", "population"],
)
SOURCE_URL = "https://opendata.ecdc.europa.eu/covid19/hospitalicuadmissionrates/csv/data.csv"


def download_data():
    print("Downloading ECDC data…")
    df = pd.read_csv(
        SOURCE_URL,
        usecols=["country", "indicator", "date", "value", "year_week"])
    df = df.drop_duplicates()
    df = df.rename(columns={"country": "entity"})
    return df


def pipe_undo_100k(df):
    df = pd.merge(df, POPULATION, on="entity", how="left")
    assert df[df.population.isna(
    )].shape[0] == 0, "Country missing from population file"
    df.loc[df["indicator"].str.contains(" per 100k"),
           "value"] = df["value"].div(100000).mul(df["population"])
    df.loc[:, "indicator"] = df["indicator"].str.replace(" per 100k", "")
    return df
Example #16
0
class USStatesETL:
    source_url: str = "https://covid.cdc.gov/covid-data-tracker/COVIDData/getAjaxData?id=vaccination_data"
    cdc_data_path: str = os.path.join(get_project_dir(), "scripts", "input",
                                      "cdc", "vaccinations")

    def extract(self):
        self._download_data()
        return self._read_data()

    def _download_data(self):
        data = json.loads(requests.get(self.source_url).content)
        df = pd.DataFrame.from_records(data["vaccination_data"])
        assert len(df) > 0
        df.to_csv(os.path.join(self.cdc_data_path,
                               f"cdc_data_{df.Date.max()}.csv"),
                  index=False)

    def _read_data(self):
        files = glob(os.path.join(self.cdc_data_path, "cdc_data_*.csv"))
        data = [*map(self._read_file, files)]
        return pd.concat(data, ignore_index=True)

    def _read_file(self, filepath):
        df = pd.read_csv(filepath, na_values=[0.0, 0])
        # Each variable present in VARIABLE_MATCHING.keys() will be created based on the variables in
        # VARIABLE_MATCHING.values() by order of priority. If none of the vars can be found, the variable
        # is created as pd.NA
        variable_matching = {
            "total_distributed": ["Doses_Distributed"],
            "total_vaccinations": ["Doses_Administered"],
            "people_vaccinated":
            ["Administered_Dose1_Recip", "Administered_Dose1"],
            "people_fully_vaccinated": [
                "Series_Complete_Yes",
                "Administered_Dose2_Recip",
                "Administered_Dose2",
            ],
        }
        # Mapping
        for k, v in variable_matching.items():
            for cdc_variable in v:
                if cdc_variable in df.columns:
                    df = df.rename(columns={cdc_variable: k})
                    break
            if k not in df.columns:
                df[k] = pd.NA
        # Order columns
        df = df[["Date", "LongName", "Census2019"] +
                [*variable_matching.keys()]]
        return df

    def transform(self, df: pd.DataFrame):
        return (df.pipe(pipe_rename_cols).pipe(pipe_per_capita).pipe(
            pipe_smoothed).pipe(pipe_usage).drop(columns=["Census2019"]).pipe(
                pipe_monotonic_by_state).sort_values(
                    ["location",
                     "date"]).pipe(pipe_select_columns).pipe(pipe_checks))

    def load(self, df: pd.DataFrame, output_path: str) -> None:
        # Export data
        df.to_csv(output_path, index=False)

    def run(self, output_path: str):
        data = self.extract()
        df = self.transform(data)
        self.load(df, output_path)
Example #17
0
import os

from cowidev.utils.utils import get_project_dir
from .etl import run_etl
from .grapher import run_grapheriser, run_explorerizer, run_db_updater
from ._parser import _parse_args

FILE_DS = os.path.join(get_project_dir(), "public", "data", "variants",
                       "covid-variants.csv")
FILE_GRAPHER = os.path.join(get_project_dir(), "scripts", "grapher",
                            "COVID-19 - Variants.csv")
FILE_EXPLORER = os.path.join(get_project_dir(), "public", "data", "internal",
                             "megafile--variants.json")


def run_step(step: str):
    if step == "etl":
        run_etl(FILE_DS)
    elif step == "grapher-file":
        run_grapheriser(FILE_DS, FILE_GRAPHER)
    elif step == "explorer-file":
        run_explorerizer(FILE_DS, FILE_EXPLORER)
    elif step == "grapher-db":
        run_db_updater(FILE_GRAPHER)


if __name__ == "__main__":
    args = _parse_args()
    run_step(args.step)