def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Kenya.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "http://covidkenya.org/" soup = get_soup(source_url) element = soup.find("div", class_="elementor-element-b36fad5").find( class_="elementor-text-editor") cumulative_total = clean_count(element.text) date_raw = soup.select(".elementor-element-75168b2 p")[0].text date = extract_clean_date( date_raw, regex=r"\[Updated on ([A-Za-z]+ \d+) \[\d\d:\d\d\]", date_format="%B %d", replace_year=2021) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Cumulative total": cumulative_total, "Date": [date], "Country": "Kenya", "Units": "samples tested", "Source URL": source_url, "Source label": "Kenya Ministry of Health", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Lebanon.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "https://corona.ministryinfo.gov.lb/" soup = get_soup(source_url) element = soup.find("h1", class_="s-counter3") cumulative_total = clean_count(element.text) date_raw = soup.select(".last-update strong")[0].text date = extract_clean_date(date_raw, regex=r"([A-Za-z]+ \d+)", date_format="%b %d", replace_year=2021) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame( { "Cumulative total": cumulative_total, "Date": [date], "Country": "Lebanon", "Units": "tests performed", "Source URL": source_url, "Source label": "Lebanon Ministry of Health", } ) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Azerbaijan.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "https://koronavirusinfo.az/az/page/statistika/azerbaycanda-cari-veziyyet" soup = get_soup(source_url) element = soup.find_all("div", class_="gray_little_statistic")[5].find("strong") cumulative_total = clean_count(element.text) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Cumulative total": cumulative_total, "Date": [localdate("Asia/Baku")], "Country": "Azerbaijan", "Units": "tests performed", "Source URL": source_url, "Source label": "Cabinet of Ministers of Azerbaijan", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Nigeria.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "http://covid19.ncdc.gov.ng/" soup = get_soup(source_url) element = soup.find("div", class_="col-xl-3").find("span") cumulative_total = clean_count(element.text) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Date": [localdate("Africa/Lagos")], "Cumulative total": cumulative_total, "Country": "Nigeria", "Units": "samples tested", "Source URL": source_url, "Source label": "Nigeria Centre for Disease Control", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Tunisia.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "https://onmne.tn" soup = get_soup(source_url) cumulative_total = json.loads( soup.find( "span", class_="vcex-milestone-time").attrs["data-options"])["endVal"] Date = soup.select("p span")[0].text.replace( "Chiffres clés mis à jour le ", "") Date = pd.to_datetime(Date, format="%d %B %Y").strftime("%Y-%m-%d") if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Cumulative total": cumulative_total, "Date": [Date], "Country": "Tunisia", "Units": "people tested", "Source URL": source_url, "Source label": "Tunisia Ministry of Health", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def pipe_filter_locations(self, df: pd.DataFrame) -> pd.DataFrame: # Filter locations populations_path = os.path.join(get_project_dir(), "scripts", "input", "un", "population_latest.csv") dfc = pd.read_csv(populations_path) df = df[df.location.isin(dfc.entity.unique())] return df
def countries_missing( path_population: str = None, path_locations: str = None, ascending: bool = False, as_dict: bool = False, ): """Get countries currently not present in our dataset. Args: path_population (str, optional): Path to UN population csv file. Default value works if repo structure is left unmodified. path_locations (str, optional): Path to locations csv file. Default value works if repo structure is left unmodified. ascending (bool, optional): Set to True to sort results in ascending order. By default sorts in ascedning order. as_dict (bool, optional): Set to True for the return value to be shaped as a dictionary. Otherwise returns a DataFrame. """ if not path_population: path_population = os.path.abspath( os.path.join(get_project_dir(), "scripts", "input", "un", "population_latest.csv")) if not path_locations: path_locations = os.path.abspath( os.path.join(get_project_dir(), "public", "data", "vaccinations", "locations.csv")) df_loc = pd.read_csv(path_locations, usecols=["location"]) df_pop = pd.read_csv(path_population) df_pop = df_pop[df_pop.iso_code.apply( lambda x: isinstance(x, str) and len(x) == 3)] df_mis = df_pop.loc[~df_pop["entity"].isin(df_loc["location"]), ["entity", "population"]] # Sort if not ascending: df_mis = df_mis.sort_values(by="population", ascending=False) # Return data if as_dict: return df_mis.to_dict(orient="records") return df_mis
# MIN_RESPONSES: country-date-question observations with less than this # many valid responses will be dropped. If "None", no observations will # be dropped. MIN_RESPONSES = 500 # FREQ: temporal level at which to aggregate the individual survey # responses, passed as the `freq` argument to # pandas.Series.dt.to_period. Must conform to a valid Pandas offset # string (e.g. 'M' = "month", "W" = "week"). FREQ = 'M' # ZERO_DAY: reference date for internal yearIsDay Grapher usage. ZERO_DAY = "2020-01-21" # File paths PROJECT_DIR = get_project_dir() INPUT_PATH = os.path.join(PROJECT_DIR, "scripts", "input", "yougov") OUTPUT_PATH = os.path.join(PROJECT_DIR, "scripts", "grapher") MAPPING_PATH = os.path.join(INPUT_PATH, "mapping.csv") MAPPING_VALUES_PATH = os.path.join(INPUT_PATH, 'mapped_values.json') MAPPING = pd.read_csv(MAPPING_PATH, na_values=None) MAPPING['label'] = MAPPING['label'].str.lower() with open(MAPPING_VALUES_PATH, 'r') as f: MAPPED_VALUES = json.load(f) class YouGov: def __init__(self, output_path: str, debug: bool = False): self.source_url = "https://github.com/YouGov-Data/covid-19-tracker/raw/master"
def country_updates_summary( path_vaccinations: str = None, path_locations: str = None, path_automation_state: str = None, as_dict: bool = False, sortby_counts: bool = False, sortby_updatefreq: bool = False, who: bool = False, vaccines: bool = False, ): """Check last updated countries. It loads the content from locations.csv, vaccinations.csv and automation_state.csv to present results on the update frequency and timeline of all countries. By default, the countries are sorted from least to most recently updated. You can also sort them from least to most frequently updated ones by using argument `sortby_counts`. In Jupyter is recommended to ass the following lines to enable the DataFrame to be fully shown: ```python import pandas as pd pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.max_colwidth', None) ``` Args: path_vaccinations (str, optional): Path to vaccinations csv file. Default value works if repo structure is left unmodified. path_locations (str, optional): Path to locations csv file. Default value works if repo structure is left unmodified. path_automation_state (str, optional): Path to automation state csv file. Default value works if repo structure is left unmodified. as_dict (bool, optional): Set to True for the return value to be shaped as a dictionary. Otherwise returns a DataFrame. sortby_counts (bool, optional): Set to True to sort resuls from least to most updated countries. who (bool, optional): Display WHO columns Returns: Union[pd.DataFrame, dict]: List or DataFrame, where each row (or element) contains five fields: - 'last_observation_date': Last update date. - 'location': Country name. - 'source_website': Source used to retrieve last added data. - 'automated': True if country process is automated. - 'counts': Number of times the country has been updated. """ # Get data paths if not path_vaccinations: path_vaccinations = os.path.abspath( os.path.join( get_project_dir(), "public", "data", "vaccinations", "vaccinations.csv", )) if not path_locations: path_locations = os.path.abspath( os.path.join(get_project_dir(), "public", "data", "vaccinations", "locations.csv")) if not path_automation_state: path_automation_state = os.path.abspath( os.path.join(get_project_dir(), "scripts", "output", "vaccinations", "automation_state.csv")) columns_output = [ "location", "last_observation_date", "first_observation_date", "counts", "update_frequency", "num_observation_days", "source_website", "automated", ] # Read data df_vax = pd.read_csv(path_vaccinations) df_loc = pd.read_csv(path_locations) df_state = pd.read_csv(path_automation_state) df_who = get_who_data() # Get counts df_vax = df_vax.dropna( subset=[ "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ], how="all", ) df_vax = pd.DataFrame({ "counts": df_vax.groupby("location").date.count().sort_values(), "first_observation_date": df_vax.groupby("location").date.min(), }) # Merge data df = df_loc.merge(df_state, on="location") df = df.merge(df_vax, on="location") # Merge with WHO if who: # print(df_who.columns) df = df.merge(df_who, left_on="iso_code", right_on="ISO3", how="left") columns_output += ["reporting_to_WHO", "location_WHO"] # Additional fields num_observation_days = ( datetime.now() - pd.to_datetime(df.first_observation_date)).dt.days + 1 num_updates_per_observation_day = df.counts / num_observation_days df = df.assign( num_observation_days=num_observation_days, update_frequency=num_updates_per_observation_day, ) # Sort data if sortby_updatefreq: sort_column = "update_frequency" elif sortby_counts: sort_column = "counts" else: sort_column = "last_observation_date" df = df.sort_values(by=sort_column)[columns_output] def _web_type(x): govs = [ ".gov/", "gov.", ".gob.", ".moh.", ".gub.", ".go.", ".gouv.", "govern", ".govt", ".coronavirus2020.kz/", "thl.fi", ".gv.", "corona.nun.gl", "exploregov.ky", "covid19response.lc/", "corona.fo/", "103.247.238.92/webportal/", "data.public.lu/", "vaccinocovid.iss.sm/", "koronavirus.hr", "koronavirusinfo.az", "covid.is", "government.", "covid19ireland-geohive.hub.arcgis", "sacoronavirus.co.za", "covidodgovor.me", "experience.arcgis.com/experience/59226cacd2b441c7a939dca13f832112/", "guineasalud.org/estadisticas/", "bakuna.cw/", "laatjevaccineren.sr/", "coronavirus.bg/bg/statistika", "admin.ch", "folkhalsomyndigheten.se/", "covid19.ssi.dk/", "fhi.no/", "impfdashboard.de/", "covid-19.nczisk.sk", "opendata.digilugu.ee", ".mzcr.cz/", "ghanahealthservice.org/", "ccss.sa.cr/", "epistat.wiv-isp.be", "covidmaroc.ma", "experience.arcgis.com/experience/cab84dcfe0464c2a8050a78f817924ca", "gtmvigilanciacovid.shinyapps", "belta.by", "fohm.se", "moh.", "vaccines.ncdc.ge", "opendata.swiss", ] if "facebook." in x.lower(): return "Facebook" elif "twitter." in x.lower(): return "Twitter" elif "github." in x.lower() or "githubusercontent" in x.lower(): return "GitHub" elif any(gov in x.lower() for gov in govs): return "Govern/Official" elif (".who.int" in x.lower()) or ("who.maps.arcgis.com" in x.lower()): return "WHO" elif ".pacificdata.org" in x.lower(): return "SPC" elif "ecdc.europa." in x.lower(): return "ECDC" elif "paho.org" in x.lower(): return "PAHO" elif "africacdc.org" in x.lower(): return "Africa CDC" else: return "Others" df = df.assign(**{"web_type": df.source_website.apply(_web_type)}) if vaccines: df_vax = vaccines_comparison_with_who() df = df.merge( df_vax[["location", "missing_in_who", "missing_in_owid"]], on="location", how="left", ) # Return data if as_dict: return df.to_dict(orient="records") return df
import os from cowidev.utils.utils import get_project_dir from .etl import run_etl from .grapher import run_explorerizer from ._parser import _parse_args FILE_DS = os.path.join( get_project_dir(), "public", "data", "excess_mortality", "excess_mortality.csv" ) FILE_EXPLORER = os.path.join( get_project_dir(), "public", "data", "internal", "megafile--excess-mortality.json" ) def run_step(step: str): if step == "etl": run_etl(FILE_DS) elif step == "explorer-file": run_explorerizer(FILE_DS, FILE_EXPLORER) if __name__ == "__main__": args = _parse_args() run_step(args.step)
import os from cowidev.utils.utils import get_project_dir from .etl import run_etl from .grapher import run_grapheriser, run_explorerizer, run_db_updater from ._parser import _parse_args FILE_DS = os.path.join(get_project_dir(), "public", "data", "variants", "covid-variants.csv") FILE_GRAPHER = os.path.join(get_project_dir(), "scripts", "grapher", "COVID-19 - Variants.csv") FILE_EXPLORER = os.path.join(get_project_dir(), "public", "data", "internal", "megafile--variants.json") def run_step(step: str): if step == "etl": run_etl(FILE_DS) elif step == "grapher-file": run_grapheriser(FILE_DS, FILE_GRAPHER) elif step == "explorer-file": run_explorerizer(FILE_DS, FILE_EXPLORER) elif step == "grapher-db": run_db_updater(FILE_GRAPHER) if __name__ == "__main__": args = _parse_args() run_step(args.step)
import os from cowidev.utils.utils import get_project_dir from .etl import run_etl from .grapher import run_explorerizer from ._parser import _parse_args FILE_DS = os.path.join(get_project_dir(), "public", "data", "excess_mortality", "excess_mortality.csv") FILE_EXPLORER = os.path.join(get_project_dir(), "public", "data", "internal", "megafile--excess-mortality.json") def run_step(step: str): if step == "etl": run_etl(FILE_DS) elif step == "explorer-file": run_explorerizer(FILE_DS, FILE_EXPLORER) if __name__ == "__main__": args = _parse_args() run_step(args.step)
import os from cowidev.utils.utils import get_project_dir from .etl import run_etl from .grapher import run_grapheriser, run_db_updater from ._parser import _parse_args project_dir = get_project_dir() FILE_DS = os.path.join("/tmp", "google-mobility.csv") FILE_GRAPHER = os.path.join(project_dir, "scripts", "grapher", "Google Mobility Trends (2020).csv") FILE_COUNTRY_STD = os.path.join(project_dir, "scripts", "input", "gmobility", "gmobility_country_standardized.csv") def run_step(step: str): if step == "etl": run_etl(FILE_DS) elif step == "grapher-file": run_grapheriser(FILE_DS, FILE_COUNTRY_STD, FILE_GRAPHER) elif step == "grapher-db": run_db_updater(FILE_GRAPHER) if __name__ == "__main__": args = _parse_args() run_step(args.step)
import os from cowidev.utils.utils import get_project_dir from cowidev.vax.us_states.etl import run_etl from cowidev.vax.us_states.grapher import run_grapheriser from cowidev.vax.us_states._parser import _parse_args FILE_DS = os.path.join(get_project_dir(), "public", "data", "vaccinations", "us_state_vaccinations.csv") FILE_GRAPHER = os.path.join(get_project_dir(), "scripts", "grapher", "COVID-19 - United States vaccinations.csv") def run_step(step: str): if step == "etl": run_etl(FILE_DS) elif step == "grapher-file": run_grapheriser(FILE_DS, FILE_GRAPHER) # elif step == "explorer-file": # run_explorerizer(FILE_DS, FILE_EXPLORER) # elif step == "grapher-db": # run_db_updater(FILE_GRAPHER) if __name__ == "__main__": args = _parse_args() run_step(args.step)
import os import pandas as pd from cowidev.utils.utils import get_project_dir POPULATION = pd.read_csv( os.path.join(get_project_dir(), "scripts", "input", "un", "population_latest.csv"), usecols=["iso_code", "entity", "population"], ) SOURCE_URL = "https://opendata.ecdc.europa.eu/covid19/hospitalicuadmissionrates/csv/data.csv" def download_data(): print("Downloading ECDC data…") df = pd.read_csv( SOURCE_URL, usecols=["country", "indicator", "date", "value", "year_week"]) df = df.drop_duplicates() df = df.rename(columns={"country": "entity"}) return df def pipe_undo_100k(df): df = pd.merge(df, POPULATION, on="entity", how="left") assert df[df.population.isna( )].shape[0] == 0, "Country missing from population file" df.loc[df["indicator"].str.contains(" per 100k"), "value"] = df["value"].div(100000).mul(df["population"]) df.loc[:, "indicator"] = df["indicator"].str.replace(" per 100k", "") return df
class USStatesETL: source_url: str = "https://covid.cdc.gov/covid-data-tracker/COVIDData/getAjaxData?id=vaccination_data" cdc_data_path: str = os.path.join(get_project_dir(), "scripts", "input", "cdc", "vaccinations") def extract(self): self._download_data() return self._read_data() def _download_data(self): data = json.loads(requests.get(self.source_url).content) df = pd.DataFrame.from_records(data["vaccination_data"]) assert len(df) > 0 df.to_csv(os.path.join(self.cdc_data_path, f"cdc_data_{df.Date.max()}.csv"), index=False) def _read_data(self): files = glob(os.path.join(self.cdc_data_path, "cdc_data_*.csv")) data = [*map(self._read_file, files)] return pd.concat(data, ignore_index=True) def _read_file(self, filepath): df = pd.read_csv(filepath, na_values=[0.0, 0]) # Each variable present in VARIABLE_MATCHING.keys() will be created based on the variables in # VARIABLE_MATCHING.values() by order of priority. If none of the vars can be found, the variable # is created as pd.NA variable_matching = { "total_distributed": ["Doses_Distributed"], "total_vaccinations": ["Doses_Administered"], "people_vaccinated": ["Administered_Dose1_Recip", "Administered_Dose1"], "people_fully_vaccinated": [ "Series_Complete_Yes", "Administered_Dose2_Recip", "Administered_Dose2", ], } # Mapping for k, v in variable_matching.items(): for cdc_variable in v: if cdc_variable in df.columns: df = df.rename(columns={cdc_variable: k}) break if k not in df.columns: df[k] = pd.NA # Order columns df = df[["Date", "LongName", "Census2019"] + [*variable_matching.keys()]] return df def transform(self, df: pd.DataFrame): return (df.pipe(pipe_rename_cols).pipe(pipe_per_capita).pipe( pipe_smoothed).pipe(pipe_usage).drop(columns=["Census2019"]).pipe( pipe_monotonic_by_state).sort_values( ["location", "date"]).pipe(pipe_select_columns).pipe(pipe_checks)) def load(self, df: pd.DataFrame, output_path: str) -> None: # Export data df.to_csv(output_path, index=False) def run(self, output_path: str): data = self.extract() df = self.transform(data) self.load(df, output_path)