def _merge_primary_and_boosters(self, df_primary, df_boosters): if not df_boosters.vacc_date.str.match(r"\d{4}-\d{2}-\d{2}").all(): try: df_boosters["vacc_date"] = clean_date_series( df_boosters.vacc_date, "%d-%b-%y") except: df_boosters["vacc_date"] = clean_date_series( df_boosters.vacc_date, "%d/%m/%Y") df_boosters = df_boosters.drop_duplicates(subset=["vacc_date"], keep=False) if not df_primary.vacc_date.str.match(r"\d{4}-\d{2}-\d{2}").all(): try: df_primary["vacc_date"] = clean_date_series( df_primary.vacc_date, "%d-%b-%y") except: df_primary["vacc_date"] = clean_date_series( df_primary.vacc_date, "%d/%m/%Y") df_primary = df_primary.drop_duplicates(subset=["vacc_date"], keep=False) df = pd.merge(df_primary, df_boosters, on="vacc_date", how="outer", validate="one_to_one") return df
def _read_new(self): df = pd.read_csv( "https://docs.google.com/spreadsheets/d/10c9jNi8VnV0YYCfV_7AZrzBY5l18dOFHEJMIJsP4THI/export?format=csv&gid=512078862", usecols=[ "Date", "Total", "선별진료소(통합)", "의심신고 검사자 수", "임시선별검사소 검사건수", "수도권 임시선별검사소 검사건수", "비수도권 임시선별검사소", ], ) df = df.assign(Date=clean_date_series(df["Date"], "%Y-%m-%d")) # 2021-04-21 < data < 2021-10-25; 'Number of testing at temporary screening stations' (임시선별검사소 검사건수) = 'Number of inspections by temporary screening and inspection centers in the metropolitan area' # (수도권 임시선별검사소 검사건수) + 'Non-Metropolitan Temporary Screening Center' (비수도권 임시선별검사소) df.iloc[:, 4].fillna((df.iloc[:, 5] + df.iloc[:, 6]), inplace=True) ## 2020-12-17 < data < 2022-02-08; 'Number of testing at screening stations' (Aggregate) (선별진료소(통합)) = 'Number of suspicious report testing' (의심신고 검사자 수) + 'Number of testing at temporary screening stations' (임시선별검사소 검사건수) df.iloc[:, 2].fillna((df.iloc[:, 3] + df.iloc[:, 4]), inplace=True) ## Use 'total' after 2022-02-06 df.loc[df["Date"] > "2022-02-06", "선별진료소(통합)"] = df["Total"] df["Daily change in cumulative total"] = df["선별진료소(통합)"] return df[["Date", "Daily change in cumulative total"]]
def pipeline(self, df: pd.DataFrame) -> pd.DataFrame: # Rename df = df.pipe(self.pipe_rename_columns) # Remove NaNs df = df[~df["Cumulative total"].isna()] # Date df = df.assign(Date=clean_date_series(df.Date, "%d/%m/%Y")) # Metadata df = df.pipe(self.pipe_metadata) return df
def _read_antigens(self): ## Antigen url = "https://atlas.jifo.co/api/connectors/425b93dc-c055-477c-b81a-5d4d9a1275f7" data = request_json(url)["data"][4] df = pd.DataFrame.from_records(data[1:], columns=data[0]) # Clean df = df.assign(Date=clean_date_series(df[""], "%d/%m/%Y")) df["Positivas"] = df["Positivas"].apply(clean_count) df["Total Px Ag"] = df["Total Px Ag"].apply(clean_count) return df
def _read_pcr(self): df = pd.read_csv( "https://www.datos.gov.co/resource/8835-5baf.csv", usecols=["fecha", "positivas_acumuladas", "negativas_acumuladas"], ) df = df[(df["fecha"] != "Acumulado Feb") & (df.fecha.notnull())] df = df.assign( Date=clean_date_series(df["fecha"], "%Y-%m-%dT%H:%M:%S.%f")) # df = df.assign(Date=clean_date_series(df["fecha"], "%d/%m/%Y")) return df
def _read_art(self): url = f"{self.base_url}1ee4d904-b17e-41de-a731-854578b036e6" json_dict = request_json(url)["result"]["records"] df = pd.DataFrame.from_records(json_dict).drop(columns=["_id"]) # correct errors df.loc[(df[df["week_of"] == "14/12/2022"].index.values), "week_of"] = "14/12/2021" df.loc[(df[df["week_of"] == "28/12/2022"].index.values), "week_of"] = "28/12/2021" df["week_of"] = clean_date_series(df["week_of"], "%d/%m/%Y") return df
def pipe_clean_data(self, df: pd.DataFrame) -> pd.DataFrame: df = (df.assign( Date=clean_date_series(df.Date, format_input="%m/%d/%Y"), Location="United States").drop(columns=["date_type"]).rename( columns={ "Date": "date", "Location": "location", "Administered_Cumulative": "total_vaccinations", "Admin_Dose_1_Cumulative": "people_vaccinated", "Series_Complete_Cumulative": "people_fully_vaccinated", "Booster_Cumulative": "total_boosters", }).sort_values("date")) return df[df.total_vaccinations > 0].drop_duplicates(subset=["date"], keep=False)
def pipeline_manufacturer(self, df): df = df.melt("date", var_name="vaccine", value_name="total_vaccinations") df["date"] = clean_date_series(df["date"], "%d.%m.%Y") df["total_vaccinations"] = pd.to_numeric(df["total_vaccinations"], errors="coerce").fillna(0) df["total_vaccinations"] = (df.sort_values("date").groupby( "vaccine", as_index=False)["total_vaccinations"].cumsum()) df["location"] = "Iceland" assert set(df["vaccine"].unique()) == set(VACCINE_MAPPING.keys( )), f"Vaccines present in data: {df['vaccine'].unique()}" df = df.replace(VACCINE_MAPPING) return df
def _read_old(self): ## data < 2020-12-18; 'Number of suspicious report testing' (의심신고 검사자 수) df = pd.read_csv( "https://docs.google.com/spreadsheets/d/10c9jNi8VnV0YYCfV_7AZrzBY5l18dOFHEJMIJsP4THI/export?format=csv&gid=334130338", usecols=["DATE", "TOTAL_TEST"], ) # Daily change and clean df = df[pd.to_numeric(df["TOTAL_TEST"], errors="coerce").notnull()] df = df.assign(Date=clean_date_series(df["DATE"], "%Y-%m-%d")) df["Daily change in cumulative total"] = df["TOTAL_TEST"].astype( "int32").diff(periods=-1) df["TOTAL_TEST"] = df["TOTAL_TEST"].apply(clean_count) return df[["Date", "Daily change in cumulative total" ]].loc[df["Date"] < "2020-12-18"]
def pipe_fill_gaps(self, df: pd.DataFrame): """Fill gaps with average daily value.""" df["Date"] = pd.to_datetime(df["Date"]) # Check difference of 7 days if not (df.Date.diff().iloc[1:].dt.days == 7).all(): raise ValueError( "Not all values are separated by 7 days. Please check `Date` value!" ) # Create date range dt_min = df.Date.min() dt_max = df.Date.max() + timedelta(days=6) ds = pd.Series(pd.date_range(dt_min, dt_max), name="Date") # Merge with dataframe & fillna df = df.merge(ds, how="outer", sort=True) # .sort_values("date") df = df.fillna(method="ffill") df["Date"] = clean_date_series(df["Date"]) return df
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: """Formats the date column.""" return df.assign(date=clean_date_series(df.date, "%d/%m/%Y"))
def pipe_date(self, df: pd.DataFrame): return df.assign( Date=clean_date_series(df["Date"], "%d.%m.%Y")).sort_values("Date")
def pipe_date(self, df: pd.DataFrame): return df.assign(Date=clean_date_series(df.Date, "%Y-%m-%d"))
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: """Cleans date column""" return df.assign(Date=clean_date_series(df["Date"], unit="ms"))
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: return (df.assign(date=clean_date_series( df.date, unit="ms")).sort_values("date").drop_duplicates( subset=["date"], keep=False))
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: """Convert date to datetime""" return df.assign(Date=clean_date_series(df["Date"], "%Y/%m/%d"))
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(date=clean_date_series(df["DAY(txn_date)-value"]))
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: df["Date"] = clean_date_series(df["Date"], "%Y.%m.%d.") # df["Date"] = df["Date"].str.replace(".", "-", regex=True) return df
def pipe_date(self, df: pd.DataFrame): df = df[~df.Date.isin(["Cannot specify date"])] return df.assign(Date=clean_date_series(df.Date, "%d/%m/%Y"))
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(Date=clean_date_series(df["Date"], "%d %b %y"))
def _read_pcr(self): url = f"{self.base_url}07cd6bfd-c73e-4aed-bc7b-55b13dd9e7c2" json_dict = request_json(url)["result"]["records"] df = pd.DataFrame.from_records(json_dict).drop(columns=["_id"]) df["week_of"] = clean_date_series(df["week_of"], "%d/%m/%Y") return df
def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: """Pipes date column.""" return df.assign(Date=clean_date_series(df.Date, "%d/%m/%y"))
def pipeline(self, df: pd.DataFrame) -> pd.DataFrame: """Pipeline for the data""" df = df.pipe(self.pipe_rename_columns).pipe(self.pipe_pr).pipe( self.pipe_metadata) df = df.assign(Date=clean_date_series(df.Date)) return df
def pipe_age_metadata(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign( date=clean_date_series(df.Date), location=self.location, )