def clean_raw_file(raw_path): """Take one file, checks whether it fits expected format, and clean it""" # TODO check what is up with that renaming thing - I thinkI don't need it # Check file name format f = raw_path.split("/")[-1][:-4] instance, table, year, month = f.split("_") assert table in [ "main", "report", ], f"Unexpected data type in file name for {f}: correct format is [instance_datatype_YYYYMmm], e.g. new_main_2020Apr" assert instance in [ "new", "old", ], f"Unexpected dhis2 instance in file name for {f}: correct format is [instance_datatype_YYYYMmm], e.g. new_main_2020Apr" # import file and get to standard format if table == "main": df = clean_add_indicators(raw_path, instance) else: df = get_reporting_data(raw_path, instance) renaming_dict = get_renaming_dict() df["dataElement"].replace(renaming_dict, inplace=True) assert df["year"].nunique( ) == 1, f"Data for several years found in file {f}" assert df["month"].nunique( ) == 1, f"Data for several months found in file {f}" assert int(df["year"].unique()[0]) == int( year ), f"Data from a different year than what file name indicates for {f}" assert ( df["month"].unique()[0] == month ), f"Data from a different year than what file name indicates for {f}" make_note(f"data imported for file {f}", START_TIME) # cleaning formatted table df.reset_index(drop=True, inplace=True) # TODO Check this groupby for breakdown addition df = df.groupby(["dataElement", "orgUnit", "year", "month"], as_index=False).agg({"value": "sum"}) df["value"] = pd.to_numeric(df["value"], errors="coerce") df = df[df["orgUnit"].isin(FACILITY_IDS)] if table == "report": df["value"] = (df["value"] > 0).astype("int") return df
def get_reporting_data(path, instance): month_dict = { "01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr", "05": "May", "06": "Jun", "07": "Jul", "08": "Aug", "09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec", } cols = [ "orgunitlevel1", "orgunitlevel2", "orgunitlevel3", "orgunitlevel4", "orgunitlevel5", "organisationunitname", "organisationunitcode", "organisationunitdescription", "periodid", "periodname", "periodcode", "perioddescription", ] # Get the right data df = pd.read_csv(path, dtype="object") var_corr_rep = [el for el in VAR_CORR if el.get("domain") == "REPORT"] metrics = get_flat_list_json(var_corr_rep, instance) for x in metrics: df[x] = pd.to_numeric(df[x], errors="coerce") make_note(str(instance) + " reporting data loaded", START_TIME) # Formatting dates df["year"] = df["periodcode"].astype("str").apply(lambda x: x[:4]) df["month"] = (df["periodcode"].astype("str").apply( lambda x: x[-2:]).replace(month_dict)) df.rename(columns={"organisationunitid": "orgUnit"}, inplace=True) df.set_index(["orgUnit", "year", "month"], drop=True, inplace=True) # Dropping unused columns and renaming df.drop(cols, axis=1, inplace=True) df1 = df.copy().stack(dropna=False).reset_index() df1.rename(columns={0: "value", "level_3": "dataElement"}, inplace=True) df1["value"] = df1["value"].fillna(0).astype(int) return df1
def clean(raw_path): file_name = raw_path.split("/")[-1] make_note(f"Starting the cleaning process for {file_name}", START_TIME) clean_df = clean_raw_file(raw_path) make_note(f"Cleaning of raw file done for {file_name}", START_TIME) return clean_df
def compute_outliers_stack(pivot, policy, location): if policy == 'std': pivot_processed = replace_outliers(pivot, cutoff=3) elif policy == 'iqr': pivot_processed = replace_outliers_iqr(pivot, k=3) stack = pivot_stack_post_process(pivot_processed) stack = add_info_and_format(stack, location) make_note(f'{policy} outlier exclusion process done', START_TIME) return stack
def create_reporting_pivot(pivot, report, location): full_pivot = add_report_to_pivot(pivot, report, location) for c in full_pivot.columns: full_pivot[c] = (full_pivot[c] > 0).astype('int') report = add_report_columns(full_pivot) report.drop(columns=['expected_105_1_reporting', 'actual_105_1_reporting'], inplace=True) make_note('Reporting done', START_TIME) return report
def clean_add_indicators(file_path, instance): make_note(f"Creating additional indicators for {instance}", START_TIME) add_dict = get_variable_breakdown_dict(instance) dhis_df = get_data(file_path, instance) df = pd.DataFrame(columns=dhis_df.columns) for indicator in add_dict.keys(): df = compute_indicators(dhis_df, df, indicator, add_dict.get(indicator)) df = process_date(df) return df
def map_to_temp(raw_path, map, clean_df): f = raw_path.split("/")[-1] f_short = f[:-4] instance, table, year, month = f_short.split("_") map_dict = dict( zip(map.loc[:, 'indicatorname'], map.loc[:, 'indicatorcode_out'])) clean_df["dataElement"] = clean_df["dataElement"].map(map_dict) f_path = f"data/temp/{f_short}_clean.csv" clean_df[["orgUnit", "dataElement", "year", "month", "value"]].to_csv(f_path, index=False, header=False) make_note(f"Creation of temporary csv done for {f}", START_TIME) return f_path, year, month, table
def process(main, report, location): make_note('Starting the data processing', START_TIME) pivot_outliers = pivot_stack(main) make_note('data pivot for outlier exclusion done', START_TIME) # outlier computations outliers_stack = add_info_and_format(main, location) # out outliers = full_pivot_for_export(outliers_stack) stack = pd.DataFrame(columns=outliers_stack.columns) stack = add_to_final_stack(stack, outliers_stack, 'outliers') del outliers_stack outliers.to_csv(INDICATORS['out_data']) # std std_stack = compute_outliers_stack(pivot_outliers, 'std', location) std = full_pivot_for_export(std_stack) stack = add_to_final_stack(stack, std_stack, 'std') del std_stack std.to_csv(INDICATORS['std_data']) # iqr iqr_stack = compute_outliers_stack(pivot_outliers, 'iqr', location) iqr = full_pivot_for_export(iqr_stack) stack = add_to_final_stack(stack, iqr_stack, 'iqr') del iqr_stack iqr.to_csv(INDICATORS['iqr_data']) make_note('outlier exclusion done', START_TIME) # rep report = create_reporting_pivot(outliers, report, location) report_stack = stack_reporting(report) stack = add_to_final_stack(stack, report_stack, 'report') del report_stack report.to_csv(INDICATORS['rep_data']) # stack stack.to_csv(INDICATORS["tall_data"], index=False) make_note('breakdown in four tables done', START_TIME)
load_dotenv(find_dotenv(), verbose=True) # NOQA: E402 from src.db import adpter as db # NOQA: E402 from src.api.ddi_dhis2 import Dhis # NOQA: E402 START_TIME = datetime.now() with open(INDICATORS["data_config"], "r", encoding="utf-8") as f: VAR_CORR = json.load(f) if __name__ == "__main__": # init # db.pg_recreate_tables() make_note("Starting the pipeline", START_TIME) # Adding any new indicators / facilities to the lookup table db.pg_update_indicator(dataelements=VAR_CORR) db.pg_update_location(file_path=INDICATORS["name_district_map"]) # Adding the population data cols = clean.clean_pop_to_temp(INDICATORS["pop"], INDICATORS["pop_perc"]) db.pg_update_pop("data/temp/pop.csv", cols) # cleaning the data and writing it to the database file by file files = os.listdir(INDICATORS["raw_data"])