Exemple #1
0
def clean_raw_file(raw_path):
    """Take one file, checks whether it fits expected format, and clean it"""

    # TODO check what is up with that renaming thing - I thinkI don't need it

    # Check file name format

    f = raw_path.split("/")[-1][:-4]
    instance, table, year, month = f.split("_")

    assert table in [
        "main",
        "report",
    ], f"Unexpected data type in file name for {f}: correct format is [instance_datatype_YYYYMmm], e.g. new_main_2020Apr"
    assert instance in [
        "new",
        "old",
    ], f"Unexpected dhis2 instance in file name for {f}: correct format is [instance_datatype_YYYYMmm], e.g. new_main_2020Apr"

    # import file and get to standard format

    if table == "main":
        df = clean_add_indicators(raw_path, instance)
    else:
        df = get_reporting_data(raw_path, instance)
        renaming_dict = get_renaming_dict()
        df["dataElement"].replace(renaming_dict, inplace=True)

    assert df["year"].nunique(
    ) == 1, f"Data for several years found in file {f}"
    assert df["month"].nunique(
    ) == 1, f"Data for several months found in file {f}"

    assert int(df["year"].unique()[0]) == int(
        year
    ), f"Data from a different year than what file name indicates for {f}"
    assert (
        df["month"].unique()[0] == month
    ), f"Data from a different year than what file name indicates for {f}"

    make_note(f"data imported for file {f}", START_TIME)

    # cleaning formatted table

    df.reset_index(drop=True, inplace=True)

    # TODO Check this groupby for breakdown addition

    df = df.groupby(["dataElement", "orgUnit", "year", "month"],
                    as_index=False).agg({"value": "sum"})
    df["value"] = pd.to_numeric(df["value"], errors="coerce")

    df = df[df["orgUnit"].isin(FACILITY_IDS)]

    if table == "report":
        df["value"] = (df["value"] > 0).astype("int")

    return df
Exemple #2
0
def get_reporting_data(path, instance):

    month_dict = {
        "01": "Jan",
        "02": "Feb",
        "03": "Mar",
        "04": "Apr",
        "05": "May",
        "06": "Jun",
        "07": "Jul",
        "08": "Aug",
        "09": "Sep",
        "10": "Oct",
        "11": "Nov",
        "12": "Dec",
    }

    cols = [
        "orgunitlevel1",
        "orgunitlevel2",
        "orgunitlevel3",
        "orgunitlevel4",
        "orgunitlevel5",
        "organisationunitname",
        "organisationunitcode",
        "organisationunitdescription",
        "periodid",
        "periodname",
        "periodcode",
        "perioddescription",
    ]

    # Get the right data
    df = pd.read_csv(path, dtype="object")

    var_corr_rep = [el for el in VAR_CORR if el.get("domain") == "REPORT"]

    metrics = get_flat_list_json(var_corr_rep, instance)

    for x in metrics:
        df[x] = pd.to_numeric(df[x], errors="coerce")

    make_note(str(instance) + " reporting data loaded", START_TIME)

    # Formatting dates
    df["year"] = df["periodcode"].astype("str").apply(lambda x: x[:4])
    df["month"] = (df["periodcode"].astype("str").apply(
        lambda x: x[-2:]).replace(month_dict))
    df.rename(columns={"organisationunitid": "orgUnit"}, inplace=True)
    df.set_index(["orgUnit", "year", "month"], drop=True, inplace=True)

    # Dropping unused columns and renaming
    df.drop(cols, axis=1, inplace=True)
    df1 = df.copy().stack(dropna=False).reset_index()
    df1.rename(columns={0: "value", "level_3": "dataElement"}, inplace=True)
    df1["value"] = df1["value"].fillna(0).astype(int)

    return df1
Exemple #3
0
def clean(raw_path):

    file_name = raw_path.split("/")[-1]
    make_note(f"Starting the cleaning process for {file_name}", START_TIME)

    clean_df = clean_raw_file(raw_path)
    make_note(f"Cleaning of raw file done for {file_name}", START_TIME)

    return clean_df
def compute_outliers_stack(pivot, policy, location):
    if policy == 'std':
        pivot_processed = replace_outliers(pivot, cutoff=3)
    elif policy == 'iqr':
        pivot_processed = replace_outliers_iqr(pivot, k=3)

    stack = pivot_stack_post_process(pivot_processed)

    stack = add_info_and_format(stack, location)

    make_note(f'{policy} outlier exclusion process done', START_TIME)

    return stack
def create_reporting_pivot(pivot, report, location):

    full_pivot = add_report_to_pivot(pivot, report, location)

    for c in full_pivot.columns:
        full_pivot[c] = (full_pivot[c] > 0).astype('int')

    report = add_report_columns(full_pivot)
    report.drop(columns=['expected_105_1_reporting', 'actual_105_1_reporting'],
                inplace=True)

    make_note('Reporting done', START_TIME)

    return report
Exemple #6
0
def clean_add_indicators(file_path, instance):

    make_note(f"Creating additional indicators for {instance}", START_TIME)

    add_dict = get_variable_breakdown_dict(instance)

    dhis_df = get_data(file_path, instance)

    df = pd.DataFrame(columns=dhis_df.columns)

    for indicator in add_dict.keys():
        df = compute_indicators(dhis_df, df, indicator,
                                add_dict.get(indicator))

    df = process_date(df)

    return df
Exemple #7
0
def map_to_temp(raw_path, map, clean_df):

    f = raw_path.split("/")[-1]
    f_short = f[:-4]
    instance, table, year, month = f_short.split("_")

    map_dict = dict(
        zip(map.loc[:, 'indicatorname'], map.loc[:, 'indicatorcode_out']))

    clean_df["dataElement"] = clean_df["dataElement"].map(map_dict)

    f_path = f"data/temp/{f_short}_clean.csv"

    clean_df[["orgUnit", "dataElement", "year", "month",
              "value"]].to_csv(f_path, index=False, header=False)

    make_note(f"Creation of temporary csv done for {f}", START_TIME)

    return f_path, year, month, table
def process(main, report, location):
    make_note('Starting the data processing', START_TIME)

    pivot_outliers = pivot_stack(main)

    make_note('data pivot for outlier exclusion done', START_TIME)

    # outlier computations

    outliers_stack = add_info_and_format(main, location)

    # out

    outliers = full_pivot_for_export(outliers_stack)
    stack = pd.DataFrame(columns=outliers_stack.columns)
    stack = add_to_final_stack(stack, outliers_stack, 'outliers')
    del outliers_stack
    outliers.to_csv(INDICATORS['out_data'])

    # std

    std_stack = compute_outliers_stack(pivot_outliers, 'std', location)
    std = full_pivot_for_export(std_stack)
    stack = add_to_final_stack(stack, std_stack, 'std')
    del std_stack
    std.to_csv(INDICATORS['std_data'])

    # iqr

    iqr_stack = compute_outliers_stack(pivot_outliers, 'iqr', location)
    iqr = full_pivot_for_export(iqr_stack)
    stack = add_to_final_stack(stack, iqr_stack, 'iqr')
    del iqr_stack
    iqr.to_csv(INDICATORS['iqr_data'])

    make_note('outlier exclusion done', START_TIME)

    # rep

    report = create_reporting_pivot(outliers, report, location)
    report_stack = stack_reporting(report)
    stack = add_to_final_stack(stack, report_stack, 'report')
    del report_stack
    report.to_csv(INDICATORS['rep_data'])

    # stack

    stack.to_csv(INDICATORS["tall_data"], index=False)

    make_note('breakdown in four tables done', START_TIME)
Exemple #9
0
load_dotenv(find_dotenv(), verbose=True)  # NOQA: E402

from src.db import adpter as db  # NOQA: E402
from src.api.ddi_dhis2 import Dhis  # NOQA: E402

START_TIME = datetime.now()

with open(INDICATORS["data_config"], "r", encoding="utf-8") as f:
    VAR_CORR = json.load(f)

if __name__ == "__main__":

    # init
    # db.pg_recreate_tables()

    make_note("Starting the pipeline", START_TIME)

    # Adding any new indicators / facilities to the lookup table

    db.pg_update_indicator(dataelements=VAR_CORR)
    db.pg_update_location(file_path=INDICATORS["name_district_map"])

    # Adding the population data

    cols = clean.clean_pop_to_temp(INDICATORS["pop"], INDICATORS["pop_perc"])

    db.pg_update_pop("data/temp/pop.csv", cols)

    # cleaning the data and writing it to the database file by file

    files = os.listdir(INDICATORS["raw_data"])