Esempio n. 1
0
def process_inspection_results(
    inspection_results: List[dict],
    exclude: Union[list, set] = None,
    apply: List[Tuple] = None,
    drop: bool = True,
    verbose: bool = False,
) -> pd.DataFrame:
    """Process inspection result into pd.DataFrame."""
    if not inspection_results:
        return ValueError("Empty iterable provided.")

    datetime_spec = ("created|started_at|finished_at", pd.to_datetime)
    if apply is None:
        apply = [datetime_spec]
    else:
        apply = [*apply, datetime_spec]

    exclude = exclude or []
    apply = apply or ()

    df = json_normalize(inspection_results,
                        sep="__")  # each row resembles InspectionResult

    if len(df) <= 1:
        return df

    for regex, func in apply:
        for col in df.filter(regex=regex).columns:
            df[col] = df[col].apply(func)

    keys = [k for k in inspection_results[0] if k not in exclude]
    for k in keys:
        if k in exclude:
            continue
        d = df.filter(regex=k)
        p = profile(d)

        rejected = (p.description_set["variables"].query(
            "distinct_count <= 1 & type != 'UNSUPPORTED'").filter(
                regex="^((?!version).)*$",
                axis=0))  # explicitly include versions

        if verbose:
            print("Rejected columns: ", rejected.index)

        if drop:
            df.drop(rejected.index, axis=1, inplace=True)

    df = df.eval(
        "status__job__duration   = status__job__finished_at   - status__job__started_at",
        engine="python"
    ).eval(
        "status__build__duration = status__build__finished_at - status__build__started_at",
        engine="python")

    return df
# %% {"init_cell": true, "hidden": true}
inspection_results[0].keys()

# %% [markdown] {"hidden": true}
# #### Status

# %% {"require": ["base/js/events", "datatables.net", "d3", "jupyter-datatables"], "hidden": true}
df_status = df.filter(regex="status")

date_columns = df_status.filter(regex="started_at|finished_at").columns
for col in date_columns:
    df_status[col] = df[col].apply(pd.to_datetime)

# %% {"hidden": true}
p = profile(df_status)
p

# %% [markdown] {"hidden": true}
# According to the profiling, we can drop the values with the constant value:

# %% {"hidden": true}
rejected = p.description_set["variables"].query(
    "distinct_count <= 1 & type != 'UNSUPPORTED'"
)
rejected

# %% {"hidden": true}
df.drop(rejected.index, axis=1, inplace=True)

# %% [markdown] {"hidden": true}