Python clean_df Examples

Programming Language: Python

Namespace/Package Name: twodii_datawarehouse.file_import.df_utils

Method/Function: clean_df

Examples at hotexamples.com: 8

Python clean_df - 8 examples found. These are the top rated real world Python examples of twodii_datawarehouse.file_import.df_utils.clean_df extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def parse_globaldata_power_extract(
    filepath,
    columns_name_list
):
    """Read a table from read the global data powerplants file."""
    # Find the header row
    raw_data = pd.read_excel(
        io=filepath,
        sheet_name=None,
        header=None
    )
    if len(raw_data.keys()) != 1:
        raise Exception(f"""Multiple sheets found in excel file,
                        but only one expected: {raw_data.keys()}""")
    raw_data = raw_data[list(raw_data.keys())[0]]

    df = dfu.clean_df(
        df=raw_data,
        columns_name_list=columns_name_list
    )

    # Table specificecific renames
    if "company_type_(private/_public..)" in df.columns:
        df = df.rename(
            mapper={"company_type_(private/_public..)": "company_type"},
            axis='columns',
            errors='raise'
        )
    if "parent/subsidiary" in df.columns:
        df = df.rename(
            mapper={"parent/subsidiary": "parent_subsidiary"},
            axis='columns',
            errors='raise'
        )
    return df

Example #2

Show file

def parse_globaldata_power_purchase_agreements(
    filepath,
    columns_name_list
):
    """Read a table from read the global data powerplants file."""
    # Find the header row
    raw_data = pd.read_excel(
        io=filepath,
        sheet_name=None,
        header=None
    )
    raw_data = raw_data['Power Purchase Agreements']

    df = dfu.clean_df(
        df=raw_data,
        columns_name_list=columns_name_list
    )

    # Table specificecific renames
    if "associated_plant_capacity_(mw)" in df.columns:
        df['associated_plant_capacity_unit'] = 'MW'
        df = df.rename(
            mapper={
                "associated_plant_capacity_(mw)": "associated_plant_capacity"
            },
            axis='columns',
            errors='raise'
        )

    return df

Example #3

Show file

def parse_globaldata_power_plants(
    filepath,
    columns_name_list
):
    """Read a table from read the global data powerplants file."""
    raw_data = pd.read_excel(
        io=filepath,
        sheet_name=None,
        header=None
    )
    if len(raw_data.keys()) != 1:
        raise Exception(f"""Multiple sheets found in excel file,
                        but only one expected: {raw_data.keys()}""")
    raw_data = raw_data[list(raw_data.keys())[0]]

    df = dfu.clean_df(
        df=raw_data,
        columns_name_list=columns_name_list
    )

    # Table specific rename
    if "total_capacity_(mw)" in df.columns:
        df['total_capacity_unit'] = 'MW'
        df = df.rename(
            mapper={"total_capacity_(mw)": "total_capacity"},
            axis='columns',
            errors='raise'
        )
    if "active_capacity_(mw)" in df.columns:
        df['active_capacity_unit'] = 'MW'
        df = df.rename(
            mapper={"active_capacity_(mw)": "active_capacity"},
            axis='columns',
            errors='raise'
        )
    if "pipeline_capacity_(mw)" in df.columns:
        df['pipeline_capacity_unit'] = 'MW'
        df = df.rename(
            mapper={"pipeline_capacity_(mw)": "pipeline_capacity"},
            axis='columns',
            errors='raise'
        )
    if "discontinued_capacity_(mw)" in df.columns:
        df['discontinued_capacity_unit'] = 'MW'
        df = df.rename(
            mapper={"discontinued_capacity_(mw)": "discontinued_capacity"},
            axis='columns',
            errors='raise'
        )
    if "owner_stake_(%)" in df.columns:
        df = df.rename(
            mapper={"owner_stake_(%)": "owner_stake_percentage"},
            axis='columns',
            errors='raise'
        )
    if "capex_usd_(million)" in df.columns:
        df['capex_usd'] = df['capex_usd_(million)'] * 1e6
        df = df.drop("capex_usd_(million)", axis='columns')
    if "efficiency_(%)" in df.columns:
        df = df.rename(
            mapper={"efficiency_(%)": "efficiency_percentage"},
            axis='columns',
            errors='raise'
        )
    if "decommissioning_year_(actual/estimated)" in df.columns:
        df = df.rename(
            mapper={
                "decommissioning_year_(actual/estimated)":
                    "decommissioning_year_status"
            },
            axis='columns',
            errors='raise'
        )

    return df

Example #4

Show file

def test_clean_df_empty_header_footer():
    clean_df = dfu.clean_df(df=df_test_small_empty_header_small_footer,
                            columns_name_list=df_test_names)
    npt.assert_array_equal(clean_df, df_test_simple_no_header)
    npt.assert_array_equal(clean_df.columns, clean_column_names)

Example #5

Show file

def test_clean_df_empty_middle_column_header():
    clean_df = dfu.clean_df(df=df_test_empty_middle_column_header,
                            columns_name_list=df_test_names,
                            rows_to_search=25)
    npt.assert_array_equal(clean_df, df_test_simple_no_header)
    npt.assert_array_equal(clean_df.columns, clean_column_names)

Example #6

Show file

def test_clean_df_long_footer_value():
    clean_df = dfu.clean_df(df=df_test_long_footer,
                            columns_name_list=df_test_names,
                            rows_to_search=25)
    npt.assert_array_equal(clean_df, df_test_simple_no_header)
    npt.assert_array_equal(clean_df.columns, clean_column_names)

Example #7

Show file

def test_clean_df_simple_footer_mixed_nan():
    clean_df = dfu.clean_df(df=df_test_footer_mixed,
                            columns_name_list=df_test_names)
    npt.assert_array_equal(clean_df, df_test_simple_no_header)
    npt.assert_array_equal(clean_df.columns, clean_column_names)

Example #8

Show file

def test_clean_df_long_header_rows_none():
    clean_df = dfu.clean_df(df=df_test_small_header,
                            columns_name_list=df_test_names,
                            rows_to_search=None)
    npt.assert_array_equal(clean_df, df_test_simple_no_header)
    npt.assert_array_equal(clean_df.columns, clean_column_names)