Ejemplos de strip_and_trim_whitespace en Python, ejemplos de util.strip_and_trim_whitespace en Python

Ejemplo n.º 1

0

Mostrar archivo

def process_2014_law_website_data() -> pd.DataFrame:
    """Loads the raw 2014 settlement data from the law department csv,
    converts it to a properly formatted dataframe, saves to csv and returns it
    """
    # load the excel file and skip the first 3 rows and the last 654.
    # also make the first unskipped row the headers
    raw_2014_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
            RAW_C.RAW_2014_LAW_WEBSITE_DATA_EXCEL_FILE),
        sheet_name=RAW_C.RAW_2014_LAW_WEBSITE_DATA_EXCEL_SHEET,
        header=1,
        skiprows=3,
        skipfooter=654,
    )
    # verify there are 1172 rows in total
    assert raw_2014_df.shape == (1172, 13)

    # drop the hidden comptroller column
    assert raw_2014_df["COMPTROLLER"].isna().all()
    raw_2014_df.drop(columns=["COMPTROLLER"], inplace=True)

    for col in ["EFFECTIVE DATE\n", "DATE TO\nCOMPTROLLER", "DUE DATE"]:
        raw_2014_df[col] = pd.to_datetime(raw_2014_df[col])

    # fix any whitespace issues
    raw_2014_df = util.strip_and_trim_whitespace(raw_2014_df)

    util.save_df(
        df=raw_2014_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2014_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2014_df

Ejemplo n.º 2

0

Mostrar archivo

def process_2021_law_website_data() -> pd.DataFrame:
    """Loads the raw 2021 settlement data from the law department csv,
    converts it to a properly formatted dataframe, saves to csv and returns it
    """
    # load the excel file and skip the first 4 rows and the last 7.
    # also make the first unskipped row the headers
    raw_2021_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
            RAW_C.RAW_2021_LAW_WEBSITE_DATA_EXCEL_FILE),
        sheet_name=RAW_C.RAW_2021_LAW_WEBSITE_DATA_EXCEL_SHEET,
        header=1,
        skiprows=4,
        skipfooter=7,
    )
    # drop empty columns read in for some reason
    raw_2021_df.dropna(axis=1, inplace=True, how="all")
    # verify there are 473 rows in total
    assert raw_2021_df.shape == (473, 8)

    for col in ["DATE TO COMPTROLLER"]:
        raw_2021_df[col] = pd.to_datetime(raw_2021_df[col])

    # fix any whitespace issues
    raw_2021_df = util.strip_and_trim_whitespace(raw_2021_df)

    util.save_df(
        df=raw_2021_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2021_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2021_df

Ejemplo n.º 3

0

Mostrar archivo

def save_csv_formatted_foia_cpd_payments_data() -> pd.DataFrame:
    """Loads the raw unmodified 2004 to 2018 cpd payment data,
    changes it into a workable dataframe format, saves that as a csv
    then returns the dataframe as well
    """
    # skip the first 4 rows and last 3 rows
    raw_foia_cpd_payments_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath(
            RAW_C.RAW_CPD_PAYMENTS_2004_TO_2018_FOIA_DATA_EXCEL_FILE
        ),
        sheet_name=RAW_C.RAW_CPD_PAYMENTS_2004_TO_2018_FOIA_DATA_EXCEL_SHEET,
        header=1,
        skiprows=4,
        skipfooter=3,
    )

    # fix any whitespace issues
    raw_foia_cpd_payments_df = util.strip_and_trim_whitespace(
        raw_foia_cpd_payments_df,
    )

    # now convert to proper dtypes
    raw_foia_cpd_payments_df["DATE TO COMPTROLLER"] = pd.to_datetime(
        raw_foia_cpd_payments_df["DATE TO COMPTROLLER"]
    )

    # now save to csv
    util.save_df(
        df=raw_foia_cpd_payments_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_CPD_PAYMENTS_2004_TO_2018_FOIA_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR,
    )
    return raw_foia_cpd_payments_df

Ejemplo n.º 4

0

Mostrar archivo

def process_2013_law_website_data() -> pd.DataFrame:
    """Loads the raw 2013 settlement data from the law department csv,
    converts it to a properly formatted dataframe, saves to csv and returns it
    """
    # load the excel file and skip the first 4 rows and the last 4.
    # also make the first unskipped row the headers
    raw_2013_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
            RAW_C.RAW_2013_LAW_WEBSITE_DATA_EXCEL_FILE),
        sheet_name=RAW_C.RAW_2013_LAW_WEBSITE_DATA_EXCEL_SHEET,
        header=1,
        skiprows=4,
        skipfooter=4,
    )
    # verify there are 1068 rows in total
    assert raw_2013_df.shape == (1068, 9)

    # rename the last column to no name
    raw_2013_df.rename(columns={"Unnamed: 8": "Hidden Column"}, inplace=True)

    # fix any whitespace issues
    raw_2013_df = util.strip_and_trim_whitespace(raw_2013_df)

    util.save_df(
        df=raw_2013_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2013_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2013_df

Ejemplo n.º 5

0

Mostrar archivo

def save_csv_formatted_foia_pending_suits_data() -> pd.DataFrame:
    """Loads the raw unmodified pending police lawsuits data,
    changes it into a workable dataframe format, saves that as a csv
    then returns the dataframe as well
    """
    # skip the first 4 rows
    raw_foia_pending_police_suits_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath(
            RAW_C.RAW_PENDING_POLICE_SUITS_FOIA_DATA_EXCEL_FILE
        ),
        sheet_name=RAW_C.RAW_PENDING_POLICE_SUITS_FOIA_DATA_EXCEL_SHEET,
        header=1,
        skipfooter=1,
    )

    # fix any whitespace issues
    raw_foia_pending_police_suits_df = util.strip_and_trim_whitespace(
        raw_foia_pending_police_suits_df,
    )

    # now save to csv
    util.save_df(
        df=raw_foia_pending_police_suits_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_PENDING_POLICE_SUITS_FOTA_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR,
    )
    return raw_foia_pending_police_suits_df

Ejemplo n.º 6

0

Mostrar archivo

def save_csv_formatted_foia_tort_payments_data() -> pd.DataFrame:
    """Loads the raw unmodified 2001 to 2007 tort payment data,
    changes it into a workable dataframe format, saves that as a csv
    then returns the dataframe as well
    """
    # skip the first 4 rows
    raw_foia_tort_payments_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath(
            RAW_C.RAW_TORT_PAYMENTS_2001_TO_2007_FOIA_DATA_EXCEL_FILE
        ),
        sheet_name=RAW_C.RAW_TORT_PAYMENTS_2001_TO_2007_FOIA_DATA_EXCEL_SHEET,
        header=1,
        skiprows=4,
    )

    # fix any whitespace issues
    raw_foia_tort_payments_df = util.strip_and_trim_whitespace(
        raw_foia_tort_payments_df,
    )

    # do some value specific replacements
    payment_replace_dict = {"NONE": 0}

    raw_foia_tort_payments_df["PAYMENT AMOUNT ($)"].replace(
        to_replace=payment_replace_dict,
        inplace=True,
    )
    raw_foia_tort_payments_df["FEES & COSTS ($)"].replace(
        to_replace=payment_replace_dict,
        inplace=True,
    )
    # now convert to proper dtypes
    raw_foia_tort_payments_df["PAYMENT AMOUNT ($)"] = pd.to_numeric(
        raw_foia_tort_payments_df["PAYMENT AMOUNT ($)"]
    )
    raw_foia_tort_payments_df["FEES & COSTS ($)"] = pd.to_numeric(
        raw_foia_tort_payments_df["FEES & COSTS ($)"]
    )
    raw_foia_tort_payments_df["DATE TO COMPTROLLER"] = pd.to_datetime(
        raw_foia_tort_payments_df["DATE TO COMPTROLLER"]
    )

    # now save to csv
    util.save_df(
        df=raw_foia_tort_payments_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_TORT_PAYMENTS_2001_TO_2007_FOIA_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR,
    )
    return raw_foia_tort_payments_df

Ejemplo n.º 7

0

Mostrar archivo

def process_2012_law_website_data() -> pd.DataFrame:
    """Loads the raw 2012 settlement data from the law department csv,
    converts it to a properly formatted dataframe, saves to csv and returns it
    """
    # load the excel file and skip the first 4 rows and the last 6.
    # also make the first unskipped row the headers
    raw_2012_df = pd.read_excel(
        io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
            RAW_C.RAW_2012_LAW_WEBSITE_DATA_EXCEL_FILE),
        sheet_name=RAW_C.RAW_2012_LAW_WEBSITE_DATA_EXCEL_SHEET,
        header=1,
        skiprows=4,
        skipfooter=6,
    )
    # verify there are 919 rows in total
    assert raw_2012_df.shape == (919, 8)

    # split into tort and non tort
    assert raw_2012_df.loc[0, "CASE #"] == "TORT"
    assert raw_2012_df.loc[909, "CASE #"] == "NON-TORT"

    raw_2012_df.loc[0:909, "Tort Status"] = "TORT"
    raw_2012_df.loc[909:, "Tort Status"] = "NON-TORT"

    raw_2012_df.drop(index=[0, 909], inplace=True)

    # fix any whitespace issues
    raw_2012_df = util.strip_and_trim_whitespace(raw_2012_df)

    util.save_df(
        df=raw_2012_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2012_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2012_df

Ejemplo n.º 8

0

Mostrar archivo

def process_2008_law_website_data() -> pd.DataFrame:
    """Loads the raw 2008 settlement data from the law department website,
    converts it from pdf to a pandas dataframe, then saves it as a csv
    and returns the dataframe from the function
    """
    # get the path to the pdf
    raw_2008_pdf_path = DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
        RAW_C.RAW_2008_LAW_WEBSITE_DATA_PDF)
    # use camelot to convert the first 55 pages tables to dataframes
    tables = camelot.read_pdf(filepath=str(raw_2008_pdf_path), pages="1-55")
    first_page_header_string = (
        "PAYMENT \nFEES & \nCITY \nDATE \nAMOUNT \nCOSTS "
        "\nDEPARTMENT \nTO \nCASE # \nPAYEE \n($) \n($) \nPRIMARY CAUSE "
        "\nINVOLVED \nDISPOSITION \nCOMPTROLLER \nTORT")
    # expected headers for all pages besides the first
    non_first_page_header_string = (
        "PAYMENT \nFEES & \nCITY \nDATE "
        "\nDEPARTMENT \nTO \nAMOUNT \nCOSTS \nINVOLVED \nDISPOSITION "
        "\nCOMPTROLLER \nCASE # \nPAYEE \n($) \n($) \nPRIMARY CAUSE")
    # define the dtypes
    raw_2008_df_col_types = collections.OrderedDict({
        "CASE #":
        str,
        "PAYEE":
        str,
        "PAYMENT AMOUNT($)":
        int,
        "FEES & COSTS($)":
        int,
        "PRIMARY CAUSE":
        str,
        "CITY DEPARTMENT INVOLVED":
        str,
        "DISPOSITION":
        str,
        "DATE TO COMPTROLLER":
        np.datetime64,
        "Tort Status":
        str,
        "pdf_page_num":
        int,
    })
    raw_2008_df_cols = raw_2008_df_col_types.keys()

    # special replacements for the payment amount column
    payment_amount_replacements = {
        "A \n5,694": "5694",
        "S \n76,000": "76000",
        "(181)": "-181",
        "(2,374)": "-2374",
    }
    # create an empty dataframe
    raw_2008_df = pd.DataFrame(columns=raw_2008_df_cols)
    last_page = 55

    # append all the pages' tables together
    for index, table in enumerate(tables):
        page_num = index + 1
        # extract the table as a dataframe
        table_df = table.df

        # check every cell besides first one on first row is empty string
        assert table_df.iloc[0].iloc[1:].eq("").all()
        if page_num == 1:
            assert table_df.iloc[0][0] == first_page_header_string
        else:
            assert table_df.iloc[0][0] == non_first_page_header_string
        # now drop that first row
        table_df = table_df.drop(index=[0])

        # special shape on first pass
        # check the first row is just the header values in one cell
        if page_num == 1:
            assert table_df.shape == (44, 8)
            table_df["Tort Status"] = "TORT"
        # special rules for last page
        elif page_num == last_page:
            assert table_df.shape == (47, 8)
            # make everything after 37 non tort and everything before tort
            assert table_df[0].loc[37] == "NON-TORT"
            assert table_df.loc[47][0] == (
                "TOTAL JUDGMENT/VERDICTS & "
                "SETTLEMENTS \n129,670,864 \nTOTAL FEES AND COSTS \n6,903,180")
            table_df.loc[:37, "Tort Status"] = "TORT"
            table_df.loc[37:, "Tort Status"] = "NON-TORT"
            # drop the non tort lable row and the last one
            table_df = table_df.drop(index=[37, 47])
        else:
            # special shape on some pages with 50 rows instead of 51
            if page_num in [12, 41, 47, 49, 51, 53]:
                assert table_df.shape == (49, 8)
            else:
                assert table_df.shape == (50, 8)
            table_df["Tort Status"] = "TORT"

        table_df["pdf_page_num"] = page_num

        # rename the column
        table_df.columns = raw_2008_df_cols
        # fix the issue with the fees and primary cause column getting jumbled
        table_df[[
            "FEES & COSTS($)", "PRIMARY CAUSE"
        ]] = (table_df["FEES & COSTS($)"].str.cat(
            table_df["PRIMARY CAUSE"]).str.extract(FEE_AND_PRIM_CASE_PAT))

        # do special replacements and convert the numerical columns
        table_df["FEES & COSTS($)"] = (table_df["FEES & COSTS($)"].str.replace(
            ",", "").astype(int))
        table_df["PAYMENT AMOUNT($)"].replace(
            payment_amount_replacements,
            inplace=True,
        )
        table_df["PAYMENT AMOUNT($)"] = (
            table_df["PAYMENT AMOUNT($)"].str.replace(",", "").astype(int))

        # now append
        raw_2008_df = raw_2008_df.append(table_df, ignore_index=True)

    # convert to datetime
    raw_2008_df["DATE TO COMPTROLLER"] = pd.to_datetime(
        raw_2008_df["DATE TO COMPTROLLER"])
    # fix dtypes
    raw_2008_df = raw_2008_df.astype(raw_2008_df_col_types)
    # do whitespace fixing
    raw_2008_df = util.strip_and_trim_whitespace(raw_2008_df)

    # save to csv
    util.save_df(
        df=raw_2008_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2008_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2008_df

Ejemplo n.º 9

0

Mostrar archivo

def process_2009_law_website_data() -> pd.DataFrame:
    """Loads the raw 2009 settlement data from the law department website,
    converts it from pdf to a pandas dataframe, then saves it as a csv
    and returns the dataframe
    """
    # get the path to the pdf
    raw_2009_pdf_path = DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath(
        RAW_C.RAW_2009_LAW_WEBSITE_DATA_PDF)
    # use camelot to convert the first 21 pages tables to dataframes
    tables = camelot.read_pdf(filepath=str(raw_2009_pdf_path), pages="1-21")
    first_page_header_string = (
        "PAYMENT \nFEES & \nCITY \nDATE \nAMOUNT \nCOSTS "
        "\nDEPARTMENT \nTO \nCASE # \nPAYEE \n($) \n($) \nPRIMARY CAUSE "
        "\nINVOLVED \nDISPOSITION \nCOMPTROLLER \nTORT")

    # the first pages table has less rows than normal
    raw_2009_df_col_types = collections.OrderedDict({
        "CASE #":
        str,
        "PAYEE":
        str,
        "PAYMENT AMOUNT($)":
        int,
        "FEES & COSTS($)":
        int,
        "PRIMARY CAUSE":
        str,
        "CITY DEPARTMENT INVOLVED":
        str,
        "DISPOSITION":
        str,
        "DATE TO COMPTROLLER":
        np.datetime64,
        "Tort Status":
        str,
        "pdf_page_num":
        int,
    })
    raw_2009_df_cols = raw_2009_df_col_types.keys()

    # special replacements for the payment amount column
    payment_amount_replacements = {
        "(2,144)": "-2144",
        "(14,405)": "-14405",
        "(1,340)": "-1340",
        "(1,000)": "-1000",
        "(1,352)": "-1352",
        "(1,353)": "-1353",
        "(500)": "-500",
        "(1,175)": "-1175",
        "($550.23)": "550.23",
        "$620.00": "620",
        "$223.83": "223.83",
        "$450.00": "450",
        "$2,499.15": "2,499.15",
    }

    last_page = 21
    raw_2009_df = pd.DataFrame(columns=raw_2009_df_cols)

    # append all the pages' tables together
    for index, table in enumerate(tables):
        page_num = index + 1
        # extract the table as a dataframe
        table_df = table.df.copy()

        # special shape on first pass
        # check the first row is just the header values in one cell
        if page_num == 1:
            # check every cell besides first one on first row is empty string
            assert table_df.iloc[0].iloc[1:].eq("").all()
            assert table_df.iloc[0][0] == first_page_header_string
            # now drop that first row
            table_df = table_df.drop(index=[0])
            assert table_df.shape == (46, 8)
            table_df["Tort Status"] = "TORT"
        # special rule for page 20 where there is a split of tort and non-tort
        elif page_num == 20:
            assert table_df.loc[25].iloc[0] == "NON-TORT"
            table_df = table_df.drop(index=[25])
            assert table_df.shape == (54, 8)
            table_df.loc[:25, "Tort Status"] = "TORT"
            table_df.loc[25:, "Tort Status"] = "NON-TORT"
        # special rules for last page
        elif page_num == last_page:
            assert table_df.shape == (47, 8)
            # check the last row is the sums
            assert table_df.loc[46][0] == (
                "TOTAL JUDGMENT/VERDICTS & "
                "SETTLEMENTS \n51,155,053 \nTOTAL FEES AND COSTS \n7,660,924 "
                "\nTOTAL JUDGMENT/VERDICTS, SETTLEMENTS, FEES AND COSTS \n58,815,977"
            )
            table_df["Tort Status"] = "NON-TORT"
            # drop the non tort lable row and the last one
            table_df = table_df.drop(index=[46])
        else:
            assert table_df.shape == (55, 8)
            table_df["Tort Status"] = "TORT"
        # add a page number
        table_df["pdf_page_num"] = page_num

        # rename the column
        table_df.columns = raw_2009_df_cols

        # specific value rename since the number was cutoff
        if page_num == 9:
            table_df.loc[13, "PAYMENT AMOUNT($)"] = "1395000"

        # fix the issue with the fees and primary cause column getting jumbled
        assert table_df["FEES & COSTS($)"].notna().all()
        table_df[[
            "FEES & COSTS($)", "PRIMARY CAUSE"
        ]] = (table_df["FEES & COSTS($)"].astype(str).str.cat(
            table_df["PRIMARY CAUSE"]).str.extract(FEE_AND_PRIM_CASE_PAT))

        # do special replacements and convert the numerical columns
        table_df["FEES & COSTS($)"] = (table_df["FEES & COSTS($)"].str.replace(
            ",", "").astype(int))
        table_df["PAYMENT AMOUNT($)"].replace(
            payment_amount_replacements,
            inplace=True,
        )
        table_df["PAYMENT AMOUNT($)"] = (
            table_df["PAYMENT AMOUNT($)"].str.replace(",", "").astype(float))

        # now append
        raw_2009_df = raw_2009_df.append(table_df, ignore_index=True)

    # convert to datetime
    raw_2009_df["DATE TO COMPTROLLER"] = pd.to_datetime(
        raw_2009_df["DATE TO COMPTROLLER"])
    # fix dtypes
    raw_2009_df = raw_2009_df.astype(raw_2009_df_col_types)
    # do whitespace fixing
    raw_2009_df = util.strip_and_trim_whitespace(raw_2009_df)

    util.save_df(
        df=raw_2009_df,
        file_name=RAW_C.RAW_CSV_FORMATTED_2009_LAW_WEBSITE_DATA_CSV,
        save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR,
    )

    return raw_2009_df