def process_2021_law_website_data() -> pd.DataFrame: """Loads the raw 2021 settlement data from the law department csv, converts it to a properly formatted dataframe, saves to csv and returns it """ # load the excel file and skip the first 4 rows and the last 7. # also make the first unskipped row the headers raw_2021_df = pd.read_excel( io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath( RAW_C.RAW_2021_LAW_WEBSITE_DATA_EXCEL_FILE), sheet_name=RAW_C.RAW_2021_LAW_WEBSITE_DATA_EXCEL_SHEET, header=1, skiprows=4, skipfooter=7, ) # drop empty columns read in for some reason raw_2021_df.dropna(axis=1, inplace=True, how="all") # verify there are 473 rows in total assert raw_2021_df.shape == (473, 8) for col in ["DATE TO COMPTROLLER"]: raw_2021_df[col] = pd.to_datetime(raw_2021_df[col]) # fix any whitespace issues raw_2021_df = util.strip_and_trim_whitespace(raw_2021_df) util.save_df( df=raw_2021_df, file_name=RAW_C.RAW_CSV_FORMATTED_2021_LAW_WEBSITE_DATA_CSV, save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR, ) return raw_2021_df
def process_2013_law_website_data() -> pd.DataFrame: """Loads the raw 2013 settlement data from the law department csv, converts it to a properly formatted dataframe, saves to csv and returns it """ # load the excel file and skip the first 4 rows and the last 4. # also make the first unskipped row the headers raw_2013_df = pd.read_excel( io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath( RAW_C.RAW_2013_LAW_WEBSITE_DATA_EXCEL_FILE), sheet_name=RAW_C.RAW_2013_LAW_WEBSITE_DATA_EXCEL_SHEET, header=1, skiprows=4, skipfooter=4, ) # verify there are 1068 rows in total assert raw_2013_df.shape == (1068, 9) # rename the last column to no name raw_2013_df.rename(columns={"Unnamed: 8": "Hidden Column"}, inplace=True) # fix any whitespace issues raw_2013_df = util.strip_and_trim_whitespace(raw_2013_df) util.save_df( df=raw_2013_df, file_name=RAW_C.RAW_CSV_FORMATTED_2013_LAW_WEBSITE_DATA_CSV, save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR, ) return raw_2013_df
def process_2014_law_website_data() -> pd.DataFrame: """Loads the raw 2014 settlement data from the law department csv, converts it to a properly formatted dataframe, saves to csv and returns it """ # load the excel file and skip the first 3 rows and the last 654. # also make the first unskipped row the headers raw_2014_df = pd.read_excel( io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath( RAW_C.RAW_2014_LAW_WEBSITE_DATA_EXCEL_FILE), sheet_name=RAW_C.RAW_2014_LAW_WEBSITE_DATA_EXCEL_SHEET, header=1, skiprows=3, skipfooter=654, ) # verify there are 1172 rows in total assert raw_2014_df.shape == (1172, 13) # drop the hidden comptroller column assert raw_2014_df["COMPTROLLER"].isna().all() raw_2014_df.drop(columns=["COMPTROLLER"], inplace=True) for col in ["EFFECTIVE DATE\n", "DATE TO\nCOMPTROLLER", "DUE DATE"]: raw_2014_df[col] = pd.to_datetime(raw_2014_df[col]) # fix any whitespace issues raw_2014_df = util.strip_and_trim_whitespace(raw_2014_df) util.save_df( df=raw_2014_df, file_name=RAW_C.RAW_CSV_FORMATTED_2014_LAW_WEBSITE_DATA_CSV, save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR, ) return raw_2014_df
def save_csv_formatted_foia_cpd_payments_data() -> pd.DataFrame: """Loads the raw unmodified 2004 to 2018 cpd payment data, changes it into a workable dataframe format, saves that as a csv then returns the dataframe as well """ # skip the first 4 rows and last 3 rows raw_foia_cpd_payments_df = pd.read_excel( io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath( RAW_C.RAW_CPD_PAYMENTS_2004_TO_2018_FOIA_DATA_EXCEL_FILE ), sheet_name=RAW_C.RAW_CPD_PAYMENTS_2004_TO_2018_FOIA_DATA_EXCEL_SHEET, header=1, skiprows=4, skipfooter=3, ) # fix any whitespace issues raw_foia_cpd_payments_df = util.strip_and_trim_whitespace( raw_foia_cpd_payments_df, ) # now convert to proper dtypes raw_foia_cpd_payments_df["DATE TO COMPTROLLER"] = pd.to_datetime( raw_foia_cpd_payments_df["DATE TO COMPTROLLER"] ) # now save to csv util.save_df( df=raw_foia_cpd_payments_df, file_name=RAW_C.RAW_CSV_FORMATTED_CPD_PAYMENTS_2004_TO_2018_FOIA_DATA_CSV, save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR, ) return raw_foia_cpd_payments_df
def save_csv_formatted_quarterly_police_suit_disp_data() -> pd.DataFrame: """Loads the raw unmodified quarterly police lawsuits disposition data changes it into a workable dataframe format, saves that as a csv then returns the dataframe as well """ # load the sheet which currently has multiple subtables in it unsplit_raw_foia_police_suits_disp_df = pd.read_excel( io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath( RAW_C.RAW_QUARTERLY_POLICE_SUIT_DISP_FOIA_DATA_EXCEL_FILE ), sheet_name=RAW_C.RAW_QUARTERLY_POLICE_SUIT_DISP_FOIA_DATA_EXCEL_SHEET, ) # properly format the subtables into one df raw_foia_police_suits_disp_df = format_multitable_df( unsplit_raw_foia_police_suits_disp_df, header_row_value="Docket Number", subheading_col_name="Client Department", ) # strip the client department of the leading "Client Department:" value raw_foia_police_suits_disp_df["Client Department"] = ( raw_foia_police_suits_disp_df["Client Department"] .str.lstrip("Client Department:") .str.strip() ) # now save to csv util.save_df( df=raw_foia_police_suits_disp_df, file_name=RAW_C.RAW_CSV_FORMATTED_QUARTERLY_POLICE_SUIT_DISP_FOIA_DATA_CSV, save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR, ) return raw_foia_police_suits_disp_df
def save_csv_formatted_foia_pending_suits_data() -> pd.DataFrame: """Loads the raw unmodified pending police lawsuits data, changes it into a workable dataframe format, saves that as a csv then returns the dataframe as well """ # skip the first 4 rows raw_foia_pending_police_suits_df = pd.read_excel( io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath( RAW_C.RAW_PENDING_POLICE_SUITS_FOIA_DATA_EXCEL_FILE ), sheet_name=RAW_C.RAW_PENDING_POLICE_SUITS_FOIA_DATA_EXCEL_SHEET, header=1, skipfooter=1, ) # fix any whitespace issues raw_foia_pending_police_suits_df = util.strip_and_trim_whitespace( raw_foia_pending_police_suits_df, ) # now save to csv util.save_df( df=raw_foia_pending_police_suits_df, file_name=RAW_C.RAW_CSV_FORMATTED_PENDING_POLICE_SUITS_FOTA_DATA_CSV, save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR, ) return raw_foia_pending_police_suits_df
def save_csv_formatted_matter_disp_report_data() -> typing.List[pd.DataFrame]: """Loads the raw unmodified matter disposition data changes it into a workable dataframe formats, saves each sheet as a csv then returns a list of the dataframes as well """ output_list = [] # sheet name, subheading col name and the output csv name sheet_save_list = [ ( RAW_C.RAW_QUARTERLY_MATTER_DISP_REPORT_BY_DIVISION_FOIA_DATA_EXCEL_SHEET, "Division", RAW_C.RAW_CSV_FORMATTED_MATTER_DISP_REPORT_BY_DIVISION_FOIA_DATA_CSV, ), ( RAW_C.RAW_QUARTERLY_MATTER_DISP_REPORT_BY_DEPARTMENT_FOIA_DATA_EXCEL_SHEET, "Client Department", RAW_C.RAW_CSV_FORMATTED_MATTER_DISP_REPORT_BY_DEPARTMENT_FOIA_DATA_CSV, ), ( RAW_C.RAW_QUARTERLY_MATTER_DISP_REPORT_BY_ASSIGNEE_FOIA_DATA_EXCEL_SHEET, "Main Assignee", RAW_C.RAW_CSV_FORMATTED_MATTER_DISP_REPORT_BY_ASSIGNEE_FOIA_DATA_CSV, ), ] for sheet_name, subtable_col_name, output_csv_name in sheet_save_list: unsplit_df = pd.read_excel( io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath( RAW_C.RAW_QUARTERLY_MATTER_DISP_REPORT_FOIA_DATA_EXCEL_FILE ), sheet_name=sheet_name, ) # properly format the subtables into one df formatted_df = format_multitable_df( unsplit_df, header_row_value="Docket Number", subheading_col_name=subtable_col_name, ) # strip the client department of the leading "Division:" value formatted_df[subtable_col_name] = ( formatted_df[subtable_col_name] .str.lstrip(subtable_col_name + ":") .str.strip() ) output_list.append(formatted_df) # now save to csv util.save_df( df=formatted_df, file_name=output_csv_name, save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR, ) return output_list
def save_csv_formatted_foia_tort_payments_data() -> pd.DataFrame: """Loads the raw unmodified 2001 to 2007 tort payment data, changes it into a workable dataframe format, saves that as a csv then returns the dataframe as well """ # skip the first 4 rows raw_foia_tort_payments_df = pd.read_excel( io=DIR_C.RAW_UNMODIFIED_FOIA_DATA_DIR.joinpath( RAW_C.RAW_TORT_PAYMENTS_2001_TO_2007_FOIA_DATA_EXCEL_FILE ), sheet_name=RAW_C.RAW_TORT_PAYMENTS_2001_TO_2007_FOIA_DATA_EXCEL_SHEET, header=1, skiprows=4, ) # fix any whitespace issues raw_foia_tort_payments_df = util.strip_and_trim_whitespace( raw_foia_tort_payments_df, ) # do some value specific replacements payment_replace_dict = {"NONE": 0} raw_foia_tort_payments_df["PAYMENT AMOUNT ($)"].replace( to_replace=payment_replace_dict, inplace=True, ) raw_foia_tort_payments_df["FEES & COSTS ($)"].replace( to_replace=payment_replace_dict, inplace=True, ) # now convert to proper dtypes raw_foia_tort_payments_df["PAYMENT AMOUNT ($)"] = pd.to_numeric( raw_foia_tort_payments_df["PAYMENT AMOUNT ($)"] ) raw_foia_tort_payments_df["FEES & COSTS ($)"] = pd.to_numeric( raw_foia_tort_payments_df["FEES & COSTS ($)"] ) raw_foia_tort_payments_df["DATE TO COMPTROLLER"] = pd.to_datetime( raw_foia_tort_payments_df["DATE TO COMPTROLLER"] ) # now save to csv util.save_df( df=raw_foia_tort_payments_df, file_name=RAW_C.RAW_CSV_FORMATTED_TORT_PAYMENTS_2001_TO_2007_FOIA_DATA_CSV, save_dir=DIR_C.RAW_CSV_FORMATTED_FOIA_DATA_DIR, ) return raw_foia_tort_payments_df
def process_2012_law_website_data() -> pd.DataFrame: """Loads the raw 2012 settlement data from the law department csv, converts it to a properly formatted dataframe, saves to csv and returns it """ # load the excel file and skip the first 4 rows and the last 6. # also make the first unskipped row the headers raw_2012_df = pd.read_excel( io=DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath( RAW_C.RAW_2012_LAW_WEBSITE_DATA_EXCEL_FILE), sheet_name=RAW_C.RAW_2012_LAW_WEBSITE_DATA_EXCEL_SHEET, header=1, skiprows=4, skipfooter=6, ) # verify there are 919 rows in total assert raw_2012_df.shape == (919, 8) # split into tort and non tort assert raw_2012_df.loc[0, "CASE #"] == "TORT" assert raw_2012_df.loc[909, "CASE #"] == "NON-TORT" raw_2012_df.loc[0:909, "Tort Status"] = "TORT" raw_2012_df.loc[909:, "Tort Status"] = "NON-TORT" raw_2012_df.drop(index=[0, 909], inplace=True) # fix any whitespace issues raw_2012_df = util.strip_and_trim_whitespace(raw_2012_df) util.save_df( df=raw_2012_df, file_name=RAW_C.RAW_CSV_FORMATTED_2012_LAW_WEBSITE_DATA_CSV, save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR, ) return raw_2012_df
if val >= amount: return np.nan return val df['ENTRIES_PER_INTERVAL'] = df['ENTRIES_PER_INTERVAL'].apply(cap, amount=1000000) df['EXITS_PER_INTERVAL'] = df['EXITS_PER_INTERVAL'].apply(cap, amount=1000000) # add column for busyness which is entries + exits # this is now very easy given we have <ENTRIES/EXITS>_PER_INTERVAL df['BUSYNESS_PER_INTERVAL'] = df.ENTRIES_PER_INTERVAL + df.EXITS_PER_INTERVAL # add any date information that might be important for modeling # do now so we don't have to redo everytime we tweak our model df['MINUTE'] = df['DATETIME'].dt.minute df['HOUR'] = df['DATETIME'].dt.hour df['MONTH'] = df['DATETIME'].dt.month df['YEAR'] = df['DATETIME'].dt.year df['DAYOFWEEK'] = df['DATETIME'].dt.dayofweek df['DAYOFYEAR'] = df['DATETIME'].dt.dayofyear df['WEEKOFYEAR'] = df['DATETIME'].dt.weekofyear # TODO add month busyness? # TODO descriptive statistics???! # http://pandas.pydata.org/pandas-docs/stable/api.html#computations-descriptive-stats # remember, have to have them available at time of prediciton df.to_csv('augmented.csv') # save entire dataframe util.save_df(df, 'turnstile', 'augmented')
def process_2008_law_website_data() -> pd.DataFrame: """Loads the raw 2008 settlement data from the law department website, converts it from pdf to a pandas dataframe, then saves it as a csv and returns the dataframe from the function """ # get the path to the pdf raw_2008_pdf_path = DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath( RAW_C.RAW_2008_LAW_WEBSITE_DATA_PDF) # use camelot to convert the first 55 pages tables to dataframes tables = camelot.read_pdf(filepath=str(raw_2008_pdf_path), pages="1-55") first_page_header_string = ( "PAYMENT \nFEES & \nCITY \nDATE \nAMOUNT \nCOSTS " "\nDEPARTMENT \nTO \nCASE # \nPAYEE \n($) \n($) \nPRIMARY CAUSE " "\nINVOLVED \nDISPOSITION \nCOMPTROLLER \nTORT") # expected headers for all pages besides the first non_first_page_header_string = ( "PAYMENT \nFEES & \nCITY \nDATE " "\nDEPARTMENT \nTO \nAMOUNT \nCOSTS \nINVOLVED \nDISPOSITION " "\nCOMPTROLLER \nCASE # \nPAYEE \n($) \n($) \nPRIMARY CAUSE") # define the dtypes raw_2008_df_col_types = collections.OrderedDict({ "CASE #": str, "PAYEE": str, "PAYMENT AMOUNT($)": int, "FEES & COSTS($)": int, "PRIMARY CAUSE": str, "CITY DEPARTMENT INVOLVED": str, "DISPOSITION": str, "DATE TO COMPTROLLER": np.datetime64, "Tort Status": str, "pdf_page_num": int, }) raw_2008_df_cols = raw_2008_df_col_types.keys() # special replacements for the payment amount column payment_amount_replacements = { "A \n5,694": "5694", "S \n76,000": "76000", "(181)": "-181", "(2,374)": "-2374", } # create an empty dataframe raw_2008_df = pd.DataFrame(columns=raw_2008_df_cols) last_page = 55 # append all the pages' tables together for index, table in enumerate(tables): page_num = index + 1 # extract the table as a dataframe table_df = table.df # check every cell besides first one on first row is empty string assert table_df.iloc[0].iloc[1:].eq("").all() if page_num == 1: assert table_df.iloc[0][0] == first_page_header_string else: assert table_df.iloc[0][0] == non_first_page_header_string # now drop that first row table_df = table_df.drop(index=[0]) # special shape on first pass # check the first row is just the header values in one cell if page_num == 1: assert table_df.shape == (44, 8) table_df["Tort Status"] = "TORT" # special rules for last page elif page_num == last_page: assert table_df.shape == (47, 8) # make everything after 37 non tort and everything before tort assert table_df[0].loc[37] == "NON-TORT" assert table_df.loc[47][0] == ( "TOTAL JUDGMENT/VERDICTS & " "SETTLEMENTS \n129,670,864 \nTOTAL FEES AND COSTS \n6,903,180") table_df.loc[:37, "Tort Status"] = "TORT" table_df.loc[37:, "Tort Status"] = "NON-TORT" # drop the non tort lable row and the last one table_df = table_df.drop(index=[37, 47]) else: # special shape on some pages with 50 rows instead of 51 if page_num in [12, 41, 47, 49, 51, 53]: assert table_df.shape == (49, 8) else: assert table_df.shape == (50, 8) table_df["Tort Status"] = "TORT" table_df["pdf_page_num"] = page_num # rename the column table_df.columns = raw_2008_df_cols # fix the issue with the fees and primary cause column getting jumbled table_df[[ "FEES & COSTS($)", "PRIMARY CAUSE" ]] = (table_df["FEES & COSTS($)"].str.cat( table_df["PRIMARY CAUSE"]).str.extract(FEE_AND_PRIM_CASE_PAT)) # do special replacements and convert the numerical columns table_df["FEES & COSTS($)"] = (table_df["FEES & COSTS($)"].str.replace( ",", "").astype(int)) table_df["PAYMENT AMOUNT($)"].replace( payment_amount_replacements, inplace=True, ) table_df["PAYMENT AMOUNT($)"] = ( table_df["PAYMENT AMOUNT($)"].str.replace(",", "").astype(int)) # now append raw_2008_df = raw_2008_df.append(table_df, ignore_index=True) # convert to datetime raw_2008_df["DATE TO COMPTROLLER"] = pd.to_datetime( raw_2008_df["DATE TO COMPTROLLER"]) # fix dtypes raw_2008_df = raw_2008_df.astype(raw_2008_df_col_types) # do whitespace fixing raw_2008_df = util.strip_and_trim_whitespace(raw_2008_df) # save to csv util.save_df( df=raw_2008_df, file_name=RAW_C.RAW_CSV_FORMATTED_2008_LAW_WEBSITE_DATA_CSV, save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR, ) return raw_2008_df
def process_2009_law_website_data() -> pd.DataFrame: """Loads the raw 2009 settlement data from the law department website, converts it from pdf to a pandas dataframe, then saves it as a csv and returns the dataframe """ # get the path to the pdf raw_2009_pdf_path = DIR_C.RAW_UNMODIFIED_LAW_WEBSITE_DATA_DIR.joinpath( RAW_C.RAW_2009_LAW_WEBSITE_DATA_PDF) # use camelot to convert the first 21 pages tables to dataframes tables = camelot.read_pdf(filepath=str(raw_2009_pdf_path), pages="1-21") first_page_header_string = ( "PAYMENT \nFEES & \nCITY \nDATE \nAMOUNT \nCOSTS " "\nDEPARTMENT \nTO \nCASE # \nPAYEE \n($) \n($) \nPRIMARY CAUSE " "\nINVOLVED \nDISPOSITION \nCOMPTROLLER \nTORT") # the first pages table has less rows than normal raw_2009_df_col_types = collections.OrderedDict({ "CASE #": str, "PAYEE": str, "PAYMENT AMOUNT($)": int, "FEES & COSTS($)": int, "PRIMARY CAUSE": str, "CITY DEPARTMENT INVOLVED": str, "DISPOSITION": str, "DATE TO COMPTROLLER": np.datetime64, "Tort Status": str, "pdf_page_num": int, }) raw_2009_df_cols = raw_2009_df_col_types.keys() # special replacements for the payment amount column payment_amount_replacements = { "(2,144)": "-2144", "(14,405)": "-14405", "(1,340)": "-1340", "(1,000)": "-1000", "(1,352)": "-1352", "(1,353)": "-1353", "(500)": "-500", "(1,175)": "-1175", "($550.23)": "550.23", "$620.00": "620", "$223.83": "223.83", "$450.00": "450", "$2,499.15": "2,499.15", } last_page = 21 raw_2009_df = pd.DataFrame(columns=raw_2009_df_cols) # append all the pages' tables together for index, table in enumerate(tables): page_num = index + 1 # extract the table as a dataframe table_df = table.df.copy() # special shape on first pass # check the first row is just the header values in one cell if page_num == 1: # check every cell besides first one on first row is empty string assert table_df.iloc[0].iloc[1:].eq("").all() assert table_df.iloc[0][0] == first_page_header_string # now drop that first row table_df = table_df.drop(index=[0]) assert table_df.shape == (46, 8) table_df["Tort Status"] = "TORT" # special rule for page 20 where there is a split of tort and non-tort elif page_num == 20: assert table_df.loc[25].iloc[0] == "NON-TORT" table_df = table_df.drop(index=[25]) assert table_df.shape == (54, 8) table_df.loc[:25, "Tort Status"] = "TORT" table_df.loc[25:, "Tort Status"] = "NON-TORT" # special rules for last page elif page_num == last_page: assert table_df.shape == (47, 8) # check the last row is the sums assert table_df.loc[46][0] == ( "TOTAL JUDGMENT/VERDICTS & " "SETTLEMENTS \n51,155,053 \nTOTAL FEES AND COSTS \n7,660,924 " "\nTOTAL JUDGMENT/VERDICTS, SETTLEMENTS, FEES AND COSTS \n58,815,977" ) table_df["Tort Status"] = "NON-TORT" # drop the non tort lable row and the last one table_df = table_df.drop(index=[46]) else: assert table_df.shape == (55, 8) table_df["Tort Status"] = "TORT" # add a page number table_df["pdf_page_num"] = page_num # rename the column table_df.columns = raw_2009_df_cols # specific value rename since the number was cutoff if page_num == 9: table_df.loc[13, "PAYMENT AMOUNT($)"] = "1395000" # fix the issue with the fees and primary cause column getting jumbled assert table_df["FEES & COSTS($)"].notna().all() table_df[[ "FEES & COSTS($)", "PRIMARY CAUSE" ]] = (table_df["FEES & COSTS($)"].astype(str).str.cat( table_df["PRIMARY CAUSE"]).str.extract(FEE_AND_PRIM_CASE_PAT)) # do special replacements and convert the numerical columns table_df["FEES & COSTS($)"] = (table_df["FEES & COSTS($)"].str.replace( ",", "").astype(int)) table_df["PAYMENT AMOUNT($)"].replace( payment_amount_replacements, inplace=True, ) table_df["PAYMENT AMOUNT($)"] = ( table_df["PAYMENT AMOUNT($)"].str.replace(",", "").astype(float)) # now append raw_2009_df = raw_2009_df.append(table_df, ignore_index=True) # convert to datetime raw_2009_df["DATE TO COMPTROLLER"] = pd.to_datetime( raw_2009_df["DATE TO COMPTROLLER"]) # fix dtypes raw_2009_df = raw_2009_df.astype(raw_2009_df_col_types) # do whitespace fixing raw_2009_df = util.strip_and_trim_whitespace(raw_2009_df) util.save_df( df=raw_2009_df, file_name=RAW_C.RAW_CSV_FORMATTED_2009_LAW_WEBSITE_DATA_CSV, save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR, ) return raw_2009_df
def reformat_and_save(f): print 'old' if is_old_format(f) else 'new', f df = read_old(f) if is_old_format(f) else read_new(f) util.save_df(df, f, 'reformatted')
def clean_and_standardize_all_data() -> None: """Cleans, standardizes, and saves Law Website data from each year. Cleans, standardizes and saves the Law Website data for each year. It also saves a single file with all the years combined into one.""" # list of tuples with (raw_csv, output_csv) processing_list = [ ( RAW_C.RAW_CSV_FORMATTED_2008_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2008_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2009_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2009_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2010_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2010_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2011_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2011_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2012_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2012_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2013_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2013_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2014_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2014_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2015_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2015_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2016_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2016_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2017_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2017_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2018_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2018_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2019_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2019_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2020_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2020_LAW_WEBSITE_DATA_CSV, ), ( RAW_C.RAW_CSV_FORMATTED_2021_LAW_WEBSITE_DATA_CSV, STAN_C.STANDARDIZED_2021_LAW_WEBSITE_DATA_CSV, ), ] rename_dict = STAN_C.LAW_WEBSITE_DATA_COL_STANDARDIZATION_RENAME_DICT output_dfs = [] doc_yr = 2008 for raw_csv, output_csv in processing_list: raw_df = util.load_df( file_name=raw_csv, save_dir=DIR_C.RAW_CSV_FORMATTED_LAW_WEBSITE_DATA_DIR, ) assert raw_df.columns.isin( rename_dict.keys() ).all(), f"Not all keys in {raw_csv} are in the rename dict!" # rename standardized_df = raw_df.rename(columns=rename_dict) # standardize case number and extract relevant info standardized_df = case_num_parsing.standardize_case_num_info(standardized_df) # remove 0931 from department name (included one year for some reason) standardized_df[STAN_C.CITY_DEPARTMENT_INVOLVED_COL] = standardized_df[ STAN_C.CITY_DEPARTMENT_INVOLVED_COL ].str.rstrip(" 0931") # if any numeric columns not numeric then fix them already_numeric_cols = standardized_df.select_dtypes("number").columns # remove and dollar signs or commas from payment column for money_col in [STAN_C.PAYMENT_AMOUNT_COL, STAN_C.FEES_AND_COSTS_COL]: # skip if already numberic if money_col in already_numeric_cols: continue standardized_df[money_col] = pd.to_numeric( standardized_df[money_col].str.replace( pat=r"[\,\$\s]+", repl="", regex=True ) ) # add data source standardized_df[STAN_C.DATA_SOURCE_COL] = f"law_dept_website_{doc_yr}" doc_yr += 1 # save output util.save_df( df=standardized_df, file_name=output_csv, save_dir=DIR_C.CLEANED_AND_STANDARDIZED_LAW_WEBSITE_DATA_DIR, ) output_dfs.append(standardized_df) all_yrs_output_df = pd.concat(output_dfs) util.save_df( df=all_yrs_output_df, file_name=STAN_C.STANDARDIZED_ALL_YEARS_LAW_WEBSITE_DATA_CSV, save_dir=DIR_C.CLEANED_AND_STANDARDIZED_LAW_WEBSITE_DATA_DIR, )