def send_sheet_fixes_to_postgres(sheet): sheet_records = sheet.batch_get(["A:P"])[0] column_names = sheet_records.pop(0) sheet_df = pd.DataFrame(sheet_records, columns=column_names).rename(columns=sql_sheet_column_names_map) sheet_df = sheet_df[sheet_df["fixed"] == "TRUE"] if len(sheet_df) == 0: myprint("Nobody has fixed any jobs in the Google Sheet.") return myprint(f"{len(sheet_df)} jobs have been fixed in the Google Sheet. Sending these changes to Postgres now.") sheet_df["housing accuracy"] = pd.to_numeric(sheet_df["housing accuracy"]) sheet_df["housing_lat"] = pd.to_numeric(sheet_df["housing_lat"]) sheet_df["housing_long"] = pd.to_numeric(sheet_df["housing_long"]) sheet_df["id"] = pd.to_numeric(sheet_df["id"]) sheet_df["Date of run"] = pd.to_datetime(sheet_df["Date of run"]) sheet_df["fixed"] = sheet_df["fixed"].astype('bool') for i, job in sheet_df.iterrows(): query = text( """UPDATE low_accuracies SET (fixed, housing_lat, housing_long, "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY", "HOUSING_STATE", "HOUSING_POSTAL_CODE", notes, housing_fixed_by) = (:fixed, :lat, :long, :address, :city, :state, :zip, :notes, :fixed_by) WHERE id = :id""") with engine.connect() as connection: connection.execute(query, fixed=job["fixed"], lat=job["housing_lat"], long=job["housing_long"], address=job["HOUSING_ADDRESS_LOCATION"], city=job["HOUSING_CITY"], state=job["HOUSING_STATE"], zip=job["HOUSING_POSTAL_CODE"], notes=job["notes"], fixed_by=job["housing_fixed_by"], id=job["id"])
def fix_previously_fixed(): previously_fixed_query = """ SELECT * FROM ( (SELECT COALESCE("HOUSING_ADDRESS_LOCATION", '') || COALESCE("HOUSING_CITY", '') || COALESCE("HOUSING_STATE", '') || COALESCE("HOUSING_POSTAL_CODE", '') as full_address, id AS low_acc_id FROM low_accuracies) AS low_acc INNER JOIN (SELECT DISTINCT ON (initial_address) * FROM previously_fixed) AS prev_fixed ON prev_fixed.initial_address = low_acc.full_address) """ previously_fixed_df = pd.read_sql(previously_fixed_query, con=engine) myprint( f"There are {len(previously_fixed_df)} rows in low_accuracies whose exact housing address column has been fixed before." ) for i, fixed in previously_fixed_df.iterrows(): update_query = text(""" UPDATE low_accuracies SET "HOUSING_ADDRESS_LOCATION" = :address, "HOUSING_CITY" = :city, "HOUSING_POSTAL_CODE" = :zip, "HOUSING_STATE" = :state, "fixed" = :fixed, "housing accuracy" = :accuracy, "housing accuracy type" = :accuracy_type, "housing_fixed_by" = :fixed_by, "housing_lat" = :lat, "housing_long" = :long, "notes" = :notes WHERE "id" = :id """) with engine.connect() as connection: connection.execute(update_query, address=fixed["HOUSING_ADDRESS_LOCATION"], city=fixed["HOUSING_CITY"], zip=fixed["HOUSING_POSTAL_CODE"], state=fixed["HOUSING_STATE"], fixed=fixed["fixed"], accuracy=fixed["housing accuracy"], accuracy_type=fixed["housing accuracy type"], fixed_by=fixed["housing_fixed_by"], lat=fixed["housing_lat"], long=fixed["housing_long"], notes=fixed["notes"], id=fixed["low_acc_id"]) myprint("Successfully fixed all previously fixed rows in low accuracies.")
def add_data_as_table(path_to_data, table_name, excel_or_csv=""): if excel_or_csv == "excel": data = pd.read_excel(path_to_data) elif excel_or_csv == "csv": data = pd.read_csv(path_to_data) else: raise Exception( "`excel_or_csv` parameter of add_data_to_table function must be either 'excel' or 'csv'." ) myprint(f"Adding {len(data)} rows to {table_name}...") data.to_sql(table_name, engine, index=False) myprint( f"Successfully added the {excel_or_csv} file {path_to_data} as a table named {table_name}." )
def mark_inactive_dolH_as_fixed(): acc_case_nums_statuses_df = pd.read_sql( """select "CASE_NUMBER", status from job_central""", con=engine) inacc_case_nums_statuses_df = pd.read_sql( """select "CASE_NUMBER", status from low_accuracies where "table" != 'dol_h'""", con=engine) # check that there are no duplicate case numbers case_nums_list = acc_case_nums_statuses_df["CASE_NUMBER"].tolist( ) + inacc_case_nums_statuses_df["CASE_NUMBER"].tolist() assert len(case_nums_list) == len(set(case_nums_list)) all_case_nums_statuses_df = acc_case_nums_statuses_df.append( inacc_case_nums_statuses_df) inaccurate_additional_housings = pd.read_sql( """select "id", "CASE_NUMBER" from low_accuracies where "table" = 'dol_h'""", con=engine) myprint( f"There are {len(inaccurate_additional_housings)} inaccurate additional housing rows." ) case_nums_with_no_matches = [] num_fixed = 0 # iterate over all inaccurate additional housing case numbers and ids. if a rows's corresponding row in job_central is inactive, update the row's fixed column to be true for i, job in inaccurate_additional_housings.iterrows(): job_case_num = job["CASE_NUMBER"] central_job = all_case_nums_statuses_df[ all_case_nums_statuses_df["CASE_NUMBER"] == job_case_num] if len(central_job) > 1: myprint( f"{job_case_num} has {len(central_job)} matching rows in the dataframe with all non dol_h case numbers. This should be impossible." ) elif len(central_job) == 0: myprint( f"{job_case_num} has no matching rows in the dataframe with all non dol_h case numbers." ) case_nums_with_no_matches.append(job_case_num) else: job_status = central_job["status"].tolist()[0] if job_status not in ["not yet started", "started"]: job_id = job["id"] make_query(f"""UPDATE low_accuracies SET fixed = true, worksite_fixed_by = 'inactive', housing_fixed_by = 'inactive' WHERE "id" = {job_id}""") num_fixed += 1 # myprint(f"There are {len(case_nums_with_no_matches)} additional housing rows in low_accuracies without a matching central case number.") # myprint(f"There are {len(set(case_nums_with_no_matches))} additional housing unique case numbers in low_accuracies without a matching central case number.") myprint( f"{num_fixed} additional housing rows in low accuracies were marked as fixed." )
def overwrite_feature(username, password, new_df, old_feature_name): gis = GIS(url='https://www.arcgis.com', username=username, password=password) # print("Logged in as " + str(gis.properties.user.username)) csv_file_name = f"{old_feature_name}.csv" new_df.to_csv(csv_file_name, index=False) # get first search resul old_jobs_item = gis.content.search(f"title: {old_feature_name}", 'Feature Layer')[0] old_feature_layer = FeatureLayerCollection.fromitem(old_jobs_item) myprint( f"Overwriting feature layer.... there will now be {len(new_df)} features." ) old_feature_layer.manager.overwrite(csv_file_name) myprint('Done overwriting feature layer.') os.remove(csv_file_name)
def append_excels_to_their_tables(): year = input("What year is it? (eg: 2020)\n").strip() quarter = int(input("What quarter is it? (enter 1, 2, 3, or 4)\n").strip()) input(f"Ok, appending excel files for fiscal year {year}Q{quarter}. If this is correct press any key, othewise press control + c to start over.") files_to_tables_map = pd.read_excel(os.path.join(os.getcwd(), '..', "excel_files/dol_table_file_mappings.xls")) myprint(f"Will be appending {len(files_to_tables_map)} files to their respective tables.") for i, pair in files_to_tables_map.iterrows(): file_name, table_name = pair["file_name"].strip(), pair["table_name"].strip() myprint(f"Appending {file_name} to {table_name}.\n") try: append_excel_to_table(file_name, table_name, year, quarter) myprint(f"Success!") except Exception as error: myprint(f"That didn't work, here's the error message:\n{str(error)}\n")
def add_worksites_to_postgres(): file_path = "dol_data/" + input( "Check that the additional worksites file is in a folder named `dol_data` in the `scripts` folder. (If it isn't, exit this using control + c then re-run this script once you've done it.) Now enter the name of the file (this is case sensitive).\n" ).strip() year = input("What year is it? (eg: 2020)\n").strip() quarter = int(input("What quarter it is? (enter 1, 2, 3, or 4)\n").strip()) input( f"Ok, adding worksites from {file_path} for fiscal year {year}Q{quarter}. If this is correct press any key, othewise press control + c to start over." ) worksites = pd.read_excel(file_path) worksites = manage_worksites(worksites, year, quarter) myprint(f"Adding {len(worksites)} rows to additional_worksites table.") worksites.to_sql("additional_worksites", engine, if_exists='append', index=False) if int(quarter) != 1: make_query( f"""DELETE FROM additional_worksites WHERE fy = '{year}Q{int(quarter) - 1}'""" )
def append_excel_to_table(file_name, table_name, year, quarter): if table_name in all_table_names: if check_for_new_columns(file_name, table_name): raise Exception(f"Since there are columns in {file_name} that aren't in {table_name}, we can't continue this process or it will fail. Either add the missing columns to PostgreSQL or change the column names in the excel file to match ones that already exist in Postgres.") else: myprint(f"The table {table_name} doesn't exist yet, so it will be added.") df = pd.read_excel(f"dol_data/{file_name}") df["fy"] = f"{year}Q{quarter}" myprint(f"Adding {len(df)} rows from {file_name} to {table_name}.") df.to_sql(table_name, engine, if_exists='append', index=False) myprint("Done adding rows.") if int(quarter) != 1: make_query(f"""DELETE FROM {table_name} WHERE fy = '{year}Q{int(quarter) - 1}'""")
else: h2b_inaccurates_inaccurate = True return worksites_accurate, housings_accurate, h2a_inaccurates_inaccurate, h2b_inaccurates_inaccurate type_conversions = {'fixed': bool, 'housing_fixed_by': str, 'worksite_fixed_by': str, "HOUSING_POSTAL_CODE": str, "WORKSITE_POSTAL_CODE": str} accurate_new_jobs = pd.read_excel(os.path.join(os.getcwd(), '..', "excel_files/accurate_dol_geocoded.xlsx"), converters=type_conversions) inaccurate_new_jobs = pd.read_excel(os.path.join(os.getcwd(), '..', "excel_files/inaccurate_dol_geocoded.xlsx"), converters=type_conversions) accurate_old_jobs = pd.read_excel(os.path.join(os.getcwd(), '..', "excel_files/accurates_geocoded.xlsx"), converters=type_conversions) inaccurate_old_jobs = pd.read_excel(os.path.join(os.getcwd(), '..', "excel_files/inaccurates_geocoded.xlsx"), converters=type_conversions) myprint("Start of add housing test", is_red="red") housing = pd.read_excel(os.path.join(os.getcwd(), '..', 'excel_files/housing_addendum.xlsx')) accurate_housing, inaccurate_housing = geocode_manage_split_housing(housing, 2020, 3) class TestAddHousing(unittest.TestCase): def test_length_and_table_column(self): self.assertEqual(len(accurate_housing), 9) self.assertEqual(len(inaccurate_housing), 1) self.assertTrue((accurate_housing["table"] == "dol_h").all() and (inaccurate_housing["table"] == "dol_h").all()) def test_accuracies(self): accurates_are_accurate = (((accurate_housing["housing accuracy"] >= 0.7) & (~(accurate_housing["housing accuracy type"].isin(bad_accuracy_types)))) | (~accurate_housing["HOUSING_STATE"].str.lower().isin(helpers.our_states))).all() self.assertTrue(accurates_are_accurate) inaccurates_are_inaccurate = ((inaccurate_housing["housing accuracy"].isnull()) | (inaccurate_housing["housing accuracy"] < 0.7) | (inaccurate_housing["housing accuracy type"].isin(bad_accuracy_types))).all() self.assertTrue(inaccurates_are_inaccurate) def test_fy_column(self): self.assertTrue(all(accurate_housing["fy"] == "2020Q3")) self.assertTrue(all(inaccurate_housing["fy"] == "2020Q3"))
def send_fixes_to_postgres(): fixed = pd.read_sql_query('select * from low_accuracies where fixed=true', con=engine) if len(fixed) == 0: myprint("No jobs have been fixed.") return myprint(f"{len(fixed)} jobs have been fixed.") central, housing, failures = implement_fixes(fixed) assert len(central) + len(housing) + len(failures) == len(fixed) myprint(f"{len(central)} rows moving from low_accuracies to job_central.") myprint( f"{len(housing)} rows moving from low_accuracies to additional housing." ) myprint(f"{len(failures)} failed fixes.") # saving appropriate fixes to previously_fixed table prev_fixed_columns = [ "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY", "HOUSING_POSTAL_CODE", "HOUSING_STATE", "fixed", "housing accuracy", "housing accuracy type", "housing_fixed_by", "housing_lat", "housing_long", "notes" ] fixed_by_vals_to_save_fixes_for = ["coordinates", "address", "impossible"] central_for_prev_fixed_table = central[central["housing_fixed_by"].isin( fixed_by_vals_to_save_fixes_for)][prev_fixed_columns] housing_for_prev_fixed_table = housing[housing["housing_fixed_by"].isin( fixed_by_vals_to_save_fixes_for)][prev_fixed_columns] for_prev_fixed_table = central_for_prev_fixed_table.append( housing_for_prev_fixed_table) if not for_prev_fixed_table.empty: myprint( f"There are {len(for_prev_fixed_table)} rows to add to the previously_fixed table." ) for_prev_fixed_table["initial_address"] = for_prev_fixed_table.apply( lambda job: handle_null(job["HOUSING_ADDRESS_LOCATION"]) + handle_null(job["HOUSING_CITY"]) + handle_null(job[ "HOUSING_STATE"]) + handle_null(job["HOUSING_POSTAL_CODE"]), axis=1) for_prev_fixed_table = for_prev_fixed_table.drop_duplicates( subset='initial_address', keep="last") for_prev_fixed_table.to_sql("previously_fixed", engine, if_exists='append', index=False) myprint(f"All rows successfully added to the previously_fixed table.") else: myprint(f"No rows to add to the previously_fixed table.") # adding fixes to appropriate tables and deleting appropriate rows from low_accuracies central.to_sql('job_central', engine, if_exists='append', index=False) housing.to_sql('additional_housing', engine, if_exists='append', index=False) failures.to_sql('low_accuracies', engine, if_exists='append', index=False) make_query("delete from low_accuracies where fixed=true") myprint( f"Done implementing fixes. There were {len(failures)} failed fixes out of {len(fixed)} attempts." )
def overwrite_our_feature(): # get all accurate h2a jobs that are in one of our states and have housing coordinates h2a_df = pd.read_sql("""SELECT * FROM job_central WHERE "Visa type" = 'H-2A' AND LOWER("WORKSITE_STATE") IN ('texas', 'tx', 'kentucky', 'ky', 'tennessee', 'tn', 'arkansas', 'ar', 'louisiana', 'la', 'mississippi', 'ms', 'alabama', 'al') AND housing_lat IS NOT NUll AND housing_long IS NOT NULL""", con=engine) # get all h2a jobs from job_central that are in one of our states and do not have housing coordinates h2a_no_housing_df = pd.read_sql("""SELECT * FROM job_central WHERE "Visa type" = 'H-2A' AND LOWER("WORKSITE_STATE") IN ('texas', 'tx', 'kentucky', 'ky', 'tennessee', 'tn', 'arkansas', 'ar', 'louisiana', 'la', 'mississippi', 'ms', 'alabama', 'al') AND (housing_lat IS NUll OR housing_long IS NULL)""", con=engine) # for arcGIS map purposes because color-coding is based on this column h2a_df["TOTAL_OCCUPANCY"].fillna(600, inplace=True) h2a_no_housing_df["TOTAL_OCCUPANCY"].fillna(600, inplace=True) # get all forestry h2b jobs from job_central that are in one of our states forestry_h2b_in_our_states_df = pd.read_sql( """SELECT * FROM job_central WHERE "Visa type" = 'H-2B' AND "SOC_CODE" IN ('45-4011.00', '45-4011') AND LOWER("WORKSITE_STATE") IN ('texas', 'tx', 'kentucky', 'ky', 'tennessee', 'tn', 'arkansas', 'ar', 'louisiana', 'la', 'mississippi', 'ms', 'alabama', 'al') """, con=engine) # set housing coordinates of h2b jobs and h2a jobs without housing to their worksite coordinates so that arecGIS will map them forestry_h2b_in_our_states_df[ "housing_lat"] = forestry_h2b_in_our_states_df.apply( lambda job: job["worksite_lat"], axis=1) forestry_h2b_in_our_states_df[ "housing_long"] = forestry_h2b_in_our_states_df.apply( lambda job: job["worksite_long"], axis=1) h2a_no_housing_df["housing_lat"] = h2a_no_housing_df.apply( lambda job: job["worksite_lat"], axis=1) h2a_no_housing_df["housing_long"] = h2a_no_housing_df.apply( lambda job: job["worksite_long"], axis=1) # combine h2a and forestry data h2a_and_h2b_df = h2a_df.append(forestry_h2b_in_our_states_df) h2a_housing_and_no_housing_and_h2b_df = h2a_and_h2b_df.append( h2a_no_housing_df) # get all additional housing rows that are in one of our states and that have a matching case number in job_central additional_housing_df = pd.read_sql( """SELECT * FROM additional_housing WHERE "CASE_NUMBER" IN (SELECT "CASE_NUMBER" FROM job_central WHERE "Visa type" = 'H-2A' AND LOWER("WORKSITE_STATE") IN ('texas', 'tx', 'kentucky', 'ky', 'tennessee', 'tn', 'arkansas', 'ar', 'louisiana', 'la', 'mississippi', 'ms', 'alabama', 'al')) """, con=engine) myprint(f"There will be {len(h2a_df)} normal H2A jobs in the feature.") myprint( f"There will be {len(h2a_no_housing_df)} H2A jobs mapped using their worksites in the feature." ) myprint( f"There will be {len(forestry_h2b_in_our_states_df)} forestry H2B jobs in the feature." ) myprint( f"There will be {len(additional_housing_df)} additional housing rows in the feature." ) # get columns that are in the h2a data but not the additional housing data and add each one to the additional housing datafrane cols_only_in_h2a = set(h2a_df.columns) - set(additional_housing_df.columns) for column in cols_only_in_h2a: additional_housing_df[column] = None # for each additional housing row, find its matching row in job_central and insert the data about that case number that is in job_central but not the additional_housing row for i, row in additional_housing_df.iterrows(): case_number = row["CASE_NUMBER"] job_in_h2a = h2a_df[h2a_df["CASE_NUMBER"] == case_number] if len(job_in_h2a) == 1: for column in cols_only_in_h2a: additional_housing_df.at[i, column] = get_value( job_in_h2a, column) else: print_red_and_email( f"{case_number} is in additional_housing, so I looked for it in job_central, and found a number of matching rows not equal to 1.", "Overwriting ArcGIS Layer") # append completed additional_housing df to the h2a and forestry data full_layer = h2a_housing_and_no_housing_and_h2b_df.append( additional_housing_df) overwrite_feature(ARCGIS_USERNAME, ARCGIS_PASSWORD, full_layer, 'H2Data')
def write_dataframe(sheet, df): df.fillna("", inplace=True) sheet.update([df.columns.values.tolist()] + df.values.tolist()) myprint(f"Wrote dataframe to {sheet}")
def open_sheet(client, file_name="", sheet_name=""): sheet = client.open(file_name).worksheet(sheet_name) myprint(f"Opened worksheet '{sheet_name}' in file '{file_name}'.") return sheet
def update_database(): latest_jobs = requests.get(most_recent_run_url).json() # use these two lines if you're updating using a local csv file # latest_jobs = pd.read_csv("dataset_apify-dol-actor-AJaB_2022-05-03_22-00-16-286.csv").drop(columns=["Unnamed: 0"]) # latest_jobs = latest_jobs.to_dict('records') if not latest_jobs: myprint("No new jobs.") return myprint(f"There are {len(latest_jobs)} new jobs.") # use this version of parse function if updating data using a local csv file rather than an apify scraper run # def parse(job): # column_mappings_dict = column_name_mappings # columns_names_dict = {"Section A": "Job Info", "Section C": "Place of Employment Info", "Section D":"Housing Info"} # parsed_job = {} # for key in job: # if "Section A" in key or "Section C" in key or "Section D" in key: # section = key.split("/")[0] # key_name = key.replace(section, columns_names_dict[section]) # if key_name in column_mappings_dict: # parsed_job[column_mappings_dict[key_name]] = job[key] # else: # parsed_job[key_name] = job[key] # else: # if key in column_mappings_dict: # parsed_job[column_mappings_dict[key]] = job[key] # else: # parsed_job[key] = job[key] # # return parsed_job # given a dictionary `job` from the scraper data, returns another dictionary with keys modifed to match those in postgres def parse(job): # dictionary mapping apify column names to postgres column names column_mappings_dict = column_name_mappings columns_names_dict = { "Section A": "Job Info", "Section C": "Place of Employment Info", "Section D": "Housing Info" } parsed_job = {} for key in job: if key not in ["Section D", "Section C", "Section A"]: if key in column_mappings_dict: # use the postgres column name parsed_job[column_mappings_dict[key]] = job[key] else: parsed_job[key] = job[key] else: inner_dict = job[key] for inner_key in inner_dict: key_name = columns_names_dict[key] + "/" + inner_key if key_name in column_mappings_dict: # use the postgres column name parsed_job[column_mappings_dict[ key_name]] = inner_dict[inner_key] else: parsed_job[key_name] = inner_dict[inner_key] return parsed_job # get date of run for the last actor run date_of_run = requests.get(date_of_run_url).json()["data"]["finishedAt"] # adds caclulated fields to the dictionary job def add_necessary_columns(job): # add source and date of run column job["Source"], job["table"] = "Apify", "central" job["Date of run"] = date_of_run if helpers.h2a_or_h2b(job) == "H-2A": job["Visa type"] = "H-2A" zip_code_columns = [ "EMPLOYER_POSTAL_CODE", "WORKSITE_POSTAL_CODE", "Place of Employment Info/Postal Code", "HOUSING_POSTAL_CODE" ] if "TOTAL_WORKERS_H-2A_REQUESTED" in job: job["TOTAL_WORKERS_NEEDED"] = job[ "TOTAL_WORKERS_H-2A_REQUESTED"] workers_needed, occupancy = job["TOTAL_WORKERS_NEEDED"], job.get( "TOTAL_OCCUPANCY", None) if workers_needed and occupancy: if workers_needed > occupancy: job["W to H Ratio"] = "W>H" elif workers_needed < occupancy: job["W to H Ratio"] = "W<H" else: job["W to H Ratio"] = "W=H" # fix case number if it's malformed in this way (sometimes they have been in the past) if job["CASE_NUMBER"][0] == "3": job["CASE_NUMBER"] = "H-" + job["CASE_NUMBER"] elif helpers.h2a_or_h2b(job) == "H-2B": job["Visa type"] = "H-2B" zip_code_columns = ["EMPLOYER_POSTAL_CODE", "WORKSITE_POSTAL_CODE"] else: job["Visa type"], zip_code_columns = "", [] # fix zip code columns for column in zip_code_columns: if column in job: fixed_zip_code = helpers.fix_zip_code(job[column]) job[column] = fixed_zip_code return job # parse each job, add necessary columns to each job, drop duplicates case numbers parsed_jobs = [parse(job) for job in latest_jobs] full_jobs = [add_necessary_columns(job) for job in parsed_jobs] full_jobs_df = pd.DataFrame(full_jobs).drop_duplicates( subset="CASE_NUMBER", keep="last") # convert date columns to type datetime date_columns = [ "RECEIVED_DATE", "EMPLOYMENT_END_DATE", "EMPLOYMENT_BEGIN_DATE", "Date of run" ] for column in date_columns: full_jobs_df[column] = pd.to_datetime(full_jobs_df[column], errors='coerce') # add data as is to raw_scraper_jobs table full_raw_jobs = full_jobs_df.drop(columns=["table"]) full_raw_jobs.to_sql("raw_scraper_jobs", engine, if_exists="append", index=False) myprint("Uploaded raw scraper jobs to PostgreSQL") # geocode, split by accuracy, merge existing data with new data new_accurate_jobs, new_inaccurate_jobs = helpers.geocode_and_split_by_accuracy( full_jobs_df) helpers.merge_all_data(new_accurate_jobs, new_inaccurate_jobs)
def update_database(): latest_jobs = requests.get(most_recent_run_url).json() # use these two lines if you're updating using a local csv file # latest_jobs = pd.read_csv("Since_10_23_to_10_29.csv", encoding='unicode_escape').drop(columns=["Unnamed: 0"]) # latest_jobs = latest_jobs.to_dict('records') if not latest_jobs: myprint("No new jobs.") return myprint(f"There are {len(latest_jobs)} new jobs.") # use this version of parse function if using a local csv file # def parse(job): # column_mappings_dict = column_name_mappings # columns_names_dict = {"Section A": "Job Info", "Section C": "Place of Employment Info", "Section D":"Housing Info"} # parsed_job = {} # for key in job: # if "Section A" in key or "Section C" in key or "Section D" in key: # section = key.split("/")[0] # key_name = key.replace(section, columns_names_dict[section]) # if key_name in column_mappings_dict: # parsed_job[column_mappings_dict[key_name]] = job[key] # else: # parsed_job[key_name] = job[key] # else: # if key in column_mappings_dict: # parsed_job[column_mappings_dict[key]] = job[key] # else: # parsed_job[key] = job[key] # # return parsed_job def parse(job): column_mappings_dict = column_name_mappings columns_names_dict = { "Section A": "Job Info", "Section C": "Place of Employment Info", "Section D": "Housing Info" } parsed_job = {} for key in job: if key not in ["Section D", "Section C", "Section A"]: if key in column_mappings_dict: parsed_job[column_mappings_dict[key]] = job[key] else: parsed_job[key] = job[key] else: inner_dict = job[key] for inner_key in inner_dict: key_name = columns_names_dict[key] + "/" + inner_key if key_name in column_mappings_dict: parsed_job[column_mappings_dict[ key_name]] = inner_dict[inner_key] else: parsed_job[key_name] = inner_dict[inner_key] return parsed_job date_of_run = requests.get(date_of_run_url).json()["data"]["finishedAt"] def add_necessary_columns(job): # add source and date of run column job["Source"], job["table"] = "Apify", "central" job["Date of run"] = date_of_run # check if job is h2a if helpers.h2a_or_h2b(job) == "H-2A": job["Visa type"] = "H-2A" zip_code_columns = [ "EMPLOYER_POSTAL_CODE", "WORKSITE_POSTAL_CODE", "Place of Employment Info/Postal Code", "HOUSING_POSTAL_CODE" ] if "TOTAL_WORKERS_H-2A_REQUESTED" in job: job["TOTAL_WORKERS_NEEDED"] = job[ "TOTAL_WORKERS_H-2A_REQUESTED"] workers_needed, occupancy = job["TOTAL_WORKERS_NEEDED"], job.get( "TOTAL_OCCUPANCY", None) if workers_needed and occupancy: if workers_needed > occupancy: job["W to H Ratio"] = "W>H" elif workers_needed < occupancy: job["W to H Ratio"] = "W<H" else: job["W to H Ratio"] = "W=H" if job["CASE_NUMBER"][0] == "3": job["CASE_NUMBER"] = "H-" + job["CASE_NUMBER"] # check if job is h2b elif helpers.h2a_or_h2b(job) == "H-2B": job["Visa type"] = "H-2B" zip_code_columns = ["EMPLOYER_POSTAL_CODE", "WORKSITE_POSTAL_CODE"] else: job["Visa type"], zip_code_columns = "", [] # fix zip code columns for column in zip_code_columns: if column in job: fixed_zip_code = helpers.fix_zip_code(job[column]) job[column] = fixed_zip_code return job # parse each job, add all columns to each job, append this to raw scraper data and push back to postgres parsed_jobs = [parse(job) for job in latest_jobs] full_jobs = [add_necessary_columns(job) for job in parsed_jobs] full_jobs_df = pd.DataFrame(full_jobs).drop_duplicates( subset="CASE_NUMBER", keep="last") date_columns = [ "RECEIVED_DATE", "EMPLOYMENT_END_DATE", "EMPLOYMENT_BEGIN_DATE", "Date of run" ] for column in date_columns: full_jobs_df[column] = pd.to_datetime(full_jobs_df[column], errors='coerce') full_raw_jobs = full_jobs_df.drop(columns=["table"]) full_raw_jobs.to_sql("raw_scraper_jobs", engine, if_exists="append", index=False) myprint("Uploaded raw scraper jobs to PostgreSQL") # geocode, split by accuracy, get old data, merge old with new data, sort data new_accurate_jobs, new_inaccurate_jobs = helpers.geocode_and_split_by_accuracy( full_jobs_df) helpers.merge_all_data(new_accurate_jobs, new_inaccurate_jobs)
def send_fixes_to_postgres(): fixed = pd.read_sql_query('select * from low_accuracies where fixed=true', con=engine) if len(fixed) == 0: myprint("No jobs have been fixed.") return myprint(f"{len(fixed)} jobs have been fixed.") central, housing, failures = implement_fixes(fixed) assert len(central) + len(housing) + len(failures) == len(fixed) myprint(f"{len(central)} rows moving from low_accuracies to job_central.") myprint( f"{len(housing)} rows moving from low_accuracies to additional housing." ) myprint(f"{len(failures)} failed fixes.") # saving appropriate fixes to previously_fixed table prev_fixed_columns = [ "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY", "HOUSING_POSTAL_CODE", "HOUSING_STATE", "fixed", "housing accuracy", "housing accuracy type", "housing_fixed_by", "housing_lat", "housing_long", "notes" ] fixed_by_vals_to_save_fixes_for = ["coordinates", "address", "impossible"] central_for_prev_fixed_table = central[central["housing_fixed_by"].isin( fixed_by_vals_to_save_fixes_for)][prev_fixed_columns] housing_for_prev_fixed_table = housing[housing["housing_fixed_by"].isin( fixed_by_vals_to_save_fixes_for)][prev_fixed_columns] for_prev_fixed_table = central_for_prev_fixed_table.append( housing_for_prev_fixed_table) if not for_prev_fixed_table.empty: myprint( f"There are {len(for_prev_fixed_table)} rows to add to the previously_fixed table." ) for_prev_fixed_table["initial_address"] = for_prev_fixed_table.apply( lambda job: handle_null(job["HOUSING_ADDRESS_LOCATION"]) + handle_null(job["HOUSING_CITY"]) + handle_null(job[ "HOUSING_STATE"]) + handle_null(job["HOUSING_POSTAL_CODE"]), axis=1) for_prev_fixed_table = for_prev_fixed_table.drop_duplicates( subset='initial_address', keep="last") for_prev_fixed_table.to_sql("previously_fixed", engine, if_exists='append', index=False) myprint(f"All rows successfully added to the previously_fixed table.") else: myprint(f"No rows to add to the previously_fixed table.") # adding fixes to appropriate tables and deleting appropriate rows from low_accuracies central.to_sql('job_central', engine, if_exists='append', index=False) housing.to_sql('additional_housing', engine, if_exists='append', index=False) # adding failed fixes back to low_accuracies - can't just use to_sql because of primary key for i, job in failures.iterrows(): query = text("""UPDATE low_accuracies SET (fixed, housing_lat, housing_long, "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY", "HOUSING_STATE", "HOUSING_POSTAL_CODE", notes, housing_fixed_by) = (:fixed, :lat, :long, :address, :city, :state, :zip, :notes, :fixed_by) WHERE id = :id""") with engine.connect() as connection: connection.execute(query, fixed=job["fixed"], lat=job["housing_lat"], long=job["housing_long"], address=job["HOUSING_ADDRESS_LOCATION"], city=job["HOUSING_CITY"], state=job["HOUSING_STATE"], zip=job["HOUSING_POSTAL_CODE"], notes=job["notes"], fixed_by=job["housing_fixed_by"], id=job["id"]) # all rows that had fixed as true are now either in job_central or in low_accuracies as a new row with fixed_by as failed and fixed as false make_query("delete from low_accuracies where fixed=true") myprint( f"Done implementing fixes. There were {len(failures)} failed fixes out of {len(fixed)} attempts." )