def expand_abbreviations(): for abbreviation in states_abbreviations: state = states_abbreviations[abbreviation] make_query(f"""UPDATE job_central SET "WORKSITE_STATE" = '{state}' WHERE "WORKSITE_STATE" IN ('{abbreviation}', '{abbreviation.lower()}')""" ) make_query(f"""UPDATE low_accuracies SET "WORKSITE_STATE" = '{state}' WHERE "WORKSITE_STATE" IN ('{abbreviation}', '{abbreviation.lower()}')""" ) make_query(f"""UPDATE job_central SET "HOUSING_STATE" = '{state}' WHERE "HOUSING_STATE" IN ('{abbreviation}', '{abbreviation.lower()}')""" ) make_query(f"""UPDATE additional_housing SET "HOUSING_STATE" = '{state}' WHERE "HOUSING_STATE" IN ('{abbreviation}', '{abbreviation.lower()}')""" ) make_query(f"""UPDATE low_accuracies SET "HOUSING_STATE" = '{state}' WHERE "HOUSING_STATE" IN ('{abbreviation}', '{abbreviation.lower()}')""" )
def add_housing_to_postgres(): file_path = "dol_data/" + input( "Put the additional housing file in a folder named `dol_data` in the `scripts` folder. Now enter the name of the file (this is case sensitive).\n" ).strip() year = input("What year is it? (eg: 2020)\n").strip() quarter = input("What quarter it is? (enter 1, 2, 3, or 4)\n").strip() input( f"Ok, adding additional housing from {file_path} for fiscal year {year}Q{quarter}. If this is correct press any key, othewise press control + c to start over." ) housing = pd.read_excel(file_path) accurate_housing, inaccurate_housing = geocode_manage_split_housing( housing, year, quarter) accurate_housing.to_sql("additional_housing", engine, if_exists='append', index=False) inaccurate_housing.to_sql("low_accuracies", engine, if_exists='append', index=False) if quarter != 1: response = input( f"Enter 'yes' or 'y' if you're ready to run the queries to delete the additional_housing rows from the previous quarter ({year}Q{quarter - 1}). You may want to check that adding the current quarter ({year}Q{quarter}) went well first. If it didn't you can always redo it, but geocoding may cost more because I won't be able to steal geocoding results from last quarter's additional housing rows." ) if response in ["y", "yes"]: make_query( f"""DELETE FROM additional_housing WHERE fy = '{year}Q{quarter - 1}'""" ) make_query( f"""DELETE FROM low_accuracies WHERE fy = '{year}Q{quarter - 1}' and "table" = 'dol_h'""" )
def mark_inactive_dolH_as_fixed(): acc_case_nums_statuses_df = pd.read_sql( """select "CASE_NUMBER", status from job_central""", con=engine) inacc_case_nums_statuses_df = pd.read_sql( """select "CASE_NUMBER", status from low_accuracies where "table" != 'dol_h'""", con=engine) # check that there are no duplicate case numbers case_nums_list = acc_case_nums_statuses_df["CASE_NUMBER"].tolist( ) + inacc_case_nums_statuses_df["CASE_NUMBER"].tolist() assert len(case_nums_list) == len(set(case_nums_list)) all_case_nums_statuses_df = acc_case_nums_statuses_df.append( inacc_case_nums_statuses_df) inaccurate_additional_housings = pd.read_sql( """select "id", "CASE_NUMBER" from low_accuracies where "table" = 'dol_h'""", con=engine) myprint( f"There are {len(inaccurate_additional_housings)} inaccurate additional housing rows." ) case_nums_with_no_matches = [] num_fixed = 0 # iterate over all inaccurate additional housing case numbers and ids. if a rows's corresponding row in job_central is inactive, update the row's fixed column to be true for i, job in inaccurate_additional_housings.iterrows(): job_case_num = job["CASE_NUMBER"] central_job = all_case_nums_statuses_df[ all_case_nums_statuses_df["CASE_NUMBER"] == job_case_num] if len(central_job) > 1: myprint( f"{job_case_num} has {len(central_job)} matching rows in the dataframe with all non dol_h case numbers. This should be impossible." ) elif len(central_job) == 0: myprint( f"{job_case_num} has no matching rows in the dataframe with all non dol_h case numbers." ) case_nums_with_no_matches.append(job_case_num) else: job_status = central_job["status"].tolist()[0] if job_status not in ["not yet started", "started"]: job_id = job["id"] make_query(f"""UPDATE low_accuracies SET fixed = true, worksite_fixed_by = 'inactive', housing_fixed_by = 'inactive' WHERE "id" = {job_id}""") num_fixed += 1 # myprint(f"There are {len(case_nums_with_no_matches)} additional housing rows in low_accuracies without a matching central case number.") # myprint(f"There are {len(set(case_nums_with_no_matches))} additional housing unique case numbers in low_accuracies without a matching central case number.") myprint( f"{num_fixed} additional housing rows in low accuracies were marked as fixed." )
def append_excel_to_table(file_name, table_name, year, quarter): if table_name in all_table_names: if check_for_new_columns(file_name, table_name): raise Exception(f"Since there are columns in {file_name} that aren't in {table_name}, we can't continue this process or it will fail. Either add the missing columns to PostgreSQL or change the column names in the excel file to match ones that already exist in Postgres.") else: myprint(f"The table {table_name} doesn't exist yet, so it will be added.") df = pd.read_excel(f"dol_data/{file_name}") df["fy"] = f"{year}Q{quarter}" myprint(f"Adding {len(df)} rows from {file_name} to {table_name}.") df.to_sql(table_name, engine, if_exists='append', index=False) myprint("Done adding rows.") if int(quarter) != 1: make_query(f"""DELETE FROM {table_name} WHERE fy = '{year}Q{int(quarter) - 1}'""")
def _relation(self, id, join_on, join_to, level=None, featuretype=None, order_by=None, reverse=False, completely_within=False): # The following docstring will be included in the parents() and # children() docstrings to maintain consistency, since they both # delegate to this method. """ Parameters ---------- id : string or a Feature object level : None or int If `level=None` (default), then return all children regardless of level. If `level` is an integer, then constrain to just that level. {_method_doc} """ if isinstance(id, Feature): id = id.id other = ''' JOIN relations ON relations.{join_on} = features.id WHERE relations.{join_to} = ? '''.format(**locals()) args = [id] level_clause = '' if level is not None: level_clause = 'relations.level = ?' args.append(level) query, args = helpers.make_query( args=args, other=other, extra=level_clause, featuretype=featuretype, order_by=order_by, reverse=reverse, completely_within=completely_within, ) # modify _SELECT so that only unique results are returned query = query.replace("SELECT", "SELECT DISTINCT") for i in self._execute(query, args): yield self._feature_returner(**i)
def add_worksites_to_postgres(): file_path = "dol_data/" + input( "Check that the additional worksites file is in a folder named `dol_data` in the `scripts` folder. (If it isn't, exit this using control + c then re-run this script once you've done it.) Now enter the name of the file (this is case sensitive).\n" ).strip() year = input("What year is it? (eg: 2020)\n").strip() quarter = int(input("What quarter it is? (enter 1, 2, 3, or 4)\n").strip()) input( f"Ok, adding worksites from {file_path} for fiscal year {year}Q{quarter}. If this is correct press any key, othewise press control + c to start over." ) worksites = pd.read_excel(file_path) worksites = manage_worksites(worksites, year, quarter) myprint(f"Adding {len(worksites)} rows to additional_worksites table.") worksites.to_sql("additional_worksites", engine, if_exists='append', index=False) if int(quarter) != 1: make_query( f"""DELETE FROM additional_worksites WHERE fy = '{year}Q{int(quarter) - 1}'""" )
def update_halfway_columns(): # update days_until_halfway column for job_central and low_accuracies make_query(""" UPDATE job_central SET days_until_halfway = CASE WHEN status != 'ended' THEN EXTRACT(DAY FROM ("EMPLOYMENT_BEGIN_DATE" + (("EMPLOYMENT_END_DATE" - "EMPLOYMENT_BEGIN_DATE") / 2)) - CURRENT_DATE) ELSE NULL END """) make_query(""" UPDATE low_accuracies SET days_until_halfway = CASE WHEN status != 'ended' THEN EXTRACT(DAY FROM ("EMPLOYMENT_BEGIN_DATE" + (("EMPLOYMENT_END_DATE" - "EMPLOYMENT_BEGIN_DATE") / 2)) - CURRENT_DATE) ELSE NULL END WHERE "table" = 'central' """) # update is_halfway column for job_central and low_accuracies make_query(""" UPDATE job_central SET is_halfway = CASE WHEN days_until_halfway > 0 THEN FALSE WHEN days_until_halfway IS NULL THEN NULL ELSE TRUE END """) make_query(""" UPDATE low_accuracies SET is_halfway = CASE WHEN days_until_halfway > 0 THEN FALSE WHEN days_until_halfway IS NULL THEN NULL ELSE TRUE END WHERE "table" = 'central' """)
def all_features(self, limit=None, strand=None, featuretype=None, order_by=None, reverse=False, completely_within=False): """ Returns an iterator of all :class:`Feature` objects in the database. Parameters ---------- {_method_doc} """ query, args = helpers.make_query( args=[], limit=limit, strand=strand, featuretype=featuretype, order_by=order_by, reverse=reverse, completely_within=completely_within ) for i in self._execute(query, args): yield self._feature_returner(**i)
def features_of_type(self, featuretype, limit=None, strand=None, order_by=None, reverse=False, completely_within=False): """ Returns an iterator of :class:`gffutils.Feature` objects. Parameters ---------- {_method_doc} """ query, args = helpers.make_query( args=[], limit=limit, featuretype=featuretype, order_by=order_by, reverse=reverse, strand=strand, completely_within=completely_within, ) for i in self._execute(query, args): yield self._feature_returner(**i)
def update_status_columns(table_name): make_query(f"""UPDATE {table_name} SET "EMPLOYMENT_BEGIN_DATE" = "REQUESTED_BEGIN_DATE" WHERE "EMPLOYMENT_BEGIN_DATE" IS null""") make_query(f"""UPDATE {table_name} SET "EMPLOYMENT_END_DATE" = "REQUESTED_END_DATE" WHERE "EMPLOYMENT_END_DATE" IS null""") make_query(f"""UPDATE {table_name} SET status = CASE WHEN ("EMPLOYMENT_BEGIN_DATE" IS null) OR ("EMPLOYMENT_END_DATE" IS null) THEN null WHEN ("EMPLOYMENT_BEGIN_DATE" <= CURRENT_DATE) AND (CURRENT_DATE <= "EMPLOYMENT_END_DATE") THEN 'started' WHEN ("EMPLOYMENT_BEGIN_DATE" > CURRENT_DATE) AND (CURRENT_DATE <= "EMPLOYMENT_END_DATE") THEN 'not yet started' ELSE 'ended' END""")
def set_test_database_state(accurates, inaccurates): make_query("DELETE FROM job_central") make_query("DELETE FROM low_accuracies") accurates.to_sql("job_central", engine, if_exists='append', index=False, dtype=helpers.column_types) inaccurates.to_sql("low_accuracies", engine, if_exists='append', index=False, dtype=helpers.column_types) make_query("REFRESH MATERIALIZED VIEW previously_geocoded")
def mark_inactive_central_as_fixed(): make_query("""UPDATE low_accuracies SET fixed = true, worksite_fixed_by = 'inactive', housing_fixed_by = 'inactive' WHERE status = 'ended'""")
def send_fixes_to_postgres(): fixed = pd.read_sql_query('select * from low_accuracies where fixed=true', con=engine) if len(fixed) == 0: myprint("No jobs have been fixed.") return myprint(f"{len(fixed)} jobs have been fixed.") central, housing, failures = implement_fixes(fixed) assert len(central) + len(housing) + len(failures) == len(fixed) myprint(f"{len(central)} rows moving from low_accuracies to job_central.") myprint( f"{len(housing)} rows moving from low_accuracies to additional housing." ) myprint(f"{len(failures)} failed fixes.") # saving appropriate fixes to previously_fixed table prev_fixed_columns = [ "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY", "HOUSING_POSTAL_CODE", "HOUSING_STATE", "fixed", "housing accuracy", "housing accuracy type", "housing_fixed_by", "housing_lat", "housing_long", "notes" ] fixed_by_vals_to_save_fixes_for = ["coordinates", "address", "impossible"] central_for_prev_fixed_table = central[central["housing_fixed_by"].isin( fixed_by_vals_to_save_fixes_for)][prev_fixed_columns] housing_for_prev_fixed_table = housing[housing["housing_fixed_by"].isin( fixed_by_vals_to_save_fixes_for)][prev_fixed_columns] for_prev_fixed_table = central_for_prev_fixed_table.append( housing_for_prev_fixed_table) if not for_prev_fixed_table.empty: myprint( f"There are {len(for_prev_fixed_table)} rows to add to the previously_fixed table." ) for_prev_fixed_table["initial_address"] = for_prev_fixed_table.apply( lambda job: handle_null(job["HOUSING_ADDRESS_LOCATION"]) + handle_null(job["HOUSING_CITY"]) + handle_null(job[ "HOUSING_STATE"]) + handle_null(job["HOUSING_POSTAL_CODE"]), axis=1) for_prev_fixed_table = for_prev_fixed_table.drop_duplicates( subset='initial_address', keep="last") for_prev_fixed_table.to_sql("previously_fixed", engine, if_exists='append', index=False) myprint(f"All rows successfully added to the previously_fixed table.") else: myprint(f"No rows to add to the previously_fixed table.") # adding fixes to appropriate tables and deleting appropriate rows from low_accuracies central.to_sql('job_central', engine, if_exists='append', index=False) housing.to_sql('additional_housing', engine, if_exists='append', index=False) failures.to_sql('low_accuracies', engine, if_exists='append', index=False) make_query("delete from low_accuracies where fixed=true") myprint( f"Done implementing fixes. There were {len(failures)} failed fixes out of {len(fixed)} attempts." )
def update_workers_and_occupancy_columns(): # merging TOTAL_WORKERS_H-2A_REQUESTED with TOTAL_WORKERS_NEEDED column - this should happen automatically, but just in case something is missed: make_query(""" UPDATE job_central SET "TOTAL_WORKERS_NEEDED" = "TOTAL_WORKERS_H-2A_REQUESTED" WHERE "TOTAL_WORKERS_NEEDED" IS NULL """) make_query(""" UPDATE low_accuracies SET "TOTAL_WORKERS_NEEDED" = "TOTAL_WORKERS_H-2A_REQUESTED" WHERE "TOTAL_WORKERS_NEEDED" IS NULL """) # updating occupancy_minus_workers workers column make_query(""" UPDATE job_central SET occupancy_minus_workers = "TOTAL_OCCUPANCY" - "TOTAL_WORKERS_NEEDED" WHERE "TOTAL_OCCUPANCY" IS NOT NULL AND "TOTAL_WORKERS_NEEDED" IS NOT NULL """) make_query(""" UPDATE low_accuracies SET occupancy_minus_workers = "TOTAL_OCCUPANCY" - "TOTAL_WORKERS_NEEDED" WHERE "TOTAL_OCCUPANCY" IS NOT NULL AND "TOTAL_WORKERS_NEEDED" IS NOT NULL """) # updating W to H Ratio column make_query(""" UPDATE job_central SET "W to H Ratio" = CASE WHEN "TOTAL_WORKERS_NEEDED" > "TOTAL_OCCUPANCY" THEN 'W>H' WHEN "TOTAL_WORKERS_NEEDED" < "TOTAL_OCCUPANCY" THEN 'W<H' ELSE 'W=H' END WHERE "TOTAL_OCCUPANCY" IS NOT NULL AND "TOTAL_WORKERS_NEEDED" IS NOT NULL """) make_query(""" UPDATE low_accuracies SET "W to H Ratio" = CASE WHEN "TOTAL_WORKERS_NEEDED" > "TOTAL_OCCUPANCY" THEN 'W>H' WHEN "TOTAL_WORKERS_NEEDED" < "TOTAL_OCCUPANCY" THEN 'W<H' ELSE 'W=H' END WHERE "TOTAL_OCCUPANCY" IS NOT NULL AND "TOTAL_WORKERS_NEEDED" IS NOT NULL """)
def send_fixes_to_postgres(): fixed = pd.read_sql_query('select * from low_accuracies where fixed=true', con=engine) if len(fixed) == 0: myprint("No jobs have been fixed.") return myprint(f"{len(fixed)} jobs have been fixed.") central, housing, failures = implement_fixes(fixed) assert len(central) + len(housing) + len(failures) == len(fixed) myprint(f"{len(central)} rows moving from low_accuracies to job_central.") myprint( f"{len(housing)} rows moving from low_accuracies to additional housing." ) myprint(f"{len(failures)} failed fixes.") # saving appropriate fixes to previously_fixed table prev_fixed_columns = [ "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY", "HOUSING_POSTAL_CODE", "HOUSING_STATE", "fixed", "housing accuracy", "housing accuracy type", "housing_fixed_by", "housing_lat", "housing_long", "notes" ] fixed_by_vals_to_save_fixes_for = ["coordinates", "address", "impossible"] central_for_prev_fixed_table = central[central["housing_fixed_by"].isin( fixed_by_vals_to_save_fixes_for)][prev_fixed_columns] housing_for_prev_fixed_table = housing[housing["housing_fixed_by"].isin( fixed_by_vals_to_save_fixes_for)][prev_fixed_columns] for_prev_fixed_table = central_for_prev_fixed_table.append( housing_for_prev_fixed_table) if not for_prev_fixed_table.empty: myprint( f"There are {len(for_prev_fixed_table)} rows to add to the previously_fixed table." ) for_prev_fixed_table["initial_address"] = for_prev_fixed_table.apply( lambda job: handle_null(job["HOUSING_ADDRESS_LOCATION"]) + handle_null(job["HOUSING_CITY"]) + handle_null(job[ "HOUSING_STATE"]) + handle_null(job["HOUSING_POSTAL_CODE"]), axis=1) for_prev_fixed_table = for_prev_fixed_table.drop_duplicates( subset='initial_address', keep="last") for_prev_fixed_table.to_sql("previously_fixed", engine, if_exists='append', index=False) myprint(f"All rows successfully added to the previously_fixed table.") else: myprint(f"No rows to add to the previously_fixed table.") # adding fixes to appropriate tables and deleting appropriate rows from low_accuracies central.to_sql('job_central', engine, if_exists='append', index=False) housing.to_sql('additional_housing', engine, if_exists='append', index=False) # adding failed fixes back to low_accuracies - can't just use to_sql because of primary key for i, job in failures.iterrows(): query = text("""UPDATE low_accuracies SET (fixed, housing_lat, housing_long, "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY", "HOUSING_STATE", "HOUSING_POSTAL_CODE", notes, housing_fixed_by) = (:fixed, :lat, :long, :address, :city, :state, :zip, :notes, :fixed_by) WHERE id = :id""") with engine.connect() as connection: connection.execute(query, fixed=job["fixed"], lat=job["housing_lat"], long=job["housing_long"], address=job["HOUSING_ADDRESS_LOCATION"], city=job["HOUSING_CITY"], state=job["HOUSING_STATE"], zip=job["HOUSING_POSTAL_CODE"], notes=job["notes"], fixed_by=job["housing_fixed_by"], id=job["id"]) # all rows that had fixed as true are now either in job_central or in low_accuracies as a new row with fixed_by as failed and fixed as false make_query("delete from low_accuracies where fixed=true") myprint( f"Done implementing fixes. There were {len(failures)} failed fixes out of {len(fixed)} attempts." )