def send_sheet_fixes_to_postgres(sheet):
    sheet_records = sheet.batch_get(["A:P"])[0]
    column_names = sheet_records.pop(0)
    sheet_df = pd.DataFrame(sheet_records, columns=column_names).rename(columns=sql_sheet_column_names_map)
    sheet_df = sheet_df[sheet_df["fixed"] == "TRUE"]

    if len(sheet_df) == 0:
        myprint("Nobody has fixed any jobs in the Google Sheet.")
        return
    myprint(f"{len(sheet_df)} jobs have been fixed in the Google Sheet. Sending these changes to Postgres now.")

    sheet_df["housing accuracy"] = pd.to_numeric(sheet_df["housing accuracy"])
    sheet_df["housing_lat"] = pd.to_numeric(sheet_df["housing_lat"])
    sheet_df["housing_long"] = pd.to_numeric(sheet_df["housing_long"])
    sheet_df["id"] = pd.to_numeric(sheet_df["id"])
    sheet_df["Date of run"] = pd.to_datetime(sheet_df["Date of run"])
    sheet_df["fixed"] = sheet_df["fixed"].astype('bool')

    for i, job in sheet_df.iterrows():
        query = text(
                """UPDATE low_accuracies SET
                    (fixed, housing_lat, housing_long, "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY",
                    "HOUSING_STATE", "HOUSING_POSTAL_CODE", notes, housing_fixed_by)
                     =
                     (:fixed, :lat, :long, :address, :city, :state, :zip, :notes, :fixed_by)
                     WHERE id = :id""")

        with engine.connect() as connection:
            connection.execute(query, fixed=job["fixed"], lat=job["housing_lat"], long=job["housing_long"],
                               address=job["HOUSING_ADDRESS_LOCATION"], city=job["HOUSING_CITY"],
                               state=job["HOUSING_STATE"], zip=job["HOUSING_POSTAL_CODE"],
                               notes=job["notes"], fixed_by=job["housing_fixed_by"], id=job["id"])
def fix_previously_fixed():
    previously_fixed_query = """
                            SELECT * FROM (
                        	(SELECT
                            COALESCE("HOUSING_ADDRESS_LOCATION", '') || COALESCE("HOUSING_CITY", '') || COALESCE("HOUSING_STATE", '') || COALESCE("HOUSING_POSTAL_CODE", '') as full_address,
                            id AS low_acc_id
                        	FROM low_accuracies) AS low_acc
                        	INNER JOIN
                        	(SELECT DISTINCT ON (initial_address) * FROM previously_fixed) AS prev_fixed
                        	ON prev_fixed.initial_address = low_acc.full_address)
                            """

    previously_fixed_df = pd.read_sql(previously_fixed_query, con=engine)
    myprint(
        f"There are {len(previously_fixed_df)} rows in low_accuracies whose exact housing address column has been fixed before."
    )

    for i, fixed in previously_fixed_df.iterrows():
        update_query = text("""
                        UPDATE low_accuracies SET
                        "HOUSING_ADDRESS_LOCATION" = :address,
                        "HOUSING_CITY" = :city,
                        "HOUSING_POSTAL_CODE" = :zip,
                        "HOUSING_STATE" = :state,
                        "fixed" = :fixed,
                        "housing accuracy" = :accuracy,
                        "housing accuracy type" = :accuracy_type,
                        "housing_fixed_by" = :fixed_by,
                        "housing_lat" = :lat,
                        "housing_long" = :long,
                        "notes" = :notes
                        WHERE "id" = :id
                        """)

        with engine.connect() as connection:
            connection.execute(update_query,
                               address=fixed["HOUSING_ADDRESS_LOCATION"],
                               city=fixed["HOUSING_CITY"],
                               zip=fixed["HOUSING_POSTAL_CODE"],
                               state=fixed["HOUSING_STATE"],
                               fixed=fixed["fixed"],
                               accuracy=fixed["housing accuracy"],
                               accuracy_type=fixed["housing accuracy type"],
                               fixed_by=fixed["housing_fixed_by"],
                               lat=fixed["housing_lat"],
                               long=fixed["housing_long"],
                               notes=fixed["notes"],
                               id=fixed["low_acc_id"])

    myprint("Successfully fixed all previously fixed rows in low accuracies.")
def add_data_as_table(path_to_data, table_name, excel_or_csv=""):
    if excel_or_csv == "excel":
        data = pd.read_excel(path_to_data)
    elif excel_or_csv == "csv":
        data = pd.read_csv(path_to_data)
    else:
        raise Exception(
            "`excel_or_csv` parameter of add_data_to_table function must be either 'excel' or 'csv'."
        )

    myprint(f"Adding {len(data)} rows to {table_name}...")
    data.to_sql(table_name, engine, index=False)
    myprint(
        f"Successfully added the {excel_or_csv} file {path_to_data} as a table named {table_name}."
    )
def mark_inactive_dolH_as_fixed():
    acc_case_nums_statuses_df = pd.read_sql(
        """select "CASE_NUMBER", status from job_central""", con=engine)
    inacc_case_nums_statuses_df = pd.read_sql(
        """select "CASE_NUMBER", status from low_accuracies where "table" != 'dol_h'""",
        con=engine)

    # check that there are no duplicate case numbers
    case_nums_list = acc_case_nums_statuses_df["CASE_NUMBER"].tolist(
    ) + inacc_case_nums_statuses_df["CASE_NUMBER"].tolist()
    assert len(case_nums_list) == len(set(case_nums_list))

    all_case_nums_statuses_df = acc_case_nums_statuses_df.append(
        inacc_case_nums_statuses_df)

    inaccurate_additional_housings = pd.read_sql(
        """select "id", "CASE_NUMBER" from low_accuracies where "table" = 'dol_h'""",
        con=engine)
    myprint(
        f"There are {len(inaccurate_additional_housings)} inaccurate additional housing rows."
    )

    case_nums_with_no_matches = []
    num_fixed = 0
    # iterate over all inaccurate additional housing case numbers and ids. if a rows's corresponding row in job_central is inactive, update the row's fixed column to be true
    for i, job in inaccurate_additional_housings.iterrows():
        job_case_num = job["CASE_NUMBER"]
        central_job = all_case_nums_statuses_df[
            all_case_nums_statuses_df["CASE_NUMBER"] == job_case_num]

        if len(central_job) > 1:
            myprint(
                f"{job_case_num} has {len(central_job)} matching rows in the dataframe with all non dol_h case numbers. This should be impossible."
            )
        elif len(central_job) == 0:
            myprint(
                f"{job_case_num} has no matching rows in the dataframe with all non dol_h case numbers."
            )
            case_nums_with_no_matches.append(job_case_num)
        else:
            job_status = central_job["status"].tolist()[0]
            if job_status not in ["not yet started", "started"]:
                job_id = job["id"]
                make_query(f"""UPDATE low_accuracies SET
                                fixed = true,
                                worksite_fixed_by = 'inactive',
                                housing_fixed_by = 'inactive'
                                WHERE "id" = {job_id}""")
                num_fixed += 1

    # myprint(f"There are {len(case_nums_with_no_matches)} additional housing rows in low_accuracies without a matching central case number.")
    # myprint(f"There are {len(set(case_nums_with_no_matches))} additional housing unique case numbers in low_accuracies without a matching central case number.")
    myprint(
        f"{num_fixed} additional housing rows in low accuracies were marked as fixed."
    )
def overwrite_feature(username, password, new_df, old_feature_name):
    gis = GIS(url='https://www.arcgis.com',
              username=username,
              password=password)
    # print("Logged in as " + str(gis.properties.user.username))

    csv_file_name = f"{old_feature_name}.csv"
    new_df.to_csv(csv_file_name, index=False)

    # get first search resul
    old_jobs_item = gis.content.search(f"title: {old_feature_name}",
                                       'Feature Layer')[0]
    old_feature_layer = FeatureLayerCollection.fromitem(old_jobs_item)

    myprint(
        f"Overwriting feature layer.... there will now be {len(new_df)} features."
    )
    old_feature_layer.manager.overwrite(csv_file_name)
    myprint('Done overwriting feature layer.')

    os.remove(csv_file_name)
def append_excels_to_their_tables():
    year = input("What year is it? (eg: 2020)\n").strip()
    quarter = int(input("What quarter is it? (enter 1, 2, 3, or 4)\n").strip())
    input(f"Ok, appending excel files for fiscal year {year}Q{quarter}. If this is correct press any key, othewise press control + c to start over.")

    files_to_tables_map = pd.read_excel(os.path.join(os.getcwd(), '..',  "excel_files/dol_table_file_mappings.xls"))
    myprint(f"Will be appending {len(files_to_tables_map)} files to their respective tables.")

    for i, pair in files_to_tables_map.iterrows():
        file_name, table_name = pair["file_name"].strip(), pair["table_name"].strip()
        myprint(f"Appending {file_name} to {table_name}.\n")
        try:
            append_excel_to_table(file_name, table_name, year, quarter)
            myprint(f"Success!")
        except Exception as error:
            myprint(f"That didn't work, here's the error message:\n{str(error)}\n")
def add_worksites_to_postgres():
    file_path = "dol_data/" + input(
        "Check that the additional worksites file is in a folder named `dol_data` in the `scripts` folder. (If it isn't, exit this using control + c then re-run this script once you've done it.)  Now enter the name of the file (this is case sensitive).\n"
    ).strip()
    year = input("What year is it? (eg: 2020)\n").strip()
    quarter = int(input("What quarter it is? (enter 1, 2, 3, or 4)\n").strip())
    input(
        f"Ok, adding worksites from {file_path} for fiscal year {year}Q{quarter}. If this is correct press any key, othewise press control + c to start over."
    )

    worksites = pd.read_excel(file_path)

    worksites = manage_worksites(worksites, year, quarter)
    myprint(f"Adding {len(worksites)} rows to additional_worksites table.")

    worksites.to_sql("additional_worksites",
                     engine,
                     if_exists='append',
                     index=False)

    if int(quarter) != 1:
        make_query(
            f"""DELETE FROM additional_worksites WHERE fy = '{year}Q{int(quarter) - 1}'"""
        )
def append_excel_to_table(file_name, table_name, year, quarter):
    if table_name in all_table_names:
        if check_for_new_columns(file_name, table_name):
            raise Exception(f"Since there are columns in {file_name} that aren't in {table_name}, we can't continue this process or it will fail. Either add the missing columns to PostgreSQL or change the column names in the excel file to match ones that already exist in Postgres.")
    else:
        myprint(f"The table {table_name} doesn't exist yet, so it will be added.")

    df = pd.read_excel(f"dol_data/{file_name}")
    df["fy"] = f"{year}Q{quarter}"

    myprint(f"Adding {len(df)} rows from {file_name} to {table_name}.")

    df.to_sql(table_name, engine, if_exists='append', index=False)
    myprint("Done adding rows.")

    if int(quarter) != 1:
        make_query(f"""DELETE FROM {table_name} WHERE fy = '{year}Q{int(quarter) - 1}'""")
    else:
        h2b_inaccurates_inaccurate = True

    return worksites_accurate, housings_accurate, h2a_inaccurates_inaccurate, h2b_inaccurates_inaccurate



type_conversions = {'fixed': bool, 'housing_fixed_by': str, 'worksite_fixed_by': str, "HOUSING_POSTAL_CODE": str, "WORKSITE_POSTAL_CODE": str}
accurate_new_jobs = pd.read_excel(os.path.join(os.getcwd(), '..',  "excel_files/accurate_dol_geocoded.xlsx"), converters=type_conversions)
inaccurate_new_jobs = pd.read_excel(os.path.join(os.getcwd(), '..',  "excel_files/inaccurate_dol_geocoded.xlsx"),  converters=type_conversions)
accurate_old_jobs = pd.read_excel(os.path.join(os.getcwd(), '..',  "excel_files/accurates_geocoded.xlsx"), converters=type_conversions)
inaccurate_old_jobs = pd.read_excel(os.path.join(os.getcwd(), '..',  "excel_files/inaccurates_geocoded.xlsx"),  converters=type_conversions)



myprint("Start of add housing test", is_red="red")
housing = pd.read_excel(os.path.join(os.getcwd(), '..', 'excel_files/housing_addendum.xlsx'))
accurate_housing, inaccurate_housing = geocode_manage_split_housing(housing, 2020, 3)
class TestAddHousing(unittest.TestCase):
    def test_length_and_table_column(self):
        self.assertEqual(len(accurate_housing), 9)
        self.assertEqual(len(inaccurate_housing), 1)
        self.assertTrue((accurate_housing["table"] == "dol_h").all() and (inaccurate_housing["table"] == "dol_h").all())
    def test_accuracies(self):
        accurates_are_accurate = (((accurate_housing["housing accuracy"] >= 0.7) & (~(accurate_housing["housing accuracy type"].isin(bad_accuracy_types)))) | (~accurate_housing["HOUSING_STATE"].str.lower().isin(helpers.our_states))).all()
        self.assertTrue(accurates_are_accurate)
        inaccurates_are_inaccurate = ((inaccurate_housing["housing accuracy"].isnull()) | (inaccurate_housing["housing accuracy"] < 0.7) | (inaccurate_housing["housing accuracy type"].isin(bad_accuracy_types))).all()
        self.assertTrue(inaccurates_are_inaccurate)
    def test_fy_column(self):
        self.assertTrue(all(accurate_housing["fy"] == "2020Q3"))
        self.assertTrue(all(inaccurate_housing["fy"] == "2020Q3"))
Exemple #10
0
def send_fixes_to_postgres():
    fixed = pd.read_sql_query('select * from low_accuracies where fixed=true',
                              con=engine)

    if len(fixed) == 0:
        myprint("No jobs have been fixed.")
        return

    myprint(f"{len(fixed)} jobs have been fixed.")
    central, housing, failures = implement_fixes(fixed)
    assert len(central) + len(housing) + len(failures) == len(fixed)
    myprint(f"{len(central)} rows moving from low_accuracies to job_central.")
    myprint(
        f"{len(housing)} rows moving from low_accuracies to additional housing."
    )
    myprint(f"{len(failures)} failed fixes.")

    # saving appropriate fixes to previously_fixed table
    prev_fixed_columns = [
        "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY", "HOUSING_POSTAL_CODE",
        "HOUSING_STATE", "fixed", "housing accuracy", "housing accuracy type",
        "housing_fixed_by", "housing_lat", "housing_long", "notes"
    ]
    fixed_by_vals_to_save_fixes_for = ["coordinates", "address", "impossible"]

    central_for_prev_fixed_table = central[central["housing_fixed_by"].isin(
        fixed_by_vals_to_save_fixes_for)][prev_fixed_columns]
    housing_for_prev_fixed_table = housing[housing["housing_fixed_by"].isin(
        fixed_by_vals_to_save_fixes_for)][prev_fixed_columns]
    for_prev_fixed_table = central_for_prev_fixed_table.append(
        housing_for_prev_fixed_table)

    if not for_prev_fixed_table.empty:
        myprint(
            f"There are {len(for_prev_fixed_table)} rows to add to the previously_fixed table."
        )
        for_prev_fixed_table["initial_address"] = for_prev_fixed_table.apply(
            lambda job: handle_null(job["HOUSING_ADDRESS_LOCATION"]) +
            handle_null(job["HOUSING_CITY"]) + handle_null(job[
                "HOUSING_STATE"]) + handle_null(job["HOUSING_POSTAL_CODE"]),
            axis=1)
        for_prev_fixed_table = for_prev_fixed_table.drop_duplicates(
            subset='initial_address', keep="last")
        for_prev_fixed_table.to_sql("previously_fixed",
                                    engine,
                                    if_exists='append',
                                    index=False)
        myprint(f"All rows successfully added to the previously_fixed table.")
    else:
        myprint(f"No rows to add to the previously_fixed table.")

    # adding fixes to appropriate tables and deleting appropriate rows from low_accuracies
    central.to_sql('job_central', engine, if_exists='append', index=False)
    housing.to_sql('additional_housing',
                   engine,
                   if_exists='append',
                   index=False)
    failures.to_sql('low_accuracies', engine, if_exists='append', index=False)
    make_query("delete from low_accuracies where fixed=true")

    myprint(
        f"Done implementing fixes. There were {len(failures)} failed fixes out of {len(fixed)} attempts."
    )
def overwrite_our_feature():

    # get all accurate h2a jobs that are in one of our states and have housing coordinates
    h2a_df = pd.read_sql("""SELECT * FROM job_central WHERE
                        "Visa type" = 'H-2A' AND
                        LOWER("WORKSITE_STATE") IN
                        ('texas', 'tx', 'kentucky', 'ky', 'tennessee', 'tn', 'arkansas', 'ar', 'louisiana', 'la', 'mississippi', 'ms', 'alabama', 'al') AND
                        housing_lat IS NOT NUll AND housing_long IS NOT NULL""",
                         con=engine)

    # get all h2a jobs from job_central that are in one of our states and do not have housing coordinates
    h2a_no_housing_df = pd.read_sql("""SELECT * FROM job_central WHERE
                                       "Visa type" = 'H-2A' AND
                                       LOWER("WORKSITE_STATE") IN
                                       ('texas', 'tx', 'kentucky', 'ky', 'tennessee', 'tn', 'arkansas', 'ar', 'louisiana', 'la', 'mississippi', 'ms', 'alabama', 'al') AND
                                       (housing_lat IS NUll OR housing_long IS NULL)""",
                                    con=engine)

    # for arcGIS map purposes because color-coding is based on this column
    h2a_df["TOTAL_OCCUPANCY"].fillna(600, inplace=True)
    h2a_no_housing_df["TOTAL_OCCUPANCY"].fillna(600, inplace=True)

    # get all forestry h2b jobs from job_central that are in one of our states
    forestry_h2b_in_our_states_df = pd.read_sql(
        """SELECT * FROM job_central WHERE
                                                   "Visa type" = 'H-2B' AND
                                                   "SOC_CODE" IN ('45-4011.00', '45-4011') AND
                                                   LOWER("WORKSITE_STATE") IN
                                                   ('texas', 'tx', 'kentucky', 'ky', 'tennessee', 'tn', 'arkansas', 'ar', 'louisiana', 'la', 'mississippi', 'ms', 'alabama', 'al')
                                                    """,
        con=engine)

    # set housing coordinates of h2b jobs and h2a jobs without housing to their worksite coordinates so that arecGIS will map them
    forestry_h2b_in_our_states_df[
        "housing_lat"] = forestry_h2b_in_our_states_df.apply(
            lambda job: job["worksite_lat"], axis=1)
    forestry_h2b_in_our_states_df[
        "housing_long"] = forestry_h2b_in_our_states_df.apply(
            lambda job: job["worksite_long"], axis=1)
    h2a_no_housing_df["housing_lat"] = h2a_no_housing_df.apply(
        lambda job: job["worksite_lat"], axis=1)
    h2a_no_housing_df["housing_long"] = h2a_no_housing_df.apply(
        lambda job: job["worksite_long"], axis=1)

    # combine h2a and forestry data
    h2a_and_h2b_df = h2a_df.append(forestry_h2b_in_our_states_df)
    h2a_housing_and_no_housing_and_h2b_df = h2a_and_h2b_df.append(
        h2a_no_housing_df)

    # get all additional housing rows that are in one of our states and that have a matching case number in job_central
    additional_housing_df = pd.read_sql(
        """SELECT * FROM additional_housing WHERE
                                           "CASE_NUMBER" IN
                                                (SELECT "CASE_NUMBER" FROM job_central WHERE
                                                "Visa type" = 'H-2A' AND
                                                LOWER("WORKSITE_STATE") IN
                                                ('texas', 'tx', 'kentucky', 'ky', 'tennessee', 'tn', 'arkansas', 'ar', 'louisiana', 'la', 'mississippi', 'ms', 'alabama', 'al'))
                                                 """,
        con=engine)

    myprint(f"There will be {len(h2a_df)} normal H2A jobs in the feature.")
    myprint(
        f"There will be {len(h2a_no_housing_df)} H2A jobs mapped using their worksites in the feature."
    )
    myprint(
        f"There will be {len(forestry_h2b_in_our_states_df)} forestry H2B jobs in the feature."
    )
    myprint(
        f"There will be {len(additional_housing_df)} additional housing rows in the feature."
    )

    # get columns that are in the h2a data but not the additional housing data and add each one to the additional housing datafrane
    cols_only_in_h2a = set(h2a_df.columns) - set(additional_housing_df.columns)
    for column in cols_only_in_h2a:
        additional_housing_df[column] = None

    # for each additional housing row, find its matching row in job_central and insert the data about that case number that is in job_central but not the additional_housing row
    for i, row in additional_housing_df.iterrows():
        case_number = row["CASE_NUMBER"]
        job_in_h2a = h2a_df[h2a_df["CASE_NUMBER"] == case_number]

        if len(job_in_h2a) == 1:
            for column in cols_only_in_h2a:
                additional_housing_df.at[i, column] = get_value(
                    job_in_h2a, column)
        else:
            print_red_and_email(
                f"{case_number} is in additional_housing, so I looked for it in job_central, and found a number of matching rows not equal to 1.",
                "Overwriting ArcGIS Layer")

    # append completed additional_housing df to the h2a and forestry data
    full_layer = h2a_housing_and_no_housing_and_h2b_df.append(
        additional_housing_df)

    overwrite_feature(ARCGIS_USERNAME, ARCGIS_PASSWORD, full_layer, 'H2Data')
def write_dataframe(sheet, df):
    df.fillna("", inplace=True)
    sheet.update([df.columns.values.tolist()] + df.values.tolist())
    myprint(f"Wrote dataframe to {sheet}")
def open_sheet(client, file_name="", sheet_name=""):
    sheet = client.open(file_name).worksheet(sheet_name)
    myprint(f"Opened worksheet '{sheet_name}' in file '{file_name}'.")
    return sheet
Exemple #14
0
def update_database():
    latest_jobs = requests.get(most_recent_run_url).json()

    # use these two lines if you're updating using a local csv file
    # latest_jobs = pd.read_csv("dataset_apify-dol-actor-AJaB_2022-05-03_22-00-16-286.csv").drop(columns=["Unnamed: 0"])
    # latest_jobs = latest_jobs.to_dict('records')

    if not latest_jobs:
        myprint("No new jobs.")
        return
    myprint(f"There are {len(latest_jobs)} new jobs.")

    # use this version of parse function if updating data using a local csv file rather than an apify scraper run
    # def parse(job):
    #     column_mappings_dict = column_name_mappings
    #     columns_names_dict = {"Section A": "Job Info", "Section C": "Place of Employment Info", "Section D":"Housing Info"}
    #     parsed_job = {}
    #     for key in job:
    #         if "Section A" in key or "Section C" in key or "Section D" in key:
    #             section = key.split("/")[0]
    #             key_name = key.replace(section, columns_names_dict[section])
    #             if key_name in column_mappings_dict:
    #                 parsed_job[column_mappings_dict[key_name]] = job[key]
    #             else:
    #                 parsed_job[key_name] = job[key]
    #         else:
    #             if key in column_mappings_dict:
    #                 parsed_job[column_mappings_dict[key]] = job[key]
    #             else:
    #                 parsed_job[key] = job[key]
    #
    #     return parsed_job

    # given a dictionary `job` from the scraper data, returns another dictionary with keys modifed to match those in postgres
    def parse(job):
        # dictionary mapping apify column names to postgres column names
        column_mappings_dict = column_name_mappings
        columns_names_dict = {
            "Section A": "Job Info",
            "Section C": "Place of Employment Info",
            "Section D": "Housing Info"
        }
        parsed_job = {}
        for key in job:
            if key not in ["Section D", "Section C", "Section A"]:
                if key in column_mappings_dict:
                    # use the postgres column name
                    parsed_job[column_mappings_dict[key]] = job[key]
                else:
                    parsed_job[key] = job[key]
            else:
                inner_dict = job[key]
                for inner_key in inner_dict:
                    key_name = columns_names_dict[key] + "/" + inner_key
                    if key_name in column_mappings_dict:
                        # use the postgres column name
                        parsed_job[column_mappings_dict[
                            key_name]] = inner_dict[inner_key]
                    else:
                        parsed_job[key_name] = inner_dict[inner_key]
        return parsed_job

    # get date of run for the last actor run
    date_of_run = requests.get(date_of_run_url).json()["data"]["finishedAt"]

    # adds caclulated fields to the dictionary job
    def add_necessary_columns(job):
        # add source and date of run column
        job["Source"], job["table"] = "Apify", "central"
        job["Date of run"] = date_of_run

        if helpers.h2a_or_h2b(job) == "H-2A":
            job["Visa type"] = "H-2A"
            zip_code_columns = [
                "EMPLOYER_POSTAL_CODE", "WORKSITE_POSTAL_CODE",
                "Place of Employment Info/Postal Code", "HOUSING_POSTAL_CODE"
            ]

            if "TOTAL_WORKERS_H-2A_REQUESTED" in job:
                job["TOTAL_WORKERS_NEEDED"] = job[
                    "TOTAL_WORKERS_H-2A_REQUESTED"]
            workers_needed, occupancy = job["TOTAL_WORKERS_NEEDED"], job.get(
                "TOTAL_OCCUPANCY", None)
            if workers_needed and occupancy:
                if workers_needed > occupancy:
                    job["W to H Ratio"] = "W>H"
                elif workers_needed < occupancy:
                    job["W to H Ratio"] = "W<H"
                else:
                    job["W to H Ratio"] = "W=H"

            # fix case number if it's malformed in this way (sometimes they have been in the past)
            if job["CASE_NUMBER"][0] == "3":
                job["CASE_NUMBER"] = "H-" + job["CASE_NUMBER"]

        elif helpers.h2a_or_h2b(job) == "H-2B":
            job["Visa type"] = "H-2B"
            zip_code_columns = ["EMPLOYER_POSTAL_CODE", "WORKSITE_POSTAL_CODE"]

        else:
            job["Visa type"], zip_code_columns = "", []

        # fix zip code columns
        for column in zip_code_columns:
            if column in job:
                fixed_zip_code = helpers.fix_zip_code(job[column])
                job[column] = fixed_zip_code

        return job

    # parse each job, add necessary columns to each job, drop duplicates case numbers
    parsed_jobs = [parse(job) for job in latest_jobs]
    full_jobs = [add_necessary_columns(job) for job in parsed_jobs]
    full_jobs_df = pd.DataFrame(full_jobs).drop_duplicates(
        subset="CASE_NUMBER", keep="last")

    # convert date columns to type datetime
    date_columns = [
        "RECEIVED_DATE", "EMPLOYMENT_END_DATE", "EMPLOYMENT_BEGIN_DATE",
        "Date of run"
    ]
    for column in date_columns:
        full_jobs_df[column] = pd.to_datetime(full_jobs_df[column],
                                              errors='coerce')

    # add data as is to raw_scraper_jobs table
    full_raw_jobs = full_jobs_df.drop(columns=["table"])
    full_raw_jobs.to_sql("raw_scraper_jobs",
                         engine,
                         if_exists="append",
                         index=False)
    myprint("Uploaded raw scraper jobs to PostgreSQL")

    # geocode, split by accuracy, merge existing data with new data
    new_accurate_jobs, new_inaccurate_jobs = helpers.geocode_and_split_by_accuracy(
        full_jobs_df)
    helpers.merge_all_data(new_accurate_jobs, new_inaccurate_jobs)
Exemple #15
0
def update_database():
    latest_jobs = requests.get(most_recent_run_url).json()

    # use these two lines if you're updating using a local csv file
    # latest_jobs = pd.read_csv("Since_10_23_to_10_29.csv", encoding='unicode_escape').drop(columns=["Unnamed: 0"])
    # latest_jobs = latest_jobs.to_dict('records')

    if not latest_jobs:
        myprint("No new jobs.")
        return
    myprint(f"There are {len(latest_jobs)} new jobs.")

    # use this version of parse function if using a local csv file
    # def parse(job):
    #     column_mappings_dict = column_name_mappings
    #     columns_names_dict = {"Section A": "Job Info", "Section C": "Place of Employment Info", "Section D":"Housing Info"}
    #     parsed_job = {}
    #     for key in job:
    #         if "Section A" in key or "Section C" in key or "Section D" in key:
    #             section = key.split("/")[0]
    #             key_name = key.replace(section, columns_names_dict[section])
    #             if key_name in column_mappings_dict:
    #                 parsed_job[column_mappings_dict[key_name]] = job[key]
    #             else:
    #                 parsed_job[key_name] = job[key]
    #         else:
    #             if key in column_mappings_dict:
    #                 parsed_job[column_mappings_dict[key]] = job[key]
    #             else:
    #                 parsed_job[key] = job[key]
    #
    #     return parsed_job

    def parse(job):
        column_mappings_dict = column_name_mappings
        columns_names_dict = {
            "Section A": "Job Info",
            "Section C": "Place of Employment Info",
            "Section D": "Housing Info"
        }
        parsed_job = {}
        for key in job:
            if key not in ["Section D", "Section C", "Section A"]:
                if key in column_mappings_dict:
                    parsed_job[column_mappings_dict[key]] = job[key]
                else:
                    parsed_job[key] = job[key]
            else:
                inner_dict = job[key]
                for inner_key in inner_dict:
                    key_name = columns_names_dict[key] + "/" + inner_key
                    if key_name in column_mappings_dict:
                        parsed_job[column_mappings_dict[
                            key_name]] = inner_dict[inner_key]
                    else:
                        parsed_job[key_name] = inner_dict[inner_key]
        return parsed_job

    date_of_run = requests.get(date_of_run_url).json()["data"]["finishedAt"]

    def add_necessary_columns(job):
        # add source and date of run column
        job["Source"], job["table"] = "Apify", "central"
        job["Date of run"] = date_of_run
        # check if job is h2a
        if helpers.h2a_or_h2b(job) == "H-2A":
            job["Visa type"] = "H-2A"
            zip_code_columns = [
                "EMPLOYER_POSTAL_CODE", "WORKSITE_POSTAL_CODE",
                "Place of Employment Info/Postal Code", "HOUSING_POSTAL_CODE"
            ]
            if "TOTAL_WORKERS_H-2A_REQUESTED" in job:
                job["TOTAL_WORKERS_NEEDED"] = job[
                    "TOTAL_WORKERS_H-2A_REQUESTED"]
            workers_needed, occupancy = job["TOTAL_WORKERS_NEEDED"], job.get(
                "TOTAL_OCCUPANCY", None)
            if workers_needed and occupancy:
                if workers_needed > occupancy:
                    job["W to H Ratio"] = "W>H"
                elif workers_needed < occupancy:
                    job["W to H Ratio"] = "W<H"
                else:
                    job["W to H Ratio"] = "W=H"
            if job["CASE_NUMBER"][0] == "3":
                job["CASE_NUMBER"] = "H-" + job["CASE_NUMBER"]
        # check if job is h2b
        elif helpers.h2a_or_h2b(job) == "H-2B":
            job["Visa type"] = "H-2B"
            zip_code_columns = ["EMPLOYER_POSTAL_CODE", "WORKSITE_POSTAL_CODE"]
        else:
            job["Visa type"], zip_code_columns = "", []
        # fix zip code columns
        for column in zip_code_columns:
            if column in job:
                fixed_zip_code = helpers.fix_zip_code(job[column])
                job[column] = fixed_zip_code
        return job

    # parse each job, add all columns to each job, append this to raw scraper data and push back to postgres
    parsed_jobs = [parse(job) for job in latest_jobs]
    full_jobs = [add_necessary_columns(job) for job in parsed_jobs]
    full_jobs_df = pd.DataFrame(full_jobs).drop_duplicates(
        subset="CASE_NUMBER", keep="last")

    date_columns = [
        "RECEIVED_DATE", "EMPLOYMENT_END_DATE", "EMPLOYMENT_BEGIN_DATE",
        "Date of run"
    ]
    for column in date_columns:
        full_jobs_df[column] = pd.to_datetime(full_jobs_df[column],
                                              errors='coerce')

    full_raw_jobs = full_jobs_df.drop(columns=["table"])
    full_raw_jobs.to_sql("raw_scraper_jobs",
                         engine,
                         if_exists="append",
                         index=False)
    myprint("Uploaded raw scraper jobs to PostgreSQL")

    # geocode, split by accuracy, get old data, merge old with new data, sort data
    new_accurate_jobs, new_inaccurate_jobs = helpers.geocode_and_split_by_accuracy(
        full_jobs_df)

    helpers.merge_all_data(new_accurate_jobs, new_inaccurate_jobs)
Exemple #16
0
def send_fixes_to_postgres():
    fixed = pd.read_sql_query('select * from low_accuracies where fixed=true',
                              con=engine)

    if len(fixed) == 0:
        myprint("No jobs have been fixed.")
        return

    myprint(f"{len(fixed)} jobs have been fixed.")
    central, housing, failures = implement_fixes(fixed)
    assert len(central) + len(housing) + len(failures) == len(fixed)
    myprint(f"{len(central)} rows moving from low_accuracies to job_central.")
    myprint(
        f"{len(housing)} rows moving from low_accuracies to additional housing."
    )
    myprint(f"{len(failures)} failed fixes.")

    # saving appropriate fixes to previously_fixed table
    prev_fixed_columns = [
        "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY", "HOUSING_POSTAL_CODE",
        "HOUSING_STATE", "fixed", "housing accuracy", "housing accuracy type",
        "housing_fixed_by", "housing_lat", "housing_long", "notes"
    ]
    fixed_by_vals_to_save_fixes_for = ["coordinates", "address", "impossible"]

    central_for_prev_fixed_table = central[central["housing_fixed_by"].isin(
        fixed_by_vals_to_save_fixes_for)][prev_fixed_columns]
    housing_for_prev_fixed_table = housing[housing["housing_fixed_by"].isin(
        fixed_by_vals_to_save_fixes_for)][prev_fixed_columns]
    for_prev_fixed_table = central_for_prev_fixed_table.append(
        housing_for_prev_fixed_table)

    if not for_prev_fixed_table.empty:
        myprint(
            f"There are {len(for_prev_fixed_table)} rows to add to the previously_fixed table."
        )
        for_prev_fixed_table["initial_address"] = for_prev_fixed_table.apply(
            lambda job: handle_null(job["HOUSING_ADDRESS_LOCATION"]) +
            handle_null(job["HOUSING_CITY"]) + handle_null(job[
                "HOUSING_STATE"]) + handle_null(job["HOUSING_POSTAL_CODE"]),
            axis=1)
        for_prev_fixed_table = for_prev_fixed_table.drop_duplicates(
            subset='initial_address', keep="last")
        for_prev_fixed_table.to_sql("previously_fixed",
                                    engine,
                                    if_exists='append',
                                    index=False)
        myprint(f"All rows successfully added to the previously_fixed table.")
    else:
        myprint(f"No rows to add to the previously_fixed table.")

    # adding fixes to appropriate tables and deleting appropriate rows from low_accuracies
    central.to_sql('job_central', engine, if_exists='append', index=False)
    housing.to_sql('additional_housing',
                   engine,
                   if_exists='append',
                   index=False)

    # adding failed fixes back to low_accuracies - can't just use to_sql because of primary key
    for i, job in failures.iterrows():
        query = text("""UPDATE low_accuracies SET
                    (fixed, housing_lat, housing_long, "HOUSING_ADDRESS_LOCATION", "HOUSING_CITY",
                    "HOUSING_STATE", "HOUSING_POSTAL_CODE", notes, housing_fixed_by)
                     =
                     (:fixed, :lat, :long, :address, :city, :state, :zip, :notes, :fixed_by)
                     WHERE id = :id""")

        with engine.connect() as connection:
            connection.execute(query,
                               fixed=job["fixed"],
                               lat=job["housing_lat"],
                               long=job["housing_long"],
                               address=job["HOUSING_ADDRESS_LOCATION"],
                               city=job["HOUSING_CITY"],
                               state=job["HOUSING_STATE"],
                               zip=job["HOUSING_POSTAL_CODE"],
                               notes=job["notes"],
                               fixed_by=job["housing_fixed_by"],
                               id=job["id"])

    # all rows that had fixed as true are now either in job_central or in low_accuracies as a new row with fixed_by as failed and fixed as false
    make_query("delete from low_accuracies where fixed=true")

    myprint(
        f"Done implementing fixes. There were {len(failures)} failed fixes out of {len(fixed)} attempts."
    )