def init_program():
    # Load env vars
    load_dotenv()

    # init logging
    program_start_time = utc_now()
    timezone = program_start_time.tzinfo
    logs_config()
    logging.info(f"Begin program run: {program_start_time} ({timezone} time)")

    # create or clean download dir
    if DIR_OUTPUT.is_dir():
        # delete files from previous run
        delete_dir_contents(DIR_OUTPUT)
    else:
        logging.info("Data directory doesn't exist - building")
        DIR_OUTPUT.mkdir()

    # Set pandas options
    pandas_opts()

    # this fixes a strange bug with botocore/moto not recognizing AWS credentials: https://github.com/spulec/moto/issues/1941
    os.environ["AWS_ACCESS_KEY_ID"] = os.environ.get("KEY_ID")
    os.environ["AWS_SECRET_ACCESS_KEY"] = os.environ.get("SECRET_KEY_ID")

    # set Altair themes
    alt.themes.register("spotlight", spotlight)
    alt.themes.enable("spotlight")

    return program_start_time
Ejemplo n.º 2
0
    def setUp(self) -> None:
        # start logging
        logs_config(paths["logs_config_test"])
        # delete previous test output files
        logging.info(f"Deleting {mock_dirs['payload_email']} if it exists")
        if mock_dirs["payload_email"].is_dir():
            rmtree(mock_dirs["payload_email"])
        # rebuild directory
        mock_dirs["payload_email"].mkdir(parents=True, exist_ok=True)

        # SET PANDAS OPTIONS FOR PRINT DISPLAY
        pd.set_option("display.max_columns", 20)
        pd.set_option("display.width", 2000)
        pd.set_option("display.max_rows", 700)
def main():

    # init logging
    scrape_start_time = datetime.now()
    logs_config()
    logging.info('Beginning scrape')

    # SET VARS
    # list of target filing IDs
    list_of_filing_ids = [
        "336344",  # samuel doctor
        "331887",  # phil heasley
        "332791",  # Andre Del Valle
        "209028"  # thomas murt
    ]
    sleep_time = 5

    # create or clean up PDF download dir
    if DIR_DATA.is_dir():
        # delete files from previous run
        delete_dir_contents(DIR_DATA)
    else:
        DIR_DATA.mkdir()

    # get pdf
    for filing_id in list_of_filing_ids:
        get_pdf(filing_id)
        # sleep in order to avoid overloading Ethics Commission website
        logging.info(f'Sleeping for {sleep_time} seconds...')
        time.sleep(sleep_time)
        logging.info('...Finished sleeping!')

    # complete
    scrape_end_time = datetime.now()
    scrape_duration = (scrape_end_time - scrape_start_time).total_seconds()
    logging.info(f'Total scrape time: {scrape_duration /60} minutes')
    logging.info(
        f'Time per item scraped: {round(scrape_duration / len(list_of_filing_ids), 2)} sec'
    )
    logging.info("Scrape complete")
def main():

    # Load env vars
    load_dotenv()

    # Init logging
    logs_config()
    logging.info("Begin program")

    # delete data from previous runs
    clean_data()

    # get google sheet config
    with open(PATH_CONFIG_GSHEETS) as f:
        config_sheets = json.load(f)

    for document in config_sheets:
        sheets = document.get("sheets", [])
        document_name = document.get("document_name")
        document_id = document.get("document_id")
        move_s3 = document.get("move_s3", False)
        bucket_name = document.get("bucket_name", None)
        bucket_dest_dir = document.get("bucket_dest_dir", None)
        move_local = document.get("move_local", False)
        logging.info(f"Extracting files from document: {document_name}")
        for sheet in sheets:
            sheet_name = sheet["name"]
            output_filename = f"{sheet_name}.csv"
            download_sheet(sheet_name, document_id, sheet["gid"], output_filename)
            if move_s3:
                if bucket_name and bucket_dest_dir:
                    copy_to_s3(output_filename, bucket_name, bucket_dest_dir)
                else:
                    raise ValueError('Missing bucket_name or bucket_dest_dir')
            if move_local:
                copy_to_local(output_filename)
            logging.info("Sleeping to avoid google rate limiting issues...")
            sleep(10)
            logging.info("Waking!")
import unittest
from unittest import mock
from pathlib import Path
from shutil import rmtree
from dotenv import load_dotenv

# project modules
from modules.download import download_pdf
from modules.initialize import initialize_driver
from locations import dirs, paths, root_dir, test_dir
from logs.config.logging import logs_config

# LOGGING
logs_config(paths["logs_config_test"])

# ENV
load_dotenv(root_dir / ".dev.env")

# MOCK VARS
mock_dirs = {
    "pdfs": test_dir / "output/pdfs"
}  # NOTE: Must have resolve otherwise you'll have problems

# MOCK FUNCS
@mock.patch.dict(dirs, mock_dirs, clear=True)
def initialize_test_driver():
    """ By mocking the directory paths we force webdriver to set
    test/output/pdfs as default download directory """
    return initialize_driver()

def main():

    # load environ vars
    load_dotenv()

    # init logging
    logs_config()

    # create or clean temp dirs
    if DIR_DATA.is_dir():
        # delete files from previous run
        delete_dir_contents(DIR_DATA)
    else:
        DIR_DATA.mkdir()

    # init driver
    logging.info("Begin scrape")
    driver = initialize_driver()

    search_terms = [
        {
            "search_name": "House - paper forms",
            "input_fields": [
                {"field": "Template", "value": "SFI Template"},
                {"field": "BatesBatch", "value": "*HR*"},
                {"field": "Year", "value": "2019"},
            ],
        },
        {
            "search_name": "Senate - paper forms",
            "input_fields": [
                {"field": "Template", "value": "SFI Template"},
                {"field": "BatesBatch", "value": "*SN*"},
                {"field": "Year", "value": "2019"},
            ],
        },
        {
            "search_name": "House - web forms",
            "input_fields": [
                {"field": "Template", "value": "Web State of Financial Interests Form"},
                {"field": "03-05 State Entity", "value": "*rep*"},
                {"field": "07 Year", "value": "2019"},
            ],
        },
        {
            "search_name": "Senate - web forms",
            "input_fields": [
                {"field": "Template", "value": "Web State of Financial Interests Form"},
                {"field": "03-05 State Entity", "value": "*sen*"},
                {"field": "07 Year", "value": "2019"},
            ],
        },
    ]

    """
    TODO:
    
    > loop over each page of results:
        > loop over each row on page:
            > scrape page
    
    /// pseudo code ///
        
    driver open "https://www.ethicsrulings.pa.gov/WebLink/Search.aspx?dbid=0&repo=EthicsLF8"
    
    for search in search_terms:
        input search_terms 
        click submit
        
        # page search loop
        page_count = 1
        while True:
            wait for page results to load

            # row search loop
            row_count = 1
            while True:
                try:
                    click filer's name based on row_count
                    wait for filer page to load
                    scrape_page()
                    row_count += 1
                except NoSuchElementException:
                    logging.info("No more rows found")
                    break  # end row search loop
            
            try: 
                page_count += 1
                click page element based on page_count
            
            except NoSuchElementException:
                logging.info("no more pages found")
                break # end page search loop
            
    
    """

    # set vars - just using these numbers for testing, delete this
    start_id = 3
    end_id = 5

    for page_id in range(start_id, end_id):
        scrape_page(driver, page_id)
Ejemplo n.º 7
0
def main():

    ########################################################################
    #                                 SETUP
    ########################################################################

    # INIT LOGGING
    logs_config()

    # START TIME
    scrape_start_time = datetime.now()

    # GET ENV VARS
    county_list = json.loads(os.environ.get("COUNTY_LIST"))
    target_scrape_day = os.environ.get("TARGET_SCRAPE_DATE",
                                       "yesterday").lower()
    target_scrape_date = (misc.today_date() if target_scrape_day == "today"
                          else misc.yesterday_date())  # convert to date
    scrape_name = os.getenv("SCRAPE_NAME", "Cases Scrape")
    run_env = os.environ.get("ENV_FILE", "DEV")  # defaults to 'DEV'
    rest_api_enabled = os.getenv("REST_API_ENABLED")
    rest_api_enabled = True if rest_api_enabled == "TRUE" else False
    move_to_s3_enabled = os.getenv("MOVE_TO_S3_ENABLED")
    move_to_s3_enabled = True if move_to_s3_enabled == "TRUE" else False

    # REFORMAT COUNTY LIST
    county_list = [
        x.title() for x in county_list
    ]  # Counties are transformed into print_title case, otherwise we'll get errors during scrape

    ########################################################################
    #                          START PROGRAM
    ########################################################################

    misc.print_title("pa court report")
    logging.info("##### PROGRAM START #####")
    logging.info(f"Scrape: {scrape_name}")
    logging.info(f"Running in {run_env} environment\n")

    ########################################################################
    #                          DELETE + CREATE
    ########################################################################

    # DELETE OLD FILES
    # If temp folder was created from previous scrape we delete it so it doesn't cause complications.
    misc.delete_folders_and_contents(temp_dir)

    # CREATE TEMP DIRECTORIES
    temp_subdirs = [
        dirs[directory] for directory in dirs
        if "/" + str(temp_dir.name) + "/" in str(dirs[directory])
    ]
    misc.create_folders(temp_subdirs)

    ########################################################################
    #                                 SCRAPE
    ########################################################################

    for count, county in enumerate(county_list):

        # SCRAPE UJS SEARCH RESULTS
        # We first get basic docket data from search results, like docket
        # numbers, filing dates, etc. and turn it into a list of dicts.
        docket_list = scrape(county, target_scrape_date)
        if docket_list:

            # DOWNLOAD PDF OF EACH DOCKET
            # Each case is associated with a PDF that has more data. We
            # extract info from those pdfs and add them to our dicts.
            driver = initialize.initialize_driver()
            for docket in docket_list:
                pdf_path = download.download_pdf(driver, docket["url"],
                                                 docket["docketnum"])
                text = convert.convert_pdf_to_text(pdf_path,
                                                   docket["docketnum"])

                # PARSE PDF TEXT
                parsed_data = parse_main(text)
                docket.update(parsed_data)
            driver.quit()

            # CONVERT DICT LIST INTO PANDAS DF
            df = export.convert_dict_into_df(docket_list, county)

            # SAVE BACKUP OF DF FOR DEBUGGING
            df.to_pickle(dirs["df_pkl"] / "df.pkl")

            # CONVERT DF TO CSV
            export.convert_df_to_csv(df)

            # CONVERT DF INTO HTML FOR EMAIL PAYLOAD
            county_intro = "{} in {} County:".format(df.shape[0],
                                                     county)  # count of cases
            html_df = export.convert_df_to_html(df)
            export.save_html_county_payload(county_intro, html_df)

            if not count == (len(county_list) - 1):
                sleep_after_scrape = 65
                logging.info(
                    f"Sleeping for {sleep_after_scrape} seconds after scrape in order to prevent overloading "
                    f"UJS server")
                time.sleep(sleep_after_scrape)

        else:
            logging.info(f"No cases were found for {county} County")
            county_intro = f"No cases found for {county} County."
            export.save_html_county_payload(county_intro)

    ########################################################################
    #                        EXPORT & EMAIL FINAL PAYLOAD
    ########################################################################

    # START TIME
    scrape_end_time = datetime.now()

    # OPTIONAL: MOVE JSON FILE TO S3
    if move_to_s3_enabled:
        export.convert_csv_to_json(scrape_end_time, county_list)
        copy_file_to_s3_bucket()

    # OPTIONAL: UPLOAD DATA TO DATABASE
    if rest_api_enabled and paths["payload_csv"].is_file():
        upload.upload_to_rest_api()

    # SEND EMAIL WITH DOCKET DATA
    email.email_notification(scrape_start_time, scrape_end_time,
                             target_scrape_day, county_list)

    # CLOSE PROGRAM
    logging.info("Scrape completed at: {}".format(
        get_datetime_now_formatted()))
    logging.info("Closing program")