def test_convert_empty_dict_into_df(self): """ Test that result is a dataframe """ empty_dict = {} df = convert_dict_into_df(empty_dict, "Dauphin") self.assertIsInstance(df, pd.core.frame.DataFrame)
def test_convert_dict_into_df(self): """ Test that result is a dataframe """ df = convert_dict_into_df(docket_list, "Dauphin") self.assertIsInstance(df, pd.core.frame.DataFrame)
def test_html_file_is_created(self): """ Test that an HTML file is generated """ # create df df = convert_dict_into_df(docket_list, "Dauphin") # create styled df styled_df = convert_df_to_html(df) # wrap styled df with more html save_html_county_payload("This is an introduction for the email", styled_df) # Check html file has been created self.assertTrue(mock_paths["payload_email"].is_file())
def setUp(self) -> None: mock_dirs["payload_email"].mkdir(parents=True, exist_ok=True) # create testing df df = convert_dict_into_df(docket_list, "Dauphin") self.styled_df = convert_df_to_html(df)
def setUp(self) -> None: mock_dirs["payload_csv"].mkdir(parents=True, exist_ok=True) # make directory self.df = convert_dict_into_df(docket_list, "Dauphin") # make testing df
def main(): ######################################################################## # SETUP ######################################################################## # INIT LOGGING logs_config() # START TIME scrape_start_time = datetime.now() # GET ENV VARS county_list = json.loads(os.environ.get("COUNTY_LIST")) target_scrape_day = os.environ.get("TARGET_SCRAPE_DATE", "yesterday").lower() target_scrape_date = (misc.today_date() if target_scrape_day == "today" else misc.yesterday_date()) # convert to date scrape_name = os.getenv("SCRAPE_NAME", "Cases Scrape") run_env = os.environ.get("ENV_FILE", "DEV") # defaults to 'DEV' rest_api_enabled = os.getenv("REST_API_ENABLED") rest_api_enabled = True if rest_api_enabled == "TRUE" else False move_to_s3_enabled = os.getenv("MOVE_TO_S3_ENABLED") move_to_s3_enabled = True if move_to_s3_enabled == "TRUE" else False # REFORMAT COUNTY LIST county_list = [ x.title() for x in county_list ] # Counties are transformed into print_title case, otherwise we'll get errors during scrape ######################################################################## # START PROGRAM ######################################################################## misc.print_title("pa court report") logging.info("##### PROGRAM START #####") logging.info(f"Scrape: {scrape_name}") logging.info(f"Running in {run_env} environment\n") ######################################################################## # DELETE + CREATE ######################################################################## # DELETE OLD FILES # If temp folder was created from previous scrape we delete it so it doesn't cause complications. misc.delete_folders_and_contents(temp_dir) # CREATE TEMP DIRECTORIES temp_subdirs = [ dirs[directory] for directory in dirs if "/" + str(temp_dir.name) + "/" in str(dirs[directory]) ] misc.create_folders(temp_subdirs) ######################################################################## # SCRAPE ######################################################################## for count, county in enumerate(county_list): # SCRAPE UJS SEARCH RESULTS # We first get basic docket data from search results, like docket # numbers, filing dates, etc. and turn it into a list of dicts. docket_list = scrape(county, target_scrape_date) if docket_list: # DOWNLOAD PDF OF EACH DOCKET # Each case is associated with a PDF that has more data. We # extract info from those pdfs and add them to our dicts. driver = initialize.initialize_driver() for docket in docket_list: pdf_path = download.download_pdf(driver, docket["url"], docket["docketnum"]) text = convert.convert_pdf_to_text(pdf_path, docket["docketnum"]) # PARSE PDF TEXT parsed_data = parse_main(text) docket.update(parsed_data) driver.quit() # CONVERT DICT LIST INTO PANDAS DF df = export.convert_dict_into_df(docket_list, county) # SAVE BACKUP OF DF FOR DEBUGGING df.to_pickle(dirs["df_pkl"] / "df.pkl") # CONVERT DF TO CSV export.convert_df_to_csv(df) # CONVERT DF INTO HTML FOR EMAIL PAYLOAD county_intro = "{} in {} County:".format(df.shape[0], county) # count of cases html_df = export.convert_df_to_html(df) export.save_html_county_payload(county_intro, html_df) if not count == (len(county_list) - 1): sleep_after_scrape = 65 logging.info( f"Sleeping for {sleep_after_scrape} seconds after scrape in order to prevent overloading " f"UJS server") time.sleep(sleep_after_scrape) else: logging.info(f"No cases were found for {county} County") county_intro = f"No cases found for {county} County." export.save_html_county_payload(county_intro) ######################################################################## # EXPORT & EMAIL FINAL PAYLOAD ######################################################################## # START TIME scrape_end_time = datetime.now() # OPTIONAL: MOVE JSON FILE TO S3 if move_to_s3_enabled: export.convert_csv_to_json(scrape_end_time, county_list) copy_file_to_s3_bucket() # OPTIONAL: UPLOAD DATA TO DATABASE if rest_api_enabled and paths["payload_csv"].is_file(): upload.upload_to_rest_api() # SEND EMAIL WITH DOCKET DATA email.email_notification(scrape_start_time, scrape_end_time, target_scrape_day, county_list) # CLOSE PROGRAM logging.info("Scrape completed at: {}".format( get_datetime_now_formatted())) logging.info("Closing program")