def parse_settings_on_cloud( afterdate: str, beforedate: str, write_to_sheets=True, scraper: Optional[scrapers.FakeScraper] = None, ): """ Same as `parse_settings()` (see below) but without command line interface and showbrowser option. Outputs scraped results to a gsheet:Settings_scheduler if `write_to_sheets` is True """ if scraper is None: scraper = scrapers.TravisScraper() logger.info(f"Parsing settings between {afterdate} and {beforedate}.") days_to_pull = get_days_between_dates(afterdate=afterdate, beforedate=beforedate) pulled_settings = scraper.make_setting_list(days_to_pull) if scraper is None: scraper = scrapers.TravisScraper(headless=True) if isinstance(scraper, scrapers.TravisScraper): import persist for setting in pulled_settings: persist.rest_setting(setting) # maybe make this cleaner in sql? future work if write_to_sheets: import gsheet gsheet.write_pulled_settings(pulled_settings)
def parse_all_from_parse_filings( case_nums: List[str], scraper: Optional[scrapers.FakeScraper] = None, db: bool = True, county: str = "travis", showbrowser: bool = False, ) -> List[Dict[str, Any]]: """ Gets case details for each case number in `case_nums` and sends the data to PostgreSQL. Logs any case numbers for which getting data failed. """ if not scraper: # Get the scraper corresponding to the lowercase command line entry for county. Default to TravisScraper. county = county.lower() scraper = ( scrapers.SCRAPER_NAMES[county]() if county in scrapers.SCRAPER_NAMES else scrapers.TravisScraper() ) parsed_cases = [] for tries in range(1, 6): try: parsed_cases = scraper.make_case_list(ids_to_parse=case_nums) return parsed_cases except Exception as e: logger.error( f"Failed to parse hearings on attempt {tries}. Error message: {e}" ) return parsed_cases
def parse_filings_on_cloud( afterdate: datetime.date, beforedate: datetime.date, get_old_active=True, showbrowser=False, scraper: Optional[scrapers.FakeScraper] = None, ): """Parses filings without command line interface and outfile options.""" logger.info(f"Parsing filings between {afterdate} and {beforedate}.") if not scraper: scraper = scrapers.TravisScraper(headless=not showbrowser) all_case_nums = scraper.get_all_case_nums(afterdate=afterdate, beforedate=beforedate) if get_old_active: from persist import get_old_active_case_nums all_case_nums += get_old_active_case_nums() # using dict to eliminate duplicates all_case_nums = list(dict.fromkeys(all_case_nums)) logger.info( f"Found {len(all_case_nums)} case numbers (including old active ones)." ) cases = parse_all_from_parse_filings(all_case_nums, scraper=scraper) # persist cases only if not using the test scraper if isinstance(scraper, scrapers.TravisScraper): persist_parsed_cases(cases) return cases
def test_fetch_settings(self): scraper = scrapers.TravisScraper() settings = scraper.make_setting_list(days_to_pull=[date(2020, 9, 1)]) assert any(case["case_number"] == "J1-CV-19-005480" for case in settings)
from datetime import date import pytest import scrapers import config config.county = "travis" scraper = scrapers.TravisScraper() williamson_scraper = scrapers.WilliamsonScraper() class TestFetchFilingsPage: def test_fetch_filings_page(self): fetched = scraper.query_filings( afterdate=date(2020, 6, 1), beforedate=date(2020, 6, 30), case_num_prefix="J1-CV-20*", ) assert "J1-CV-20-001773" in fetched class TestFetchCaseNumbers: def test_fetch_case_numbers_requiring_split(self): """ Test date range requiring multiple pages of search results. The scraper will need to split this into multiple queries and combine the results. """