def wrapped(*args, **kwargs): driver = args[0].driver try: captcha_div = driver.find_element_by_css_selector('.g-recaptcha') site_key = captcha_div.get_attribute('data-sitekey').strip() print(f're-captcha site-key: {site_key}') try: configs = Configs() anticaptcha_api_key = configs.captcha_service_api_key except KeyError: msg = f'CAPTCHA sites require Anticaptcha.com API key set in {configs.config_file_path}' raise CaptchaError(msg) except selenium.common.exceptions.NoSuchElementException: site_key = None if site_key: site_url = driver.current_url solver = recaptchaV2Proxyless() solver.set_verbose(1) solver.set_key(anticaptcha_api_key) solver.set_website_url(site_url) solver.set_website_key(site_key) g_response = solver.solve_and_return_solution() to_inject = f'document.querySelector(".g-recaptcha-response").innerHTML = "{g_response}";' driver.execute_script(to_inject) return method(*args, **kwargs)
def search(place_id, case_number, case_numbers_file, with_browser): """Search court site.""" # Config and logging setup configs = Configs() cache_dir = Path(configs.cache_dir) cache_dir.mkdir(parents=True, exist_ok=True) log_file = str(cache_dir.joinpath("logfile.txt")) logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)-12s - %(message)s", datefmt="%m-%d %H:%M", filename=log_file, filemode="a", ) console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter("%(message)s") console.setFormatter(formatter) logging.getLogger("").addHandler(console) logger = logging.getLogger(__name__) # Get Runner and execute the search RunnerKlass = _get_runner(place_id) runner = RunnerKlass(configs.cache_dir, configs.config_file_path, place_id) if case_number: case_numbers = [case_number] else: case_numbers = [t.strip() for t in case_numbers_file] kwargs = { "case_numbers": case_numbers, "headless": not with_browser, } # TODO: Restore catch-all try/except results = runner.search(**kwargs) runner.cache_detail_pages(results) dstore = Datastore(configs.db_path) logger.info("Adding {} results to {}".format(len(results), configs.db_path)) to_db = [] for result in results: # Place ID is required Case db table result.place_id = place_id to_db.append(result.standard_data) dstore.upsert(to_db)
def search(self, case_numbers=[], headless=True, **kwargs): """ For a given scraper, executes the search, acquisition and processing of case info. Args: case_numbers (list<str>): List of case numbers to search headless (boolean): Whether or not to run headless (default: True) Returns: List of CaseInfo instances """ # Look up the catcha API key as env variable, then fall back to config file configs = Configs() site = Site(self.place_id, captcha_api_key=configs.captcha_service_api_key) logger.info("Executing search for {}".format(self.place_id)) data = site.search(case_numbers=case_numbers, headless=headless) return data
import logging import os import traceback from pathlib import Path import click from click_option_group import optgroup, RequiredMutuallyExclusiveOptionGroup from court_scraper.configs import Configs from court_scraper.datastore import Datastore from court_scraper.runner import Runner from court_scraper.sites_meta import SitesMeta configs = Configs() cache_dir = Path(configs.cache_dir) cache_dir.mkdir(parents=True, exist_ok=True) log_file = str(cache_dir.joinpath('logfile.txt')) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)-12s - %(message)s', datefmt='%m-%d %H:%M', filename=log_file, filemode='a') console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter('%(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) logger = logging.getLogger(__name__) @click.group()
def get_captcha_service_api_key(): configs = Configs() return configs.captcha_service_api_key