def parse_and_scrape_site(self, mod, full_crawl): court_str = mod.__name__.split(".")[-1].split("_")[0] logger.info('Using court_str: "%s"' % court_str) for site in site_yielder(mod.Site().back_scrape_iterable, mod): site.parse() self.scrape_court(site, full_crawl=True)
def parse_and_scrape_site(self, mod, full_crawl): court_str = mod.__name__.split('.')[-1].split('_')[0] logger.info("Using court_str: \"%s\"" % court_str) for site in site_yielder(mod.Site().back_scrape_iterable, mod): site.parse() self.scrape_court(site, full_crawl=True)
def main(): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die safely signal.signal(signal.SIGTERM, signal_handler) usage = ( "usage: %prog -c COURTID [-d|--daemon] [-b|--binaries] [-r|--report]\n\n" "To test ca1, downloading binaries, use: \n" " python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n" "To test all federal courts, omitting binaries, use: \n" " python %prog -c opinions.united_states.federal_appellate" "Passing the --report option will generate an HTML report in " "the root directory after scrapers have run") parser = OptionParser(usage) parser.add_option( "-c", "--courts", dest="court_id", metavar="COURTID", help=("The court(s) to scrape and extract. This should be in " "the form of a python module or package import " "from the Juriscraper library, e.g. " '"juriscraper.opinions.united_states.federal.ca1" or ' 'simply "opinions" to do all opinions. If desired, ' "you can use slashes instead of dots to separate" "the import path."), ) parser.add_option( "-d", "--daemon", action="store_true", dest="daemonmode", default=False, help=("Use this flag to turn on daemon " "mode, in which all courts requested " "will be scraped in turn, non-stop."), ) parser.add_option( "-b", "--download_binaries", action="store_true", dest="binaries", default=False, help=("Use this flag if you wish to download the pdf, " "wpd, and doc files."), ) parser.add_option( "-v", "--verbosity", action="count", default=1, help="Increase output verbosity (e.g., -vv is more than -v).", ) parser.add_option( "--backscrape", dest="backscrape", action="store_true", default=False, help= "Download the historical corpus using the _download_backwards method.", ) parser.add_option( "-r", "--report", action="store_true", default=False, help="Generate a report.html with the outcome of running the scrapers", ) (options, args) = parser.parse_args() daemon_mode = options.daemonmode binaries = options.binaries court_id = options.court_id backscrape = options.backscrape generate_report = options.report # Set up the print function print("Verbosity is set to: %s" % options.verbosity) def _v_print(*verb_args): if verb_args[0] > (3 - options.verbosity): print(verb_args[1]) global v_print v_print = _v_print results = {} if not court_id: parser.error("You must specify a court as a package or module.") else: court_id = court_id.replace("/", ".") if court_id.endswith(".py"): court_id = court_id[:-3] module_strings = build_module_list(court_id) if len(module_strings) == 0: parser.error("Unable to import module or package. Aborting.") v_print(3, "Starting up the scraper.") num_courts = len(module_strings) i = 0 while i < num_courts: current_court = module_strings[i] results[current_court] = {"global_failure": False} # this catches SIGINT, so the code can be killed safely. if die_now: v_print(3, "The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit(".", 1) v_print(3, "Current court: %s.%s" % (package, module)) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) try: if backscrape: for site in site_yielder(mod.Site().back_scrape_iterable, mod): site.parse() scrape_court(site, binaries) else: site = mod.Site() v_print(3, "Sent %s request to: %s" % (site.method, site.url)) if site.uses_selenium: v_print(3, "Selenium will be used.") site.parse() results[current_court]["scrape"] = scrape_court( site, binaries) except Exception: results[current_court][ "global_failure"] = traceback.format_exc() results[current_court]["scrape"] = {} v_print(3, "*************!! CRAWLER DOWN !!****************") v_print( 3, "*****scrape_court method failed on mod: %s*****" % module_strings[i], ) v_print(3, "*************!! ACTION NEEDED !!***************") v_print(3, traceback.format_exc()) i += 1 continue last_court_in_list = i == (num_courts - 1) if last_court_in_list and daemon_mode: i = 0 else: i += 1 v_print(3, "The scraper has stopped.") if generate_report: report_path = os.path.abspath( os.path.join(os.path.dirname(__file__), "../report.html")) v_print(3, "Generating HTML report at %s" % report_path) generate_scraper_report(report_path, results) sys.exit(0)
def main(): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die safely signal.signal(signal.SIGTERM, signal_handler) usage = ('usage: %prog -c COURTID [-d|--daemon] [-b|--binaries]\n\n' 'To test ca1, downloading binaries, use: \n' ' python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n' 'To test all federal courts, omitting binaries, use: \n' ' python %prog -c opinions.united_states.federal_appellate') parser = OptionParser(usage) parser.add_option('-c', '--courts', dest='court_id', metavar="COURTID", help=('The court(s) to scrape and extract. This should be in ' 'the form of a python module or package import ' 'from the Juriscraper library, e.g. ' '"juriscraper.opinions.united_states.federal.ca1" or ' 'simply "opinions" to do all opinions. If desired, ' 'you can use slashes instead of dots to separate' 'the import path.')) parser.add_option('-d', '--daemon', action="store_true", dest='daemonmode', default=False, help=('Use this flag to turn on daemon ' 'mode, in which all courts requested ' 'will be scraped in turn, non-stop.')) parser.add_option('-b', '--download_binaries', action='store_true', dest='binaries', default=False, help=('Use this flag if you wish to download the pdf, ' 'wpd, and doc files.')) parser.add_option('-v', '--verbosity', action='count', default=1, help='Increase output verbosity (e.g., -vv is more than -v).') parser.add_option('--backscrape', dest='backscrape', action='store_true', default=False, help='Download the historical corpus using the _download_backwards method.') (options, args) = parser.parse_args() daemon_mode = options.daemonmode binaries = options.binaries court_id = options.court_id backscrape = options.backscrape # Set up the print function print("Verbosity is set to: %s" % options.verbosity) def _v_print(*verb_args): if verb_args[0] > (3 - options.verbosity): print(verb_args[1]) global v_print v_print = _v_print if not court_id: parser.error('You must specify a court as a package or module.') else: court_id = court_id.replace('/', '.') if court_id.endswith('.py'): court_id = court_id[:-3] module_strings = build_module_list(court_id) if len(module_strings) == 0: parser.error('Unable to import module or package. Aborting.') v_print(3, 'Starting up the scraper.') num_courts = len(module_strings) i = 0 while i < num_courts: # this catches SIGINT, so the code can be killed safely. if die_now: v_print(3, 'The scraper has stopped.') sys.exit(1) package, module = module_strings[i].rsplit('.', 1) v_print(3, "Current court: %s.%s" % (package, module)) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) try: if backscrape: for site in site_yielder(mod.Site().back_scrape_iterable, mod): site.parse() scrape_court(site, binaries) else: site = mod.Site() v_print(3, 'Sent %s request to: %s' % (site.method, site.url)) if site.uses_selenium: v_print(3, "Selenium will be used.") site.parse() scrape_court(site, binaries) except Exception: v_print(3, '*************!! CRAWLER DOWN !!****************') v_print(3, '*****scrape_court method failed on mod: %s*****' % module_strings[i]) v_print(3, '*************!! ACTION NEEDED !!***************') v_print(3, traceback.format_exc()) i += 1 continue last_court_in_list = (i == (num_courts - 1)) if last_court_in_list and daemon_mode: i = 0 else: i += 1 v_print(3, 'The scraper has stopped.') sys.exit(0)
def main(): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die safely signal.signal(signal.SIGTERM, signal_handler) usage = ('usage: %prog -c COURTID [-d|--daemon] [-b|--binaries]\n\n' 'To test ca1, downloading binaries, use: \n' ' python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n' 'To test all federal courts, omitting binaries, use: \n' ' python %prog -c opinions.united_states.federal_appellate') parser = OptionParser(usage) parser.add_option('-c', '--courts', dest='court_id', metavar="COURTID", help=('The court(s) to scrape and extract. This should be in ' 'the form of a python module or package import ' 'from the Juriscraper library, e.g. ' '"juriscraper.opinions.united_states.federal.ca1" or ' 'simply "opinions" to do all opinions. If desired, ' 'you can use slashes instead of dots to separate' 'the import path.')) parser.add_option('-d', '--daemon', action="store_true", dest='daemonmode', default=False, help=('Use this flag to turn on daemon ' 'mode, in which all courts requested ' 'will be scraped in turn, non-stop.')) parser.add_option('-b', '--download_binaries', action='store_true', dest='binaries', default=False, help=('Use this flag if you wish to download the pdf, ' 'wpd, and doc files.')) parser.add_option('-v', '--verbosity', action='count', default=1, help='Increase output verbosity (e.g., -vv is more than -v).') parser.add_option('--backscrape', dest='backscrape', action='store_true', default=False, help='Download the historical corpus using the _download_backwards method.') (options, args) = parser.parse_args() daemon_mode = options.daemonmode binaries = options.binaries court_id = options.court_id backscrape = options.backscrape # Set up the print function print "Verbosity is set to: %s" % options.verbosity def _v_print(*verb_args): if verb_args[0] > (3 - options.verbosity): print verb_args[1] global v_print v_print = _v_print if not court_id: parser.error('You must specify a court as a package or module.') else: court_id = court_id.replace('/', '.') if court_id.endswith('.py'): court_id = court_id[:-3] module_strings = build_module_list(court_id) if len(module_strings) == 0: parser.error('Unable to import module or package. Aborting.') v_print(3, 'Starting up the scraper.') num_courts = len(module_strings) i = 0 while i < num_courts: # this catches SIGINT, so the code can be killed safely. if die_now: v_print(3, 'The scraper has stopped.') sys.exit(1) package, module = module_strings[i].rsplit('.', 1) v_print(3, "Current court: %s.%s" % (package, module)) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) try: if backscrape: for site in site_yielder(mod.Site().back_scrape_iterable, mod): site.parse() scrape_court(site, binaries) else: site = mod.Site() v_print(3, 'Sent %s request to: %s' % (site.method, site.url)) if site.uses_selenium: v_print(3, "Selenium will be used.") site.parse() scrape_court(site, binaries) except Exception: v_print(3, '*************!! CRAWLER DOWN !!****************') v_print(3, '*****scrape_court method failed on mod: %s*****' % module_strings[i]) v_print(3, '*************!! ACTION NEEDED !!***************') v_print(3, traceback.format_exc()) i += 1 continue last_court_in_list = (i == (num_courts - 1)) if last_court_in_list and daemon_mode: i = 0 else: i += 1 v_print(3, 'The scraper has stopped.') sys.exit(0)