def handle(self, *args, **options): court_id = options.get('court_id') if not court_id: raise CommandError('You must specify a court as a package or module') else: module_strings = build_module_list(court_id) if not len(module_strings): raise CommandError('Unable to import module or package. Aborting.') logger.info("Starting up the scraper.") num_courts = len(module_strings) i = 0 while i < num_courts: package, module = module_strings[i].rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) # noinspection PyBroadException try: self.back_scrape(mod) except Exception, e: msg = ('********!! CRAWLER DOWN !!***********\n' '*****scrape_court method failed!*****\n' '********!! ACTION NEEDED !!**********\n%s') % traceback.format_exc() logger.critical(msg) finally:
def handle(self, *args, **options): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) self.verbosity = int(options.get("verbosity", 1)) daemon_mode = options.get("daemonmode", False) full_crawl = options.get("full_crawl", False) try: rate = int(options["rate"]) except (ValueError, AttributeError, TypeError): rate = 30 court_id = options.get("court_id") if not court_id: raise CommandError("You must specify a court as a package or " "module.") else: module_strings = build_module_list(court_id) if not len(module_strings): raise CommandError("Unable to import module or package. " "Aborting.") logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (rate * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit(".", 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, full_crawl) except Exception, e: # noinspection PyBroadException try: msg = ( "********!! CRAWLER DOWN !!***********\n" "*****scrape_court method failed!*****\n" "********!! ACTION NEEDED !!**********\n%s" % traceback.format_exc() ) logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split(".")[-1].split("_")[0] court = Court.objects.get(pk=court_str) ErrorLog(log_level="CRITICAL", court=court, message=msg).save() except Exception, e: # This is very important. Without this, an exception # above will crash the caller. pass finally:
def calculate_counts(): """Grab the information for new documents over the past 30 days, and calculate the number of cases found for each court. Returns a list like so: [('ca1', date1, link), ('ca2', date2, link), ('ca3',...)] """ thirty_days_ago = now() - timedelta(days=30) thirty_five_days_ago = now() - timedelta(days=35) cts_more_than_30_days = Court.objects \ .filter(docket__documents__date_filed__gt=thirty_days_ago) \ .annotate(count=Count('docket__documents__pk')) \ .values('pk', 'count') # Needed because annotation calls above don't return courts with no new # opinions all_active_courts = Court.objects.filter(has_opinion_scraper=True) \ .values_list('pk', flat=True).order_by('position') # Reformat the results into dicts... cts_more_than_30_days = _make_query_dict(cts_more_than_30_days) # Combine everything most_recent_opinions = [] recently_dying_courts = [] mod_list = importer.build_module_list('juriscraper.opinions') mod_dict = {} for v in mod_list: court = v.rsplit('.')[-1] mod_dict[court] = v for court in all_active_courts: if cts_more_than_30_days.get(court, 0) == 0: # No results in newer than 35 days. Get date of most recent # item. date_filed = Document.objects.filter(docket__court_id=court)\ .order_by('-date_filed')[0].date_filed try: mod = __import__( mod_dict[court], globals(), locals(), [mod_dict[court].rsplit('.')[0]], ) url = mod.Site().url method = mod.Site().method except KeyError: # Happens when multiple scrapers for single court. url = "" method = "Unknown" if thirty_five_days_ago.date() < date_filed < \ thirty_days_ago.date(): recently_dying_courts.append((court, date_filed, method, url)) most_recent_opinions.append((court, date_filed, method, url)) # Sort by date (index 1) most_recent_opinions.sort(key=itemgetter(1), reverse=True) return most_recent_opinions, recently_dying_courts
def handle(self, *args, **options): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) module_strings = build_module_list(options['court_id']) if not len(module_strings): raise CommandError('Unable to import module or package. Aborting.') logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (options['rate'] * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit('.', 1) mod = __import__( "%s.%s" % (package, module), globals(), locals(), [module] ) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, options['full_crawl']) except Exception, e: # noinspection PyBroadException try: msg = ('********!! CRAWLER DOWN !!***********\n' '*****scrape_court method failed!*****\n' '********!! ACTION NEEDED !!**********\n%s' % traceback.format_exc()) logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) ErrorLog( log_level='CRITICAL', court=court, message=msg ).save() except Exception, e: # This is very important. Without this, an exception # above will crash the caller. pass
def handle(self, *args, **options): super(Command, self).handle(*args, **options) global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) module_strings = build_module_list(options["court_id"]) if not len(module_strings): raise CommandError("Unable to import module or package. Aborting.") logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (options["rate"] * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit(".", 1) mod = __import__( f"{package}.{module}", globals(), locals(), [module] ) try: self.parse_and_scrape_site(mod, options["full_crawl"]) except Exception as e: capture_exception(e) last_court_in_list = i == (num_courts - 1) daemon_mode = options["daemon"] if last_court_in_list: if not daemon_mode: break else: logger.info( "All jurisdictions done. Looping back to " "the beginning because daemon mode is enabled." ) i = 0 else: i += 1 time.sleep(wait) logger.info("The scraper has stopped.")
def handle(self, *args, **options): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) module_strings = build_module_list(options['court_id']) if not len(module_strings): raise CommandError('Unable to import module or package. Aborting.') logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (options['rate'] * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, options['full_crawl']) except Exception, e: # noinspection PyBroadException try: msg = ('********!! CRAWLER DOWN !!***********\n' '*****scrape_court method failed!*****\n' '********!! ACTION NEEDED !!**********\n%s' % traceback.format_exc()) logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split('.')[-1].split( '_')[0] court = Court.objects.get(pk=court_str) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() except Exception, e: # This is very important. Without this, an exception # above will crash the caller. pass
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') count = len([s for s in module_strings if 'backscraper' not in s]) print "Testing {count} scrapers against their example files:".format( count=count) for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write(' %s ' % module_string) sys.stdout.flush( ) # Makes sure the output prints before the error message. paths = glob.glob('%s_example*' % module_string.replace('.', '/')) self.assertTrue( paths, "No example file found for: %s!" % module_string.rsplit('.', 1)[1]) t1 = time.time() for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site() site.url = path # Forces a local GET site.method = 'LOCAL' # do-nothing function, b/c we don't want to iterate over # items in a DeferringList. Otherwise, this function is # called as part of the parse() function. site._clean_attributes = lambda *a: None site.parse() t2 = time.time() if t2 - t1 > 2: msg = " - WARNING: Slow scraper!" else: msg = ' - OK' print '(%0.1f seconds%s)' % ((t2 - t1), msg)
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') count = len([s for s in module_strings if 'backscraper' not in s]) print "Testing {count} scrapers against their example files:".format( count=count) for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write(' %s ' % module_string) sys.stdout.flush() # Makes sure the output prints before the error message. paths = glob.glob( '%s_example*' % module_string.replace('.', '/')) self.assertTrue(paths, "No example file found for: %s!" % module_string.rsplit('.', 1)[1]) t1 = time.time() for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site() site.url = path # Forces a local GET site.method = 'LOCAL' # do-nothing function, b/c we don't want to iterate over # items in a DeferringList. Otherwise, this function is # called as part of the parse() function. site._clean_attributes = lambda *a: None site.parse() t2 = time.time() if t2 - t1 > 2: msg = " - WARNING: Slow scraper!" else: msg = ' - OK' print '(%0.1f seconds%s)' % ((t2 - t1), msg)
def main(): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die safely signal.signal(signal.SIGTERM, signal_handler) usage = ('usage: %prog -c COURTID [-d|--daemon] [-b|--binaries]\n\n' 'To test ca1, downloading binaries, use: \n' ' python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n' 'To test all federal courts, omitting binaries, use: \n' ' python %prog -c opinions.united_states.federal_appellate') parser = OptionParser(usage) parser.add_option('-c', '--courts', dest='court_id', metavar="COURTID", help=('The court(s) to scrape and extract. This should be in ' 'the form of a python module or package import ' 'from the Juriscraper library, e.g. ' '"juriscraper.opinions.united_states.federal.ca1" or ' 'simply "opinions" to do all opinions. If desired, ' 'you can use slashes instead of dots to separate' 'the import path.')) parser.add_option('-d', '--daemon', action="store_true", dest='daemonmode', default=False, help=('Use this flag to turn on daemon ' 'mode, in which all courts requested ' 'will be scraped in turn, non-stop.')) parser.add_option('-b', '--download_binaries', action='store_true', dest='binaries', default=False, help=('Use this flag if you wish to download the pdf, ' 'wpd, and doc files.')) parser.add_option('-v', '--verbosity', action='count', default=1, help='Increase output verbosity (e.g., -vv is more than -v).') parser.add_option('--backscrape', dest='backscrape', action='store_true', default=False, help='Download the historical corpus using the _download_backwards method.') (options, args) = parser.parse_args() daemon_mode = options.daemonmode binaries = options.binaries court_id = options.court_id backscrape = options.backscrape # Set up the print function print("Verbosity is set to: %s" % options.verbosity) def _v_print(*verb_args): if verb_args[0] > (3 - options.verbosity): print(verb_args[1]) global v_print v_print = _v_print if not court_id: parser.error('You must specify a court as a package or module.') else: court_id = court_id.replace('/', '.') if court_id.endswith('.py'): court_id = court_id[:-3] module_strings = build_module_list(court_id) if len(module_strings) == 0: parser.error('Unable to import module or package. Aborting.') v_print(3, 'Starting up the scraper.') num_courts = len(module_strings) i = 0 while i < num_courts: # this catches SIGINT, so the code can be killed safely. if die_now: v_print(3, 'The scraper has stopped.') sys.exit(1) package, module = module_strings[i].rsplit('.', 1) v_print(3, "Current court: %s.%s" % (package, module)) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) try: if backscrape: for site in site_yielder(mod.Site().back_scrape_iterable, mod): site.parse() scrape_court(site, binaries) else: site = mod.Site() v_print(3, 'Sent %s request to: %s' % (site.method, site.url)) if site.uses_selenium: v_print(3, "Selenium will be used.") site.parse() scrape_court(site, binaries) except Exception: v_print(3, '*************!! CRAWLER DOWN !!****************') v_print(3, '*****scrape_court method failed on mod: %s*****' % module_strings[i]) v_print(3, '*************!! ACTION NEEDED !!***************') v_print(3, traceback.format_exc()) i += 1 continue last_court_in_list = (i == (num_courts - 1)) if last_court_in_list and daemon_mode: i = 0 else: i += 1 v_print(3, 'The scraper has stopped.') sys.exit(0)
def handle(self, *args, **options): super(Command, self).handle(*args, **options) global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die # safely signal.signal(signal.SIGTERM, signal_handler) module_strings = build_module_list(options["court_id"]) if not len(module_strings): raise CommandError("Unable to import module or package. Aborting.") logger.info("Starting up the scraper.") num_courts = len(module_strings) wait = (options["rate"] * 60) / num_courts i = 0 while i < num_courts: # this catches SIGTERM, so the code can be killed safely. if die_now: logger.info("The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit(".", 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) # noinspection PyBroadException try: self.parse_and_scrape_site(mod, options["full_crawl"]) except Exception as e: # noinspection PyBroadException try: msg = ("********!! CRAWLER DOWN !!***********\n" "*****scrape_court method failed!*****\n" "********!! ACTION NEEDED !!**********\n%s" % traceback.format_exc()) logger.critical(msg) # opinions.united_states.federal.ca9_u --> ca9 court_str = mod.Site.__module__.split(".")[-1].split( "_")[0] court = Court.objects.get(pk=court_str) ErrorLog(log_level="CRITICAL", court=court, message=msg).save() except Exception as e: # This is very important. Without this, an exception # above will crash the caller. pass finally: time.sleep(wait) last_court_in_list = i == (num_courts - 1) if last_court_in_list and options["daemon"]: # Start over... logger.info( "All jurisdictions done. Looping back to " "the beginning because daemon mode is enabled.") i = 0 else: i += 1 logger.info("The scraper has stopped.") sys.exit(0)
def calculate_counts(): """Grab the information for new documents over the past 30 days, and calculate the number of cases found for each court. Returns a list like so: [('ca1', date1, link), ('ca2', date2, link), ('ca3',...)] """ thirty_days_ago = now() - timedelta(days=30) thirty_five_days_ago = now() - timedelta(days=35) cts_more_than_30_days = Court.objects \ .filter(docket__documents__date_filed__gt=thirty_days_ago) \ .annotate(count=Count('docket__documents__pk')) \ .values('pk', 'count') # Needed because annotation calls above don't return courts with no new # opinions all_active_courts = Court.objects.filter(has_opinion_scraper=True) \ .values_list('pk', flat=True).order_by('position') # Reformat the results into dicts... cts_more_than_30_days = _make_query_dict(cts_more_than_30_days) # Combine everything most_recent_opinions = [] recently_dying_courts = [] mod_list = importer.build_module_list('juriscraper.opinions') mod_dict = {} for v in mod_list: court = v.rsplit('.')[-1] mod_dict[court] = v for court in all_active_courts: if cts_more_than_30_days.get(court, 0) == 0: # No results in newer than 35 days. Get date of most recent # item. date_filed = Document.objects.filter(docket__court_id=court)\ .order_by('-date_filed')[0].date_filed try: mod = __import__( mod_dict[court], globals(), locals(), [mod_dict[court].rsplit('.')[0]], ) url = mod.Site().url method = mod.Site().method except KeyError: # Happens when multiple scrapers for single court. url = "" method = "Unknown" if thirty_five_days_ago.date() < date_filed < \ thirty_days_ago.date(): recently_dying_courts.append( (court, date_filed, method, url) ) most_recent_opinions.append( (court, date_filed, method, url) ) # Sort by date (index 1) most_recent_opinions.sort(key=itemgetter(1), reverse=True) return most_recent_opinions, recently_dying_courts
def main(): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die safely signal.signal(signal.SIGTERM, signal_handler) usage = ('usage: %prog -c COURTID [-d|--daemon] [-b|--binaries]\n\n' 'To test ca1, downloading binaries, use: \n' ' python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n' 'To test all federal courts, omitting binaries, use: \n' ' python %prog -c opinions.united_states.federal_appellate') parser = OptionParser(usage) parser.add_option('-c', '--courts', dest='court_id', metavar="COURTID", help=('The court(s) to scrape and extract. This should be in ' 'the form of a python module or package import ' 'from the Juriscraper library, e.g. ' '"juriscraper.opinions.united_states.federal.ca1" or ' 'simply "opinions" to do all opinions. If desired, ' 'you can use slashes instead of dots to separate' 'the import path.')) parser.add_option('-d', '--daemon', action="store_true", dest='daemonmode', default=False, help=('Use this flag to turn on daemon ' 'mode, in which all courts requested ' 'will be scraped in turn, non-stop.')) parser.add_option('-b', '--download_binaries', action='store_true', dest='binaries', default=False, help=('Use this flag if you wish to download the pdf, ' 'wpd, and doc files.')) parser.add_option('-v', '--verbosity', action='count', default=1, help='Increase output verbosity (e.g., -vv is more than -v).') parser.add_option('--backscrape', dest='backscrape', action='store_true', default=False, help='Download the historical corpus using the _download_backwards method.') (options, args) = parser.parse_args() daemon_mode = options.daemonmode binaries = options.binaries court_id = options.court_id backscrape = options.backscrape # Set up the print function print "Verbosity is set to: %s" % options.verbosity def _v_print(*verb_args): if verb_args[0] > (3 - options.verbosity): print verb_args[1] global v_print v_print = _v_print if not court_id: parser.error('You must specify a court as a package or module.') else: court_id = court_id.replace('/', '.') if court_id.endswith('.py'): court_id = court_id[:-3] module_strings = build_module_list(court_id) if len(module_strings) == 0: parser.error('Unable to import module or package. Aborting.') v_print(3, 'Starting up the scraper.') num_courts = len(module_strings) i = 0 while i < num_courts: # this catches SIGINT, so the code can be killed safely. if die_now: v_print(3, 'The scraper has stopped.') sys.exit(1) package, module = module_strings[i].rsplit('.', 1) v_print(3, "Current court: %s.%s" % (package, module)) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) try: if backscrape: for site in site_yielder(mod.Site().back_scrape_iterable, mod): site.parse() scrape_court(site, binaries) else: site = mod.Site() v_print(3, 'Sent %s request to: %s' % (site.method, site.url)) if site.uses_selenium: v_print(3, "Selenium will be used.") site.parse() scrape_court(site, binaries) except Exception: v_print(3, '*************!! CRAWLER DOWN !!****************') v_print(3, '*****scrape_court method failed on mod: %s*****' % module_strings[i]) v_print(3, '*************!! ACTION NEEDED !!***************') v_print(3, traceback.format_exc()) i += 1 continue last_court_in_list = (i == (num_courts - 1)) if last_court_in_list and daemon_mode: i = 0 else: i += 1 v_print(3, 'The scraper has stopped.') sys.exit(0)
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') num_scrapers = len([s for s in module_strings if 'backscraper' not in s]) msg = "Testing {count} scrapers against their example files:" print(msg.format(count=num_scrapers)) max_len_mod_string = max(len(mod) for mod in module_strings if 'backscraper' not in mod) + 2 num_example_files = 0 num_warnings = 0 cnt = CaseNameTweaker() for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write( ' %s ' % module_string.ljust(max_len_mod_string) ) sys.stdout.flush() # module_parts: # [0] - "juriscraper" # [1] - "opinions" or "oral_args" # ... - rest of the path # [-1] - module name module_parts = module_string.split('.') example_path = os.path.join( "tests", "examples", module_parts[1], "united_states", module_parts[-1], ) paths = glob.glob('%s_example*' % example_path) self.assertTrue( paths, "No example file found for: %s! \n\nThe test looked in: " "%s" % ( module_string.rsplit('.', 1)[1], os.path.join(os.getcwd(), example_path), )) num_example_files += len(paths) t1 = time.time() num_tests = len(paths) for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site(cnt=cnt) site.url = path # Forces a local GET site.method = 'LOCAL' site.parse() t2 = time.time() max_speed = 15 warn_speed = 1 speed = t2 - t1 msg = '' if speed > max_speed: if sys.gettrace() is None: # Only do this if we're not debugging. Debuggers make # things slower and breakpoints make things stop. raise SlownessException( "This scraper took {speed}s to test, which is more " "than the allowed speed of {max_speed}s. " "Please speed it up for tests to pass.".format( speed=speed, max_speed=max_speed, )) elif speed > warn_speed: msg = ' - WARNING: SLOW SCRAPER' num_warnings += 1 else: msg = '' print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg)) print("\n{num_scrapers} scrapers tested successfully against " "{num_example_files} example files, with {num_warnings} " "speed warnings.".format( num_scrapers=num_scrapers, num_example_files=num_example_files, num_warnings=num_warnings,)) if num_warnings: print("\nAt least one speed warning was triggered during the " "tests. If this is due to a slow scraper you wrote, we " "suggest attempting to speed it up, as it will be slow " "both in production and while running tests. This is " "currently a warning, but may raise a failure in the " "future as performance requirements are tightened.") else: # Someday, this line of code will be run. That day is not today. print("\nNo speed warnings detected. That's great, keep up the " \ "good work!")
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') num_scrapers = len([s for s in module_strings if 'backscraper' not in s]) msg = "Testing {count} scrapers against their example files:" print(msg.format(count=num_scrapers)) max_len_mod_string = max(len(mod) for mod in module_strings if 'backscraper' not in mod) + 2 num_example_files = 0 num_warnings = 0 cnt = CaseNameTweaker() json_compare_extension = '.compare.json' for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write( ' %s ' % module_string.ljust(max_len_mod_string) ) sys.stdout.flush() # module_parts: # [0] - "juriscraper" # [1] - "opinions" or "oral_args" # ... - rest of the path # [-1] - module name module_parts = module_string.split('.') example_path = os.path.join( "tests", "examples", module_parts[1], "united_states", module_parts[-1], ) paths = [path for path in glob.glob('%s_example*' % example_path) if not path.endswith(json_compare_extension)] self.assertTrue( paths, "No example file found for: %s! \n\nThe test looked in: " "%s" % ( module_string.rsplit('.', 1)[1], os.path.join(os.getcwd(), example_path), )) num_example_files += len(paths) t1 = time.time() num_tests = len(paths) for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site(cnt=cnt) site.url = path # Forces a local GET site.method = 'LOCAL' site.parse() # Now validate that the parsed result is as we expect json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension) json_data = json.loads(site.to_json(), encoding='utf-8') if os.path.isfile(json_path): # Compare result with corresponding json file example_file = path.rsplit('/', 1)[1] compare_file = json_path.rsplit('/', 1)[1] with open(json_path, 'r') as input_file: fixture_json = json.load(input_file) self.assertEqual( len(fixture_json), len(json_data), msg="Fixture and scraped data have different " "lengths: expected %s and scraped %s (%s)" % ( len(fixture_json), len(json_data), module_string ) ) for i, item in enumerate(fixture_json): self.assertEqual( fixture_json[i], json_data[i], ) else: # Generate corresponding json file if it doesn't # already exist. This should only happen once # when adding a new example html file. with open(json_path, 'w') as json_example: json.dump(json_data, json_example, indent=2) t2 = time.time() max_speed = 15 warn_speed = 1 speed = t2 - t1 msg = '' if speed > max_speed: if sys.gettrace() is None and not IS_TRAVIS: # Only do this if we're not debugging. Debuggers make # things slower and breakpoints make things stop. raise SlownessException( "This scraper took {speed}s to test, which is more " "than the allowed speed of {max_speed}s. " "Please speed it up for tests to pass.".format( speed=speed, max_speed=max_speed, )) elif speed > warn_speed: msg = ' - WARNING: SLOW SCRAPER' num_warnings += 1 else: msg = '' print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg)) print("\n{num_scrapers} scrapers tested successfully against " "{num_example_files} example files, with {num_warnings} " "speed warnings.".format( num_scrapers=num_scrapers, num_example_files=num_example_files, num_warnings=num_warnings,)) if num_warnings: print("\nAt least one speed warning was triggered during the " "tests. If this is due to a slow scraper you wrote, we " "suggest attempting to speed it up, as it will be slow " "both in production and while running tests. This is " "currently a warning, but may raise a failure in the " "future as performance requirements are tightened.") else: # Someday, this line of code will be run. That day is not today. print("\nNo speed warnings detected. That's great, keep up the " \ "good work!")
def main(): global die_now # this line is used for handling SIGTERM (CTRL+4), so things can die safely signal.signal(signal.SIGTERM, signal_handler) usage = ( "usage: %prog -c COURTID [-d|--daemon] [-b|--binaries] [-r|--report]\n\n" "To test ca1, downloading binaries, use: \n" " python %prog -c opinions.united_states.federal_appellate.ca1 -b\n\n" "To test all federal courts, omitting binaries, use: \n" " python %prog -c opinions.united_states.federal_appellate" "Passing the --report option will generate an HTML report in " "the root directory after scrapers have run") parser = OptionParser(usage) parser.add_option( "-c", "--courts", dest="court_id", metavar="COURTID", help=("The court(s) to scrape and extract. This should be in " "the form of a python module or package import " "from the Juriscraper library, e.g. " '"juriscraper.opinions.united_states.federal.ca1" or ' 'simply "opinions" to do all opinions. If desired, ' "you can use slashes instead of dots to separate" "the import path."), ) parser.add_option( "-d", "--daemon", action="store_true", dest="daemonmode", default=False, help=("Use this flag to turn on daemon " "mode, in which all courts requested " "will be scraped in turn, non-stop."), ) parser.add_option( "-b", "--download_binaries", action="store_true", dest="binaries", default=False, help=("Use this flag if you wish to download the pdf, " "wpd, and doc files."), ) parser.add_option( "-v", "--verbosity", action="count", default=1, help="Increase output verbosity (e.g., -vv is more than -v).", ) parser.add_option( "--backscrape", dest="backscrape", action="store_true", default=False, help= "Download the historical corpus using the _download_backwards method.", ) parser.add_option( "-r", "--report", action="store_true", default=False, help="Generate a report.html with the outcome of running the scrapers", ) (options, args) = parser.parse_args() daemon_mode = options.daemonmode binaries = options.binaries court_id = options.court_id backscrape = options.backscrape generate_report = options.report # Set up the print function print("Verbosity is set to: %s" % options.verbosity) def _v_print(*verb_args): if verb_args[0] > (3 - options.verbosity): print(verb_args[1]) global v_print v_print = _v_print results = {} if not court_id: parser.error("You must specify a court as a package or module.") else: court_id = court_id.replace("/", ".") if court_id.endswith(".py"): court_id = court_id[:-3] module_strings = build_module_list(court_id) if len(module_strings) == 0: parser.error("Unable to import module or package. Aborting.") v_print(3, "Starting up the scraper.") num_courts = len(module_strings) i = 0 while i < num_courts: current_court = module_strings[i] results[current_court] = {"global_failure": False} # this catches SIGINT, so the code can be killed safely. if die_now: v_print(3, "The scraper has stopped.") sys.exit(1) package, module = module_strings[i].rsplit(".", 1) v_print(3, "Current court: %s.%s" % (package, module)) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) try: if backscrape: for site in site_yielder(mod.Site().back_scrape_iterable, mod): site.parse() scrape_court(site, binaries) else: site = mod.Site() v_print(3, "Sent %s request to: %s" % (site.method, site.url)) if site.uses_selenium: v_print(3, "Selenium will be used.") site.parse() results[current_court]["scrape"] = scrape_court( site, binaries) except Exception: results[current_court][ "global_failure"] = traceback.format_exc() results[current_court]["scrape"] = {} v_print(3, "*************!! CRAWLER DOWN !!****************") v_print( 3, "*****scrape_court method failed on mod: %s*****" % module_strings[i], ) v_print(3, "*************!! ACTION NEEDED !!***************") v_print(3, traceback.format_exc()) i += 1 continue last_court_in_list = i == (num_courts - 1) if last_court_in_list and daemon_mode: i = 0 else: i += 1 v_print(3, "The scraper has stopped.") if generate_report: report_path = os.path.abspath( os.path.join(os.path.dirname(__file__), "../report.html")) v_print(3, "Generating HTML report at %s" % report_path) generate_scraper_report(report_path, results) sys.exit(0)
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') num_scrapers = len([s for s in module_strings if 'backscraper' not in s]) max_len_mod_string = max(len(mod) for mod in module_strings if 'backscraper' not in mod) + 2 num_example_files = 0 num_warnings = 0 cnt = CaseNameTweaker() json_compare_extension = '.compare.json' json_compare_files_generated = [] for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write( ' %s ' % module_string.ljust(max_len_mod_string) ) sys.stdout.flush() # module_parts: # [0] - "juriscraper" # [1] - "opinions" or "oral_args" # ... - rest of the path # [-1] - module name module_parts = module_string.split('.') example_path = os.path.join( "tests", "examples", module_parts[1], "united_states", module_parts[-1], ) paths = [path for path in glob.glob('%s_example*' % example_path) if not path.endswith(json_compare_extension)] self.assertTrue( paths, "No example file found for: %s! \n\nThe test looked in: " "%s" % ( module_string.rsplit('.', 1)[1], os.path.join(os.getcwd(), example_path), )) num_example_files += len(paths) t1 = time.time() num_tests = len(paths) for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site(cnt=cnt) site.url = path # Forces a local GET site.enable_test_mode() site.parse() # Now validate that the parsed result is as we expect json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension) json_data = json.loads(site.to_json(), encoding='utf-8') if os.path.isfile(json_path): # Compare result with corresponding json file example_file = path.rsplit('/', 1)[1] compare_file = json_path.rsplit('/', 1)[1] with open(json_path, 'r') as input_file: fixture_json = json.load(input_file) self.assertEqual( len(fixture_json), len(json_data), msg="Fixture and scraped data have different " "lengths: expected %s and scraped %s (%s)" % ( len(fixture_json), len(json_data), module_string ) ) for i, item in enumerate(fixture_json): self.assertEqual( fixture_json[i], json_data[i], ) else: # Generate corresponding json file if it doesn't # already exist. This should only happen once # when adding a new example html file. warn_generated_compare_file(json_path) json_compare_files_generated.append(json_path) with open(json_path, 'w') as json_example: json.dump(json_data, json_example, indent=2) t2 = time.time() duration = t2 - t1 warning_msg = warn_or_crash_slow_parser(t2 - t1) if warning_msg: num_warnings += 1 print('(%s test(s) in %0.1f seconds)' % (num_tests, duration)) print("\n{num_scrapers} scrapers tested successfully against " "{num_example_files} example files, with {num_warnings} " "speed warnings.".format( num_scrapers=num_scrapers, num_example_files=num_example_files, num_warnings=num_warnings,)) if json_compare_files_generated: msg = 'Generated compare file(s) during test, please review before proceeding. ' \ 'If the data looks good, run tests again, then be sure to include ' \ 'the new compare file(s) in your commit: %s' self.fail(msg % ', '.join(json_compare_files_generated)) if num_warnings: print("\nAt least one speed warning was triggered during the " "tests. If this is due to a slow scraper you wrote, we " "suggest attempting to speed it up, as it will be slow " "both in production and while running tests. This is " "currently a warning, but may raise a failure in the " "future as performance requirements are tightened.") else: # Someday, this line of code will be run. That day is not today. print("\nNo speed warnings detected. That's great, keep up the " \ "good work!")
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') num_scrapers = len([s for s in module_strings if 'backscraper' not in s]) print "Testing {count} scrapers against their example files:".format( count=num_scrapers) max_len_mod_string = max(len(mod) for mod in module_strings if 'backscraper' not in mod) + 2 num_example_files = 0 num_warnings = 0 for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write( ' %s ' % module_string.ljust(max_len_mod_string) ) sys.stdout.flush() paths = glob.glob( '%s_example*' % module_string.replace('.', '/')) self.assertTrue(paths, "No example file found for: %s!" % module_string.rsplit('.', 1)[1]) num_example_files += len(paths) t1 = time.time() num_tests = len(paths) for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site() site.url = path # Forces a local GET site.method = 'LOCAL' site.parse() t2 = time.time() max_speed = 10 warn_speed = 1 speed = t2 - t1 if speed > max_speed: raise SlownessException( "This scraper took {speed}s to test, which is more " "than the allowed speed of {max_speed}s. " "Please speed it up for tests to pass.".format( speed=speed, max_speed=max_speed, )) elif speed > warn_speed: msg = ' - WARNING: SLOW SCRAPER' num_warnings += 1 else: msg = '' print '(%s test(s) in %0.1f seconds%s)' % ( num_tests, speed, msg ) print ("\n{num_scrapers} scrapers tested successfully against " "{num_example_files} example files, with {num_warnings} " "speed warnings.".format( num_scrapers=num_scrapers, num_example_files=num_example_files, num_warnings=num_warnings, )) if num_warnings: print ("\nAt least one speed warning was triggered during the " "tests. If this is due to a slow scraper you wrote, we " "suggest attempting to speed it up, as it will be slow " "both in production and while running tests. This is " "currently a warning, but may raise a failure in the " "future as performance requirements are tightened.") else: # Someday, this line of code will be run. That day is not today. print "\nNo speed warnings detected. That's great, keep up the " \ "good work!"
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') num_scrapers = len([s for s in module_strings if 'backscraper' not in s]) msg = "Testing {count} scrapers against their example files:" print(msg.format(count=num_scrapers)) max_len_mod_string = max(len(mod) for mod in module_strings if 'backscraper' not in mod) + 2 num_example_files = 0 num_warnings = 0 cnt = CaseNameTweaker() json_compare_extension = '.compare.json' for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write( ' %s ' % module_string.ljust(max_len_mod_string) ) sys.stdout.flush() # module_parts: # [0] - "juriscraper" # [1] - "opinions" or "oral_args" # ... - rest of the path # [-1] - module name module_parts = module_string.split('.') example_path = os.path.join( "tests", "examples", module_parts[1], "united_states", module_parts[-1], ) paths = [path for path in glob.glob('%s_example*' % example_path) if not path.endswith(json_compare_extension)] self.assertTrue( paths, "No example file found for: %s! \n\nThe test looked in: " "%s" % ( module_string.rsplit('.', 1)[1], os.path.join(os.getcwd(), example_path), )) num_example_files += len(paths) t1 = time.time() num_tests = len(paths) for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site(cnt=cnt) site.url = path # Forces a local GET site.method = 'LOCAL' site.parse() # Now validate that the parsed result is as we expect json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension) json_data = site.to_json() if os.path.isfile(json_path): # Compare result with corresponding json file example_file = path.rsplit('/', 1)[1] compare_file = json_path.rsplit('/', 1)[1] error = ('The result of parsing ' + example_file + ' does not match the expected data in ' + compare_file + '. Either the later has ' + 'bad data or recent changes to this scraper ' + 'are incompatible with the ' + example_file + ' use case. PARSED JSON: ' + json_data) with open(json_path, 'r') as input_file: self.assertEqual(input_file.read(), json_data, error) else: # Generate corresponding json file if it doesn't # already exist. This should only happen once # when adding a new example html file. with open(json_path, 'w') as json_example: json_example.write(json_data) t2 = time.time() max_speed = 15 warn_speed = 1 speed = t2 - t1 msg = '' if speed > max_speed: if sys.gettrace() is None and not IS_TRAVIS: # Only do this if we're not debugging. Debuggers make # things slower and breakpoints make things stop. raise SlownessException( "This scraper took {speed}s to test, which is more " "than the allowed speed of {max_speed}s. " "Please speed it up for tests to pass.".format( speed=speed, max_speed=max_speed, )) elif speed > warn_speed: msg = ' - WARNING: SLOW SCRAPER' num_warnings += 1 else: msg = '' print('(%s test(s) in %0.1f seconds%s)' % (num_tests, speed, msg)) print("\n{num_scrapers} scrapers tested successfully against " "{num_example_files} example files, with {num_warnings} " "speed warnings.".format( num_scrapers=num_scrapers, num_example_files=num_example_files, num_warnings=num_warnings,)) if num_warnings: print("\nAt least one speed warning was triggered during the " "tests. If this is due to a slow scraper you wrote, we " "suggest attempting to speed it up, as it will be slow " "both in production and while running tests. This is " "currently a warning, but may raise a failure in the " "future as performance requirements are tightened.") else: # Someday, this line of code will be run. That day is not today. print("\nNo speed warnings detected. That's great, keep up the " \ "good work!")