def parse_files(self, path_root, file_ext): paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, file_ext): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): t1 = time.time() sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) with open(path, 'rb') as f: text = f.read() result = check_if_logged_in_page(text) if not os.path.exists(json_path): with open(json_path, 'w') as f: print("Creating new file at %s" % json_path) json.dump(result, f, indent=2, sort_keys=True) continue with open(json_path) as f: j = json.load(f) self.assertEqual(j, result) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=0.5) sys.stdout.write("✓\n")
def parse_files(self, path_root, file_ext): paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, file_ext): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): t1 = time.time() sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) with open(path, 'rb') as f: text = f.read() result = check_if_logged_in_page(text) if not os.path.exists(json_path): with open(json_path, 'w') as f: print("Creating new file at %s" % json_path) json.dump(result, f, indent=2, sort_keys=True) continue with open(json_path) as f: j = json.load(f) self.assertEqual(j, result) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=0.5) sys.stdout.write("✓\n")
def parse_files(self, path_root, file_ext, test_class): """Can we do a simple query and parse?""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, file_ext): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): t1 = time.time() sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) dirname, filename = os.path.split(path) filename_sans_ext = filename.split(".")[0] json_path = os.path.join(dirname, "%s.json" % filename_sans_ext) court = filename_sans_ext.split("_")[0] report = test_class(court) with open(path, "r") as f: report._parse_text(f.read()) # Does the metadata function work too? It usually, but not always, # gets called by report.data try: _ = report.metadata except AttributeError: # Some reports don't have this method. pass data = report.data if not os.path.exists(json_path): with open(json_path, "w") as f: print("Creating new file at %s" % json_path) json.dump(data, f, indent=2, sort_keys=True) continue data = json.loads(json.dumps(data, sort_keys=True)) with open(json_path) as f: j = json.load(f) with self.subTest("Parsing PACER", file=filename, klass=test_class): self.assertEqual(j, data) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=2) sys.stdout.write("✓\n")
def run_parsers_on_path(self, path): """Test all the parsers on a given local path :param path: The path where you can find the files """ file_paths = glob.glob(path) file_paths.sort() path_max_len = max(len(path) for path in file_paths) + 2 for i, path in enumerate(file_paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) t1 = time.time() dirname, filename = os.path.split(path) filename_sans_ext = filename.split(".")[0] json_path = os.path.join( dirname, "%s_result.json" % filename_sans_ext ) lasc = LASCSearch(session=None) with open(path, "rb") as f: data = json.load(f) clean_data = lasc._parse_case_data(data) if not os.path.isfile(json_path): # First time testing this docket bar = "*" * 50 print( "\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:" "\n\n %s\n\n" "Please test the data in this file before assuming " "everything worked.\n%s\n" % (bar, json_path, bar) ) with open(json_path, "w") as f: json.dump(clean_data, f, indent=2, sort_keys=True) continue with open(json_path) as f: j = json.load(f) self.assertEqual(j, clean_data) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=1) sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
def parse_files(self, path_root, file_ext, test_class, initialize_with_court=True): """Can we do a simple query and parse?""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, file_ext): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): t1 = time.time() sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) if initialize_with_court: court = filename_sans_ext.split('_')[0] report = test_class(court) else: report = test_class() with open(path, 'rb') as f: report._parse_text(f.read().decode('utf-8')) data = report.data if not os.path.exists(json_path): with open(json_path, 'w') as f: print("Creating new file at %s" % json_path) json.dump(data, f, indent=2, sort_keys=True) continue with open(json_path) as f: j = json.load(f) self.assertEqual(j, data) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=2) sys.stdout.write("✓\n")
def parse_files(self, path_root, file_ext, test_class, initialize_with_court=True): """Can we do a simple query and parse?""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, file_ext): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): t1 = time.time() sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) if initialize_with_court: court = filename_sans_ext.split('_')[0] report = test_class(court) else: report = test_class() with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data if not os.path.exists(json_path): with open(json_path, 'w') as f: print("Creating new file at %s" % json_path) json.dump(data, f, indent=2, sort_keys=True) continue with open(json_path) as f: j = json.load(f) self.assertEqual(j, data) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=2) sys.stdout.write("✓\n")
def run_parsers_on_path( self, path_root, required_fields=["date_filed", "case_name", "docket_number"], ): """Test all the parsers, faking the network query.""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, "*.html"): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) t1 = time.time() dirname, filename = os.path.split(path) filename_sans_ext = filename.split(".")[0] json_path = os.path.join(dirname, "%s.json" % filename_sans_ext) court = filename_sans_ext.split("_")[0] report = DocketReport(court) with open(path, "rb") as f: report._parse_text(f.read().decode("utf-8")) data = report.data if data != {}: # If the docket is a valid docket, make sure some required # fields are populated. for field in required_fields: self.assertTrue( data[field], msg="Unable to find truthy value for field %s" % field, ) self.assertEqual(data["court_id"], court) # Party-specific tests... for party in data["parties"]: self.assertTrue( party.get("name", False), msg="Every party must have a name attribute. Did not " "get a value for:\n\n%s" % party, ) # Protect against effed up adversary proceedings cases that # don't parse properly. See: cacb, 2:08-ap-01570-BB self.assertNotIn("----", party["name"]) if not os.path.isfile(json_path): bar = "*" * 50 print( "\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:" "\n\n %s\n\n" "Please test the data in this file before assuming " "everything worked.\n%s\n" % (bar, json_path, bar) ) with open(json_path, "w") as f: json.dump(data, f, indent=2, sort_keys=True) # self.assertFalse(True) continue with open(json_path) as f: j = json.load(f) if j != {}: # Compare docket entries and parties first, for easier # debugging, then compare whole objects to be sure. self.assertEqual( j["docket_entries"], data["docket_entries"] ) self.assertEqual(j["parties"], data["parties"]) self.assertEqual(j, data) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=1) sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
def test_scrape_all_example_files(self): """Finds all the $module_example* files and tests them with the sample scraper. """ module_strings = build_module_list('juriscraper') num_scrapers = len([s for s in module_strings if 'backscraper' not in s]) max_len_mod_string = max(len(mod) for mod in module_strings if 'backscraper' not in mod) + 2 num_example_files = 0 num_warnings = 0 cnt = CaseNameTweaker() json_compare_extension = '.compare.json' json_compare_files_generated = [] for module_string in module_strings: package, module = module_string.rsplit('.', 1) mod = __import__("%s.%s" % (package, module), globals(), locals(), [module]) if 'backscraper' not in module_string: sys.stdout.write( ' %s ' % module_string.ljust(max_len_mod_string) ) sys.stdout.flush() # module_parts: # [0] - "juriscraper" # [1] - "opinions" or "oral_args" # ... - rest of the path # [-1] - module name module_parts = module_string.split('.') example_path = os.path.join( "tests", "examples", module_parts[1], "united_states", module_parts[-1], ) paths = [path for path in glob.glob('%s_example*' % example_path) if not path.endswith(json_compare_extension)] self.assertTrue( paths, "No example file found for: %s! \n\nThe test looked in: " "%s" % ( module_string.rsplit('.', 1)[1], os.path.join(os.getcwd(), example_path), )) num_example_files += len(paths) t1 = time.time() num_tests = len(paths) for path in paths: # This loop allows multiple example files per module if path.endswith('~'): # Text editor backup: Not interesting. continue site = mod.Site(cnt=cnt) site.url = path # Forces a local GET site.enable_test_mode() site.parse() # Now validate that the parsed result is as we expect json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension) json_data = json.loads(site.to_json(), encoding='utf-8') if os.path.isfile(json_path): # Compare result with corresponding json file example_file = path.rsplit('/', 1)[1] compare_file = json_path.rsplit('/', 1)[1] with open(json_path, 'r') as input_file: fixture_json = json.load(input_file) self.assertEqual( len(fixture_json), len(json_data), msg="Fixture and scraped data have different " "lengths: expected %s and scraped %s (%s)" % ( len(fixture_json), len(json_data), module_string ) ) for i, item in enumerate(fixture_json): self.assertEqual( fixture_json[i], json_data[i], ) else: # Generate corresponding json file if it doesn't # already exist. This should only happen once # when adding a new example html file. warn_generated_compare_file(json_path) json_compare_files_generated.append(json_path) with open(json_path, 'w') as json_example: json.dump(json_data, json_example, indent=2) t2 = time.time() duration = t2 - t1 warning_msg = warn_or_crash_slow_parser(t2 - t1) if warning_msg: num_warnings += 1 print('(%s test(s) in %0.1f seconds)' % (num_tests, duration)) print("\n{num_scrapers} scrapers tested successfully against " "{num_example_files} example files, with {num_warnings} " "speed warnings.".format( num_scrapers=num_scrapers, num_example_files=num_example_files, num_warnings=num_warnings,)) if json_compare_files_generated: msg = 'Generated compare file(s) during test, please review before proceeding. ' \ 'If the data looks good, run tests again, then be sure to include ' \ 'the new compare file(s) in your commit: %s' self.fail(msg % ', '.join(json_compare_files_generated)) if num_warnings: print("\nAt least one speed warning was triggered during the " "tests. If this is due to a slow scraper you wrote, we " "suggest attempting to speed it up, as it will be slow " "both in production and while running tests. This is " "currently a warning, but may raise a failure in the " "future as performance requirements are tightened.") else: # Someday, this line of code will be run. That day is not today. print("\nNo speed warnings detected. That's great, keep up the " \ "good work!")
def run_parsers_on_path(self, path_root, required_fields=[ 'date_filed', 'case_name', 'docket_number']): """Test all the parsers, faking the network query.""" paths = [] for root, dirnames, filenames in os.walk(path_root): for filename in fnmatch.filter(filenames, '*.html'): paths.append(os.path.join(root, filename)) paths.sort() path_max_len = max(len(path) for path in paths) + 2 for i, path in enumerate(paths): sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len))) t1 = time.time() dirname, filename = os.path.split(path) filename_sans_ext = filename.split('.')[0] json_path = os.path.join(dirname, '%s.json' % filename_sans_ext) court = filename_sans_ext.split('_')[0] report = DocketReport(court) with open(path, 'r') as f: report._parse_text(f.read().decode('utf-8')) data = report.data if data != {}: # If the docket is a valid docket, make sure some required # fields are populated. for field in required_fields: self.assertTrue( data[field], msg="Unable to find truthy value for field %s" % field, ) self.assertEqual(data['court_id'], court) # Party-specific tests... for party in data['parties']: self.assertTrue( party.get('name', False), msg="Every party must have a name attribute. Did not " "get a value for:\n\n%s" % party ) # Protect against effed up adversary proceedings cases that # don't parse properly. See: cacb, 2:08-ap-01570-BB self.assertNotIn('----', party['name']) if not os.path.isfile(json_path): bar = "*" * 50 print("\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:" "\n\n %s\n\n" "Please test the data in this file before assuming " "everything worked.\n%s\n" % (bar, json_path, bar)) with open(json_path, 'w') as f: json.dump(data, f, indent=2, sort_keys=True) #self.assertFalse(True) continue with open(json_path) as f: j = json.load(f) if j != {}: # Compare docket entries and parties first, for easier # debugging, then compare whole objects to be sure. self.assertEqual(j['docket_entries'], data['docket_entries']) self.assertEqual(j['parties'], data['parties']) self.assertEqual(j, data) t2 = time.time() duration = t2 - t1 warn_or_crash_slow_parser(duration, max_duration=1) sys.stdout.write("✓ - %0.1fs\n" % (t2-t1))