Ejemplo n.º 1
0
    def parse_files(self, path_root, file_ext):
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, file_ext):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):
            t1 = time.time()
            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)

            with open(path, 'rb') as f:
                text = f.read()

            result = check_if_logged_in_page(text)

            if not os.path.exists(json_path):
                with open(json_path, 'w') as f:
                    print("Creating new file at %s" % json_path)
                    json.dump(result, f, indent=2, sort_keys=True)
                continue
            with open(json_path) as f:
                j = json.load(f)
                self.assertEqual(j, result)

            t2 = time.time()
            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=0.5)

            sys.stdout.write("✓\n")
    def parse_files(self, path_root, file_ext):
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, file_ext):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):
            t1 = time.time()
            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)

            with open(path, 'rb') as f:
                text = f.read()

            result = check_if_logged_in_page(text)

            if not os.path.exists(json_path):
                with open(json_path, 'w') as f:
                    print("Creating new file at %s" % json_path)
                    json.dump(result, f, indent=2, sort_keys=True)
                continue
            with open(json_path) as f:
                j = json.load(f)
                self.assertEqual(j, result)
            t2 = time.time()
            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=0.5)

            sys.stdout.write("✓\n")
Ejemplo n.º 3
0
    def parse_files(self, path_root, file_ext, test_class):
        """Can we do a simple query and parse?"""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, file_ext):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):
            t1 = time.time()
            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split(".")[0]
            json_path = os.path.join(dirname, "%s.json" % filename_sans_ext)

            court = filename_sans_ext.split("_")[0]
            report = test_class(court)
            with open(path, "r") as f:
                report._parse_text(f.read())

            # Does the metadata function work too? It usually, but not always,
            # gets called by report.data
            try:
                _ = report.metadata
            except AttributeError:
                # Some reports don't have this method.
                pass
            data = report.data
            if not os.path.exists(json_path):
                with open(json_path, "w") as f:
                    print("Creating new file at %s" % json_path)
                    json.dump(data, f, indent=2, sort_keys=True)
                continue
            data = json.loads(json.dumps(data, sort_keys=True))
            with open(json_path) as f:
                j = json.load(f)
                with self.subTest("Parsing PACER",
                                  file=filename,
                                  klass=test_class):
                    self.assertEqual(j, data)
            t2 = time.time()
            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=2)

            sys.stdout.write("✓\n")
Ejemplo n.º 4
0
    def run_parsers_on_path(self, path):
        """Test all the parsers on a given local path

        :param path: The path where you can find the files
        """
        file_paths = glob.glob(path)
        file_paths.sort()
        path_max_len = max(len(path) for path in file_paths) + 2
        for i, path in enumerate(file_paths):
            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            t1 = time.time()
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split(".")[0]
            json_path = os.path.join(
                dirname, "%s_result.json" % filename_sans_ext
            )

            lasc = LASCSearch(session=None)
            with open(path, "rb") as f:
                data = json.load(f)
                clean_data = lasc._parse_case_data(data)

            if not os.path.isfile(json_path):
                # First time testing this docket
                bar = "*" * 50
                print(
                    "\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:"
                    "\n\n  %s\n\n"
                    "Please test the data in this file before assuming "
                    "everything worked.\n%s\n" % (bar, json_path, bar)
                )
                with open(json_path, "w") as f:
                    json.dump(clean_data, f, indent=2, sort_keys=True)
                    continue

            with open(json_path) as f:
                j = json.load(f)
                self.assertEqual(j, clean_data)

            t2 = time.time()
            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=1)
            sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
Ejemplo n.º 5
0
    def parse_files(self,
                    path_root,
                    file_ext,
                    test_class,
                    initialize_with_court=True):
        """Can we do a simple query and parse?"""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, file_ext):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):
            t1 = time.time()
            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)

            if initialize_with_court:
                court = filename_sans_ext.split('_')[0]
                report = test_class(court)
            else:
                report = test_class()
            with open(path, 'rb') as f:
                report._parse_text(f.read().decode('utf-8'))
            data = report.data
            if not os.path.exists(json_path):
                with open(json_path, 'w') as f:
                    print("Creating new file at %s" % json_path)
                    json.dump(data, f, indent=2, sort_keys=True)
                continue
            with open(json_path) as f:
                j = json.load(f)
                self.assertEqual(j, data)
            t2 = time.time()
            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=2)

            sys.stdout.write("✓\n")
Ejemplo n.º 6
0
    def parse_files(self, path_root, file_ext, test_class,
                    initialize_with_court=True):
        """Can we do a simple query and parse?"""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, file_ext):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):
            t1 = time.time()
            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)

            if initialize_with_court:
                court = filename_sans_ext.split('_')[0]
                report = test_class(court)
            else:
                report = test_class()
            with open(path, 'r') as f:
                report._parse_text(f.read().decode('utf-8'))
            data = report.data
            if not os.path.exists(json_path):
                with open(json_path, 'w') as f:
                    print("Creating new file at %s" % json_path)
                    json.dump(data, f, indent=2, sort_keys=True)
                continue
            with open(json_path) as f:
                j = json.load(f)
                self.assertEqual(j, data)
            t2 = time.time()
            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=2)

            sys.stdout.write("✓\n")
Ejemplo n.º 7
0
    def run_parsers_on_path(
        self,
        path_root,
        required_fields=["date_filed", "case_name", "docket_number"],
    ):
        """Test all the parsers, faking the network query."""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, "*.html"):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):

            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            t1 = time.time()
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split(".")[0]
            json_path = os.path.join(dirname, "%s.json" % filename_sans_ext)
            court = filename_sans_ext.split("_")[0]

            report = DocketReport(court)
            with open(path, "rb") as f:
                report._parse_text(f.read().decode("utf-8"))
            data = report.data

            if data != {}:
                # If the docket is a valid docket, make sure some required
                # fields are populated.
                for field in required_fields:
                    self.assertTrue(
                        data[field],
                        msg="Unable to find truthy value for field %s" % field,
                    )

                self.assertEqual(data["court_id"], court)

                # Party-specific tests...
                for party in data["parties"]:
                    self.assertTrue(
                        party.get("name", False),
                        msg="Every party must have a name attribute. Did not "
                        "get a value for:\n\n%s" % party,
                    )
                    # Protect against effed up adversary proceedings cases that
                    # don't parse properly. See: cacb, 2:08-ap-01570-BB
                    self.assertNotIn("----", party["name"])

            if not os.path.isfile(json_path):
                bar = "*" * 50
                print(
                    "\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:"
                    "\n\n  %s\n\n"
                    "Please test the data in this file before assuming "
                    "everything worked.\n%s\n" % (bar, json_path, bar)
                )
                with open(json_path, "w") as f:
                    json.dump(data, f, indent=2, sort_keys=True)
                    # self.assertFalse(True)
                    continue

            with open(json_path) as f:
                j = json.load(f)
                if j != {}:
                    # Compare docket entries and parties first, for easier
                    # debugging, then compare whole objects to be sure.
                    self.assertEqual(
                        j["docket_entries"], data["docket_entries"]
                    )
                    self.assertEqual(j["parties"], data["parties"])
                self.assertEqual(j, data)
            t2 = time.time()

            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=1)
            sys.stdout.write("✓ - %0.1fs\n" % (t2 - t1))
    def test_scrape_all_example_files(self):
        """Finds all the $module_example* files and tests them with the sample
        scraper.
        """

        module_strings = build_module_list('juriscraper')
        num_scrapers = len([s for s in module_strings
                            if 'backscraper' not in s])
        max_len_mod_string = max(len(mod) for mod in module_strings
                                 if 'backscraper' not in mod) + 2
        num_example_files = 0
        num_warnings = 0
        cnt = CaseNameTweaker()
        json_compare_extension = '.compare.json'
        json_compare_files_generated = []
        for module_string in module_strings:
            package, module = module_string.rsplit('.', 1)
            mod = __import__("%s.%s" % (package, module),
                             globals(),
                             locals(),
                             [module])
            if 'backscraper' not in module_string:
                sys.stdout.write(
                    '  %s ' % module_string.ljust(max_len_mod_string)
                )
                sys.stdout.flush()
                # module_parts:
                # [0]  - "juriscraper"
                # [1]  - "opinions" or "oral_args"
                # ...  - rest of the path
                # [-1] - module name
                module_parts = module_string.split('.')
                example_path = os.path.join(
                    "tests", "examples", module_parts[1],
                    "united_states", module_parts[-1],
                )
                paths = [path for path in glob.glob('%s_example*' % example_path)
                         if not path.endswith(json_compare_extension)]
                self.assertTrue(
                    paths,
                    "No example file found for: %s! \n\nThe test looked in: "
                    "%s" % (
                        module_string.rsplit('.', 1)[1],
                        os.path.join(os.getcwd(), example_path),
                    ))
                num_example_files += len(paths)
                t1 = time.time()
                num_tests = len(paths)
                for path in paths:
                    # This loop allows multiple example files per module
                    if path.endswith('~'):
                        # Text editor backup: Not interesting.
                        continue
                    site = mod.Site(cnt=cnt)
                    site.url = path
                    # Forces a local GET
                    site.enable_test_mode()
                    site.parse()
                    # Now validate that the parsed result is as we expect
                    json_path = '%s%s' % (path.rsplit('.', 1)[0], json_compare_extension)
                    json_data = json.loads(site.to_json(), encoding='utf-8')
                    if os.path.isfile(json_path):
                        # Compare result with corresponding json file
                        example_file = path.rsplit('/', 1)[1]
                        compare_file = json_path.rsplit('/', 1)[1]
                        with open(json_path, 'r') as input_file:
                            fixture_json = json.load(input_file)
                            self.assertEqual(
                                len(fixture_json),
                                len(json_data),
                                msg="Fixture and scraped data have different "
                                    "lengths: expected %s and scraped %s (%s)" % (
                                    len(fixture_json),
                                    len(json_data),
                                    module_string
                                )
                            )
                            for i, item in enumerate(fixture_json):
                                self.assertEqual(
                                    fixture_json[i],
                                    json_data[i],
                                )

                    else:
                        # Generate corresponding json file if it doesn't
                        # already exist. This should only happen once
                        # when adding a new example html file.
                        warn_generated_compare_file(json_path)
                        json_compare_files_generated.append(json_path)
                        with open(json_path, 'w') as json_example:
                            json.dump(json_data, json_example, indent=2)
                t2 = time.time()
                duration = t2 - t1
                warning_msg = warn_or_crash_slow_parser(t2 - t1)
                if warning_msg:
                    num_warnings += 1

                print('(%s test(s) in %0.1f seconds)' %
                      (num_tests, duration))

        print("\n{num_scrapers} scrapers tested successfully against "
              "{num_example_files} example files, with {num_warnings} "
              "speed warnings.".format(
                  num_scrapers=num_scrapers,
                  num_example_files=num_example_files,
                  num_warnings=num_warnings,))
        if json_compare_files_generated:
            msg = 'Generated compare file(s) during test, please review before proceeding. ' \
                  'If the data looks good, run tests again, then be sure to include ' \
                  'the new compare file(s) in your commit: %s'
            self.fail(msg % ', '.join(json_compare_files_generated))
        if num_warnings:
            print("\nAt least one speed warning was triggered during the "
                   "tests. If this is due to a slow scraper you wrote, we "
                   "suggest attempting to speed it up, as it will be slow "
                   "both in production and while running tests. This is "
                   "currently a warning, but may raise a failure in the "
                   "future as performance requirements are tightened.")
        else:
            # Someday, this line of code will be run. That day is not today.
            print("\nNo speed warnings detected. That's great, keep up the " \
                  "good work!")
Ejemplo n.º 9
0
    def run_parsers_on_path(self, path_root,
                            required_fields=[
                                'date_filed', 'case_name', 'docket_number']):
        """Test all the parsers, faking the network query."""
        paths = []
        for root, dirnames, filenames in os.walk(path_root):
            for filename in fnmatch.filter(filenames, '*.html'):
                paths.append(os.path.join(root, filename))
        paths.sort()
        path_max_len = max(len(path) for path in paths) + 2
        for i, path in enumerate(paths):

            sys.stdout.write("%s. Doing %s" % (i, path.ljust(path_max_len)))
            t1 = time.time()
            dirname, filename = os.path.split(path)
            filename_sans_ext = filename.split('.')[0]
            json_path = os.path.join(dirname, '%s.json' % filename_sans_ext)
            court = filename_sans_ext.split('_')[0]

            report = DocketReport(court)
            with open(path, 'r') as f:
                report._parse_text(f.read().decode('utf-8'))
            data = report.data

            if data != {}:
                # If the docket is a valid docket, make sure some required
                # fields are populated.
                for field in required_fields:
                    self.assertTrue(
                        data[field],
                        msg="Unable to find truthy value for field %s" % field,
                    )

                self.assertEqual(data['court_id'], court)

                # Party-specific tests...
                for party in data['parties']:
                    self.assertTrue(
                        party.get('name', False),
                        msg="Every party must have a name attribute. Did not "
                            "get a value for:\n\n%s" % party
                    )
                    # Protect against effed up adversary proceedings cases that
                    # don't parse properly. See: cacb, 2:08-ap-01570-BB
                    self.assertNotIn('----', party['name'])

            if not os.path.isfile(json_path):
                bar = "*" * 50
                print("\n\n%s\nJSON FILE DID NOT EXIST. CREATING IT AT:"
                      "\n\n  %s\n\n"
                      "Please test the data in this file before assuming "
                      "everything worked.\n%s\n" % (bar, json_path, bar))
                with open(json_path, 'w') as f:
                    json.dump(data, f, indent=2, sort_keys=True)
                    #self.assertFalse(True)
                    continue

            with open(json_path) as f:
                j = json.load(f)
                if j != {}:
                    # Compare docket entries and parties first, for easier
                    # debugging, then compare whole objects to be sure.
                    self.assertEqual(j['docket_entries'], data['docket_entries'])
                    self.assertEqual(j['parties'], data['parties'])
                self.assertEqual(j, data)
            t2 = time.time()

            duration = t2 - t1
            warn_or_crash_slow_parser(duration, max_duration=1)
            sys.stdout.write("✓ - %0.1fs\n" % (t2-t1))