def test_ingest_opinions(self): """Can we successfully ingest opinions at a high level?""" site = test_opinion_scraper.Site() site.method = "LOCAL" parsed_site = site.parse() cl_scrape_opinions.Command().scrape_court(parsed_site, full_crawl=True) opinions = Opinion.objects.all() self.assertTrue(opinions.count() == 6, 'Should have 6 test opinions.')
def test_ingest_opinions_from_scraper(self) -> None: """Can we successfully ingest opinions at a high level?""" site = test_opinion_scraper.Site() site.method = "LOCAL" parsed_site = site.parse() cl_scrape_opinions.Command().scrape_court(parsed_site, full_crawl=True, ocr_available=False) opinions = Opinion.objects.all() count = opinions.count() self.assertTrue( opinions.count() == 6, "Should have 6 test opinions, not %s" % count, )
def test_abort_on_unchanged_court_website(self): """Similar to the above, but we create a UrlHash object before checking if it exists.""" site = test_opinion_scraper.Site() site.hash = 'this is a dummy hash code string' for dup_checker in self.dup_checkers: UrlHash(id=site.url, sha1=site.hash).save() abort = dup_checker.abort_by_url_hash(site.url, site.hash) if dup_checker.full_crawl: self.assertFalse( abort, "DupChecker says to abort during a full crawl.") else: self.assertTrue( abort, "DupChecker says not to abort on a court that's been " "crawled before with the same hash") dup_checker.url_hash.delete()
def test_abort_when_new_court_website(self): """Tests what happens when a new website is discovered.""" site = test_opinion_scraper.Site() site.hash = 'this is a dummy hash code string' for dup_checker in self.dup_checkers: abort = dup_checker.abort_by_url_hash(site.url, site.hash) if dup_checker.full_crawl: self.assertFalse( abort, "DupChecker says to abort during a full crawl.") else: self.assertFalse( abort, "DupChecker says to abort on a court that's never been " "crawled before.") # The checking function creates url2Hashes, that we must delete as # part of cleanup. dup_checker.url_hash.delete()
def test_press_on_with_an_empty_database(self): site = test_opinion_scraper.Site() site.hash = 'this is a dummy hash code string' for dup_checker in self.dup_checkers: onwards = dup_checker.press_on(Opinion, now(), now() - timedelta(days=1), lookup_value='content', lookup_by='sha1') if dup_checker.full_crawl: self.assertTrue( onwards, "DupChecker says to abort during a full crawl. This should " "never happen.") elif dup_checker.full_crawl is False: count = Opinion.objects.all().count() self.assertTrue( onwards, "DupChecker says to abort on dups when the database has %s " "Documents." % count)
def test_abort_on_changed_court_website(self): """Similar to the above, but we create a UrlHash with a different hash before checking if it exists. """ site = test_opinion_scraper.Site() site.hash = 'this is a dummy hash code string' for dup_checker in self.dup_checkers: UrlHash(pk=site.url, sha1=site.hash).save() abort = dup_checker.abort_by_url_hash( site.url, "this is a *different* hash!") if dup_checker.full_crawl: self.assertFalse( abort, "DupChecker says to abort during a full crawl.") else: self.assertFalse( abort, "DupChecker says to abort on a court where the hash has " "changed.") dup_checker.url_hash.delete()
def test_parsing_xml_opinion_site_to_site_object(self): """Does a basic parse of a site reveal the right number of items?""" site = test_opinion_scraper.Site().parse() self.assertEqual(len(site.case_names), 6)