def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(self): content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) # Note that the next case occurs prior to the current one onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now() - timedelta(days=1), lookup_value=content_hash, lookup_by='sha1' ) if dup_checker.full_crawl: self.assertEqual( onwards, 'CONTINUE', 'DupChecker says to %s during a full crawl.' % onwards) else: self.assertEqual( onwards, 'BREAK', "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold) ) doc.delete()
class ViewDocumentTest(TestCase): fixtures = ['test_court.json'] def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Add a document to the index site = test_scraper.Site().parse() cite = Citation(case_name=site.case_names[0], docket_number=site.docket_numbers[0], neutral_cite=site.neutral_citations[0], federal_cite_one=site.west_citations[0]) cite.save(index=False) self.doc = Document(date_filed=site.case_dates[0], court=self.court, citation=cite, precedential_status=site.precedential_statuses[0]) self.doc.save(index=False) def tearDown(self): self.doc.delete() def test_simple_url_check_for_document(self): """Does the page load properly?""" response = self.client.get('/test/2/asdf/') self.assertEqual(response.status_code, 200) self.assertIn('Tarrant', response.content)
def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date( self): content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) # Note that the next case occurs prior to the current one onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now() - timedelta(days=1), lookup_value=content_hash, lookup_by='sha1') if dup_checker.full_crawl: self.assertEqual( onwards, 'CONTINUE', 'DupChecker says to %s during a full crawl.' % onwards) else: self.assertEqual( onwards, 'BREAK', "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold)) doc.delete()
def test_should_we_continue_break_or_carry_on_with_a_dup_found(self): # Set the dup_threshold to zero for this test self.dup_checkers = [ DupChecker(self.court, full_crawl=True, dup_threshold=0), DupChecker(self.court, full_crawl=False, dup_threshold=0) ] content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: # Create a document, then use the dup_checker to see if it exists. docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now(), lookup_value=content_hash, lookup_by='sha1') if dup_checker.full_crawl: self.assertEqual( onwards, 'CONTINUE', 'DupChecker says to %s during a full crawl.' % onwards) else: self.assertEqual( onwards, 'BREAK', "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold)) doc.delete()
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_scraper.Site().parse() test_strings = ['supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity'] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation(case_name=site.case_names[i]) cite.save(index=False) doc = Document(date_filed=site.case_dates[i], court=self.court, citation=cite) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def test_should_we_continue_break_or_carry_on_with_a_dup_found(self): # Set the dup_threshold to zero for this test self.dup_checkers = [ DupChecker(self.court, full_crawl=True, dup_threshold=0), DupChecker(self.court, full_crawl=False, dup_threshold=0), ] content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: # Create a document, then use the dup_checker to see if it exists. docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now(), lookup_value=content_hash, lookup_by="sha1" ) if dup_checker.full_crawl: self.assertEqual(onwards, "CONTINUE", "DupChecker says to %s during a full crawl." % onwards) else: self.assertEqual( onwards, "BREAK", "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold), ) doc.delete()
class SolrTestCase(TestCase): """A generic class that contains the setUp and tearDown functions for inheriting children. """ fixtures = ['test_court.json'] def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Set up a testing core in Solr and swap it in self.core_name = '%s.test-%s' % (self.__module__, time.time()) create_solr_core(self.core_name) swap_solr_core('collection1', self.core_name) self.si = sunburnt.SolrInterface(settings.SOLR_URL, mode='rw') # Add two documents to the index, but don't extract their contents self.site = test_scraper.Site().parse() cite_counts = (4, 6) for i in range(0, 2): cite = Citation(case_name=self.site.case_names[i], docket_number=self.site.docket_numbers[i], neutral_cite=self.site.neutral_citations[i], federal_cite_one=self.site.west_citations[i]) cite.save(index=False) self.doc = Document(date_filed=self.site.case_dates[i], court=self.court, citation=cite, precedential_status=self.site.precedential_statuses[i], citation_count=cite_counts[i], nature_of_suit=self.site.nature_of_suit[i], judges=self.site.judges[i]) self.doc.save() self.expected_num_results = 2 def tearDown(self): self.doc.delete() swap_solr_core(self.core_name, 'collection1') delete_solr_core(self.core_name)
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_opinion_scraper.Site().parse() test_strings = [ 'supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity' ] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( case_name=site.case_names[i], court=self.court, ) docket.save() doc = Document( date_filed=site.case_dates[i], citation=cite, docket=docket, ) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
class ViewDocumentTest(TestCase): fixtures = ['test_court.json'] def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Add a document to the index site = test_opinion_scraper.Site().parse() cite = Citation( docket_number=site.docket_numbers[0], neutral_cite=site.neutral_citations[0], federal_cite_one=site.west_citations[0] ) cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() self.doc = Document( date_filed=site.case_dates[0], citation=cite, docket=docket, precedential_status=site.precedential_statuses[0], ) self.doc.save(index=False) def tearDown(self): self.doc.delete() def test_simple_url_check_for_document(self): """Does the page load properly?""" response = self.client.get('/opinion/1/asdf/') self.assertEqual(response.status_code, 200) self.assertIn('Tarrant', response.content)