def test_save_old_opinion(self): """Can we save opinions older than 1900?""" court = Court.objects.get(pk='test') cite = Citation(case_name=u"Blah") cite.save(index=True) docket = Docket( case_name=u"Blah", court=court, ) docket.save() d = Document( citation=cite, docket=docket, date_filed=datetime.date(1899, 1, 1), ) try: cf = ContentFile(StringIO.StringIO('blah').read()) d.local_path.save('file_name.pdf', cf, save=False) d.save(index=True) except ValueError: raise ValueError("Unable to save a case older than 1900. Did you " "try to use `strftime`...again?")
def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{'q': 'supreme', 'caller': 'scraper_test',}).execute() count = response.result.numFound self.assertEqual(count, 1, "There were %s items found when there should have been 1" % count)
class BulkDataTest(TestCase): fixtures = ['test_court.json'] tmp_data_dir = '/tmp/bulk-dir/' def setUp(self): c1 = Citation(case_name=u"foo") c1.save(index=False) docket = Docket( case_name=u'foo', court=Court.objects.get(pk='test'), ) docket.save() # Must be more than a year old for all tests to be runnable. last_month = now().date() - timedelta(days=400) self.doc = Document(citation=c1, docket=docket, date_filed=last_month) self.doc.save(index=False) # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() OralArgumentCommand().scrape_court(site, full_crawl=True) def tearDown(self): Document.objects.all().delete() Docket.objects.all().delete() Citation.objects.all().delete() shutil.rmtree(self.tmp_data_dir) @override_settings(BULK_DATA_DIR=tmp_data_dir) def test_make_all_bulk_files(self): """Can we successfully generate all bulk files?""" Command().execute()
def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(self): content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) # Note that the next case occurs prior to the current one onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now() - timedelta(days=1), lookup_value=content_hash, lookup_by='sha1' ) if dup_checker.full_crawl: self.assertEqual( onwards, 'CONTINUE', 'DupChecker says to %s during a full crawl.' % onwards) else: self.assertEqual( onwards, 'BREAK', "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold) ) doc.delete()
def test_should_we_continue_break_or_carry_on_with_a_dup_found(self): # Set the dup_threshold to zero for this test self.dup_checkers = [ DupChecker(self.court, full_crawl=True, dup_threshold=0), DupChecker(self.court, full_crawl=False, dup_threshold=0), ] content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: # Create a document, then use the dup_checker to see if it exists. docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now(), lookup_value=content_hash, lookup_by="sha1" ) if dup_checker.full_crawl: self.assertEqual(onwards, "CONTINUE", "DupChecker says to %s during a full crawl." % onwards) else: self.assertEqual( onwards, "BREAK", "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold), ) doc.delete()
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_scraper.Site().parse() test_strings = ['supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity'] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation(case_name=site.case_names[i]) cite.save(index=False) doc = Document(date_filed=site.case_dates[i], court=self.court, citation=cite) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def associate_meta_data_to_objects(self, site, i, court, sha1_hash): """Takes the meta data from the scraper and assocites it with objects. Returns the created objects. """ cite = Citation(case_name=site.case_names[i]) if site.docket_numbers: cite.docket_number = site.docket_numbers[i] if site.neutral_citations: cite.neutral_cite = site.neutral_citations[i] if site.west_citations: cite.federal_cite_one = site.west_citations[i] if site.west_state_citations: cite.west_state_cite = site.west_state_citations[i] docket = Docket( case_name=site.case_names[i], court=court, ) doc = Document(source='C', sha1=sha1_hash, date_filed=site.case_dates[i], download_url=site.download_urls[i], precedential_status=site.precedential_statuses[i]) if site.judges: doc.judges = site.judges[i] if site.nature_of_suit: doc.nature_of_suit = site.nature_of_suit[i] return cite, docket, doc
def associate_meta_data_to_objects(self, site, i, court, sha1_hash): """Takes the meta data from the scraper and assocites it with objects. Returns the created objects. """ cite = Citation(case_name=site.case_names[i]) if site.docket_numbers: cite.docket_number = site.docket_numbers[i] if site.neutral_citations: cite.neutral_cite = site.neutral_citations[i] if site.west_citations: cite.federal_cite_one = site.west_citations[i] if site.west_state_citations: cite.west_state_cite = site.west_state_citations[i] docket = Docket(case_name=site.case_names[i], court=court) doc = Document( source="C", sha1=sha1_hash, date_filed=site.case_dates[i], download_url=site.download_urls[i], precedential_status=site.precedential_statuses[i], ) if site.judges: doc.judges = site.judges[i] if site.nature_of_suit: doc.nature_of_suit = site.nature_of_suit[i] return cite, docket, doc
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Add a document to the index site = test_opinion_scraper.Site().parse() cite = Citation( docket_number=site.docket_numbers[0], neutral_cite=site.neutral_citations[0], federal_cite_one=site.west_citations[0] ) cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() self.doc = Document( date_filed=site.case_dates[0], citation=cite, docket=docket, precedential_status=site.precedential_statuses[0], ) self.doc.save(index=False)
class BulkDataTest(TestCase): fixtures = ['test_court.json'] tmp_data_dir = '/tmp/bulk-dir/' def setUp(self): c1 = Citation(case_name=u"foo") c1.save(index=False) docket = Docket( case_name=u'foo', court=Court.objects.get(pk='test'), ) docket.save() # Must be more than a year old for all tests to be runnable. last_month = now().date() - timedelta(days=400) self.doc = Document( citation=c1, docket=docket, date_filed=last_month ) self.doc.save(index=False) # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() OralArgumentCommand().scrape_court(site, full_crawl=True) def tearDown(self): Document.objects.all().delete() Docket.objects.all().delete() Citation.objects.all().delete() shutil.rmtree(self.tmp_data_dir) @override_settings(BULK_DATA_DIR=tmp_data_dir) def test_make_all_bulk_files(self): """Can we successfully generate all bulk files?""" Command().execute()
def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{ 'q': 'supreme', 'caller': 'scraper_test', }).execute() count = response.result.numFound self.assertEqual( count, 1, "There were %s items found when there should have been 1" % count)
class ViewDocumentTest(TestCase): fixtures = ['test_court.json'] def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Add a document to the index site = test_scraper.Site().parse() cite = Citation(case_name=site.case_names[0], docket_number=site.docket_numbers[0], neutral_cite=site.neutral_citations[0], federal_cite_one=site.west_citations[0]) cite.save(index=False) self.doc = Document(date_filed=site.case_dates[0], court=self.court, citation=cite, precedential_status=site.precedential_statuses[0]) self.doc.save(index=False) def tearDown(self): self.doc.delete() def test_simple_url_check_for_document(self): """Does the page load properly?""" response = self.client.get('/test/2/asdf/') self.assertEqual(response.status_code, 200) self.assertIn('Tarrant', response.content)
def test_pagerank_calculation(self): """Create a few Documents and fake citation relation among them, then run the pagerank algorithm. Check whether this simple case can get the correct result. """ # Set up some handy variables self.court = Court.objects.get(pk='test') #create 3 documents with their citations c1, c2, c3 = Citation(case_name=u"c1"), Citation(case_name=u"c2"), Citation(case_name=u"c3") c1.save(index=False) c2.save(index=False) c3.save(index=False) d1, d2, d3 = Document(date_filed=date.today()), Document(date_filed=date.today()), Document(date_filed=date.today()) d1.citation, d2.citation, d3.citation = c1, c2, c3 doc_list = [d1, d2, d3] for d in doc_list: d.court = self.court d.citation.save(index=False) d.save(index=False) #create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1; d1.cases_cited.add(d2.citation) d2.citation_count += 1 d2.cases_cited.add(d3.citation) d3.citation_count += 1 d3.cases_cited.add(d1.citation) d1.citation_count += 1 d1.cases_cited.add(d3.citation) d3.citation_count += 1 d1.save(index=False) d2.save(index=False) d3.save(index=False) #calculate pagerank of these 3 document comm = Command() self.verbosity = 1 comm.do_pagerank(chown=False) # read in the pagerank file, converting to a dict pr_values_from_file = {} with open(get_data_dir_location() + "external_pagerank") as f: for line in f: pk, value = line.split('=') pr_values_from_file[pk] = float(value.strip()) # Verify that whether the answer is correct, based on calculations in Gephi answers = { '1': 0.387790, '2': 0.214811, '3': 0.397400, } for key, value in answers.iteritems(): self.assertTrue( (abs(pr_values_from_file[key]) - value) < 0.0001, msg="The answer for item %s was %s when it should have been %s" % (key, answers['1'], pr_values_from_file['1']) )
def import_law_box_case(case_path): """Open the file, get its contents, convert to XML and extract the meta data. Return a document object for saving in the database """ raw_text = open(case_path).read() clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text( raw_text) sha1 = hashlib.sha1(clean_html_str).hexdigest() citations = get_citations_from_tree(complete_html_tree, case_path) judges = get_judge(clean_html_tree, case_path) court = get_court_object(clean_html_tree, citations, case_path, judges) doc = Document( source='L', sha1=sha1, html= clean_html_str, # we clear this field later, putting the value into html_lawbox. date_filed=get_date_filed(clean_html_tree, citations=citations, case_path=case_path, court=court), precedential_status=get_precedential_status(), judges=judges, download_url=case_path, ) cite = Citation(docket_number=get_docket_number( clean_html_tree, case_path=case_path, court=court)) docket = Docket( case_name=get_case_name(complete_html_tree, case_path), court=court, ) # Necessary for dup_finder. path = '//p/text()' doc.body_text = ' '.join(clean_html_tree.xpath(path)) # Add the dict of citations to the object as its attributes. citations_as_dict = map_citations_to_models(citations) for k, v in citations_as_dict.iteritems(): setattr(cite, k, v) doc.citation = cite doc.docket = docket return doc
def import_mayer(case_path): """Open the file, get its contents, convert to XML and extract the meta data. Return a document object for saving in the database """ #raw_text = open(case_path).read() #clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text(raw_text) tree = html.parse(case_path) sha1 = hashlib.sha1(clean_html_str).hexdigest() citations = get_citations_from_tree(complete_html_tree, case_path) judges = get_judge(clean_html_tree, case_path) court = get_court_object(clean_html_tree, citations, case_path, judges) doc = Document( source='L', sha1=sha1, html=clean_html_str, # we clear this field later, putting the value into html_lawbox. date_filed=get_date_filed(clean_html_tree, citations=citations, case_path=case_path, court=court), precedential_status=get_precedential_status(), judges=judges, download_url=case_path, ) cite = Citation() docket = Docket( docket_number=get_docket_number( clean_html_tree, case_path=case_path, court=court ), case_name=get_case_name(complete_html_tree, case_path), court=court, ) # Necessary for dup_finder. path = '//p/text()' doc.body_text = ' '.join(clean_html_tree.xpath(path)) # Add the dict of citations to the object as its attributes. citations_as_dict = map_citations_to_models(citations) for k, v in citations_as_dict.iteritems(): setattr(cite, k, v) doc.citation = cite doc.docket = docket return doc
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Add a document to the index site = test_opinion_scraper.Site().parse() cite = Citation( neutral_cite=site.neutral_citations[0], federal_cite_one=site.west_citations[0] ) cite.save(index=False) docket = Docket( docket_number=site.docket_numbers[0], court=self.court, case_name=site.case_names[0], ) docket.save() self.doc = Document( date_filed=site.case_dates[0], citation=cite, docket=docket, precedential_status=site.precedential_statuses[0], ) self.doc.save(index=False)
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Set up a testing core in Solr and swap it in self.core_name = '%s.test-%s' % (self.__module__, time.time()) create_solr_core(self.core_name) swap_solr_core('collection1', self.core_name) self.si = sunburnt.SolrInterface(settings.SOLR_URL, mode='rw') # Add two documents to the index, but don't extract their contents self.site = test_scraper.Site().parse() cite_counts = (4, 6) for i in range(0, 2): cite = Citation(case_name=self.site.case_names[i], docket_number=self.site.docket_numbers[i], neutral_cite=self.site.neutral_citations[i], federal_cite_one=self.site.west_citations[i]) cite.save(index=False) self.doc = Document(date_filed=self.site.case_dates[i], court=self.court, citation=cite, precedential_status=self.site.precedential_statuses[i], citation_count=cite_counts[i], nature_of_suit=self.site.nature_of_suit[i], judges=self.site.judges[i]) self.doc.save() self.expected_num_results = 2
def setUp(self): c1 = Citation(case_name=u"foo") c1.save(index=False) docket = Docket( case_name=u'foo', court=Court.objects.get(pk='test'), ) docket.save() # Must be more than a year old for all tests to be runnable. last_month = now().date() - timedelta(days=400) self.doc = Document(citation=c1, docket=docket, date_filed=last_month) self.doc.save(index=False) # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() OralArgumentCommand().scrape_court(site, full_crawl=True)
def create_stub(citations): """Creates a stub document with the bare minimum of meta data.""" cite = Citation() # Add the dict of citations to the object as its attributes. citations_as_dict = map_citations_to_models(citations) for k, v in citations_as_dict.iteritems(): setattr(cite, k, v) # TODO: We can use the court information in the citation here. Failure to do so will mean that our URLs will later # change -- something we wish to avoid. stub_doc = Document( is_stub_document=True, sha1='!', court=None, citation=cite, ) stub_doc.save(index=False) return stub_doc
def setUp(self): self.court = Court.objects.get(pk='test') # create 3 documents with their citations and dockets c1, c2, c3 = Citation(case_name=u"c1"), Citation( case_name=u"c2"), Citation(case_name=u"c3") c1.save(index=False) c2.save(index=False) c3.save(index=False) docket1 = Docket( case_name=u"c1", court=self.court, ) docket2 = Docket( case_name=u"c2", court=self.court, ) docket3 = Docket( case_name=u"c3", court=self.court, ) docket1.save() docket2.save() docket3.save() d1, d2, d3 = Document(date_filed=date.today()), Document( date_filed=date.today()), Document(date_filed=date.today()) d1.citation, d2.citation, d3.citation = c1, c2, c3 d1.docket, d2.docket, d3.docket = docket1, docket2, docket3 doc_list = [d1, d2, d3] for d in doc_list: d.citation.save(index=False) d.save(index=False) # create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1; d1.cases_cited.add(d2.citation) d2.citation_count += 1 d2.cases_cited.add(d3.citation) d3.citation_count += 1 d3.cases_cited.add(d1.citation) d1.citation_count += 1 d1.cases_cited.add(d3.citation) d3.citation_count += 1 d1.save(index=False) d2.save(index=False) d3.save(index=False)
def test_should_we_continue_break_or_carry_on_with_a_dup_found(self): # Set the dup_threshold to zero for this test self.dup_checkers = [ DupChecker(self.court, full_crawl=True, dup_threshold=0), DupChecker(self.court, full_crawl=False, dup_threshold=0) ] content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: # Create a document, then use the dup_checker to see if it exists. docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now(), lookup_value=content_hash, lookup_by='sha1') if dup_checker.full_crawl: self.assertEqual( onwards, 'CONTINUE', 'DupChecker says to %s during a full crawl.' % onwards) else: self.assertEqual( onwards, 'BREAK', "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold)) doc.delete()
def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date( self): content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) # Note that the next case occurs prior to the current one onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now() - timedelta(days=1), lookup_value=content_hash, lookup_by='sha1') if dup_checker.full_crawl: self.assertEqual( onwards, 'CONTINUE', 'DupChecker says to %s during a full crawl.' % onwards) else: self.assertEqual( onwards, 'BREAK', "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold)) doc.delete()
class ViewDocumentTest(TestCase): fixtures = ['test_court.json'] def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Add a document to the index site = test_opinion_scraper.Site().parse() cite = Citation( docket_number=site.docket_numbers[0], neutral_cite=site.neutral_citations[0], federal_cite_one=site.west_citations[0] ) cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() self.doc = Document( date_filed=site.case_dates[0], citation=cite, docket=docket, precedential_status=site.precedential_statuses[0], ) self.doc.save(index=False) def tearDown(self): self.doc.delete() def test_simple_url_check_for_document(self): """Does the page load properly?""" response = self.client.get('/opinion/1/asdf/') self.assertEqual(response.status_code, 200) self.assertIn('Tarrant', response.content)
def setUp(self): c1 = Citation(case_name=u"foo") c1.save(index=False) docket = Docket( case_name=u'foo', court=Court.objects.get(pk='test'), ) docket.save() # Must be more than a year old for all tests to be runnable. last_month = now().date() - timedelta(days=400) self.doc = Document( citation=c1, docket=docket, date_filed=last_month ) self.doc.save(index=False) # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() OralArgumentCommand().scrape_court(site, full_crawl=True)
def test_updating_the_docket_when_the_citation_case_name_changes(self): """Makes sure that the docket changes when the citation does.""" court = Court.objects.get(pk='test') original_case_name = u'original case name' new_case_name = u'new case name' cite = Citation(case_name=original_case_name) cite.save(index=False) docket = Docket( case_name=original_case_name, court=court, ) docket.save() Document( citation=cite, docket=docket, ).save(index=False) cite.case_name = new_case_name cite.save(index=False) changed_docket = Docket.objects.get(pk=docket.pk) self.assertEqual(changed_docket.case_name, new_case_name)
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_opinion_scraper.Site().parse() test_strings = [ 'supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity' ] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( case_name=site.case_names[i], court=self.court, ) docket.save() doc = Document( date_filed=site.case_dates[i], citation=cite, docket=docket, ) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def test_pagerank_calculation(self): """Create a few Documents and fake citation relation among them, then run the pagerank algorithm. Check whether this simple case can get the correct result. """ # Set up some handy variables self.court = Court.objects.get(pk='test') # create 3 documents with their citations and dockets c1, c2, c3 = Citation(case_name=u"c1"), Citation( case_name=u"c2"), Citation(case_name=u"c3") c1.save(index=False) c2.save(index=False) c3.save(index=False) docket1 = Docket( case_name=u"c1", court=self.court, ) docket2 = Docket( case_name=u"c2", court=self.court, ) docket3 = Docket( case_name=u"c3", court=self.court, ) d1, d2, d3 = Document(date_filed=date.today()), Document( date_filed=date.today()), Document(date_filed=date.today()) d1.citation, d2.citation, d3.citation = c1, c2, c3 d1.docket, d2.docket, d3.docket = docket1, docket2, docket3 doc_list = [d1, d2, d3] for d in doc_list: d.citation.save(index=False) d.save(index=False) #create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1; d1.cases_cited.add(d2.citation) d2.citation_count += 1 d2.cases_cited.add(d3.citation) d3.citation_count += 1 d3.cases_cited.add(d1.citation) d1.citation_count += 1 d1.cases_cited.add(d3.citation) d3.citation_count += 1 d1.save(index=False) d2.save(index=False) d3.save(index=False) #calculate pagerank of these 3 document comm = Command() self.verbosity = 1 comm.do_pagerank(chown=False) # read in the pagerank file, converting to a dict pr_values_from_file = {} with open(get_data_dir_location() + "external_pagerank") as f: for line in f: pk, value = line.split('=') pr_values_from_file[pk] = float(value.strip()) # Verify that whether the answer is correct, based on calculations in # Gephi answers = { '1': 0.387790, '2': 0.214811, '3': 0.397400, } for key, value in answers.iteritems(): self.assertTrue( (abs(pr_values_from_file[key]) - value) < 0.0001, msg="The answer for item %s was %s when it should have been " "%s" % ( key, pr_values_from_file[key], answers[key], ))
def scrape_court(site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(site.court_id, full_crawl=full_crawl) abort = dup_checker.abort_by_hash(site.hash) if not abort: for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site._get_cookies()) clean_content = site._cleanup_content(r.content) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data. Need to convert unicode to binary before hashing. if type(clean_content) == unicode: hash_content = clean_content.encode('utf-8') else: hash_content = clean_content sha1_hash = hashlib.sha1(hash_content).hexdigest() if court_str == 'nev' and site.precedential_statuses[i] == 'Unpublished': # Nevada's non-precedential cases have different SHA1 sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( current_date, next_date, lookup_value=site.download_urls[i], lookup_by='download_url' ) else: onwards = dup_checker.should_we_continue_break_or_carry_on( current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1' ) if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() # Make a citation cite = Citation(case_name=site.case_names[i]) if site.docket_numbers: cite.docket_number = site.docket_numbers[i] if site.neutral_citations: cite.neutral_cite = site.neutral_citations[i] if site.west_citations: cite.federal_cite_one = site.west_citations[i] if site.west_state_citations: cite.west_state_cite = site.west_state_citations[i] # Make the document object doc = Document(source='C', sha1=sha1_hash, date_filed=site.case_dates[i], court=court, download_url=site.download_urls[i], precedential_status=site.precedential_statuses[i]) # Make and associate the file object try: cf = ContentFile(clean_content) extension = get_extension(r.content) # See issue #215 for why this must be lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (cite.case_name, traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue if site.judges: doc.judges = site.judges[i] if site.nature_of_suit: doc.nature_of_suit = site.nature_of_suit[i] # Save everything, but don't update Solr index yet cite.save(index=False) doc.citation = cite doc.save(index=False) # Extract the contents asynchronously. extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) logger.info("Successfully added doc %s: %s" % (doc.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status(vol_tree, case_location_relative) cite = Citation( case_name=case_name, docket_number=get_docket_number(case_location), federal_cite_one=get_west_cite(vol_tree, case_location_relative), ) docket = Docket( court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), citation=cite, docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() cite.save() docket.save() doc.docket = docket doc.citation = cite doc.save() # Update the citation graph from alert.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status( vol_tree, case_location_relative) cite = Citation( case_name=case_name, docket_number=get_docket_number(case_location), federal_cite_one=get_west_cite(vol_tree, case_location_relative), ) docket = Docket( court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), citation=cite, docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() cite.save() docket.save() doc.docket = docket doc.citation = cite doc.save() # Update the citation graph from alert.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') # Set up testing cores in Solr and swap them in self.core_name_opinion = '%s.opinion-test-%s' % \ (self.__module__, time.time()) self.core_name_audio = '%s.audio-test-%s' % \ (self.__module__, time.time()) create_solr_core(self.core_name_opinion) create_solr_core( self.core_name_audio, schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf', 'audio_schema.xml'), instance_dir='/usr/local/solr/example/solr/audio', ) swap_solr_core('collection1', self.core_name_opinion) swap_solr_core('audio', self.core_name_audio) self.si_opinion = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw') self.si_audio = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='rw') # Add three documents and three audio files to the index, but don't # extract their contents self.site_opinion = test_opinion_scraper.Site().parse() self.site_audio = test_oral_arg_scraper.Site().parse() cite_counts = (4, 6, 8) self.docs = {} for i in range(0, 3): cite = Citation( case_name=self.site_opinion.case_names[i], docket_number=self.site_opinion.docket_numbers[i], neutral_cite=self.site_opinion.neutral_citations[i], federal_cite_one=self.site_opinion.west_citations[i], ) cite.save(index=False) docket = Docket( case_name=self.site_opinion.case_names[i], court=self.court, ) docket.save() self.docs[i] = Document( date_filed=self.site_opinion.case_dates[i], citation=cite, docket=docket, precedential_status=self.site_opinion.precedential_statuses[i], citation_count=cite_counts[i], nature_of_suit=self.site_opinion.nature_of_suit[i], judges=self.site_opinion.judges[i], ) self.docs[i].save() # Create citations between the documents # 0 ---cites--> 1, 2 # 1 ---cites--> 2 # 2 ---cites--> 0 self.docs[0].cases_cited.add(self.docs[1].citation) self.docs[0].cases_cited.add(self.docs[2].citation) self.docs[1].cases_cited.add(self.docs[2].citation) self.docs[2].cases_cited.add(self.docs[0].citation) for doc in self.docs.itervalues(): doc.save() # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() Command().scrape_court(site, full_crawl=True) self.expected_num_results_opinion = 3 self.expected_num_results_audio = 2 self.si_opinion.commit() self.si_audio.commit()