def associate_meta_data_to_objects(self, site, i, court, sha1_hash): """Takes the meta data from the scraper and assocites it with objects. Returns the created objects. """ cite = Citation(case_name=site.case_names[i]) if site.docket_numbers: cite.docket_number = site.docket_numbers[i] if site.neutral_citations: cite.neutral_cite = site.neutral_citations[i] if site.west_citations: cite.federal_cite_one = site.west_citations[i] if site.west_state_citations: cite.west_state_cite = site.west_state_citations[i] docket = Docket( case_name=site.case_names[i], court=court, ) doc = Document(source='C', sha1=sha1_hash, date_filed=site.case_dates[i], download_url=site.download_urls[i], precedential_status=site.precedential_statuses[i]) if site.judges: doc.judges = site.judges[i] if site.nature_of_suit: doc.nature_of_suit = site.nature_of_suit[i] return cite, docket, doc
def test_save_old_opinion(self): """Can we save opinions older than 1900?""" court = Court.objects.get(pk='test') cite = Citation(case_name=u"Blah") cite.save(index=True) docket = Docket( case_name=u"Blah", court=court, ) docket.save() d = Document( citation=cite, docket=docket, date_filed=datetime.date(1899, 1, 1), ) try: cf = ContentFile(StringIO.StringIO('blah').read()) d.local_path.save('file_name.pdf', cf, save=False) d.save(index=True) except ValueError: raise ValueError("Unable to save a case older than 1900. Did you " "try to use `strftime`...again?")
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Add a document to the index site = test_opinion_scraper.Site().parse() cite = Citation( docket_number=site.docket_numbers[0], neutral_cite=site.neutral_citations[0], federal_cite_one=site.west_citations[0] ) cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() self.doc = Document( date_filed=site.case_dates[0], citation=cite, docket=docket, precedential_status=site.precedential_statuses[0], ) self.doc.save(index=False)
def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{ 'q': 'supreme', 'caller': 'scraper_test', }).execute() count = response.result.numFound self.assertEqual( count, 1, "There were %s items found when there should have been 1" % count)
def setUp(self): self.court = Court.objects.get(pk='test') # create 3 documents with their citations and dockets c1, c2, c3 = Citation(case_name=u"c1"), Citation( case_name=u"c2"), Citation(case_name=u"c3") c1.save(index=False) c2.save(index=False) c3.save(index=False) docket1 = Docket( case_name=u"c1", court=self.court, ) docket2 = Docket( case_name=u"c2", court=self.court, ) docket3 = Docket( case_name=u"c3", court=self.court, ) docket1.save() docket2.save() docket3.save() d1, d2, d3 = Document(date_filed=date.today()), Document( date_filed=date.today()), Document(date_filed=date.today()) d1.citation, d2.citation, d3.citation = c1, c2, c3 d1.docket, d2.docket, d3.docket = docket1, docket2, docket3 doc_list = [d1, d2, d3] for d in doc_list: d.citation.save(index=False) d.save(index=False) # create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1; d1.cases_cited.add(d2.citation) d2.citation_count += 1 d2.cases_cited.add(d3.citation) d3.citation_count += 1 d3.cases_cited.add(d1.citation) d1.citation_count += 1 d1.cases_cited.add(d3.citation) d3.citation_count += 1 d1.save(index=False) d2.save(index=False) d3.save(index=False)
def import_law_box_case(case_path): """Open the file, get its contents, convert to XML and extract the meta data. Return a document object for saving in the database """ raw_text = open(case_path).read() clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text( raw_text) sha1 = hashlib.sha1(clean_html_str).hexdigest() citations = get_citations_from_tree(complete_html_tree, case_path) judges = get_judge(clean_html_tree, case_path) court = get_court_object(clean_html_tree, citations, case_path, judges) doc = Document( source='L', sha1=sha1, html= clean_html_str, # we clear this field later, putting the value into html_lawbox. date_filed=get_date_filed(clean_html_tree, citations=citations, case_path=case_path, court=court), precedential_status=get_precedential_status(), judges=judges, download_url=case_path, ) cite = Citation(docket_number=get_docket_number( clean_html_tree, case_path=case_path, court=court)) docket = Docket( case_name=get_case_name(complete_html_tree, case_path), court=court, ) # Necessary for dup_finder. path = '//p/text()' doc.body_text = ' '.join(clean_html_tree.xpath(path)) # Add the dict of citations to the object as its attributes. citations_as_dict = map_citations_to_models(citations) for k, v in citations_as_dict.iteritems(): setattr(cite, k, v) doc.citation = cite doc.docket = docket return doc
def setUp(self): c1 = Citation(case_name=u"foo") c1.save(index=False) docket = Docket( case_name=u'foo', court=Court.objects.get(pk='test'), ) docket.save() # Must be more than a year old for all tests to be runnable. last_month = now().date() - timedelta(days=400) self.doc = Document(citation=c1, docket=docket, date_filed=last_month) self.doc.save(index=False) # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() OralArgumentCommand().scrape_court(site, full_crawl=True)
def test_updating_the_docket_when_the_citation_case_name_changes(self): """Makes sure that the docket changes when the citation does.""" court = Court.objects.get(pk='test') original_case_name = u'original case name' new_case_name = u'new case name' cite = Citation(case_name=original_case_name) cite.save(index=False) docket = Docket( case_name=original_case_name, court=court, ) docket.save() Document( citation=cite, docket=docket, ).save(index=False) cite.case_name = new_case_name cite.save(index=False) changed_docket = Docket.objects.get(pk=docket.pk) self.assertEqual(changed_docket.case_name, new_case_name)
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_opinion_scraper.Site().parse() test_strings = [ 'supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity' ] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( case_name=site.case_names[i], court=self.court, ) docket.save() doc = Document( date_filed=site.case_dates[i], citation=cite, docket=docket, ) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') # Set up testing cores in Solr and swap them in self.core_name_opinion = '%s.opinion-test-%s' % \ (self.__module__, time.time()) self.core_name_audio = '%s.audio-test-%s' % \ (self.__module__, time.time()) create_solr_core(self.core_name_opinion) create_solr_core( self.core_name_audio, schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf', 'audio_schema.xml'), instance_dir='/usr/local/solr/example/solr/audio', ) swap_solr_core('collection1', self.core_name_opinion) swap_solr_core('audio', self.core_name_audio) self.si_opinion = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw') self.si_audio = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='rw') # Add three documents and three audio files to the index, but don't # extract their contents self.site_opinion = test_opinion_scraper.Site().parse() self.site_audio = test_oral_arg_scraper.Site().parse() cite_counts = (4, 6, 8) self.docs = {} for i in range(0, 3): cite = Citation( case_name=self.site_opinion.case_names[i], docket_number=self.site_opinion.docket_numbers[i], neutral_cite=self.site_opinion.neutral_citations[i], federal_cite_one=self.site_opinion.west_citations[i], ) cite.save(index=False) docket = Docket( case_name=self.site_opinion.case_names[i], court=self.court, ) docket.save() self.docs[i] = Document( date_filed=self.site_opinion.case_dates[i], citation=cite, docket=docket, precedential_status=self.site_opinion.precedential_statuses[i], citation_count=cite_counts[i], nature_of_suit=self.site_opinion.nature_of_suit[i], judges=self.site_opinion.judges[i], ) self.docs[i].save() # Create citations between the documents # 0 ---cites--> 1, 2 # 1 ---cites--> 2 # 2 ---cites--> 0 self.docs[0].cases_cited.add(self.docs[1].citation) self.docs[0].cases_cited.add(self.docs[2].citation) self.docs[1].cases_cited.add(self.docs[2].citation) self.docs[2].cases_cited.add(self.docs[0].citation) for doc in self.docs.itervalues(): doc.save() # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() Command().scrape_court(site, full_crawl=True) self.expected_num_results_opinion = 3 self.expected_num_results_audio = 2 self.si_opinion.commit() self.si_audio.commit()
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status(vol_tree, case_location_relative) cite = Citation( case_name=case_name, docket_number=get_docket_number(case_location), federal_cite_one=get_west_cite(vol_tree, case_location_relative), ) docket = Docket( court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), citation=cite, docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() cite.save() docket.save() doc.docket = docket doc.citation = cite doc.save() # Update the citation graph from alert.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc
def test_pagerank_calculation(self): """Create a few Documents and fake citation relation among them, then run the pagerank algorithm. Check whether this simple case can get the correct result. """ # Set up some handy variables self.court = Court.objects.get(pk='test') # create 3 documents with their citations and dockets c1, c2, c3 = Citation(case_name=u"c1"), Citation( case_name=u"c2"), Citation(case_name=u"c3") c1.save(index=False) c2.save(index=False) c3.save(index=False) docket1 = Docket( case_name=u"c1", court=self.court, ) docket2 = Docket( case_name=u"c2", court=self.court, ) docket3 = Docket( case_name=u"c3", court=self.court, ) d1, d2, d3 = Document(date_filed=date.today()), Document( date_filed=date.today()), Document(date_filed=date.today()) d1.citation, d2.citation, d3.citation = c1, c2, c3 d1.docket, d2.docket, d3.docket = docket1, docket2, docket3 doc_list = [d1, d2, d3] for d in doc_list: d.citation.save(index=False) d.save(index=False) #create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1; d1.cases_cited.add(d2.citation) d2.citation_count += 1 d2.cases_cited.add(d3.citation) d3.citation_count += 1 d3.cases_cited.add(d1.citation) d1.citation_count += 1 d1.cases_cited.add(d3.citation) d3.citation_count += 1 d1.save(index=False) d2.save(index=False) d3.save(index=False) #calculate pagerank of these 3 document comm = Command() self.verbosity = 1 comm.do_pagerank(chown=False) # read in the pagerank file, converting to a dict pr_values_from_file = {} with open(get_data_dir_location() + "external_pagerank") as f: for line in f: pk, value = line.split('=') pr_values_from_file[pk] = float(value.strip()) # Verify that whether the answer is correct, based on calculations in # Gephi answers = { '1': 0.387790, '2': 0.214811, '3': 0.397400, } for key, value in answers.iteritems(): self.assertTrue( (abs(pr_values_from_file[key]) - value) < 0.0001, msg="The answer for item %s was %s when it should have been " "%s" % ( key, pr_values_from_file[key], answers[key], ))