def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(self): content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) # Note that the next case occurs prior to the current one onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now() - timedelta(days=1), lookup_value=content_hash, lookup_by='sha1' ) if dup_checker.full_crawl: self.assertEqual( onwards, 'CONTINUE', 'DupChecker says to %s during a full crawl.' % onwards) else: self.assertEqual( onwards, 'BREAK', "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold) ) doc.delete()
def test_save_old_opinion(self): """Can we save opinions older than 1900?""" court = Court.objects.get(pk='test') cite = Citation(case_name=u"Blah") cite.save(index=True) docket = Docket( case_name=u"Blah", court=court, ) docket.save() d = Document( citation=cite, docket=docket, date_filed=datetime.date(1899, 1, 1), ) try: cf = ContentFile(StringIO.StringIO('blah').read()) d.local_path.save('file_name.pdf', cf, save=False) d.save(index=True) except ValueError: raise ValueError("Unable to save a case older than 1900. Did you " "try to use `strftime`...again?")
def test_should_we_continue_break_or_carry_on_with_a_dup_found(self): # Set the dup_threshold to zero for this test self.dup_checkers = [ DupChecker(self.court, full_crawl=True, dup_threshold=0), DupChecker(self.court, full_crawl=False, dup_threshold=0) ] content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: # Create a document, then use the dup_checker to see if it exists. docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now(), lookup_value=content_hash, lookup_by='sha1') if dup_checker.full_crawl: self.assertEqual( onwards, 'CONTINUE', 'DupChecker says to %s during a full crawl.' % onwards) else: self.assertEqual( onwards, 'BREAK', "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold)) doc.delete()
def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date( self): content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) # Note that the next case occurs prior to the current one onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now() - timedelta(days=1), lookup_value=content_hash, lookup_by='sha1') if dup_checker.full_crawl: self.assertEqual( onwards, 'CONTINUE', 'DupChecker says to %s during a full crawl.' % onwards) else: self.assertEqual( onwards, 'BREAK', "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold)) doc.delete()
def test_should_we_continue_break_or_carry_on_with_a_dup_found(self): # Set the dup_threshold to zero for this test self.dup_checkers = [ DupChecker(self.court, full_crawl=True, dup_threshold=0), DupChecker(self.court, full_crawl=False, dup_threshold=0), ] content = "this is dummy content that we hash" content_hash = hashlib.sha1(content).hexdigest() for dup_checker in self.dup_checkers: # Create a document, then use the dup_checker to see if it exists. docket = Docket(court=self.court) docket.save() doc = Document(sha1=content_hash, docket=docket) doc.save(index=False) onwards = dup_checker.should_we_continue_break_or_carry_on( Document, now(), now(), lookup_value=content_hash, lookup_by="sha1" ) if dup_checker.full_crawl: self.assertEqual(onwards, "CONTINUE", "DupChecker says to %s during a full crawl." % onwards) else: self.assertEqual( onwards, "BREAK", "DupChecker says to %s but there should be a duplicate in " "the database. dup_count is %s, and dup_threshold is %s" % (onwards, dup_checker.dup_count, dup_checker.dup_threshold), ) doc.delete()
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_opinion_scraper.Site().parse() test_strings = ["supreme", "intelligence", "indiana", "reagan", "indiana", "fidelity"] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, "alert", site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket(case_name=site.case_names[i], court=self.court) docket.save() doc = Document(date_filed=site.case_dates[i], citation=cite, docket=docket) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in [".html", ".wpd"]: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Add a document to the index site = test_opinion_scraper.Site().parse() cite = Citation( neutral_cite=site.neutral_citations[0], federal_cite_one=site.west_citations[0] ) cite.save(index=False) docket = Docket( docket_number=site.docket_numbers[0], court=self.court, case_name=site.case_names[0], ) docket.save() self.doc = Document( date_filed=site.case_dates[0], citation=cite, docket=docket, precedential_status=site.precedential_statuses[0], ) self.doc.save(index=False)
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Add a document to the index site = test_opinion_scraper.Site().parse() cite = Citation( docket_number=site.docket_numbers[0], neutral_cite=site.neutral_citations[0], federal_cite_one=site.west_citations[0] ) cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() self.doc = Document( date_filed=site.case_dates[0], citation=cite, docket=docket, precedential_status=site.precedential_statuses[0], ) self.doc.save(index=False)
def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{ 'q': 'supreme', 'caller': 'scraper_test', }).execute() count = response.result.numFound self.assertEqual( count, 1, "There were %s items found when there should have been 1" % count)
def associate_meta_data_to_objects(self, site, i, court, sha1_hash): """Takes the meta data from the scraper and assocites it with objects. Returns the created objects. """ cite = Citation(case_name=site.case_names[i]) if site.neutral_citations: cite.neutral_cite = site.neutral_citations[i] if site.west_citations: cite.federal_cite_one = site.west_citations[i] if site.west_state_citations: cite.west_state_cite = site.west_state_citations[i] docket = Docket() if site.docket_numbers: docket.docket_number = site.docket_numbers[i] docket.case_name = site.case_names[i] docket.court = court doc = Document( source='C', sha1=sha1_hash, date_filed=site.case_dates[i], download_url=site.download_urls[i], precedential_status=site.precedential_statuses[i] ) if site.judges: doc.judges = site.judges[i] if site.nature_of_suit: doc.nature_of_suit = site.nature_of_suit[i] return cite, docket, doc
def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{'q': 'supreme', 'caller': 'scraper_test',}).execute() count = response.result.numFound self.assertEqual(count, 1, "There were %s items found when there should have been 1" % count)
def setUp(self): c1 = Citation(case_name=u"foo") c1.save(index=False) docket = Docket( case_name=u'foo', court=Court.objects.get(pk='test'), ) docket.save() # Must be more than a year old for all tests to be runnable. last_month = now().date() - timedelta(days=400) self.doc = Document(citation=c1, docket=docket, date_filed=last_month) self.doc.save(index=False) # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() OralArgumentCommand().scrape_court(site, full_crawl=True)
def associate_meta_data_to_objects(self, site, i, court, sha1_hash): """Takes the meta data from the scraper and assocites it with objects. Returns the created objects. """ cite = Citation(case_name=site.case_names[i]) if site.docket_numbers: cite.docket_number = site.docket_numbers[i] if site.neutral_citations: cite.neutral_cite = site.neutral_citations[i] if site.west_citations: cite.federal_cite_one = site.west_citations[i] if site.west_state_citations: cite.west_state_cite = site.west_state_citations[i] docket = Docket( case_name=site.case_names[i], court=court, ) doc = Document(source='C', sha1=sha1_hash, date_filed=site.case_dates[i], download_url=site.download_urls[i], precedential_status=site.precedential_statuses[i]) if site.judges: doc.judges = site.judges[i] if site.nature_of_suit: doc.nature_of_suit = site.nature_of_suit[i] return cite, docket, doc
def setUp(self): c1 = Citation(case_name=u"foo") c1.save(index=False) docket = Docket( case_name=u'foo', court=Court.objects.get(pk='test'), ) docket.save() # Must be more than a year old for all tests to be runnable. last_month = now().date() - timedelta(days=400) self.doc = Document( citation=c1, docket=docket, date_filed=last_month ) self.doc.save(index=False) # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() OralArgumentCommand().scrape_court(site, full_crawl=True)
def test_updating_the_docket_when_the_citation_case_name_changes(self): """Makes sure that the docket changes when the citation does.""" court = Court.objects.get(pk='test') original_case_name = u'original case name' new_case_name = u'new case name' cite = Citation(case_name=original_case_name) cite.save(index=False) docket = Docket( case_name=original_case_name, court=court, ) docket.save() Document( citation=cite, docket=docket, ).save(index=False) cite.case_name = new_case_name cite.save(index=False) changed_docket = Docket.objects.get(pk=docket.pk) self.assertEqual(changed_docket.case_name, new_case_name)
def setUp(self): self.court = Court.objects.get(pk='test') # create 3 documents with their citations and dockets c1, c2, c3 = Citation(case_name=u"c1"), Citation( case_name=u"c2"), Citation(case_name=u"c3") c1.save(index=False) c2.save(index=False) c3.save(index=False) docket1 = Docket( case_name=u"c1", court=self.court, ) docket2 = Docket( case_name=u"c2", court=self.court, ) docket3 = Docket( case_name=u"c3", court=self.court, ) docket1.save() docket2.save() docket3.save() d1, d2, d3 = Document(date_filed=date.today()), Document( date_filed=date.today()), Document(date_filed=date.today()) d1.citation, d2.citation, d3.citation = c1, c2, c3 d1.docket, d2.docket, d3.docket = docket1, docket2, docket3 doc_list = [d1, d2, d3] for d in doc_list: d.citation.save(index=False) d.save(index=False) # create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1; d1.cases_cited.add(d2.citation) d2.citation_count += 1 d2.cases_cited.add(d3.citation) d3.citation_count += 1 d3.cases_cited.add(d1.citation) d1.citation_count += 1 d1.cases_cited.add(d3.citation) d3.citation_count += 1 d1.save(index=False) d2.save(index=False) d3.save(index=False)
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_opinion_scraper.Site().parse() test_strings = [ 'supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity' ] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( case_name=site.case_names[i], court=self.court, ) docket.save() doc = Document( date_filed=site.case_dates[i], citation=cite, docket=docket, ) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def test_citation_matching(self): """Creates a few documents that contain specific citations, then attempts to find and match those citations. This becomes a bit of an integration test, which is likely fine. """ # Set up a document c1 = models.Citation(federal_cite_one=u'1 Yeates 1 (test 1795)') c1.save(index=False) docket1 = Docket( case_name=u"Lissner v. Saad", court=self.court, ) docket1.save() d1 = models.Document( date_filed=date(1795, 6, 9), citation=c1, docket=docket1, precedential_status='Published', ) d1.save(index=True) # Reference d1 from the text of another document c2 = models.Citation() c2.save(index=False) docket2 = Docket( case_name=u"Reference to Lissner v. Saad", court=self.court, ) docket2.save() d2 = models.Document(date_filed=date(1982, 6, 9), docket=docket2, citation=c2, plain_text=u"1 Yeates 1") d2.save(index=True) # Do a commit, or else citations can't be found in the index. self.si_opinion.commit() update_document(d2) # Updates d1's citation count in a Celery task d1 = models.Document.objects.get(pk=1) # cache-bust d1 self.assertEqual( d1.citation_count, 1, msg=u"d1 was not updated by a citation found in d2. Count was: %s" % d1.citation_count) d1.delete() d2.delete()
def import_law_box_case(case_path): """Open the file, get its contents, convert to XML and extract the meta data. Return a document object for saving in the database """ raw_text = open(case_path).read() clean_html_tree, complete_html_tree, clean_html_str, body_text = get_html_from_raw_text( raw_text) sha1 = hashlib.sha1(clean_html_str).hexdigest() citations = get_citations_from_tree(complete_html_tree, case_path) judges = get_judge(clean_html_tree, case_path) court = get_court_object(clean_html_tree, citations, case_path, judges) doc = Document( source='L', sha1=sha1, html= clean_html_str, # we clear this field later, putting the value into html_lawbox. date_filed=get_date_filed(clean_html_tree, citations=citations, case_path=case_path, court=court), precedential_status=get_precedential_status(), judges=judges, download_url=case_path, ) cite = Citation(docket_number=get_docket_number( clean_html_tree, case_path=case_path, court=court)) docket = Docket( case_name=get_case_name(complete_html_tree, case_path), court=court, ) # Necessary for dup_finder. path = '//p/text()' doc.body_text = ' '.join(clean_html_tree.xpath(path)) # Add the dict of citations to the object as its attributes. citations_as_dict = map_citations_to_models(citations) for k, v in citations_as_dict.iteritems(): setattr(cite, k, v) doc.citation = cite doc.docket = docket return doc
def test_citation_matching(self): """Creates a few documents that contain specific citations, then attempts to find and match those citations. This becomes a bit of an integration test, which is likely fine. """ # Set up a document c1 = models.Citation(federal_cite_one=u'1 Yeates 1 (test 1795)') c1.save(index=False) docket1 = Docket( case_name=u"Lissner v. Saad", court=self.court, ) docket1.save() d1 = models.Document( date_filed=date(1795, 6, 9), citation=c1, docket=docket1, precedential_status='Published', ) d1.save(index=True) # Reference d1 from the text of another document c2 = models.Citation() c2.save(index=False) docket2 = Docket( case_name=u"Reference to Lissner v. Saad", court=self.court, ) docket2.save() d2 = models.Document( date_filed=date(1982, 6, 9), docket=docket2, citation=c2, plain_text=u"1 Yeates 1" ) d2.save(index=True) # Do a commit, or else citations can't be found in the index. self.si_opinion.commit() update_document(d2) # Updates d1's citation count in a Celery task d1 = models.Document.objects.get(pk=1) # cache-bust d1 self.assertEqual( d1.citation_count, 1, msg=u"d1 was not updated by a citation found in d2. Count was: %s" % d1.citation_count ) d1.delete() d2.delete()
def associate_meta_data_to_objects(site, i, court, sha1_hash): audio_file = Audio( source='C', sha1=sha1_hash, case_name=site.case_names[i], date_argued=site.case_dates[i], download_url=site.download_urls[i], processing_complete=False, ) if site.judges: audio_file.judges = site.judges[i] if site.docket_numbers: audio_file.docket_number = site.docket_numbers[i] docket = Docket( case_name=site.case_names[i], court=court, ) return docket, audio_file
def test_pagerank_calculation(self): """Create a few Documents and fake citation relation among them, then run the pagerank algorithm. Check whether this simple case can get the correct result. """ # Set up some handy variables self.court = Court.objects.get(pk='test') # create 3 documents with their citations and dockets c1, c2, c3 = Citation(case_name=u"c1"), Citation( case_name=u"c2"), Citation(case_name=u"c3") c1.save(index=False) c2.save(index=False) c3.save(index=False) docket1 = Docket( case_name=u"c1", court=self.court, ) docket2 = Docket( case_name=u"c2", court=self.court, ) docket3 = Docket( case_name=u"c3", court=self.court, ) d1, d2, d3 = Document(date_filed=date.today()), Document( date_filed=date.today()), Document(date_filed=date.today()) d1.citation, d2.citation, d3.citation = c1, c2, c3 d1.docket, d2.docket, d3.docket = docket1, docket2, docket3 doc_list = [d1, d2, d3] for d in doc_list: d.citation.save(index=False) d.save(index=False) #create simple citing relation: 1 cites 2 and 3; 2 cites 3; 3 cites 1; d1.cases_cited.add(d2.citation) d2.citation_count += 1 d2.cases_cited.add(d3.citation) d3.citation_count += 1 d3.cases_cited.add(d1.citation) d1.citation_count += 1 d1.cases_cited.add(d3.citation) d3.citation_count += 1 d1.save(index=False) d2.save(index=False) d3.save(index=False) #calculate pagerank of these 3 document comm = Command() self.verbosity = 1 comm.do_pagerank(chown=False) # read in the pagerank file, converting to a dict pr_values_from_file = {} with open(get_data_dir_location() + "external_pagerank") as f: for line in f: pk, value = line.split('=') pr_values_from_file[pk] = float(value.strip()) # Verify that whether the answer is correct, based on calculations in # Gephi answers = { '1': 0.387790, '2': 0.214811, '3': 0.397400, } for key, value in answers.iteritems(): self.assertTrue( (abs(pr_values_from_file[key]) - value) < 0.0001, msg="The answer for item %s was %s when it should have been " "%s" % ( key, pr_values_from_file[key], answers[key], ))
def download_and_save(): """This function is run in many threads simultaneously. Each thread runs so long as there are items in the queue. Once an item is found, it's downloaded and saved. The number of items that can be concurrently saved is determined by the number of threads that are running this function. """ while True: item = queue.get() logger.info("Attempting to add item at: %s" % item['url']) try: msg, r = get_binary_content( item['url'], {}, ) except: logger.info("Unable to get item at: %s" % item['url']) queue.task_done() if msg: logger.warn(msg) queue.task_done() sha1_hash = hashlib.sha1(r.content).hexdigest() if Audio.objects.filter(sha1=sha1_hash).exists(): # Simpsons did it! Try the next one. logger.info("Item already exists, moving to next item.") queue.task_done() else: # New item, onwards! logger.info('Adding new document found at: %s' % item['url']) audio_file = Audio( source='H', sha1=sha1_hash, case_name=item['case_name'], date_argued=item['date_argued'], download_url=item['url'], processing_complete=False, ) if item['judges']: audio_file.judges = item['judges'] if item['docket_number']: audio_file.docket_number = item['docket_number'] court = Court.objects.get(pk=item['court_code']) docket = Docket( case_name=item['case_name'], court=court, ) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + item['url'].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_name'].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (item['case_name'], traceback.format_exc()) logger.critical(msg) queue.task_done() docket.save() audio_file.docket = docket audio_file.save(index=False) random_delay = random.randint(0, 3600) process_audio_file.apply_async((audio_file.pk, ), countdown=random_delay) logger.info("Successfully added audio file %s: %s" % (audio_file.pk, audio_file.case_name))
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') # Set up testing cores in Solr and swap them in self.core_name_opinion = '%s.opinion-test-%s' % \ (self.__module__, time.time()) self.core_name_audio = '%s.audio-test-%s' % \ (self.__module__, time.time()) create_solr_core(self.core_name_opinion) create_solr_core( self.core_name_audio, schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf', 'audio_schema.xml'), instance_dir='/usr/local/solr/example/solr/audio', ) swap_solr_core('collection1', self.core_name_opinion) swap_solr_core('audio', self.core_name_audio) self.si_opinion = sunburnt.SolrInterface( settings.SOLR_OPINION_URL, mode='rw') self.si_audio = sunburnt.SolrInterface( settings.SOLR_AUDIO_URL, mode='rw') # Add three documents and three audio files to the index, but don't # extract their contents self.site_opinion = test_opinion_scraper.Site().parse() self.site_audio = test_oral_arg_scraper.Site().parse() cite_counts = (4, 6, 8) self.docs = {} for i in range(0, 3): cite = Citation( case_name=self.site_opinion.case_names[i], docket_number=self.site_opinion.docket_numbers[i], neutral_cite=self.site_opinion.neutral_citations[i], federal_cite_one=self.site_opinion.west_citations[i], ) cite.save(index=False) docket = Docket( case_name=self.site_opinion.case_names[i], court=self.court, ) docket.save() self.docs[i] = Document( date_filed=self.site_opinion.case_dates[i], citation=cite, docket=docket, precedential_status=self.site_opinion.precedential_statuses[i], citation_count=cite_counts[i], nature_of_suit=self.site_opinion.nature_of_suit[i], judges=self.site_opinion.judges[i], ) self.docs[i].save() # Create citations between the documents # 0 ---cites--> 1, 2 # 1 ---cites--> 2 # 2 ---cites--> 0 self.docs[0].cases_cited.add(self.docs[1].citation) self.docs[0].cases_cited.add(self.docs[2].citation) self.docs[1].cases_cited.add(self.docs[2].citation) self.docs[2].cases_cited.add(self.docs[0].citation) for doc in self.docs.itervalues(): doc.save() # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() Command().scrape_court(site, full_crawl=True) self.expected_num_results_opinion = 3 self.expected_num_results_audio = 2 self.si_opinion.commit() self.si_audio.commit()
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status( vol_tree, case_location_relative) cite = Citation( case_name=case_name, docket_number=get_docket_number(case_location), federal_cite_one=get_west_cite(vol_tree, case_location_relative), ) docket = Docket( court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), citation=cite, docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() cite.save() docket.save() doc.docket = docket doc.citation = cite doc.save() # Update the citation graph from alert.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') # Set up testing cores in Solr and swap them in self.core_name_opinion = '%s.opinion-test-%s' % \ (self.__module__, time.time()) self.core_name_audio = '%s.audio-test-%s' % \ (self.__module__, time.time()) create_solr_core(self.core_name_opinion) create_solr_core( self.core_name_audio, schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf', 'audio_schema.xml'), instance_dir='/usr/local/solr/example/solr/audio', ) swap_solr_core('collection1', self.core_name_opinion) swap_solr_core('audio', self.core_name_audio) self.si_opinion = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw') self.si_audio = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='rw') # Add three documents and three audio files to the index, but don't # extract their contents self.site_opinion = test_opinion_scraper.Site().parse() self.site_audio = test_oral_arg_scraper.Site().parse() cite_counts = (4, 6, 8) self.docs = {} for i in range(0, 3): cite = Citation( case_name=self.site_opinion.case_names[i], docket_number=self.site_opinion.docket_numbers[i], neutral_cite=self.site_opinion.neutral_citations[i], federal_cite_one=self.site_opinion.west_citations[i], ) cite.save(index=False) docket = Docket( case_name=self.site_opinion.case_names[i], court=self.court, ) docket.save() self.docs[i] = Document( date_filed=self.site_opinion.case_dates[i], citation=cite, docket=docket, precedential_status=self.site_opinion.precedential_statuses[i], citation_count=cite_counts[i], nature_of_suit=self.site_opinion.nature_of_suit[i], judges=self.site_opinion.judges[i], ) self.docs[i].save() # Create citations between the documents # 0 ---cites--> 1, 2 # 1 ---cites--> 2 # 2 ---cites--> 0 self.docs[0].cases_cited.add(self.docs[1].citation) self.docs[0].cases_cited.add(self.docs[2].citation) self.docs[1].cases_cited.add(self.docs[2].citation) self.docs[2].cases_cited.add(self.docs[0].citation) for doc in self.docs.itervalues(): doc.save() # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() Command().scrape_court(site, full_crawl=True) self.expected_num_results_opinion = 3 self.expected_num_results_audio = 2 self.si_opinion.commit() self.si_audio.commit()
def import_resource_org_item(case_location): """Using the path to a case, import it, gathering all needed meta data. Path is any valid URI that the requests library can handle. """ def get_file(location): if location.startswith('/'): with open(location) as f: r = requests.Session() r.content = f.read() else: r = requests.get(location) return fromstring(r.content), get_clean_body_content(r.content) # Get trees and text for the opinion itself and for the index page # that links to it. Each has useful data. case_tree, case_text = get_file(case_location) vol_location = case_location.rsplit('/', 1)[-2] + '/index.html' vol_tree, vol_text = get_file(vol_location) html, blocked = anonymize(get_case_body(case_tree)) case_location_relative = case_location.rsplit('/', 1)[1] case_name, status = get_case_name_and_status(vol_tree, case_location_relative) cite = Citation( case_name=case_name, docket_number=get_docket_number(case_location), federal_cite_one=get_west_cite(vol_tree, case_location_relative), ) docket = Docket( court=Court.objects.get(pk=get_court_id(case_tree)), case_name=case_name, ) doc = Document( date_filed=get_date_filed(vol_tree, case_location_relative), source='R', sha1=hashlib.sha1(case_text).hexdigest(), citation=cite, docket=docket, download_url=case_location, html=html, precedential_status=status, ) if blocked: doc.blocked = True docket.blocked = True doc.date_blocked = datetime.date.today() docket.date_blocked = datetime.date.today() cite.save() docket.save() doc.docket = docket doc.citation = cite doc.save() # Update the citation graph from alert.citations.tasks import update_document_by_id update_document_by_id(doc.pk) return doc
def download_and_save(): """This function is run in many threads simultaneously. Each thread runs so long as there are items in the queue. Once an item is found, it's downloaded and saved. The number of items that can be concurrently saved is determined by the number of threads that are running this function. """ while True: item = queue.get() logger.info("Attempting to add item at: %s" % item['url']) try: msg, r = get_binary_content( item['url'], {}, ) except: logger.info("Unable to get item at: %s" % item['url']) queue.task_done() if msg: logger.warn(msg) queue.task_done() sha1_hash = hashlib.sha1(r.content).hexdigest() if Audio.objects.filter(sha1=sha1_hash).exists(): # Simpsons did it! Try the next one. logger.info("Item already exists, moving to next item.") queue.task_done() else: # New item, onwards! logger.info('Adding new document found at: %s' % item['url']) audio_file = Audio( source='H', sha1=sha1_hash, case_name=item['case_name'], date_argued=item['date_argued'], download_url=item['url'], processing_complete=False, ) if item['judges']: audio_file.judges = item['judges'] if item['docket_number']: audio_file.docket_number = item['docket_number'] court = Court.objects.get(pk=item['court_code']) docket = Docket( case_name=item['case_name'], court=court, ) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + item['url'].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_name'].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (item['case_name'], traceback.format_exc()) logger.critical(msg) queue.task_done() docket.save() audio_file.docket = docket audio_file.save(index=False) random_delay = random.randint(0, 3600) process_audio_file.apply_async( (audio_file.pk,), countdown=random_delay ) logger.info("Successfully added audio file %s: %s" % ( audio_file.pk, audio_file.case_name))