def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{'q': 'supreme', 'caller': 'scraper_test',}).execute() count = response.result.numFound self.assertEqual(count, 1, "There were %s items found when there should have been 1" % count)
def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{ 'q': 'supreme', 'caller': 'scraper_test', }).execute() count = response.result.numFound self.assertEqual( count, 1, "There were %s items found when there should have been 1" % count)
def fixer(simulate=False, verbose=False): """OCR documents that lack content""" #docs = queryset_generator(Document.objects.filter(source='C', plain_text='')) #docs = Document.objects.raw('''select "pk" from "Document" where "source" = 'C' and "plain_text" ~ '^[[:space:]]*$' ''') docs = Document.objects.raw('''select "pk" from "Document" where "source" = 'C' and "plain_text" = 'Unable to extract document content.' ''') for doc in docs: if verbose: print "Fixing document number %s: %s" % (doc.pk, doc) if not simulate: # Extract the contents asynchronously. extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
def fixer(simulate=False, verbose=False): """OCR documents that lack content""" # docs = queryset_generator(Document.objects.filter(source='C', plain_text='')) # docs = Document.objects.raw('''select "pk" from "Document" where "source" = 'C' and "plain_text" ~ '^[[:space:]]*$' ''') docs = Document.objects.raw( """select "pk" from "Document" where "source" = 'C' and "plain_text" = 'Unable to extract document content.' """ ) for doc in docs: if verbose: print "Fixing document number %s: %s" % (doc.pk, doc) if not simulate: # Extract the contents asynchronously. extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_scraper.Site().parse() test_strings = ['supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity'] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation(case_name=site.case_names[i]) cite.save(index=False) doc = Document(date_filed=site.case_dates[i], court=self.court, citation=cite) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_opinion_scraper.Site().parse() test_strings = [ 'supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity' ] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( case_name=site.case_names[i], court=self.court, ) docket.save() doc = Document( date_filed=site.case_dates[i], citation=cite, docket=docket, ) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site._get_cookies(), method=site.method) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data sha1_hash = hashlib.sha1(r.content).hexdigest() if court_str == 'nev' and site.precedential_statuses[ i] == 'Unpublished': # Nevada's non-precedential cases have different SHA1 sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=site.download_urls[i], lookup_by='download_url') else: onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1') if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() cite, docket, doc = self.associate_meta_data_to_objects( site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue # Save everything, but don't update Solr index yet self.save_everything(cite, docket, doc, index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) logger.info("Successfully added doc {pk}: {name}".format( pk=doc.pk, name=site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if not abort: for i in range(0, len(site.case_names)): msg, r = get_binary_content( site.download_urls[i], site._get_cookies(), method=site.method ) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data sha1_hash = hashlib.sha1(r.content).hexdigest() if court_str == 'nev' and site.precedential_statuses[i] == 'Unpublished': # Nevada's non-precedential cases have different SHA1 sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=site.download_urls[i], lookup_by='download_url' ) else: onwards = dup_checker.should_we_continue_break_or_carry_on( Document, current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1' ) if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() cite, docket, doc = self.associate_meta_data_to_objects( site, i, court, sha1_hash) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (site.case_names[i], traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue # Save everything, but don't update Solr index yet self.save_everything(cite, docket, doc, index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) logger.info("Successfully added doc {pk}: {name}".format( pk=doc.pk, name=site.case_names[i] )) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split('.')[-1].split('_')[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(site.court_id, full_crawl=full_crawl) abort = dup_checker.abort_by_hash(site.hash) if not abort: for i in range(0, len(site.case_names)): msg, r = get_binary_content(site.download_urls[i], site._get_cookies()) clean_content = site._cleanup_content(r.content) if msg: logger.warn(msg) ErrorLog(log_level='WARNING', court=court, message=msg).save() continue current_date = site.case_dates[i] try: next_date = site.case_dates[i + 1] except IndexError: next_date = None # Make a hash of the data. Need to convert unicode to binary before hashing. if type(clean_content) == unicode: hash_content = clean_content.encode('utf-8') else: hash_content = clean_content sha1_hash = hashlib.sha1(hash_content).hexdigest() if court_str == 'nev' and site.precedential_statuses[i] == 'Unpublished': # Nevada's non-precedential cases have different SHA1 sums every time. onwards = dup_checker.should_we_continue_break_or_carry_on( current_date, next_date, lookup_value=site.download_urls[i], lookup_by='download_url' ) else: onwards = dup_checker.should_we_continue_break_or_carry_on( current_date, next_date, lookup_value=sha1_hash, lookup_by='sha1' ) if onwards == 'CONTINUE': # It's a duplicate, but we haven't hit any thresholds yet. continue elif onwards == 'BREAK': # It's a duplicate, and we hit a date or dup_count threshold. dup_checker.update_site_hash(sha1_hash) break elif onwards == 'CARRY_ON': # Not a duplicate, carry on logger.info('Adding new document found at: %s' % site.download_urls[i]) dup_checker.reset() # Make a citation cite = Citation(case_name=site.case_names[i]) if site.docket_numbers: cite.docket_number = site.docket_numbers[i] if site.neutral_citations: cite.neutral_cite = site.neutral_citations[i] if site.west_citations: cite.federal_cite_one = site.west_citations[i] if site.west_state_citations: cite.west_state_cite = site.west_state_citations[i] # Make the document object doc = Document(source='C', sha1=sha1_hash, date_filed=site.case_dates[i], court=court, download_url=site.download_urls[i], precedential_status=site.precedential_statuses[i]) # Make and associate the file object try: cf = ContentFile(clean_content) extension = get_extension(r.content) # See issue #215 for why this must be lower-cased. file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted document: % s.\n % s' % \ (cite.case_name, traceback.format_exc()) logger.critical(msg) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() download_error = True continue if site.judges: doc.judges = site.judges[i] if site.nature_of_suit: doc.nature_of_suit = site.nature_of_suit[i] # Save everything, but don't update Solr index yet cite.save(index=False) doc.citation = cite doc.save(index=False) # Extract the contents asynchronously. extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) logger.info("Successfully added doc %s: %s" % (doc.pk, site.case_names[i])) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)