def test_html_extension(self): with open(os.path.join(self.path, "opinion_html.html"), "rb") as f: data = f.read() self.assertEqual(get_extension(data), ".html") with open(os.path.join(self.path, "not_wpd.html"), "rb") as f: data = f.read() self.assertEqual(get_extension(data), ".html")
def make_objects(self, item, court, sha1_hash, content): blocked = item["blocked_statuses"] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = item.get( "case_name_shorts" ) or self.cnt.make_case_name_short(item["case_names"]) docket = Docket( docket_number=item.get("docket_numbers", ""), case_name=item["case_names"], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, date_argued=item["case_dates"], source=Docket.SCRAPER, ) audio_file = Audio( judges=item.get("judges", ""), source="C", case_name=item["case_names"], case_name_short=case_name_short, sha1=sha1_hash, download_url=item["download_urls"], blocked=blocked, date_blocked=date_blocked, ) error = False try: cf = ContentFile(content) extension = get_extension(content) if extension not in [".mp3", ".wma"]: extension = ( "." + item["download_urls"].lower().rsplit(".", 1)[1] ) # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item["case_names"].lower(), 75) + extension audio_file.file_with_date = docket.date_argued audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = ( "Unable to save binary to disk. Deleted audio file: %s.\n " "%s" % (item["case_names"], traceback.format_exc()) ) logger.critical(msg.encode("utf-8")) ErrorLog(log_level="CRITICAL", court=court, message=msg).save() error = True return docket, audio_file, error
def make_objects(self, item, court, sha1_hash, content): blocked = item['blocked_statuses'] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, date_argued=item['case_dates'], source=Docket.SCRAPER, ) audio_file = Audio( judges=item.get('judges', ''), source='C', case_name=item['case_names'], case_name_short=case_name_short, sha1=sha1_hash, download_url=item['download_urls'], blocked=blocked, date_blocked=date_blocked, ) error = False try: cf = ContentFile(content) extension = get_extension(content) if extension not in ['.mp3', '.wma']: extension = '.' + item['download_urls'].lower().rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_names'].lower(), 75) + extension audio_file.file_with_date = docket.date_argued audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted audio file: %s.\n ' \ '%s' % (item['case_names'], traceback.format_exc()) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, audio_file, error
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" test_strings = [ 'supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity' ] opinions = Opinion.objects.all() for op, test_string in zip(opinions, test_strings): ext = get_extension(op.local_path.file.read()) op = extract_doc_content(op.pk, callback=subtask(extract_by_ocr)) if ext in ['.html', '.wpd']: self.assertIn(test_string, op.html.lower()) else: self.assertIn(test_string, op.plain_text.lower())
def make_objects( item: Dict[str, Any], court: Court, sha1_hash: str, content: str, ) -> Tuple[Docket, Audio]: blocked = item["blocked_statuses"] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = item.get("case_name_shorts") or cnt.make_case_name_short( item["case_names"] ) docket = Docket( docket_number=item.get("docket_numbers", ""), case_name=item["case_names"], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, date_argued=item["case_dates"], source=item.get("source") or Docket.SCRAPER, ) audio_file = Audio( judges=item.get("judges", ""), source=item.get("cluster_source") or "C", case_name=item["case_names"], case_name_short=case_name_short, sha1=sha1_hash, download_url=item["download_urls"], blocked=blocked, date_blocked=date_blocked, ) cf = ContentFile(content) extension = get_extension(content) if extension not in [".mp3", ".wma"]: extension = "." + item["download_urls"].lower().rsplit(".", 1)[1] file_name = trunc(item["case_names"].lower(), 75) + extension audio_file.file_with_date = docket.date_argued audio_file.local_path_original_file.save(file_name, cf, save=False) return docket, audio_file
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" test_strings = [ "supreme", "intelligence", "indiana", "reagan", "indiana", "fidelity", ] opinions = Opinion.objects.all() for op, test_string in zip(opinions, test_strings): ext = get_extension(op.local_path.file.read()) extract_doc_content(op.pk, do_ocr=True) op.refresh_from_db() if ext in [".html", ".wpd"]: self.assertIn(test_string, op.html.lower()) else: self.assertIn(test_string, op.plain_text.lower())
def make_objects(self, item, court, sha1_hash, content): """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item['blocked_statuses'] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=Docket.SCRAPER, ) west_cite_str = item.get('west_citations', '') state_cite_str = item.get('west_state_citations', '') neutral_cite_str = item.get('neutral_citations', '') cluster = OpinionCluster( judges=item.get('judges', ''), date_filed=item['case_dates'], date_filed_is_approximate=item['date_filed_is_approximate'], case_name=item['case_names'], case_name_short=case_name_short, source='C', precedential_status=item['precedential_statuses'], nature_of_suit=item.get('nature_of_suit', ''), blocked=blocked, date_blocked=date_blocked, # These three fields are replaced below. federal_cite_one=west_cite_str, state_cite_one=state_cite_str, neutral_cite=neutral_cite_str, syllabus=item.get('summaries', ''), ) citations = [] cite_types = [ (west_cite_str, Citation.WEST), (state_cite_str, Citation.STATE), (neutral_cite_str, Citation.NEUTRAL), ] for cite_str, cite_type in cite_types: if cite_str: citations.append(make_citation(cite_str, cluster, cite_type)) opinion = Opinion( type='010combined', sha1=sha1_hash, download_url=item['download_urls'], ) error = False try: cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item['case_names'].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) except: msg = ('Unable to save binary to disk. Deleted ' 'item: %s.\n %s' % (item['case_names'], traceback.format_exc())) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, opinion, cluster, citations, error
def make_objects( item: Dict[str, Union[str, Any]], court: Court, sha1_hash: str, content: bytes, ) -> Tuple[Docket, Opinion, OpinionCluster, List[Citation]]: """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item["blocked_statuses"] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = item.get("case_name_shorts") or cnt.make_case_name_short( item["case_names"] ) docket = Docket( docket_number=item.get("docket_numbers", ""), case_name=item["case_names"], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=item.get("source") or Docket.SCRAPER, ) west_cite_str = item.get("west_citations", "") state_cite_str = item.get("west_state_citations", "") neutral_cite_str = item.get("neutral_citations", "") cluster = OpinionCluster( judges=item.get("judges", ""), date_filed=item["case_dates"], date_filed_is_approximate=item["date_filed_is_approximate"], case_name=item["case_names"], case_name_short=case_name_short, source=item.get("cluster_source") or "C", precedential_status=item["precedential_statuses"], nature_of_suit=item.get("nature_of_suit", ""), blocked=blocked, date_blocked=date_blocked, syllabus=item.get("summaries", ""), ) citations = [] cite_types = [ (west_cite_str, Citation.WEST), (state_cite_str, Citation.STATE), (neutral_cite_str, Citation.NEUTRAL), ] for cite_str, cite_type in cite_types: if cite_str: citations.append(make_citation(cite_str, cluster, cite_type)) opinion = Opinion( type=Opinion.COMBINED, sha1=sha1_hash, download_url=item["download_urls"], ) cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item["case_names"].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) return docket, opinion, cluster, citations
def test_doc_extension(self): with open(os.path.join(self.path, "opinion_doc.doc"), "rb") as f: data = f.read() self.assertEqual(get_extension(data), ".doc")
def test_pdf_extension(self): with open(os.path.join(self.path, "opinion_pdf_text_based.pdf"), "rb") as f: data = f.read() self.assertEqual(get_extension(data), ".pdf")
def test_wpd_extension(self): with open(os.path.join(self.path, "opinion_wpd.wpd"), "r") as f: data = f.read() self.assertEqual(get_extension(data), ".wpd")
def make_objects(item, court, sha1_hash, content): """Takes the meta data from the scraper and associates it with objects. Returns the created objects. """ blocked = item["blocked_statuses"] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = item.get("case_name_shorts") or cnt.make_case_name_short( item["case_names"]) docket = Docket( docket_number=item.get("docket_numbers", ""), case_name=item["case_names"], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, source=item.get("source") or Docket.SCRAPER, ) west_cite_str = item.get("west_citations", "") state_cite_str = item.get("west_state_citations", "") neutral_cite_str = item.get("neutral_citations", "") cluster = OpinionCluster( judges=item.get("judges", ""), date_filed=item["case_dates"], date_filed_is_approximate=item["date_filed_is_approximate"], case_name=item["case_names"], case_name_short=case_name_short, source=item.get("cluster_source") or "C", precedential_status=item["precedential_statuses"], nature_of_suit=item.get("nature_of_suit", ""), blocked=blocked, date_blocked=date_blocked, syllabus=item.get("summaries", ""), ) citations = [] cite_types = [ (west_cite_str, Citation.WEST), (state_cite_str, Citation.STATE), (neutral_cite_str, Citation.NEUTRAL), ] for cite_str, cite_type in cite_types: if cite_str: citations.append(make_citation(cite_str, cluster, cite_type)) opinion = Opinion( type=Opinion.COMBINED, sha1=sha1_hash, download_url=item["download_urls"], ) error = False try: cf = ContentFile(content) extension = get_extension(content) file_name = trunc(item["case_names"].lower(), 75) + extension opinion.file_with_date = cluster.date_filed opinion.local_path.save(file_name, cf, save=False) except: msg = "Unable to save binary to disk. Deleted " "item: %s.\n %s" % ( item["case_names"], traceback.format_exc(), ) logger.critical(msg.encode("utf-8")) ErrorLog(log_level="CRITICAL", court=court, message=msg).save() error = True return docket, opinion, cluster, citations, error
def download_and_save(): """This function is run in many threads simultaneously. Each thread runs so long as there are items in the queue. Once an item is found, it's downloaded and saved. The number of items that can be concurrently saved is determined by the number of threads that are running this function. """ while True: item = queue.get() logger.info("%s: Attempting to add item at: %s" % (threading.current_thread().name, item['url'])) try: msg, r = get_binary_content( item['url'], {}, ) except: logger.info("%s: Unable to get item at: %s" % (threading.current_thread().name, item['url'])) queue.task_done() if msg: logger.warn(msg) queue.task_done() continue sha1_hash = hashlib.sha1(r.content).hexdigest() if Audio.objects.filter(sha1=sha1_hash).exists(): # Simpsons did it! Try the next one. logger.info("%s: Item already exists, moving to next item." % threading.current_thread().name) queue.task_done() continue else: # New item, onwards! logger.info('%s: Adding new document found at: %s' % (threading.current_thread().name, item['url'])) audio_file = Audio( source='H', sha1=sha1_hash, case_name=item['case_name'], download_url=item['url'], processing_complete=False, ) if item['judges']: audio_file.judges = item['judges'] if item['docket_number']: audio_file.docket.docket_number = item['docket_number'] court = Court.objects.get(pk=item['court_code']) docket = Docket( case_name=item['case_name'], court=court, date_argued=item['date_argued'], ) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + item['url'].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_name'].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary. Deleted document: %s.\n%s' % \ (item['case_name'], traceback.format_exc()) logger.critical(msg) queue.task_done() docket.save() audio_file.docket = docket audio_file.save(index=False) random_delay = random.randint(0, 3600) process_audio_file.apply_async( (audio_file.pk,), countdown=random_delay ) logger.info("%s: Successfully added audio file %s: %s" % (threading.current_thread().name, audio_file.pk, audio_file.case_name))