def hash_item(item): """Hash an RSS item. Item should be a dict at this stage""" # Stringify, normalizing dates to strings. item_j = json.dumps(item, sort_keys=True, default=str) item_hash = sha1(item_j) return item_hash
"tennworkcompapp": Court.objects.get(pk="tennworkcompapp"), } for case in tn_corpus: if case["label"] == skip_until: ready = True if not ready: continue logging.info("Processing label:%s for case:%s", case["label"], case["title"]) pdf_path = glob("%s/%s/*.pdf" % (os.path.dirname(filepath.name), case["label"]))[0] with open(pdf_path, "rb") as p: pdf_data = p.read() sha1_hash = sha1(force_bytes(pdf_data)) ops = Opinion.objects.filter(sha1=sha1_hash) if len(ops) > 0: op = ops[0] logging.warn("Document already in database. See: %s at %s" % (op.get_absolute_url(), op.cluster.case_name)) docket, opinion, cluster, citations, error = make_objects( make_item(case), courts[case["court"]], sha1_hash, pdf_data, ) save_everything( items={
def scrape_court( self, site, full_crawl: bool = False, backscrape: bool = False, ) -> None: # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split(".")[-1].split("_")[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) abort = dup_checker.abort_by_url_hash(site.url, site.hash) if abort: return if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content( item["download_urls"], site.cookies, method=site.method, ) if msg: logger.warning(msg) ErrorLog(log_level="WARNING", court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item["case_dates"] try: next_date = site[i + 1]["case_dates"] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) onwards = dup_checker.press_on( Audio, current_date, next_date, lookup_value=sha1_hash, lookup_by="sha1", ) if dup_checker.emulate_break: break if onwards: # Not a duplicate, carry on logger.info( "Adding new document found at: %s" % item["download_urls"].encode() ) dup_checker.reset() docket, audio_file = make_objects( item, court, sha1_hash, content ) save_everything( items={"docket": docket, "audio_file": audio_file}, index=False, backscrape=backscrape, ) process_audio_file.apply_async( (audio_file.pk,), countdown=random.randint(0, 3600) ) logger.info( "Successfully added audio file {pk}: {name}".format( pk=audio_file.pk, name=item["case_names"].encode(), ) ) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled oral arguments." % site.court_id) if not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def scrape_court(self, site, full_crawl=False): download_error = False # Get the court object early for logging # opinions.united_states.federal.ca9_u --> ca9 court_str = site.court_id.split(".")[-1].split("_")[0] court = Court.objects.get(pk=court_str) dup_checker = DupChecker(court, full_crawl=full_crawl) if dup_checker.abort_by_url_hash(site.url, site.hash): return if site.cookies: logger.info("Using cookies: %s" % site.cookies) for i, item in enumerate(site): msg, r = get_binary_content( item["download_urls"], site.cookies, site._get_adapter_instance(), method=site.method, ) if msg: logger.warn(msg) ErrorLog(log_level="WARNING", court=court, message=msg).save() continue content = site.cleanup_content(r.content) current_date = item["case_dates"] try: next_date = site[i + 1]["case_dates"] except IndexError: next_date = None # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. sha1_hash = sha1(force_bytes(content)) if (court_str == "nev" and item["precedential_statuses"] == "Unpublished"): # Nevada's non-precedential cases have different SHA1 sums # every time. lookup_params = { "lookup_value": item["download_urls"], "lookup_by": "download_url", } else: lookup_params = { "lookup_value": sha1_hash, "lookup_by": "sha1", } proceed = dup_checker.press_on(Opinion, current_date, next_date, **lookup_params) if dup_checker.emulate_break: break if not proceed: continue # Not a duplicate, carry on logger.info("Adding new document found at: %s" % item["download_urls"].encode("utf-8")) dup_checker.reset() docket, opinion, cluster, citations, error = self.make_objects( item, court, sha1_hash, content) if error: download_error = True continue self.save_everything( items={ "docket": docket, "opinion": opinion, "cluster": cluster, "citations": citations, }, index=False, ) extract_doc_content.delay( opinion.pk, do_ocr=True, citation_jitter=True, ) logger.info("Successfully added doc {pk}: {name}".format( pk=opinion.pk, name=item["case_names"].encode("utf-8"), )) # Update the hash if everything finishes properly. logger.info("%s: Successfully crawled opinions." % site.court_id) if not download_error and not full_crawl: # Only update the hash if no errors occurred. dup_checker.update_site_hash(site.hash)
def import_disclosure(self, data: Dict[str, Union[str, int, list]]) -> None: """Import disclosures into Courtlistener :param data: The disclosure information to process :return: None """ # Check download_filepath to see if it has been processed before. if has_been_extracted(data): logger.info(f"Document already extracted and saved: {data['id']}.") return interface = make_redis_interface("CACHE") disclosure_key = make_disclosure_key(data["id"]) newly_enqueued = create_redis_semaphore( interface, disclosure_key, ttl=60 * 60 * 12, ) if not newly_enqueued: logger.info(f"Process is already running {data['id']}.") return # Generate PDF content from our three paths year = int(data["year"]) person_id = data["person_id"] logger.info( f"Processing row {data['id']} for person {person_id} " f"in year {year}" ) # Check if we've already extracted disclosure_url = get_aws_url(data) was_previously_pdfed = has_been_pdfed(disclosure_url) pdf_response = generate_or_download_disclosure_as_pdf( data, was_previously_pdfed ) pdf_bytes = pdf_response.content if pdf_response.status_code != 200: logger.info("PDF generation failed.") return if was_previously_pdfed: disclosure = get_disclosure_from_pdf_path(disclosure_url) else: logger.info("PDF generated successfully.") # Sha1 hash - Check for duplicates sha1_hash = sha1(pdf_bytes) in_system = check_if_in_system(sha1_hash) if in_system: logger.info("PDF already in system.") interface.delete(disclosure_key) return # Return page count - 0 indicates a failure of some kind. Like PDF # Not actually present on aws. pg_count = get_page_count(pdf_bytes) if not pg_count: logger.info(f"PDF failed for disclosure {data['id']}.") interface.delete(disclosure_key) return # Save Financial Disclosure here to AWS and move onward disclosure = FinancialDisclosure( year=year, page_count=pg_count, person=Person.objects.get(id=person_id), sha1=sha1_hash, has_been_extracted=False, download_filepath=data.get("url") if data.get("url") else data.get("urls")[0], ) # Save and upload PDF disclosure.filepath.save( f"{disclosure.person.slug}-disclosure.{year}.pdf", ContentFile(pdf_bytes), ) logger.info( f"Uploaded to https://{settings.AWS_S3_CUSTOM_DOMAIN}/" f"{disclosure.filepath}" ) # Extract content from PDF content = extract_content( pdf_bytes=pdf_bytes, disclosure_type=data["disclosure_type"] ) if not content: logger.info("Failed extraction!") interface.delete(disclosure_key) return # Save PDF content save_disclosure(extracted_data=content, disclosure=disclosure) # Remove disclosure ID in redis for completed disclosure interface.delete(disclosure_key)