def add_or_update(self, *items): """ Given an item, adds it to the index, or updates it if it's already in the index. """ self.stdout.write("Adding or updating item(s): %s\n" % list(items)) add_items_to_solr(items, self.type)
def save_iquery_to_docket( self, iquery_data: Dict[str, str], d: Docket, tag_names: Optional[List[str]], add_to_solr: bool = False, ) -> Optional[int]: """Merge iquery results into a docket :param self: The celery task calling this function :param iquery_data: The data from a successful iquery response :param d: A docket object to work with :param tag_names: Tags to add to the items :param add_to_solr: Whether to save the completed docket to solr :return: The pk of the docket if successful. Else, None. """ d = update_docket_metadata(d, iquery_data) try: d.save() add_bankruptcy_data_to_docket(d, iquery_data) except IntegrityError as exc: msg = "Integrity error while saving iquery response." if self.request.retries == self.max_retries: logger.warning(msg) return logger.info("%s Retrying.", msg) raise self.retry(exc=exc) add_tags_to_objs(tag_names, [d]) if add_to_solr: add_items_to_solr([d.pk], "search.Docket") logger.info("Created/updated docket: %s" % d) return d.pk
def update_docket_info_iquery(self, d_pk): cookies = get_or_cache_pacer_cookies( "pacer_scraper", settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) s = PacerSession( cookies=cookies, username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) d = Docket.objects.get(pk=d_pk) report = CaseQuery(map_cl_to_pacer_id(d.court_id), s) try: report.query(d.pacer_case_id) except (requests.Timeout, requests.RequestException) as exc: logger.warning( "Timeout or unknown RequestException on iquery crawl. " "Trying again if retries not exceeded." ) if self.request.retries == self.max_retries: return raise self.retry(exc=exc) d = update_docket_metadata(d, report.data) d.save() add_bankruptcy_data_to_docket(d, report.data) add_items_to_solr([d.pk], "search.Docket")
def _update_index(): # For now, until some model/api issues are worked out for Audio # objects, we'll avoid using the cl_update_index command and do # this the hard way using tasks opinion_keys = Opinion.objects.values_list('pk', flat=True) add_items_to_solr(opinion_keys, 'search.Opinion', force_commit=True) audio_keys = Audio.objects.values_list('pk', flat=True) add_items_to_solr(audio_keys, 'audio.Audio', force_commit=True)
def save(self, index=True, force_commit=False, *args, **kwargs): """ Overrides the normal save method, but provides integration with the bulk files and with Solr indexing. :param index: Should the item be added to the Solr index? :param force_commit: Should a commit be performed in solr after indexing it? """ super(Audio, self).save(*args, **kwargs) if index: from cl.search.tasks import add_items_to_solr add_items_to_solr([self.pk], 'audio.Audio', force_commit)
def save(self, index=True, force_commit=False, *args, **kwargs): """ Overrides the normal save method, but provides integration with the bulk files and with Solr indexing. :param index: Should the item be added to the Solr index? :param force_commit: Should a commit be performed in solr after indexing it? """ super(Audio, self).save(*args, **kwargs) if index: from cl.search.tasks import add_items_to_solr add_items_to_solr([self.pk], "audio.Audio", force_commit)
def setUp(self) -> None: super(IndexedSolrTestCase, self).setUp() obj_types = { "audio.Audio": Audio, "search.Opinion": Opinion, "people_db.Person": Person, } for obj_name, obj_type in obj_types.items(): if obj_name == "people_db.Person": items = obj_type.objects.filter(is_alias_of=None) ids = [item.pk for item in items if item.is_judge] else: ids = obj_type.objects.all().values_list("pk", flat=True) add_items_to_solr(ids, obj_name, force_commit=True)
def reprocess_item(self, metadata_only=False, index=True): """Reprocess the RSS feed :param metadata_only: If True, only do the metadata, not the docket entries. :param index: Whether to save to Solr (note that none will be sent when doing medata only since no entries are modified). """ from cl.recap_rss.tasks import merge_rss_feed_contents from cl.search.tasks import add_items_to_solr rss_feed = PacerRssFeed(map_cl_to_pacer_id(self.court_id)) rss_feed._parse_text(self.file_contents) response = merge_rss_feed_contents(rss_feed.data, self.court_id, metadata_only) if index: add_items_to_solr(response.get("rds_for_solr", []), "search.RECAPDocument")
def process_citations(data, debug): """Walk through the citations and add them one at a time. """ updated_ids = set() for index, item in data.iterrows(): logger.info( "\nAdding citation from %s to %s" % (item["citing"], item["cited"]) ) try: cite = OpinionsCited.objects.get( citing_opinion_id=item["citing"], cited_opinion_id=item["cited"], ) msg = "Citation already exists. Doing nothing:\n" except OpinionsCited.DoesNotExist: cite = OpinionsCited( citing_opinion_id=item["citing"], cited_opinion_id=item["cited"], ) msg = "Created new citation:\n" if not debug: cite.save() updated_ids.add(cite.citing_opinion.pk) try: logger.info( " %s" " %s: %s\n" " From: %s\n" " To: %s\n" % (msg, cite.pk, cite, cite.citing_opinion, cite.cited_opinion) ) except Opinion.DoesNotExist: logger.warn( " Unable to create citation. Underlying Opinion doesn't " "exist." ) logger.info("\nUpdating Solr...") if not debug: add_items_to_solr(updated_ids, "search.Opinion") logger.info("Done.")
def process_citations(data, debug): """Walk through the citations and add them one at a time. """ updated_ids = set() for index, item in data.iterrows(): logger.info("\nAdding citation from %s to %s" % (item['citing'], item['cited'])) try: cite = OpinionsCited.objects.get( citing_opinion_id=item['citing'], cited_opinion_id=item['cited'], ) msg = "Citation already exists. Doing nothing:\n" except OpinionsCited.DoesNotExist: cite = OpinionsCited(citing_opinion_id=item['citing'], cited_opinion_id=item['cited']) msg = "Created new citation:\n" if not debug: cite.save() updated_ids.add(cite.citing_opinion.pk) try: logger.info( " %s" " %s: %s\n" " From: %s\n" " To: %s\n" % (msg, cite.pk, cite, cite.citing_opinion, cite.cited_opinion) ) except Opinion.DoesNotExist: logger.warn(" Unable to create citation. Underlying Opinion doesn't " "exist.") logger.info("\nUpdating Solr...") if not debug: add_items_to_solr(updated_ids, 'search.Opinion') logger.info("Done.")
def process_recap_pdf(self, pk): """Process an uploaded PDF from the RECAP API endpoint. :param pk: The PK of the processing queue item you want to work on. :return: A RECAPDocument object that was created or updated. """ """Save a RECAP PDF to the database.""" pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) if pq.attachment_number is None: document_type = RECAPDocument.PACER_DOCUMENT else: document_type = RECAPDocument.ATTACHMENT logger.info("Processing RECAP item (debug is: %s): %s " % (pq.debug, pq)) try: if pq.pacer_case_id: rd = RECAPDocument.objects.get( docket_entry__docket__pacer_case_id=pq.pacer_case_id, pacer_doc_id=pq.pacer_doc_id, ) else: # Sometimes we don't have the case ID from PACER. Try to make this # work anyway. rd = RECAPDocument.objects.get(pacer_doc_id=pq.pacer_doc_id,) except (RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned): try: d = Docket.objects.get( pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) except Docket.DoesNotExist as exc: # No Docket and no RECAPDocument. Do a retry. Hopefully # the docket will be in place soon (it could be in a # different upload task that hasn't yet been processed). logger.warning( "Unable to find docket for processing queue '%s'. " "Retrying if max_retries is not exceeded." % pq ) error_message = "Unable to find docket for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) return None else: mark_pq_status( pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to save '%s'" % pq mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None # Got the Docket, attempt to get/create the DocketEntry, and then # create the RECAPDocument try: de = DocketEntry.objects.get( docket=d, entry_number=pq.document_number ) except DocketEntry.DoesNotExist as exc: logger.warning( "Unable to find docket entry for processing " "queue '%s'." % pq ) msg = "Unable to find docket entry for item." if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) else: # If we're here, we've got the docket and docket # entry, but were unable to find the document by # pacer_doc_id. This happens when pacer_doc_id is # missing, for example. ∴, try to get the document # from the docket entry. try: rd = RECAPDocument.objects.get( docket_entry=de, document_number=pq.document_number, attachment_number=pq.attachment_number, document_type=document_type, ) except ( RECAPDocument.DoesNotExist, RECAPDocument.MultipleObjectsReturned, ): # Unable to find it. Make a new item. rd = RECAPDocument( docket_entry=de, pacer_doc_id=pq.pacer_doc_id, document_type=document_type, ) rd.document_number = pq.document_number rd.attachment_number = pq.attachment_number # Do the file, finally. try: content = pq.filepath_local.read() except IOError as exc: msg = "Internal processing error (%s: %s)." % (exc.errno, exc.strerror) if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) new_sha1 = sha1(content) existing_document = all( [ rd.sha1 == new_sha1, rd.is_available, rd.filepath_local and os.path.isfile(rd.filepath_local.path), ] ) if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. cf = ContentFile(content) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, rd.document_number, rd.attachment_number, ) if not pq.debug: rd.filepath_local.save(file_name, cf, save=False) # Do page count and extraction extension = rd.filepath_local.path.split(".")[-1] rd.page_count = get_page_count(rd.filepath_local.path, extension) rd.file_size = rd.filepath_local.size rd.ocr_status = None rd.is_available = True rd.sha1 = new_sha1 rd.date_upload = now() if not pq.debug: try: rd.save() except (IntegrityError, ValidationError): msg = "Duplicate key on unique_together constraint" mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) rd.filepath_local.delete(save=False) return None if not existing_document and not pq.debug: extract_recap_pdf(rd.pk) add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful( pq, d_id=rd.docket_entry.docket_id, de_id=rd.docket_entry_id, rd_id=rd.pk, ) mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True) return rd
def process_recap_attachment(self, pk, tag_names=None): """Process an uploaded attachment page from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on :param tag_names: A list of tag names to add to all items created or modified in this function. :return: None """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) att_page = AttachmentPage(map_cl_to_pacer_id(pq.court_id)) with open(pq.filepath_local.path) as f: text = f.read().decode("utf-8") att_page._parse_text(text) att_data = att_page.data logger.info("Parsing completed for item %s" % pq) if att_data == {}: # Bad attachment page. msg = "Not a valid attachment page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None if pq.pacer_case_id in ["undefined", "null"]: # Bad data from the client. Fix it with parsed data. pq.pacer_case_id = att_data.get("pacer_case_id") pq.save() # Merge the contents of the data into CL. try: params = { "pacer_doc_id": att_data["pacer_doc_id"], "docket_entry__docket__court": pq.court, } if pq.pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pq.pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. msg = ( "Too many documents found when attempting to associate " "attachment data" ) mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if (self.request.retries == self.max_retries) or pq.debug: mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) return None else: mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if att_data["document_number"] is None: # Bankruptcy attachment page. Use the document number from the Main doc att_data["document_number"] = main_rd.document_number rds_created = [] if not pq.debug: # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. tags = [] if tag_names: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tags.append(tag) for attachment in att_data["attachments"]: if all( [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] ): rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=att_data["document_number"], attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) needs_save = False for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) needs_save = True # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass if needs_save: rd.save() if tags: for tag in tags: tag.tag_object(rd) # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_pq_successful(pq, d_id=de.docket_id, de_id=de.pk) process_orphan_documents( rds_created, pq.court_id, main_rd.docket_entry.docket.date_filed ) changed = mark_ia_upload_needed(de.docket) if changed: de.docket.save()
def merge_attachment_page_data( court: Court, pacer_case_id: int, pacer_doc_id: int, document_number: int, text: str, attachment_dicts: List[Dict[str, Union[int, str]]], debug: bool = False, ) -> Tuple[List[RECAPDocument], DocketEntry]: """Merge attachment page data into the docket :param court: The court object we're working with :param pacer_case_id: A PACER case ID :param pacer_doc_id: A PACER document ID :param document_number: The docket entry number :param text: The text of the attachment page :param attachment_dicts: A list of Juriscraper-parsed dicts for each attachment. :param debug: Whether to do saves during this process. :return: A list of RECAPDocuments modified or created during the process, and the DocketEntry object associated with the RECAPDocuments :raises: RECAPDocument.MultipleObjectsReturned, RECAPDocument.DoesNotExist """ try: params = { "pacer_doc_id": pacer_doc_id, "docket_entry__docket__court": court, } if pacer_case_id: params["docket_entry__docket__pacer_case_id"] = pacer_case_id main_rd = RECAPDocument.objects.get(**params) except RECAPDocument.MultipleObjectsReturned as exc: # Unclear how to proceed and we don't want to associate this data with # the wrong case. We must punt. raise exc except RECAPDocument.DoesNotExist as exc: # Can't find the docket to associate with the attachment metadata # It may be possible to go look for orphaned documents at this stage # and to then add them here, as we do when adding dockets. This need is # particularly acute for those that get free look emails and then go to # the attachment page. raise exc # We got the right item. Update/create all the attachments for # the docket entry. de = main_rd.docket_entry if document_number is None: # Bankruptcy attachment page. Use the document number from the Main doc document_number = main_rd.document_number if debug: return [], de # Save the old HTML to the docket entry. pacer_file = PacerHtmlFiles( content_object=de, upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE ) pacer_file.filepath.save( "attachment_page.html", # Irrelevant b/c UUIDFileSystemStorage ContentFile(text), ) # Create/update the attachment items. rds_created = [] rds_affected = [] for attachment in attachment_dicts: sanity_checks = [ attachment["attachment_number"], # Missing on sealed items. attachment.get("pacer_doc_id", False), # Missing on some restricted docs (see Juriscraper) attachment["page_count"] is not None, attachment["description"], ] if not all(sanity_checks): continue rd, created = RECAPDocument.objects.update_or_create( docket_entry=de, document_number=document_number, attachment_number=attachment["attachment_number"], document_type=RECAPDocument.ATTACHMENT, ) if created: rds_created.append(rd) rds_affected.append(rd) for field in ["description", "pacer_doc_id"]: if attachment[field]: setattr(rd, field, attachment[field]) # Only set page_count and file_size if they're blank, in case # we got the real value by measuring. if rd.page_count is None: rd.page_count = attachment["page_count"] if rd.file_size is None and attachment["file_size_str"]: try: rd.file_size = convert_size_to_bytes( attachment["file_size_str"] ) except ValueError: pass rd.save() # Do *not* do this async — that can cause race conditions. add_items_to_solr([rd.pk], "search.RECAPDocument") mark_ia_upload_needed(de.docket, save_docket=True) process_orphan_documents( rds_created, court.pk, main_rd.docket_entry.docket.date_filed ) return rds_affected, de