def lookup_and_save(new, debug=False): """Merge new docket info into the database. Start by attempting to lookup an existing Docket. If that's not found, create a new one. Either way, merge all the attributes of `new` into the Docket found, and then save the Docket. Returns None if an error occurs, else, return the new or updated Docket. """ try: d = Docket.objects.get(pacer_case_id=new.pacer_case_id, court=new.court) except (Docket.DoesNotExist, Docket.MultipleObjectsReturned): d = None if d is None: ds = Docket.objects.filter(docket_number=new.docket_number, court=new.court).order_by('-date_filed') count = ds.count() if count < 1: # Can't find it by pacer_case_id or docket_number. Make a new item. d = Docket(source=Docket.RECAP) elif count == 1: # Nailed it! d = ds[0] elif count > 1: # Too many dockets returned. Disambiguate. logger.error("Got multiple results while attempting save.") def is_different(x): return x.pacer_case_id and x.pacer_case_id != new.pacer_case_id if all([is_different(d) for d in ds]): # All the dockets found match on docket number, but have # different pacer_case_ids. This means that the docket has # multiple pacer_case_ids in PACER, and we should mirror that # in CL by creating a new docket for the new item. d = Docket(source=Docket.RECAP) else: # Just use the most recent docket. Looking at the data, this is # OK. Nearly all of these are dockets associated with clusters # that can be merged (however, that's a project for clusters). d = ds[0] # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER for attr, v in new.__dict__.items(): setattr(d, attr, v) if not debug: d.save() logger.info("Saved as Docket %s: https://www.courtlistener.com%s" % (d.pk, d.get_absolute_url())) return d
def process_recap_docket(self, pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :returns: A dict of the form: { // The PK of the docket that's created or updated 'docket_pk': 22, // A boolean indicating whether a new docket entry or recap document // was created (implying a Solr needs updating). 'needs_solr_update': True, } This value is a dict so that it can be ingested in a Celery chain. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') if 'History/Documents' in text: # Prior to 1.1.8, we did not separate docket history reports into their # own upload_type. Alas, we still have some old clients around, so we # need to handle those clients here. pq.upload_type = pq.DOCKET_HISTORY_REPORT pq.save() process_recap_docket_history_report(pk) self.request.callbacks = None return None report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) if docket_data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) self.request.callbacks = None return None # Merge the contents of the docket into CL. Attempt several lookups of # decreasing specificity. Note that pacer_case_id is required for Docket # uploads. d = None for kwargs in [{'pacer_case_id': pq.pacer_case_id, 'docket_number': docket_data['docket_number']}, {'pacer_case_id': pq.pacer_case_id}, {'docket_number': docket_data['docket_number'], 'pacer_case_id': None}]: try: d = Docket.objects.get(court_id=pq.court_id, **kwargs) break except Docket.DoesNotExist: continue except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq mark_pq_status(pq, msg, pq.PROCESSING_FAILED) self.request.callbacks = None return None if d is None: # Couldn't find it. Make a new one. d = Docket( source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.callbacks = None return {'docket_pk': d.pk, 'needs_solr_update': False} d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries & documents rds_created = [] needs_solr_update = False for docket_entry in docket_data['docket_entries']: try: de, de_created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue if de_created: needs_solr_update = True # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. params = { 'docket_entry': de, # No attachments when uploading dockets. 'document_type': RECAPDocument.PACER_DOCUMENT, 'document_number': docket_entry['document_number'], } try: rd = RECAPDocument.objects.get(**params) except RECAPDocument.DoesNotExist: rd = RECAPDocument.objects.create( pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, **params ) rds_created.append(rd) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq) ) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, pq.court_id, d.date_filed) mark_pq_successful(pq, d_id=d.pk) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def process_recap_docket(pk): """Process an uploaded docket from the RECAP API endpoint. :param pk: The primary key of the processing queue item you want to work on. :return: The docket that's created or updated. """ pq = ProcessingQueue.objects.get(pk=pk) mark_pq_status(pq, '', pq.PROCESSING_IN_PROGRESS) logger.info("Processing RECAP item (debug is: %s): %s" % (pq.debug, pq)) report = DocketReport(map_cl_to_pacer_id(pq.court_id)) text = pq.filepath_local.read().decode('utf-8') report._parse_text(text) docket_data = report.data logger.info("Parsing completed of item %s" % pq) if docket_data == {}: # Not really a docket. Some sort of invalid document (see Juriscraper). msg = "Not a valid docket upload." mark_pq_status(pq, msg, pq.INVALID_CONTENT) return None # Merge the contents of the docket into CL. Attempt several lookups of # decreasing specificity. d = None for kwargs in [{ 'pacer_case_id': pq.pacer_case_id, 'docket_number': docket_data['docket_number'] }, { 'pacer_case_id': pq.pacer_case_id }, { 'docket_number': docket_data['docket_number'] }]: try: d = Docket.objects.get(court_id=pq.court_id, **kwargs) break except Docket.DoesNotExist: continue except Docket.MultipleObjectsReturned: msg = "Too many dockets found when trying to look up '%s'" % pq mark_pq_status(pq, msg, pq.PROCESSING_FAILED) return None if d is None: # Couldn't find it. Make a new one. d = Docket(source=Docket.RECAP, pacer_case_id=pq.pacer_case_id, court_id=pq.court_id) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER update_docket_metadata(d, docket_data) if pq.debug: mark_pq_successful(pq, d_id=d.pk) return d d.save() # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(text), ) # Docket entries for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], }) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s'" % (docket_entry['document_number'], pq)) continue # Then make the RECAPDocument object. Try to find it. If we do, update # the pacer_doc_id field if it's blank. If we can't find it, create it # or throw an error. try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: try: RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except IntegrityError: logger.warn( "Creating new document with pacer_doc_id of '%s' violates " "unique constraint on pacer_doc_id field." % docket_entry['pacer_doc_id']) continue except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry number'%s' " "while processing '%s'" % (docket_entry['document_number'], pq)) continue else: rd.pacer_doc_id = rd.pacer_doc_id or pq.pacer_doc_id add_parties_and_attorneys(d, docket_data['parties']) mark_pq_successful(pq, d_id=d.pk) return d