def process_docket_data(d, filepath, report_type): """Process docket data file. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.tasks import update_docket_metadata, add_docket_entries, \ add_parties_and_attorneys, update_docket_appellate_metadata if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive() with open(filepath, 'r') as f: text = f.read().decode('utf-8') report._parse_text(text) data = report.data if data == {}: return None update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() add_docket_entries(d, data['docket_entries']) if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE): add_parties_and_attorneys(d, data['parties']) return d.pk
def test_new_lacks_terminated_entities_old_has_them(self): """Do we update things properly when old has terminated parties, but new lacks them? Do we disassociate extraneous parties that aren't in the new data and aren't terminated? """ # Add terminated attorney that's not in the new data. term_a = Attorney.objects.create(name="Robert Mueller") Role.objects.create(docket=self.d, attorney=term_a, party=self.p, role=Role.TERMINATED, date_action=date(2018, 3, 16)) # Add a terminated party that's not in the new data. term_p = Party.objects.create(name='Zainab Ahmad') PartyType.objects.create(docket=self.d, party=term_p, name="plaintiff", date_terminated=date(2018, 11, 4)) # Remove termination data from the new. self.new_mccarthy_data['date_terminated'] = None add_parties_and_attorneys(self.d, self.new_party_data) # Docket should have three parties, Powell and McCarthy from the new # data, and Ahmad from the old. This implies that extraneous_p has been # removed and that terminated parties have not. self.assertEqual(self.d.parties.count(), 3) # Powell now has has two attorneys, Robert Mueller and self.a. The rest # are extraneous or don't have attys. role_count = Role.objects.filter(docket=self.d).count() self.assertEqual(role_count, 2)
def reprocess_docket_data(d, filepath, report_type): """Reprocess docket data that we already have. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.tasks import update_docket_metadata, add_docket_entries, \ add_parties_and_attorneys if report_type == DOCKET: report = DocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == DOCKET_HISTORY_REPORT: report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id)) with open(filepath, 'r') as f: text = f.read().decode('utf-8') report._parse_text(text) data = report.data if data == {}: return None update_docket_metadata(d, data) d.save() add_docket_entries(d, data['docket_entries']) if report_type == DOCKET: add_parties_and_attorneys(d, data['parties']) return d.pk
def test_new_has_terminated_entities(self): """Do we update all existing data when scraped data has terminated entities? """ add_parties_and_attorneys(self.d, self.new_party_data) # Docket should have two parties, Powell and McCarthy. This # implies that extraneous_p has been removed. self.assertEqual(self.d.parties.count(), 2) # Powell has an attorney. The rest are extraneous or don't have attys. role_count = Role.objects.filter(docket=self.d).count() self.assertEqual(role_count, 1)
def test_new_lacks_terminated_entities_old_lacks_too(self): """Do we update all existing data when there aren't terminated entities at play? """ self.new_mccarthy_data['date_terminated'] = None add_parties_and_attorneys(self.d, self.new_party_data) # Docket should have two parties, Powell and McCarthy. This # implies that extraneous_p has been removed. self.assertEqual(self.d.parties.count(), 2) # Powell has an attorney. The rest are extraneous or don't have attys. role_count = Role.objects.filter(docket=self.d).count() self.assertEqual(role_count, 1)
def process_docket_data(d, filepath, report_type): """Process docket data file. :param d: A docket object to work on. :param filepath: The path to a saved HTML file containing docket or docket history report data. :param report_type: Whether it's a docket or a docket history report. """ from cl.recap.tasks import update_docket_metadata, add_docket_entries, \ add_parties_and_attorneys, update_docket_appellate_metadata if report_type == UPLOAD_TYPE.DOCKET: report = DocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: report = DocketHistoryReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.APPELLATE_DOCKET: report = AppellateDocketReport(map_cl_to_pacer_id(d.court_id)) elif report_type == UPLOAD_TYPE.IA_XML_FILE: report = InternetArchive() elif report_type == UPLOAD_TYPE.CASE_REPORT_PAGE: report = CaseQuery(map_cl_to_pacer_id(d.court_id)) with open(filepath, 'r') as f: text = f.read().decode('utf-8') report._parse_text(text) data = report.data if data == {}: return None update_docket_metadata(d, data) d, og_info = update_docket_appellate_metadata(d, data) if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() if data.get('docket_entries'): add_docket_entries(d, data['docket_entries']) if report_type in (UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.APPELLATE_DOCKET, UPLOAD_TYPE.IA_XML_FILE): add_parties_and_attorneys(d, data['parties']) return d.pk
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session, tag=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param pacer_case_id: The internal case ID of the item in PACER. :param court_id: A courtlistener court ID. :param session: A valid PacerSession object. :param tag: The tag name that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ report = DocketReport(map_cl_to_pacer_id(court_id), session) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) d.tags.add(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, needs_solr_update = add_docket_entries( d, docket_data['docket_entries'], tag=tag) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def get_appellate_docket_by_docket_number(self, docket_number, court_id, cookies, tag_names=None, **kwargs): """Get a docket by docket number, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param docket_number: The docket number of the case. :param court_id: A courtlistener/PACER appellate court ID. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: The tag name that should be stored with the item in the DB, if desired. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ s = PacerSession(cookies=cookies) report = AppellateDocketReport(court_id, s) logging_id = "%s - %s" % (court_id, docket_number) logger.info("Querying docket report %s", logging_id) try: report.query(docket_number, **kwargs) except requests.RequestException as e: logger.warning("Problem getting docket %s", logging_id) if self.request.retries == self.max_retries: self.request.callbacks = None return None raise self.retry(exc=e) docket_data = report.data logger.info('Querying and parsing complete for %s', logging_id) if docket_data == {}: logger.info("Unable to find docket: %s", logging_id) self.request.callbacks = None return None try: d = Docket.objects.get( docket_number=docket_number, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is None: d, count = find_docket_object(court_id, docket_number, docket_number) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d, og_info = update_docket_appellate_metadata(d, docket_data) if not d.pacer_case_id: d.pacer_case_id = docket_number if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Save the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data['docket_entries'], tags=tags) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def get_docket_by_pacer_case_id(self, data, court_id, cookies, tag_names=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param data: A dict containing: Required: 'pacer_case_id': The internal case ID of the item in PACER. Optional: 'docket_pk': The ID of the docket to work on to avoid lookups if it's known in advance. :param court_id: A courtlistener court ID. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: A list of tag names that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). :return: A dict indicating if we need to update Solr. """ s = PacerSession(cookies=cookies) if data is None: logger.info("Empty data argument. Terminating " "chains and exiting.") self.request.callbacks = None return pacer_case_id = data.get('pacer_case_id') report = DocketReport(map_cl_to_pacer_id(court_id), s) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) if data.get('docket_pk') is not None: d = Docket.objects.get(pk=data['docket_pk']) else: try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) if not docket_data: logger.info("No valid docket data for %s.%s", court_id, pacer_case_id) self.request.callbacks = None return # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data['docket_entries'], tags=tags) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session, tag=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param pacer_case_id: The internal case ID of the item in PACER. :param court_id: A courtlistener court ID. :param session: A valid PacerSession object. :param tag: The tag name that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ report = DocketReport(map_cl_to_pacer_id(court_id), session) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if d is not None and first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) # Merge the contents into CL. try: if d is None: d = Docket.objects.get( Q(pacer_case_id=pacer_case_id) | Q(docket_number=docket_data['docket_number']), court_id=court_id, ) # Add RECAP as a source if it's not already. if d.source in [Docket.DEFAULT, Docket.SCRAPER]: d.source = Docket.RECAP_AND_SCRAPER elif d.source == Docket.COLUMBIA: d.source = Docket.COLUMBIA_AND_RECAP elif d.source == Docket.COLUMBIA_AND_SCRAPER: d.source = Docket.COLUMBIA_AND_RECAP_AND_SCRAPER except Docket.DoesNotExist: d = Docket( source=Docket.RECAP, pacer_case_id=pacer_case_id, court_id=court_id ) except Docket.MultipleObjectsReturned: logger.error("Too many dockets returned when trying to look up '%s.%s'" % (court_id, pacer_case_id)) return None update_docket_metadata(d, docket_data) d.save() if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) d.tags.add(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) for docket_entry in docket_data['docket_entries']: try: de, created = DocketEntry.objects.update_or_create( docket=d, entry_number=docket_entry['document_number'], defaults={ 'description': docket_entry['description'], 'date_filed': docket_entry['date_filed'], } ) except DocketEntry.MultipleObjectsReturned: logger.error( "Multiple docket entries found for document entry number '%s' " "while processing '%s.%s'" % (docket_entry['document_number'], court_id, pacer_case_id) ) continue else: if tag is not None: de.tags.add(tag) try: rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.DoesNotExist: try: rd = RECAPDocument.objects.create( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], pacer_doc_id=docket_entry['pacer_doc_id'], is_available=False, ) except IntegrityError: # Race condition. The item was created after our get failed. rd = RECAPDocument.objects.get( docket_entry=de, # No attachments when uploading dockets. document_type=RECAPDocument.PACER_DOCUMENT, document_number=docket_entry['document_number'], ) except RECAPDocument.MultipleObjectsReturned: logger.error( "Multiple recap documents found for document entry " "number: '%s', docket: %s" % (docket_entry['document_number'], d) ) continue rd.pacer_doc_id = rd.pacer_doc_id or docket_entry['pacer_doc_id'] if tag is not None: rd.tags.add(tag) add_parties_and_attorneys(d, docket_data['parties']) logger.info("Created/updated docket: %s" % d) return d