def merge_rss_feed_contents(rss_feed, court_pk, feed_status_pk): """Merge the rss feed contents into CourtListener :param rss_feed: A PacerRssFeed object that has already queried the feed. :param court_pk: The CourtListener court ID. :param feed_status_pk: The CL ID for the RSS status object. :returns all_rds_created: A list of all the RDs created during the processing. """ start_time = now() feed_status = RssFeedStatus.objects.get(pk=feed_status_pk) rss_feed.parse() logger.info("%s: Got %s results to merge." % (feed_status.court_id, len(rss_feed.data))) # RSS feeds are a list of normal Juriscraper docket objects. all_rds_created = [] d_pks_to_alert = [] for docket in rss_feed.data: item_hash = hash_item(docket) if is_cached(item_hash): continue with transaction.atomic(): cached_ok = cache_hash(item_hash) if not cached_ok: # The item is already in the cache, ergo it's getting processed # in another thread/process and we had a race condition. continue d, docket_count = find_docket_object( court_pk, docket['pacer_case_id'], docket['docket_number']) if docket_count > 1: logger.info("Found %s dockets during lookup. Choosing " "oldest." % docket_count) d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket) if not d.pacer_case_id: d.pacer_case_id = docket['pacer_case_id'] d.save() rds_created, content_updated = add_docket_entries( d, docket['docket_entries']) if content_updated and docket_count > 0: newly_enqueued = enqueue_docket_alert(d.pk) if newly_enqueued: d_pks_to_alert.append((d.pk, start_time)) all_rds_created.extend([rd.pk for rd in rds_created]) logger.info("%s: Sending %s new RECAP documents to Solr for indexing and " "sending %s dockets for alerts.", feed_status.court_id, len(all_rds_created), len(d_pks_to_alert)) return {'d_pks_to_alert': d_pks_to_alert, 'rds_for_solr': all_rds_created}
def get_docket_by_pacer_case_id(self, pacer_case_id, court_id, session, tag=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param pacer_case_id: The internal case ID of the item in PACER. :param court_id: A courtlistener court ID. :param session: A valid PacerSession object. :param tag: The tag name that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ report = DocketReport(map_cl_to_pacer_id(court_id), session) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() if tag is not None: tag, _ = Tag.objects.get_or_create(name=tag) d.tags.add(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, needs_solr_update = add_docket_entries( d, docket_data['docket_entries'], tag=tag) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'needs_solr_update': bool(rds_created or needs_solr_update), }
def get_appellate_docket_by_docket_number(self, docket_number, court_id, cookies, tag_names=None, **kwargs): """Get a docket by docket number, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param docket_number: The docket number of the case. :param court_id: A courtlistener/PACER appellate court ID. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: The tag name that should be stored with the item in the DB, if desired. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ s = PacerSession(cookies=cookies) report = AppellateDocketReport(court_id, s) logging_id = "%s - %s" % (court_id, docket_number) logger.info("Querying docket report %s", logging_id) try: report.query(docket_number, **kwargs) except requests.RequestException as e: logger.warning("Problem getting docket %s", logging_id) if self.request.retries == self.max_retries: self.request.callbacks = None return None raise self.retry(exc=e) docket_data = report.data logger.info('Querying and parsing complete for %s', logging_id) if docket_data == {}: logger.info("Unable to find docket: %s", logging_id) self.request.callbacks = None return None try: d = Docket.objects.get( docket_number=docket_number, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is None: d, count = find_docket_object(court_id, docket_number, docket_number) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d, og_info = update_docket_appellate_metadata(d, docket_data) if not d.pacer_case_id: d.pacer_case_id = docket_number if og_info is not None: og_info.save() d.originating_court_information = og_info d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Save the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.APPELLATE_DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data['docket_entries'], tags=tags) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def get_docket_by_pacer_case_id(self, data, court_id, cookies, tag_names=None, **kwargs): """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. For details of acceptable parameters, see DocketReport.query() :param data: A dict containing: Required: 'pacer_case_id': The internal case ID of the item in PACER. Optional: 'docket_pk': The ID of the docket to work on to avoid lookups if it's known in advance. :param court_id: A courtlistener court ID. :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: A list of tag names that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). :return: A dict indicating if we need to update Solr. """ s = PacerSession(cookies=cookies) if data is None: logger.info("Empty data argument. Terminating " "chains and exiting.") self.request.callbacks = None return pacer_case_id = data.get('pacer_case_id') report = DocketReport(map_cl_to_pacer_id(court_id), s) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) if data.get('docket_pk') is not None: d = Docket.objects.get(pk=data['docket_pk']) else: try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None if d is not None: first_missing_id = get_first_missing_de_number(d) if first_missing_id > 1: # We don't have to get the whole thing! kwargs.setdefault('doc_num_start', first_missing_id) report.query(pacer_case_id, **kwargs) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) if not docket_data: logger.info("No valid docket data for %s.%s", court_id, pacer_case_id) self.request.callbacks = None return # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.DOCKET) pacer_file.filepath.save( 'docket.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) rds_created, content_updated = add_docket_entries( d, docket_data['docket_entries'], tags=tags) add_parties_and_attorneys(d, docket_data['parties']) process_orphan_documents(rds_created, d.court_id, d.date_filed) logger.info("Created/updated docket: %s" % d) return { 'docket_pk': d.pk, 'content_updated': bool(rds_created or content_updated), }
def do_case_query_by_pacer_case_id(self, data, court_id, cookies, tag_names=None): """Run a case query (iquery.pl) query on a case and save the data :param data: A dict containing at least the following: { 'pacer_case_id': The internal pacer case ID for the item. } :param court_id: A courtlistener court ID :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a logged-in PACER user. :param tag_names: A list of tag names to associate with the docket when saving it in the DB. :return: A dict with the pacer_case_id and docket_pk values. """ s = PacerSession(cookies=cookies) if data is None: logger.info("Empty data argument. Terminating " "chains and exiting.") self.request.callbacks = None return pacer_case_id = data.get('pacer_case_id') report = CaseQuery(map_cl_to_pacer_id(court_id), s) logger.info("Querying docket report %s.%s" % (court_id, pacer_case_id)) try: d = Docket.objects.get( pacer_case_id=pacer_case_id, court_id=court_id, ) except Docket.DoesNotExist: d = None except Docket.MultipleObjectsReturned: d = None report.query(pacer_case_id) docket_data = report.data logger.info("Querying and parsing complete for %s.%s" % (court_id, pacer_case_id)) if not docket_data: logger.info("No valid docket data for %s.%s", court_id, pacer_case_id) self.request.callbacks = None return # Merge the contents into CL. if d is None: d, count = find_docket_object(court_id, pacer_case_id, docket_data['docket_number']) if count > 1: d = d.earliest('date_created') add_recap_source(d) update_docket_metadata(d, docket_data) d.save() tags = [] if tag_names is not None: for tag_name in tag_names: tag, _ = Tag.objects.get_or_create(name=tag_name) tag.tag_object(d) tags.append(tag) # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles(content_object=d, upload_type=UPLOAD_TYPE.CASE_REPORT_PAGE) pacer_file.filepath.save( 'case_report.html', # We only care about the ext w/UUIDFileSystemStorage ContentFile(report.response.text), ) logger.info("Created/updated docket: %s" % d) return { 'pacer_case_id': pacer_case_id, 'docket_pk': d.pk, }