def reprocess_item(self, metadata_only=False, index=True): """Reprocess the RSS feed :param metadata_only: If True, only do the metadata, not the docket entries. :param index: Whether to save to Solr (note that none will be sent when doing medata only since no entries are modified). """ from cl.recap_rss.tasks import merge_rss_feed_contents from cl.search.tasks import add_items_to_solr rss_feed = PacerRssFeed(map_cl_to_pacer_id(self.court_id)) rss_feed._parse_text(self.file_contents) response = merge_rss_feed_contents(rss_feed.data, self.court_id, metadata_only) if index: add_items_to_solr(response.get("rds_for_solr", []), "search.RECAPDocument")
def test_rss_feed_ingestion(self): """Can we ingest RSS feeds without creating duplicates?""" court_id = 'scotus' rss_feed = PacerRssFeed(court_id) rss_feed.is_bankruptcy = True # Needed because we say SCOTUS above. with open(self.make_path('rss_sample_unnumbered_mdb.xml')) as f: text = f.read().decode('utf-8') rss_feed._parse_text(text) docket = rss_feed.data[0] d, docket_count = find_docket_object(court_id, docket['pacer_case_id'], docket['docket_number']) update_docket_metadata(d, docket) d.save() self.assertTrue(docket_count == 0) expected_count = 1 add_docket_entries(d, docket['docket_entries']) self.assertEqual(d.docket_entries.count(), expected_count) add_docket_entries(d, docket['docket_entries']) self.assertEqual(d.docket_entries.count(), expected_count)
def check_if_feed_changed(self, court_pk, feed_status_pk, date_last_built): """Check if the feed changed For now, we do this in a very simple way, by using the lastBuildDate field and checking if it differs from the last time we checked. One thing that makes this approach suboptimal is that we know the `lastBuildDate` field varies around the time that the feeds are actually...um, built. For example, we've seen the same feed with two different values for this field around the time that it is built. When this happens, the two values tend to be off by about a minute or so. If we were being very careful and really optimizing when we crawled these feeds, this would cause us trouble because we'd detect a change in this field when the actual data hadn't changed. But because we only crawl the feeds at most once every five minutes, and because the gaps we've observed in this field tend to only be about one minute, we can get away with this. Other solutions/thoughts we can consider later: - If the difference between two lastBuildDate values is less than two minutes assume it's the same feed. - Use hashing of the feed to determine if it has changed. One other oddity here is that we use regex parsing to grab the lastBuildDate value. This is because parsing the feed properly can take several seconds for a big feed. :param court_pk: The CL ID for the court object. :param feed_status_pk: The CL ID for the status object. :param date_last_built: The last time the court was scraped. """ feed_status = RssFeedStatus.objects.get(pk=feed_status_pk) rss_feed = PacerRssFeed(map_cl_to_pacer_id(court_pk)) try: rss_feed.query() except requests.RequestException as exc: logger.warning("Network error trying to get RSS feed at %s" % rss_feed.url) abort_or_retry(self, feed_status, exc) return content = rss_feed.response.content if not content: try: raise Exception("Empty RSS document returned by PACER: %s" % feed_status.court_id) except Exception as exc: logger.warning(str(exc)) abort_or_retry(self, feed_status, exc) return current_build_date = get_last_build_date(content) if current_build_date: alert_on_staleness(current_build_date, feed_status.court_id, rss_feed.url) feed_status.date_last_build = current_build_date feed_status.save() else: try: raise Exception("No last build date in RSS document returned by " "PACER: %s" % feed_status.court_id) except Exception as exc: logger.warning(str(exc)) abort_or_retry(self, feed_status, exc) return # Only check for early abortion during partial crawls. if date_last_built == current_build_date and not feed_status.is_sweep: logger.info( "%s: Feed has not changed since %s. Aborting.", feed_status.court_id, date_last_built, ) # Abort. Nothing has changed here. self.request.chain = None mark_status(feed_status, RssFeedStatus.UNCHANGED) return logger.info("%s: Feed changed or doing a sweep. Moving on to the merge." % feed_status.court_id) rss_feed.parse() logger.info("%s: Got %s results to merge." % (feed_status.court_id, len(rss_feed.data))) # Update RSS entry types in Court table update_entry_types(court_pk, rss_feed.feed.feed.description) # Save the feed to the DB feed_data = RssFeedData(court_id=court_pk) try: feed_data.filepath.save("rss.xml.bz2", ContentFile(bz2.compress(content))) except OSError as exc: if exc.errno == errno.EIO: abort_or_retry(self, feed_status, exc) else: raise exc return rss_feed.data
def check_if_feed_changed(self, court_pk, feed_status_pk, date_last_built): """Check if the feed changed For now, we do this in a very simple way, by using the lastBuildDate field and checking if it differs from the last time we checked. One thing that makes this approach suboptimal is that we know the `lastBuildDate` field varies around the time that the feeds are actually...um, built. For example, we've seen the same feed with two different values for this field around the time that it is built. When this happens, the two values tend to be off by about a minute or so. If we were being very careful and really optimizing when we crawled these feeds, this would cause us trouble because we'd detect a change in this field when the actual data hadn't changed. But because we only crawl the feeds at most once every five minutes, and because the gaps we've observed in this field tend to only be about one minute, we can get away with this. Other solutions/thoughts we can consider later: - If the difference between two lastBuildDate values is less than two minutes assume it's the same feed. - Use hashing of the feed to determine if it has changed. One other oddity here is that we use regex parsing to grab the lastBuildDate value. This is because parsing the feed properly can take several seconds for a big feed. :param court_pk: The CL ID for the court object. :param feed_status_pk: The CL ID for the status object. :param date_last_built: The last time the court was scraped. """ feed_status = RssFeedStatus.objects.get(pk=feed_status_pk) rss_feed = PacerRssFeed(map_cl_to_pacer_id(court_pk)) try: rss_feed.query() except requests.RequestException as exc: logger.warning("Network error trying to get RSS feed at %s" % rss_feed.url) abort_or_retry(self, feed_status, exc) return else: if not rss_feed.response.content: try: raise Exception("Empty RSS document returned by PACER: %s" % feed_status.court_id) except Exception as exc: logger.warning(str(exc)) abort_or_retry(self, feed_status, exc) return current_build_date = get_last_build_date(rss_feed.response.content) if not current_build_date: try: raise Exception("No last build date in RSS document returned by " "PACER: %s" % feed_status.court_id) except Exception as exc: logger.warning(str(exc)) abort_or_retry(self, feed_status, exc) return # Only check for early abortion during partial crawls. if not feed_status.is_sweep: # Get the last time this feed was pulled successfully if date_last_built == current_build_date: logger.info("%s: Feed has not changed since %s. Aborting." % ( feed_status.court_id, date_last_built)) # Abort. Nothing has changed here. self.request.chain = None mark_status(feed_status, RssFeedStatus.UNCHANGED) return logger.info("%s: Feed changed or doing a sweep. Moving on to the merge." % feed_status.court_id) feed_status.date_last_build = current_build_date feed_status.save() rss_feed.parse() logger.info("%s: Got %s results to merge." % (feed_status.court_id, len(rss_feed.data))) return rss_feed.data