Exemple #1
0
    def reprocess_item(self, metadata_only=False, index=True):
        """Reprocess the RSS feed

        :param metadata_only: If True, only do the metadata, not the docket
        entries.
        :param index: Whether to save to Solr (note that none will be sent
        when doing medata only since no entries are modified).
        """
        from cl.recap_rss.tasks import merge_rss_feed_contents
        from cl.search.tasks import add_items_to_solr

        rss_feed = PacerRssFeed(map_cl_to_pacer_id(self.court_id))
        rss_feed._parse_text(self.file_contents)
        response = merge_rss_feed_contents(rss_feed.data, self.court_id,
                                           metadata_only)
        if index:
            add_items_to_solr(response.get("rds_for_solr", []),
                              "search.RECAPDocument")
Exemple #2
0
    def test_rss_feed_ingestion(self):
        """Can we ingest RSS feeds without creating duplicates?"""
        court_id = 'scotus'
        rss_feed = PacerRssFeed(court_id)
        rss_feed.is_bankruptcy = True  # Needed because we say SCOTUS above.
        with open(self.make_path('rss_sample_unnumbered_mdb.xml')) as f:
            text = f.read().decode('utf-8')
        rss_feed._parse_text(text)
        docket = rss_feed.data[0]
        d, docket_count = find_docket_object(court_id, docket['pacer_case_id'],
                                             docket['docket_number'])
        update_docket_metadata(d, docket)
        d.save()
        self.assertTrue(docket_count == 0)

        expected_count = 1
        add_docket_entries(d, docket['docket_entries'])
        self.assertEqual(d.docket_entries.count(), expected_count)
        add_docket_entries(d, docket['docket_entries'])
        self.assertEqual(d.docket_entries.count(), expected_count)
Exemple #3
0
def check_if_feed_changed(self, court_pk, feed_status_pk, date_last_built):
    """Check if the feed changed

    For now, we do this in a very simple way, by using the lastBuildDate field
    and checking if it differs from the last time we checked. One thing that
    makes this approach suboptimal is that we know the `lastBuildDate` field
    varies around the time that the feeds are actually...um, built. For
    example, we've seen the same feed with two different values for this field
    around the time that it is built. When this happens, the two values tend to
    be off by about a minute or so.

    If we were being very careful and really optimizing when we crawled these
    feeds, this would cause us trouble because we'd detect a change in this
    field when the actual data hadn't changed. But because we only crawl the
    feeds at most once every five minutes, and because the gaps we've observed
    in this field tend to only be about one minute, we can get away with this.

    Other solutions/thoughts we can consider later:

     - If the difference between two lastBuildDate values is less than two
       minutes assume it's the same feed.
     - Use hashing of the feed to determine if it has changed.

    One other oddity here is that we use regex parsing to grab the
    lastBuildDate value. This is because parsing the feed properly can take
    several seconds for a big feed.

    :param court_pk: The CL ID for the court object.
    :param feed_status_pk: The CL ID for the status object.
    :param date_last_built: The last time the court was scraped.
    """
    feed_status = RssFeedStatus.objects.get(pk=feed_status_pk)
    rss_feed = PacerRssFeed(map_cl_to_pacer_id(court_pk))
    try:
        rss_feed.query()
    except requests.RequestException as exc:
        logger.warning("Network error trying to get RSS feed at %s" %
                       rss_feed.url)
        abort_or_retry(self, feed_status, exc)
        return

    content = rss_feed.response.content
    if not content:
        try:
            raise Exception("Empty RSS document returned by PACER: %s" %
                            feed_status.court_id)
        except Exception as exc:
            logger.warning(str(exc))
            abort_or_retry(self, feed_status, exc)
            return

    current_build_date = get_last_build_date(content)
    if current_build_date:
        alert_on_staleness(current_build_date, feed_status.court_id,
                           rss_feed.url)
        feed_status.date_last_build = current_build_date
        feed_status.save()
    else:
        try:
            raise Exception("No last build date in RSS document returned by "
                            "PACER: %s" % feed_status.court_id)
        except Exception as exc:
            logger.warning(str(exc))
            abort_or_retry(self, feed_status, exc)
            return

    # Only check for early abortion during partial crawls.
    if date_last_built == current_build_date and not feed_status.is_sweep:
        logger.info(
            "%s: Feed has not changed since %s. Aborting.",
            feed_status.court_id,
            date_last_built,
        )
        # Abort. Nothing has changed here.
        self.request.chain = None
        mark_status(feed_status, RssFeedStatus.UNCHANGED)
        return

    logger.info("%s: Feed changed or doing a sweep. Moving on to the merge." %
                feed_status.court_id)
    rss_feed.parse()
    logger.info("%s: Got %s results to merge." %
                (feed_status.court_id, len(rss_feed.data)))

    # Update RSS entry types in Court table
    update_entry_types(court_pk, rss_feed.feed.feed.description)

    # Save the feed to the DB
    feed_data = RssFeedData(court_id=court_pk)
    try:
        feed_data.filepath.save("rss.xml.bz2",
                                ContentFile(bz2.compress(content)))
    except OSError as exc:
        if exc.errno == errno.EIO:
            abort_or_retry(self, feed_status, exc)
        else:
            raise exc

    return rss_feed.data
def check_if_feed_changed(self, court_pk, feed_status_pk, date_last_built):
    """Check if the feed changed

    For now, we do this in a very simple way, by using the lastBuildDate field
    and checking if it differs from the last time we checked. One thing that
    makes this approach suboptimal is that we know the `lastBuildDate` field
    varies around the time that the feeds are actually...um, built. For
    example, we've seen the same feed with two different values for this field
    around the time that it is built. When this happens, the two values tend to
    be off by about a minute or so.

    If we were being very careful and really optimizing when we crawled these
    feeds, this would cause us trouble because we'd detect a change in this
    field when the actual data hadn't changed. But because we only crawl the
    feeds at most once every five minutes, and because the gaps we've observed
    in this field tend to only be about one minute, we can get away with this.

    Other solutions/thoughts we can consider later:

     - If the difference between two lastBuildDate values is less than two
       minutes assume it's the same feed.
     - Use hashing of the feed to determine if it has changed.

    One other oddity here is that we use regex parsing to grab the
    lastBuildDate value. This is because parsing the feed properly can take
    several seconds for a big feed.

    :param court_pk: The CL ID for the court object.
    :param feed_status_pk: The CL ID for the status object.
    :param date_last_built: The last time the court was scraped.
    """
    feed_status = RssFeedStatus.objects.get(pk=feed_status_pk)
    rss_feed = PacerRssFeed(map_cl_to_pacer_id(court_pk))
    try:
        rss_feed.query()
    except requests.RequestException as exc:
        logger.warning("Network error trying to get RSS feed at %s" %
                       rss_feed.url)
        abort_or_retry(self, feed_status, exc)
        return
    else:
        if not rss_feed.response.content:
            try:
                raise Exception("Empty RSS document returned by PACER: %s" %
                                feed_status.court_id)
            except Exception as exc:
                logger.warning(str(exc))
                abort_or_retry(self, feed_status, exc)
                return

    current_build_date = get_last_build_date(rss_feed.response.content)
    if not current_build_date:
        try:
            raise Exception("No last build date in RSS document returned by "
                            "PACER: %s" % feed_status.court_id)
        except Exception as exc:
            logger.warning(str(exc))
            abort_or_retry(self, feed_status, exc)
            return

    # Only check for early abortion during partial crawls.
    if not feed_status.is_sweep:
        # Get the last time this feed was pulled successfully
        if date_last_built == current_build_date:
            logger.info("%s: Feed has not changed since %s. Aborting." % (
                feed_status.court_id, date_last_built))
            # Abort. Nothing has changed here.
            self.request.chain = None
            mark_status(feed_status, RssFeedStatus.UNCHANGED)
            return

    logger.info("%s: Feed changed or doing a sweep. Moving on to the merge." %
                feed_status.court_id)
    feed_status.date_last_build = current_build_date
    feed_status.save()

    rss_feed.parse()
    logger.info("%s: Got %s results to merge." % (feed_status.court_id,
                                                  len(rss_feed.data)))

    return rss_feed.data