Esempio n. 1
0
    def start_scrape(self, scrape_type):
        """Start new scrape session / query against corrections site

        Retrieves first docket item, enqueues task for initial search
        page scrape to start the new scraping session.

        Args:
            scrape_type: (ScrapeType) The type of scrape to start

        Returns:
            N/A

        """
        docket_item = self.iterate_docket_item(scrape_type)
        scrape_key = ScrapeKey(self.get_region().region_code, scrape_type)
        # Ensure that the topic and subscription are created on start.
        pubsub_helper.create_topic_and_subscription(scrape_key, BATCH_PUBSUB_TYPE)
        if not docket_item:
            logging.error(
                "Found no %s docket items for %s, shutting down.",
                scrape_type,
                self.get_region().region_code,
            )
            sessions.close_session(scrape_key)
            return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )
Esempio n. 2
0
    def _stop_scraper(region: str):
        logging.info("Trying to stop scraper for region [%s].", region)
        for scrape_type in scrape_types:
            key = ScrapeKey(region_code=region, scrape_type=scrape_type)
            session = sessions.get_current_session(key)
            if not session:
                logging.info(
                    "No [%s] scrape to stop for region: [%s]", scrape_type,
                    region)
                continue

            region_scraper = regions.get_region(region).get_ingestor()
            was_stopped = region_scraper.stop_scrape(scrape_type,
                                                     respect_is_stoppable)
            if was_stopped:
                closed_sessions = sessions.close_session(key)
                for closed_session in closed_sessions:
                    sessions.update_phase(closed_session,
                                          scrape_phase.ScrapePhase.PERSIST)
                if next_phase:
                    logging.info("Enqueueing %s for region [%s].",
                                 next_phase, region)
                    ScraperCloudTaskManager().create_scraper_phase_task(
                        region_code=region,
                        url=next_phase_url)
    def test_close_session(self, mock_datetime, mock_client, mock_query):
        mock_datetime.now.return_value = fixed_now

        key = datastore.key.Key('session', 'key', project=0)
        session = ScrapeSession.new(
            key, start=fixed_now, scrape_type=constants.ScrapeType.SNAPSHOT,
            region='us_sd', phase=scrape_phase.ScrapePhase.SCRAPE,
        )

        wire_sessions_to_query(mock_client, mock_query, [session])
        session.end_time = fixed_now

        scrape_key = ScrapeKey("us_sd", constants.ScrapeType.SNAPSHOT)
        assert to_entities(sessions.close_session(scrape_key)) == \
            to_entities([session])

        mock_client.return_value.put.assert_called_with(session.to_entity())
Esempio n. 4
0
    def _stop_scraper(region: str):
        closed_sessions = []
        for scrape_type in scrape_types:
            closed_sessions.extend(
                sessions.close_session(ScrapeKey(region, scrape_type)))
        for session in closed_sessions:
            sessions.update_phase(session, scrape_phase.ScrapePhase.PERSIST)
        if not closed_sessions:
            return

        was_stopped = False
        try:
            logging.info("Stopping scraper for region [%s].", region)
            region_scraper = regions.get_region(region).get_ingestor()
            was_stopped = region_scraper.stop_scrape(scrape_types,
                                                     respect_is_stoppable)
        finally:
            if next_phase and was_stopped:
                logging.info("Enqueueing %s for region [%s].",
                             next_phase, region)
                queues.enqueue_scraper_phase(region_code=region,
                                             url=next_phase_url)
Esempio n. 5
0
 def test_close_session_nothing_current(self, _mock_client):
     scrape_key = ScrapeKey("us_sd", constants.ScrapeType.BACKGROUND)
     assert not sessions.close_session(scrape_key)
Esempio n. 6
0
    def resume_scrape(self, scrape_type):
        """Resume a stopped scrape from where it left off

        Starts the scraper up again at the same place (roughly) as it had been
        stopped previously. This allows for cron jobs to start/stop scrapers at
        different times of day.

        Args:
            scrape_type: (ScrapeType) Type of scraping to resume

        Returns:
            N/A
        """
        # Note: None of the current scrapers support resumes, so this function
        # doesn't fully work. For instance, content is thrown away.
        if scrape_type is constants.ScrapeType.BACKGROUND:
            # Background scrape

            # In most scrapers, background scrapes will use
            # short-lived docket items. However, some background
            # scrapes use only one docket item to run a giant scrape,
            # which may run for months. Limitations in GAE Pull Queues
            # make it difficult to keep track of a leased task for
            # that long, so we don't try. Resuming a background scrape
            # simply resumes from session data, and the task stays in
            # the docket un-leased. It will get deleted the next time
            # we start a new background scrape.

            recent_sessions = sessions.get_recent_sessions(
                ScrapeKey(self.get_region().region_code, scrape_type)
            )

            last_scraped = None
            for session in recent_sessions:
                if session.last_scraped:
                    last_scraped = session.last_scraped
                    break

            if last_scraped:
                content = last_scraped.split(", ")
            else:
                logging.error(
                    "No earlier session with last_scraped found; " "cannot resume."
                )
                return

        else:
            # Snapshot scrape

            # Get an item from the docket and continue from there. These queries
            # are very quick, so we don't bother trying to resume the same task
            # we left off on.

            content = self.iterate_docket_item(scrape_type)
            if not content:
                sessions.close_session(
                    ScrapeKey(self.get_region().region_code, scrape_type)
                )
                return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )