Exemple #1
0
    def start_scrape(self, scrape_type):
        """Start new scrape session / query against corrections site

        Retrieves first docket item, enqueues task for initial search
        page scrape to start the new scraping session.

        Args:
            scrape_type: (ScrapeType) The type of scrape to start

        Returns:
            N/A

        """
        docket_item = self.iterate_docket_item(scrape_type)
        scrape_key = ScrapeKey(self.get_region().region_code, scrape_type)
        # Ensure that the topic and subscription are created on start.
        pubsub_helper.create_topic_and_subscription(scrape_key, BATCH_PUBSUB_TYPE)
        if not docket_item:
            logging.error(
                "Found no %s docket items for %s, shutting down.",
                scrape_type,
                self.get_region().region_code,
            )
            sessions.close_session(scrape_key)
            return

        self.add_task(
            self.get_initial_task_method(),
            QueueRequest(
                scrape_type=scrape_type,
                scraper_start_time=datetime.now(),
                next_task=self.get_initial_task(),
            ),
        )
Exemple #2
0
def load_background_target_list(scrape_key: ScrapeKey, name_file: str,
                                query_name: Optional[Tuple[str, str]]):
    """Load background scrape docket items, from name file.

    Iterates over a CSV of common names, loading a docket item for the scraper
    to search for each one. We load batches of lines into memory at a time,
    until the entire file has been processed.

    If a name was provided in the initial request, will attempt to only load
    names from the index of that name in the file onward, allowing for
    'resuming' a background scrape from a specific point if there were
    problems. If the provided name is not found in the name file at all, a
    single docket item will be created to search for that name only.

    Args:
        scrape_key: (ScrapeKey) Scrape key
        name_file: (string) Name of the name file to be loaded
        query_name: User-provided name, in the form of a tuple (surname,
            given names). Empty strings if not provided.

    Returns:
        N/A
    """
    futures = []
    # If a query is provided then the names aren't relevant until we find the
    # query name, so `should_write_names` starts as False. If no query is
    # provided then all names should be written.
    should_write_names = not bool(query_name)

    pubsub_helper.create_topic_and_subscription(scrape_key,
                                                pubsub_type=PUBSUB_TYPE)

    with open(name_file, "r") as csvfile:
        names_reader = csv.reader(csvfile)

        for row in names_reader:
            if not row:
                continue

            name = (row[0], "") if len(row) == 1 else tuple(row)
            if not should_write_names:
                # Check to see if this is the `query_name` and if so mark that
                # all further names should be written to the docket.
                should_write_names = name == query_name

            if should_write_names:
                futures.append(_add_to_query_docket(scrape_key, name))

    # The query string was not found, add it as a separate docket item.
    if not should_write_names:
        logging.info(
            "Couldn't find user-provided name [%s] in name list, "
            "adding one-off docket item for the name instead.",
            str(query_name),
        )
        futures.append(_add_to_query_docket(scrape_key, query_name))

    for future in futures:
        future.result()
    logging.info("Finished loading background target list to docket.")
Exemple #3
0
    def test_get_new_docket_item_no_matching_items(self):
        write_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        read_key = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND)

        pubsub_helper.create_topic_and_subscription(write_key,
                                                    docket.PUBSUB_TYPE)
        docket.add_to_query_docket(write_key, get_payload()).result()

        docket_item = docket.get_new_docket_item(read_key,
                                                 return_immediately=True)
        assert not docket_item
Exemple #4
0
    def test_purge_query_docket_nothing_matching(self):
        scrape_key_purge = ScrapeKey(REGIONS[0],
                                     constants.ScrapeType.BACKGROUND)
        scrape_key_add = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND)

        pubsub_helper.create_topic_and_subscription(scrape_key_add,
                                                    docket.PUBSUB_TYPE)
        docket.add_to_query_docket(scrape_key_add, get_payload()).result()

        docket.purge_query_docket(scrape_key_purge)
        assert not docket.get_new_docket_item(scrape_key_purge,
                                              return_immediately=True)
Exemple #5
0
    def test_add_to_query_docket_background(self):
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        pubsub_helper.create_topic_and_subscription(scrape_key,
                                                    docket.PUBSUB_TYPE)

        docket.add_to_query_docket(scrape_key, get_payload()[0]).result()
        docket.add_to_query_docket(scrape_key, get_payload()[1]).result()

        items = [
            docket.get_new_docket_item(scrape_key),
            docket.get_new_docket_item(scrape_key),
        ]
        assert len(items) == 2

        for i, item in enumerate(items):
            assert item.message.data.decode() == json.dumps(get_payload()[i])