def batch_delete_ingest_infos_for_region(region: str) -> None:
    """Batch deletes ingest infos for a particular region.

    Args:
        region: (string) Region to delete ingest infos for
    """
    results = _get_ingest_info_entities_for_region(region)
    # The Datastore limit for entity writes in one call is 500. Therefore,
    # divide the entities to delete into chunks of 500.
    if len(results) > 500:
        list_of_chunks = _divide_into_chunks(results=results, chunk_size=500)
        for chunk in list_of_chunks:
            try:
                retry_grpc(
                    NUM_GRPC_RETRIES,
                    ds().delete_multi,
                    [chunk_item.key for chunk_item in chunk],
                )
            except Exception as e:
                raise DatastoreBatchDeleteError(region) from e
    else:
        try:
            retry_grpc(
                NUM_GRPC_RETRIES, ds().delete_multi, [result.key for result in results]
            )
        except Exception as e:
            raise DatastoreBatchDeleteError(region) from e
Beispiel #2
0
def create_session(scrape_key: ScrapeKey) -> ScrapeSession:
    """Creates a new session to allow starting the given scraper.

    Should be prior to any specific scraping tasks for the region. Ends any open
    sessions for the given scraper and creates a brand new one.

    Args:
        scrape_key: (ScrapeKey) The scraper to setup a new session for
    """
    logging.info("Creating new scrape session for: [%s]", scrape_key)

    # TODO(#1598): We already skip starting a session if a session already
    # exists so we should be able to remove this. We could move the skip to here
    close_session(scrape_key)
    new_session = ScrapeSession.new(ds().key(SCRAPE_SESSION_KIND),
                                    scrape_type=scrape_key.scrape_type,
                                    region=scrape_key.region_code,
                                    phase=scrape_phase.ScrapePhase.START)

    retry_grpc(
        NUM_GRPC_RETRIES,
        ds().put,
        new_session.to_entity()
    )
    return new_session
Beispiel #3
0
def retry_with_create(scrape_key, fn, pubsub_type):
    try:
        result = retry_grpc(NUM_GRPC_RETRIES, fn)
    except exceptions.NotFound:
        create_topic_and_subscription(scrape_key, pubsub_type=pubsub_type)
        result = retry_grpc(NUM_GRPC_RETRIES, fn)
    return result
def write_ingest_info(
    region: str, task_hash: int, session_start_time: datetime, ingest_info: IngestInfo
) -> BatchIngestInfoData:
    """Writes a new ingest info for a given region.

    Args:
        region: (string) The region the ingest info is getting added for
        task_hash: (int) the hash of the task associated with the ingest info
        session_start_time: (datetime) The start time of the scraper that got
          the ingest info
        ingest_info: (IngestInfo) The ingest info data
    """
    logging.info(
        "Writing a new ingest info (with %d people) for region: [%s]",
        len(ingest_info.get_all_people()),
        region,
    )

    new_ingest_info_entity = _DatastoreIngestInfo.new(
        key=ds().key(INGEST_INFO_KIND),
        session_start_time=session_start_time,
        region=region,
        ingest_info=ingest_info,
        task_hash=task_hash,
    ).to_entity()

    try:
        retry_grpc(NUM_GRPC_RETRIES, ds().put, new_ingest_info_entity)
    except Exception as e:
        raise DatastoreWriteIngestInfoError(ingest_info, region) from e

    return _DatastoreIngestInfo.get_batch_ingest_info_data(new_ingest_info_entity)
Beispiel #5
0
def create_topic_and_subscription(scrape_key, pubsub_type):
    topic_path = get_topic_path(scrape_key, pubsub_type)
    try:
        logging.info("Creating pubsub topic: '%s'", topic_path)
        retry_grpc(NUM_GRPC_RETRIES,
                   get_publisher().create_topic,
                   name=topic_path)
    except exceptions.AlreadyExists:
        logging.info("Topic already exists")

    # A race condition exists sometimes where the topic doesn't exist yet and
    # therefore fails to make the subscription.
    time.sleep(1)
    subscription_path = get_subscription_path(scrape_key, pubsub_type)
    try:
        logging.info("Creating pubsub subscription: '%s'", subscription_path)
        retry_grpc(
            NUM_GRPC_RETRIES,
            get_subscriber().create_subscription,
            name=subscription_path,
            topic=topic_path,
            ack_deadline_seconds=ACK_DEADLINE_SECONDS,
        )
    except exceptions.AlreadyExists:
        logging.info("Subscription already exists")
Beispiel #6
0
def update_phase(session: ScrapeSession, phase: scrape_phase.ScrapePhase):
    """Updates the phase of the session to the given phase."""
    #  TODO(#1665): remove once dangling PERSIST session investigation
    #   is complete.
    logging.info("Updating phase from %s to %s", session.phase, phase)
    session.phase = phase
    retry_grpc(NUM_GRPC_RETRIES, ds().put, session.to_entity())
def write_error(region: str, session_start_time: datetime, error: str,
                trace_id: Optional[str],
                task_hash: int) -> BatchIngestInfoData:
    """Writes a new ingest info for a given region.


       Args:
           region: (string) The region the ingest info is getting added for
           session_start_time: (datetime) The start time of the scraper that
            got the ingest info
           error: (string) the error message
           trace_id: (string) the trace id used to debug
           ingest_info: (IngestInfo) The ingest info data
           task_hash: (int) the hash of the task associated with the error
       """
    logging.info("Writing a new failure for region: [%s]", region)

    new_ingest_info_entity = _DatastoreIngestInfo.new(
        key=ds().key(INGEST_INFO_KIND),
        session_start_time=session_start_time,
        region=region,
        task_hash=task_hash,
        error=error,
        trace_id=trace_id).to_entity()

    try:
        retry_grpc(NUM_GRPC_RETRIES, ds().put, new_ingest_info_entity)
    except Exception as e:
        raise DatastoreErrorWriteError(error, region) from e

    return _DatastoreIngestInfo.get_batch_ingest_info_data(
        new_ingest_info_entity)
Beispiel #8
0
 def delete_task(self, task: task_pb2.Task) -> None:
     try:
         retry_grpc(self.NUM_GRPC_RETRIES,
                    self.client.delete_task,
                    name=task.name)
     except exceptions.NotFound as e:
         logging.debug('Task not found: [%s]', e)
Beispiel #9
0
def close_session(scrape_key: ScrapeKey) -> List[ScrapeSession]:
    """Closes any open session for the given scraper.

    Resets relevant session info after a scraping session is concluded and
    before another one starts, including updating the ScrapeSession entity for
    the old session.

    Args:
        scrape_key: (ScrapeKey) The scraper to clean up session info for

    Returns: list of the scrape sessions which were closed
    """
    # TODO(#1598): Much of our code assumes there is only one open session (i.e.
    # all sessions go to the same batch persistence queue). Other code
    # specifically supports there being multiple open sessions, we should
    # reconcile this.
    open_sessions = get_sessions(scrape_key.region_code,
                                 include_closed=False,
                                 scrape_type=scrape_key.scrape_type)

    closed_sessions = []
    for session in open_sessions:
        session.end = datetime.now()
        retry_grpc(NUM_GRPC_RETRIES, ds().put, session.to_entity())
        closed_sessions.append(session)

    return closed_sessions
Beispiel #10
0
def retry_with_create(scrape_key: ScrapeKey, fn: Callable[..., ReturnType],
                      pubsub_type: str) -> ReturnType:
    try:
        result = retry_grpc(NUM_GRPC_RETRIES, fn)
    except exceptions.NotFound:
        create_topic_and_subscription(scrape_key, pubsub_type=pubsub_type)
        result = retry_grpc(NUM_GRPC_RETRIES, fn)
    return result
 def initialize_cloud_task_queue(self, queue_config: queue_pb2.Queue):
     """
     Initializes a task queue with the given config. If a queue with a given
     name already exists, it is updated to have the given config. If it does
     not exist, it will be created by this function.
     """
     # Creates queue if it does not exist, or updates it to have the given config.
     retry_grpc(self.NUM_GRPC_RETRIES, self.client.update_queue,
                queue_config)
    def create_task(self,
                    *,
                    task_id: str,
                    queue_name: str,
                    relative_uri: str,
                    body: Optional[Dict[str, str]] = None,
                    schedule_delay_seconds: int = 0,
                    http_method: HttpMethod = HttpMethod.POST):
        """Creates a task with the given details.

        Args:
            task_id: Id of the task to include in the task name
            queue_name: The queue on which to schedule the task
            schedule_delay_seconds: The number of seconds by which to delay the
                scheduling of the given task.
            relative_uri: The relative uri to hit.
            body: Dictionary of values that will be converted to JSON and
            included in the request.
            http_method: The method for this request (i.e. GET or POST)
        """
        task_name = self.format_task_path(queue_name, task_id)

        schedule_timestamp = None
        if schedule_delay_seconds > 0:
            schedule_time_sec = \
                int(datetime.utcnow().timestamp()) + schedule_delay_seconds
            schedule_timestamp = \
                timestamp_pb2.Timestamp(seconds=schedule_time_sec)

        task_builder = ProtobufBuilder(task_pb2.Task).update_args(
            name=task_name,
            app_engine_http_request={
                'relative_uri': relative_uri,
            },
        )

        if schedule_timestamp:
            task_builder.update_args(schedule_time=schedule_timestamp, )

        if http_method is not None:
            task_builder.update_args(app_engine_http_request={
                'http_method': http_method.value,
            }, )

        if http_method in (HttpMethod.POST, HttpMethod.PUT, HttpMethod.PATCH):
            if body is None:
                body = {}
            task_builder.update_args(
                app_engine_http_request={'body': json.dumps(body).encode()}, )

        task = task_builder.build()

        logging.info("Queueing task to queue [%s]: [%s]", queue_name,
                     task.name)

        retry_grpc(self.NUM_GRPC_RETRIES, self.client.create_task,
                   self.format_queue_path(queue_name), task)
    def test_retry_grpc_raises(self) -> None:
        fn = MagicMock()
        # Always a GOAWAY error
        fn.side_effect = GO_AWAY_ERROR

        with self.assertRaises(exceptions.InternalServerError):
            retry_grpc(3, fn, 1, b=2)

        fn.assert_has_calls([call(1, b=2)] * 4)
    def test_retry_grpc_raises_deadline_exceeded(self) -> None:
        fn = MagicMock()
        # Always a DEADLINE_EXCEEDED error
        fn.side_effect = DEADLINE_EXCEEDED_ERROR

        with self.assertRaises(exceptions.InternalServerError):
            retry_grpc(3, fn, 1, b=2)

        fn.assert_has_calls([call(1, b=2)] * 4)
    def test_retry_grpc_raises_no_goaway(self) -> None:
        fn = MagicMock()
        # Always a different error
        fn.side_effect = OTHER_ERROR

        with self.assertRaises(exceptions.InternalServerError):
            retry_grpc(3, fn, 1, b=2)

        fn.assert_has_calls([call(1, b=2)])
Beispiel #16
0
def purge_scrape_tasks(*, region_code: str, queue_name: str):
    """Purge scrape tasks for a given region from its queue.

    Args:
        region_code: `str` region code.
        queue_name: `str` queue name.
    """
    for task in list_scrape_tasks(region_code=region_code,
                                  queue_name=queue_name):
        try:
            retry_grpc(NUM_GRPC_RETRIES, client().delete_task, task.name)
        except exceptions.NotFound as e:
            logging.debug('Task not found: [%s]', e)
Beispiel #17
0
def add_docket_item_to_session(docket_ack_id: str, session: ScrapeSession) -> bool:
    """Adds docket item to the given session

    Args:
        docket_ack_id: (string) Id used to ack the docket message
        session: (ScrapeSession) The session to add to

    Returns:
        True if successful otherwise False
    """
    session.docket_ack_id = docket_ack_id
    retry_grpc(NUM_GRPC_RETRIES, ds().put, session.to_entity())
    return True
Beispiel #18
0
def remove_docket_item_from_session(session: ScrapeSession) -> str:
    """Removes the docket item from the session.

    Args:
        session: (ScrapeSession) The session to remove from

    Returns:
        Id used to ack the docket message
    """
    docket_ack_id = session.docket_ack_id
    session.docket_ack_id = None
    retry_grpc(NUM_GRPC_RETRIES, ds().put, session.to_entity())
    return docket_ack_id
Beispiel #19
0
def update_phase(session: ScrapeSession, phase: scrape_phase.ScrapePhase):
    """Updates the phase of the session to the given phase."""
    #  TODO(#1665): remove once dangling PERSIST session investigation
    #   is complete.
    logging.info("Updating phase from %s to %s", session.phase, phase)

    previous_phase = session.phase

    session.phase = phase
    retry_grpc(NUM_GRPC_RETRIES, ds().put, session.to_entity())

    if previous_phase == scrape_phase.ScrapePhase.RELEASE and \
       phase == scrape_phase.ScrapePhase.DONE:
        jid = regions.get_region(session.region).jurisdiction_id
        store_scraper_success(ScraperSuccess(), jid)
Beispiel #20
0
    def test_retry_grpc_no_raise(self):
        fn = MagicMock()
        # Two GOAWAY errors, 1 DEADLINE_EXCEEDED, then works
        fn.side_effect = [GO_AWAY_ERROR] * 2 + [DEADLINE_EXCEEDED_ERROR] + [3]

        result = retry_grpc(3, fn, 1, b=2)

        self.assertEqual(result, 3)
        fn.assert_has_calls([call(1, b=2)] * 3)
Beispiel #21
0
def write_scraped_record(*args, **kwds):
    """Passes any arguments to create a scraped record and writes it to
    datastore

    Should be prior to any specific scraping tasks for the region. Ends any open
    sessions for the given scraper and creates a brand new one.

    Args:
        scrape_key: (ScrapeKey) The scraper to setup a new session for
    """
    new_record = ScrapedRecord.new(ds().key(SCRAPED_RECORD_KIND),
                                   *args, **kwds)

    try:
        retry_grpc(NUM_GRPC_RETRIES, ds().put, new_record.to_entity())
    except Exception as e:
        logging.warning("Couldn't persist ScrapedRecord entry, "
                        "record_id: %s\n%s", new_record['record_id'], e)
Beispiel #22
0
def enqueue_scraper_phase(*, region_code, url):
    """Add a task to trigger the next phase of a scrape.

    This triggers the phase at the given url for an individual region, passing
    the `region_code` as a url parameter. For example, this can trigger stopping
    a scraper or inferring release for a particular region.
    """
    task = tasks.types.Task(
        app_engine_http_request={
            'http_method':
            'GET',
            'relative_uri':
            '{url}?region={region_code}'.format(url=url,
                                                region_code=region_code),
        })
    retry_grpc(NUM_GRPC_RETRIES,
               client().create_task, format_queue_path(SCRAPER_PHASE_QUEUE),
               task)
Beispiel #23
0
def list_tasks_with_prefix(path_prefix: str,
                           queue_name: str) -> List[tasks.types.Task]:
    """List tasks for the given queue with the given task path prefix."""
    return [
        task for task in retry_grpc(NUM_GRPC_RETRIES,
                                    client().list_tasks,
                                    format_queue_path(queue_name))
        if task.name.startswith(path_prefix)
    ]
Beispiel #24
0
def create_scrape_task(*, region_code, queue_name, url, body):
    """Create a scrape task in a queue.

    Args:
        region_code: `str` region code.
        queue_name: `str` queue name.
        url: `str` App Engine worker url.
        body: `dict` task body to be passed to worker.
    """
    task = tasks.types.Task(name=format_scrape_task_path(
        queue_name, region_code, uuid.uuid4()),
                            app_engine_http_request={
                                'relative_uri': url,
                                'body': json.dumps(body).encode()
                            })

    retry_grpc(NUM_GRPC_RETRIES,
               client().create_task, format_queue_path(queue_name), task)
Beispiel #25
0
def get_sessions(
    region_code: str,
    include_open: bool = True,
    include_closed: bool = True,
    most_recent_only: bool = False,
    scrape_type: constants.ScrapeType = None,
) -> Iterator[ScrapeSession]:
    """Retrieves scrape sessions.

    Retrieves some combination of scrape session entities based on the arguments
    provided.

    If both `include_open` and `include_closed` are False the returned generator
    will be empty.

    Args:
        region_code: (string) Region code to fetch sessions for
        include_open: (bool) Return open sessions
        include_closed: (bool) Return closed sessions
        most_recent_only: (bool) Only return the most recent session entity
        scrape_type: (string) Only return sessions of this scrape type

    Returns:
        A generator of ScrapeSessions
    """
    session_query = ds().query(kind=SCRAPE_SESSION_KIND)
    session_query.add_filter("region", "=", region_code)
    session_query.order = ["-start"]

    if not include_closed:
        if include_open:
            session_query.add_filter("end", "=", None)
        else:
            return (_ for _ in ())

    if scrape_type:
        session_query.add_filter("scrape_type", "=", scrape_type.value)

    limit = None
    # If `include_open` is not set we have to filter after fetching the results
    # so we cannot limit to the first result.
    if most_recent_only and include_open:
        limit = 1

    results = retry_grpc(NUM_GRPC_RETRIES, session_query.fetch, limit=limit)

    # Datastore doesn't allow an inequality filter on `end` because we are
    # sorting on `start`, so we have to do the filter after we get the results.
    if not include_open:
        results = (result for result in results if result.get("end"))
        if most_recent_only:
            first_result = next(results, None)
            results = [first_result] if first_result else []

    return _sessions_from_entities(results)
Beispiel #26
0
def create_bq_task(table_name: str, module: str, url: str):
    """Create a BigQuery table export path.

    Args:
        table_name: Cloud SQL table to export to BQ. Must be defined in
            the *_TABLES_TO_EXPORT for the given schema.
        module: The module of the table being exported, either 'county' or
            'state'.
        url: App Engine worker URL.
    """
    body = {'table_name': table_name, 'module': module}
    task_id = '{}-{}-{}-{}'.format(table_name, module,
                                   str(datetime.date.today()), uuid.uuid4())
    task_name = format_task_path(BIGQUERY_QUEUE, task_id)
    task = tasks.types.Task(name=task_name,
                            app_engine_http_request={
                                'relative_uri': url,
                                'body': json.dumps(body).encode()
                            })
    retry_grpc(NUM_GRPC_RETRIES,
               client().create_task, format_queue_path(BIGQUERY_QUEUE), task)
Beispiel #27
0
    def list_tasks_with_prefix(
            self,
            queue_name: str,
            task_id_prefix: str,
    ) -> List[task_pb2.Task]:
        """List tasks for the given queue with the given task path prefix."""

        task_name_prefix = self.format_task_path(queue_name, task_id_prefix)
        return [task for task in retry_grpc(self.NUM_GRPC_RETRIES,
                                            self.client.list_tasks,
                                            parent=self.format_queue_path(queue_name))
                if task.name.startswith(task_name_prefix)]
Beispiel #28
0
def update_session(last_scraped: str, scrape_key: ScrapeKey) -> bool:
    """Updates ScrapeSession entity with most recently successfully scraped
    person.

    Updates the most recent open session entity with the person. This allows us
    to pause and resume long-lived scrapes without losing our place.

    Args:
        last_scraped: (String) Name of the last successfully scraped person
        scrape_key: (ScrapeKey) The scraper to update session info for

    Returns:
        True if successful
        False if not
    """
    current_session = get_current_session(scrape_key)

    if not current_session:
        logging.error("No open sessions found to update.")
        return False

    current_session.last_scraped = last_scraped
    retry_grpc(NUM_GRPC_RETRIES, ds().put, current_session.to_entity())
    return True
Beispiel #29
0
def already_scraped_record(region_code, record_id, start):
    """Checks datastore to see if a matching record already exists.

    Args:
        region_code: Code for the region
        record_id: Id for the record
        start: Time to check if the record is newer than

    Returns:
        True if a record exists otherwise False
    """
    record_query = ds().query(kind=SCRAPED_RECORD_KIND)
    record_query.add_filter('region', '=', region_code)
    record_query.add_filter('record_id', '==', record_id)
    record_query.add_filter('created_on', '>', start)
    return bool(next(retry_grpc(NUM_GRPC_RETRIES, record_query.fetch), None))
def _get_ingest_info_entities_for_region(region: str, session_start_time: datetime = None) \
        -> List[datastore.Entity]:
    logging.info(
        "Getting ingest info entities for region: [%s] and "
        "session_start_time: [%s]", region, session_start_time)
    session_query = ds().query(kind=INGEST_INFO_KIND)
    session_query.add_filter('region', '=', region)
    if session_start_time:
        session_query.add_filter('session_start_time', '=', session_start_time)

    results = None
    try:
        results = retry_grpc(NUM_GRPC_RETRIES, session_query.fetch, limit=None)

    except Exception as e:
        raise DatastoreBatchGetError(region) from e

    return list(results)