def batch_delete_ingest_infos_for_region(region: str) -> None: """Batch deletes ingest infos for a particular region. Args: region: (string) Region to delete ingest infos for """ results = _get_ingest_info_entities_for_region(region) # The Datastore limit for entity writes in one call is 500. Therefore, # divide the entities to delete into chunks of 500. if len(results) > 500: list_of_chunks = _divide_into_chunks(results=results, chunk_size=500) for chunk in list_of_chunks: try: retry_grpc( NUM_GRPC_RETRIES, ds().delete_multi, [chunk_item.key for chunk_item in chunk], ) except Exception as e: raise DatastoreBatchDeleteError(region) from e else: try: retry_grpc( NUM_GRPC_RETRIES, ds().delete_multi, [result.key for result in results] ) except Exception as e: raise DatastoreBatchDeleteError(region) from e
def create_session(scrape_key: ScrapeKey) -> ScrapeSession: """Creates a new session to allow starting the given scraper. Should be prior to any specific scraping tasks for the region. Ends any open sessions for the given scraper and creates a brand new one. Args: scrape_key: (ScrapeKey) The scraper to setup a new session for """ logging.info("Creating new scrape session for: [%s]", scrape_key) # TODO(#1598): We already skip starting a session if a session already # exists so we should be able to remove this. We could move the skip to here close_session(scrape_key) new_session = ScrapeSession.new(ds().key(SCRAPE_SESSION_KIND), scrape_type=scrape_key.scrape_type, region=scrape_key.region_code, phase=scrape_phase.ScrapePhase.START) retry_grpc( NUM_GRPC_RETRIES, ds().put, new_session.to_entity() ) return new_session
def retry_with_create(scrape_key, fn, pubsub_type): try: result = retry_grpc(NUM_GRPC_RETRIES, fn) except exceptions.NotFound: create_topic_and_subscription(scrape_key, pubsub_type=pubsub_type) result = retry_grpc(NUM_GRPC_RETRIES, fn) return result
def write_ingest_info( region: str, task_hash: int, session_start_time: datetime, ingest_info: IngestInfo ) -> BatchIngestInfoData: """Writes a new ingest info for a given region. Args: region: (string) The region the ingest info is getting added for task_hash: (int) the hash of the task associated with the ingest info session_start_time: (datetime) The start time of the scraper that got the ingest info ingest_info: (IngestInfo) The ingest info data """ logging.info( "Writing a new ingest info (with %d people) for region: [%s]", len(ingest_info.get_all_people()), region, ) new_ingest_info_entity = _DatastoreIngestInfo.new( key=ds().key(INGEST_INFO_KIND), session_start_time=session_start_time, region=region, ingest_info=ingest_info, task_hash=task_hash, ).to_entity() try: retry_grpc(NUM_GRPC_RETRIES, ds().put, new_ingest_info_entity) except Exception as e: raise DatastoreWriteIngestInfoError(ingest_info, region) from e return _DatastoreIngestInfo.get_batch_ingest_info_data(new_ingest_info_entity)
def create_topic_and_subscription(scrape_key, pubsub_type): topic_path = get_topic_path(scrape_key, pubsub_type) try: logging.info("Creating pubsub topic: '%s'", topic_path) retry_grpc(NUM_GRPC_RETRIES, get_publisher().create_topic, name=topic_path) except exceptions.AlreadyExists: logging.info("Topic already exists") # A race condition exists sometimes where the topic doesn't exist yet and # therefore fails to make the subscription. time.sleep(1) subscription_path = get_subscription_path(scrape_key, pubsub_type) try: logging.info("Creating pubsub subscription: '%s'", subscription_path) retry_grpc( NUM_GRPC_RETRIES, get_subscriber().create_subscription, name=subscription_path, topic=topic_path, ack_deadline_seconds=ACK_DEADLINE_SECONDS, ) except exceptions.AlreadyExists: logging.info("Subscription already exists")
def update_phase(session: ScrapeSession, phase: scrape_phase.ScrapePhase): """Updates the phase of the session to the given phase.""" # TODO(#1665): remove once dangling PERSIST session investigation # is complete. logging.info("Updating phase from %s to %s", session.phase, phase) session.phase = phase retry_grpc(NUM_GRPC_RETRIES, ds().put, session.to_entity())
def write_error(region: str, session_start_time: datetime, error: str, trace_id: Optional[str], task_hash: int) -> BatchIngestInfoData: """Writes a new ingest info for a given region. Args: region: (string) The region the ingest info is getting added for session_start_time: (datetime) The start time of the scraper that got the ingest info error: (string) the error message trace_id: (string) the trace id used to debug ingest_info: (IngestInfo) The ingest info data task_hash: (int) the hash of the task associated with the error """ logging.info("Writing a new failure for region: [%s]", region) new_ingest_info_entity = _DatastoreIngestInfo.new( key=ds().key(INGEST_INFO_KIND), session_start_time=session_start_time, region=region, task_hash=task_hash, error=error, trace_id=trace_id).to_entity() try: retry_grpc(NUM_GRPC_RETRIES, ds().put, new_ingest_info_entity) except Exception as e: raise DatastoreErrorWriteError(error, region) from e return _DatastoreIngestInfo.get_batch_ingest_info_data( new_ingest_info_entity)
def delete_task(self, task: task_pb2.Task) -> None: try: retry_grpc(self.NUM_GRPC_RETRIES, self.client.delete_task, name=task.name) except exceptions.NotFound as e: logging.debug('Task not found: [%s]', e)
def close_session(scrape_key: ScrapeKey) -> List[ScrapeSession]: """Closes any open session for the given scraper. Resets relevant session info after a scraping session is concluded and before another one starts, including updating the ScrapeSession entity for the old session. Args: scrape_key: (ScrapeKey) The scraper to clean up session info for Returns: list of the scrape sessions which were closed """ # TODO(#1598): Much of our code assumes there is only one open session (i.e. # all sessions go to the same batch persistence queue). Other code # specifically supports there being multiple open sessions, we should # reconcile this. open_sessions = get_sessions(scrape_key.region_code, include_closed=False, scrape_type=scrape_key.scrape_type) closed_sessions = [] for session in open_sessions: session.end = datetime.now() retry_grpc(NUM_GRPC_RETRIES, ds().put, session.to_entity()) closed_sessions.append(session) return closed_sessions
def retry_with_create(scrape_key: ScrapeKey, fn: Callable[..., ReturnType], pubsub_type: str) -> ReturnType: try: result = retry_grpc(NUM_GRPC_RETRIES, fn) except exceptions.NotFound: create_topic_and_subscription(scrape_key, pubsub_type=pubsub_type) result = retry_grpc(NUM_GRPC_RETRIES, fn) return result
def initialize_cloud_task_queue(self, queue_config: queue_pb2.Queue): """ Initializes a task queue with the given config. If a queue with a given name already exists, it is updated to have the given config. If it does not exist, it will be created by this function. """ # Creates queue if it does not exist, or updates it to have the given config. retry_grpc(self.NUM_GRPC_RETRIES, self.client.update_queue, queue_config)
def create_task(self, *, task_id: str, queue_name: str, relative_uri: str, body: Optional[Dict[str, str]] = None, schedule_delay_seconds: int = 0, http_method: HttpMethod = HttpMethod.POST): """Creates a task with the given details. Args: task_id: Id of the task to include in the task name queue_name: The queue on which to schedule the task schedule_delay_seconds: The number of seconds by which to delay the scheduling of the given task. relative_uri: The relative uri to hit. body: Dictionary of values that will be converted to JSON and included in the request. http_method: The method for this request (i.e. GET or POST) """ task_name = self.format_task_path(queue_name, task_id) schedule_timestamp = None if schedule_delay_seconds > 0: schedule_time_sec = \ int(datetime.utcnow().timestamp()) + schedule_delay_seconds schedule_timestamp = \ timestamp_pb2.Timestamp(seconds=schedule_time_sec) task_builder = ProtobufBuilder(task_pb2.Task).update_args( name=task_name, app_engine_http_request={ 'relative_uri': relative_uri, }, ) if schedule_timestamp: task_builder.update_args(schedule_time=schedule_timestamp, ) if http_method is not None: task_builder.update_args(app_engine_http_request={ 'http_method': http_method.value, }, ) if http_method in (HttpMethod.POST, HttpMethod.PUT, HttpMethod.PATCH): if body is None: body = {} task_builder.update_args( app_engine_http_request={'body': json.dumps(body).encode()}, ) task = task_builder.build() logging.info("Queueing task to queue [%s]: [%s]", queue_name, task.name) retry_grpc(self.NUM_GRPC_RETRIES, self.client.create_task, self.format_queue_path(queue_name), task)
def test_retry_grpc_raises(self) -> None: fn = MagicMock() # Always a GOAWAY error fn.side_effect = GO_AWAY_ERROR with self.assertRaises(exceptions.InternalServerError): retry_grpc(3, fn, 1, b=2) fn.assert_has_calls([call(1, b=2)] * 4)
def test_retry_grpc_raises_deadline_exceeded(self) -> None: fn = MagicMock() # Always a DEADLINE_EXCEEDED error fn.side_effect = DEADLINE_EXCEEDED_ERROR with self.assertRaises(exceptions.InternalServerError): retry_grpc(3, fn, 1, b=2) fn.assert_has_calls([call(1, b=2)] * 4)
def test_retry_grpc_raises_no_goaway(self) -> None: fn = MagicMock() # Always a different error fn.side_effect = OTHER_ERROR with self.assertRaises(exceptions.InternalServerError): retry_grpc(3, fn, 1, b=2) fn.assert_has_calls([call(1, b=2)])
def purge_scrape_tasks(*, region_code: str, queue_name: str): """Purge scrape tasks for a given region from its queue. Args: region_code: `str` region code. queue_name: `str` queue name. """ for task in list_scrape_tasks(region_code=region_code, queue_name=queue_name): try: retry_grpc(NUM_GRPC_RETRIES, client().delete_task, task.name) except exceptions.NotFound as e: logging.debug('Task not found: [%s]', e)
def add_docket_item_to_session(docket_ack_id: str, session: ScrapeSession) -> bool: """Adds docket item to the given session Args: docket_ack_id: (string) Id used to ack the docket message session: (ScrapeSession) The session to add to Returns: True if successful otherwise False """ session.docket_ack_id = docket_ack_id retry_grpc(NUM_GRPC_RETRIES, ds().put, session.to_entity()) return True
def remove_docket_item_from_session(session: ScrapeSession) -> str: """Removes the docket item from the session. Args: session: (ScrapeSession) The session to remove from Returns: Id used to ack the docket message """ docket_ack_id = session.docket_ack_id session.docket_ack_id = None retry_grpc(NUM_GRPC_RETRIES, ds().put, session.to_entity()) return docket_ack_id
def update_phase(session: ScrapeSession, phase: scrape_phase.ScrapePhase): """Updates the phase of the session to the given phase.""" # TODO(#1665): remove once dangling PERSIST session investigation # is complete. logging.info("Updating phase from %s to %s", session.phase, phase) previous_phase = session.phase session.phase = phase retry_grpc(NUM_GRPC_RETRIES, ds().put, session.to_entity()) if previous_phase == scrape_phase.ScrapePhase.RELEASE and \ phase == scrape_phase.ScrapePhase.DONE: jid = regions.get_region(session.region).jurisdiction_id store_scraper_success(ScraperSuccess(), jid)
def test_retry_grpc_no_raise(self): fn = MagicMock() # Two GOAWAY errors, 1 DEADLINE_EXCEEDED, then works fn.side_effect = [GO_AWAY_ERROR] * 2 + [DEADLINE_EXCEEDED_ERROR] + [3] result = retry_grpc(3, fn, 1, b=2) self.assertEqual(result, 3) fn.assert_has_calls([call(1, b=2)] * 3)
def write_scraped_record(*args, **kwds): """Passes any arguments to create a scraped record and writes it to datastore Should be prior to any specific scraping tasks for the region. Ends any open sessions for the given scraper and creates a brand new one. Args: scrape_key: (ScrapeKey) The scraper to setup a new session for """ new_record = ScrapedRecord.new(ds().key(SCRAPED_RECORD_KIND), *args, **kwds) try: retry_grpc(NUM_GRPC_RETRIES, ds().put, new_record.to_entity()) except Exception as e: logging.warning("Couldn't persist ScrapedRecord entry, " "record_id: %s\n%s", new_record['record_id'], e)
def enqueue_scraper_phase(*, region_code, url): """Add a task to trigger the next phase of a scrape. This triggers the phase at the given url for an individual region, passing the `region_code` as a url parameter. For example, this can trigger stopping a scraper or inferring release for a particular region. """ task = tasks.types.Task( app_engine_http_request={ 'http_method': 'GET', 'relative_uri': '{url}?region={region_code}'.format(url=url, region_code=region_code), }) retry_grpc(NUM_GRPC_RETRIES, client().create_task, format_queue_path(SCRAPER_PHASE_QUEUE), task)
def list_tasks_with_prefix(path_prefix: str, queue_name: str) -> List[tasks.types.Task]: """List tasks for the given queue with the given task path prefix.""" return [ task for task in retry_grpc(NUM_GRPC_RETRIES, client().list_tasks, format_queue_path(queue_name)) if task.name.startswith(path_prefix) ]
def create_scrape_task(*, region_code, queue_name, url, body): """Create a scrape task in a queue. Args: region_code: `str` region code. queue_name: `str` queue name. url: `str` App Engine worker url. body: `dict` task body to be passed to worker. """ task = tasks.types.Task(name=format_scrape_task_path( queue_name, region_code, uuid.uuid4()), app_engine_http_request={ 'relative_uri': url, 'body': json.dumps(body).encode() }) retry_grpc(NUM_GRPC_RETRIES, client().create_task, format_queue_path(queue_name), task)
def get_sessions( region_code: str, include_open: bool = True, include_closed: bool = True, most_recent_only: bool = False, scrape_type: constants.ScrapeType = None, ) -> Iterator[ScrapeSession]: """Retrieves scrape sessions. Retrieves some combination of scrape session entities based on the arguments provided. If both `include_open` and `include_closed` are False the returned generator will be empty. Args: region_code: (string) Region code to fetch sessions for include_open: (bool) Return open sessions include_closed: (bool) Return closed sessions most_recent_only: (bool) Only return the most recent session entity scrape_type: (string) Only return sessions of this scrape type Returns: A generator of ScrapeSessions """ session_query = ds().query(kind=SCRAPE_SESSION_KIND) session_query.add_filter("region", "=", region_code) session_query.order = ["-start"] if not include_closed: if include_open: session_query.add_filter("end", "=", None) else: return (_ for _ in ()) if scrape_type: session_query.add_filter("scrape_type", "=", scrape_type.value) limit = None # If `include_open` is not set we have to filter after fetching the results # so we cannot limit to the first result. if most_recent_only and include_open: limit = 1 results = retry_grpc(NUM_GRPC_RETRIES, session_query.fetch, limit=limit) # Datastore doesn't allow an inequality filter on `end` because we are # sorting on `start`, so we have to do the filter after we get the results. if not include_open: results = (result for result in results if result.get("end")) if most_recent_only: first_result = next(results, None) results = [first_result] if first_result else [] return _sessions_from_entities(results)
def create_bq_task(table_name: str, module: str, url: str): """Create a BigQuery table export path. Args: table_name: Cloud SQL table to export to BQ. Must be defined in the *_TABLES_TO_EXPORT for the given schema. module: The module of the table being exported, either 'county' or 'state'. url: App Engine worker URL. """ body = {'table_name': table_name, 'module': module} task_id = '{}-{}-{}-{}'.format(table_name, module, str(datetime.date.today()), uuid.uuid4()) task_name = format_task_path(BIGQUERY_QUEUE, task_id) task = tasks.types.Task(name=task_name, app_engine_http_request={ 'relative_uri': url, 'body': json.dumps(body).encode() }) retry_grpc(NUM_GRPC_RETRIES, client().create_task, format_queue_path(BIGQUERY_QUEUE), task)
def list_tasks_with_prefix( self, queue_name: str, task_id_prefix: str, ) -> List[task_pb2.Task]: """List tasks for the given queue with the given task path prefix.""" task_name_prefix = self.format_task_path(queue_name, task_id_prefix) return [task for task in retry_grpc(self.NUM_GRPC_RETRIES, self.client.list_tasks, parent=self.format_queue_path(queue_name)) if task.name.startswith(task_name_prefix)]
def update_session(last_scraped: str, scrape_key: ScrapeKey) -> bool: """Updates ScrapeSession entity with most recently successfully scraped person. Updates the most recent open session entity with the person. This allows us to pause and resume long-lived scrapes without losing our place. Args: last_scraped: (String) Name of the last successfully scraped person scrape_key: (ScrapeKey) The scraper to update session info for Returns: True if successful False if not """ current_session = get_current_session(scrape_key) if not current_session: logging.error("No open sessions found to update.") return False current_session.last_scraped = last_scraped retry_grpc(NUM_GRPC_RETRIES, ds().put, current_session.to_entity()) return True
def already_scraped_record(region_code, record_id, start): """Checks datastore to see if a matching record already exists. Args: region_code: Code for the region record_id: Id for the record start: Time to check if the record is newer than Returns: True if a record exists otherwise False """ record_query = ds().query(kind=SCRAPED_RECORD_KIND) record_query.add_filter('region', '=', region_code) record_query.add_filter('record_id', '==', record_id) record_query.add_filter('created_on', '>', start) return bool(next(retry_grpc(NUM_GRPC_RETRIES, record_query.fetch), None))
def _get_ingest_info_entities_for_region(region: str, session_start_time: datetime = None) \ -> List[datastore.Entity]: logging.info( "Getting ingest info entities for region: [%s] and " "session_start_time: [%s]", region, session_start_time) session_query = ds().query(kind=INGEST_INFO_KIND) session_query.add_filter('region', '=', region) if session_start_time: session_query.add_filter('session_start_time', '=', session_start_time) results = None try: results = retry_grpc(NUM_GRPC_RETRIES, session_query.fetch, limit=None) except Exception as e: raise DatastoreBatchGetError(region) from e return list(results)