async def create( app, db: Database, # BORROWED inst_coll_manager: InstanceCollectionManager, resource_manager: CloudResourceManager, machine_name_prefix: str, config: JobPrivateInstanceManagerConfig, task_manager: aiotools.BackgroundTaskManager, ): jpim = JobPrivateInstanceManager( app, db, inst_coll_manager, resource_manager, machine_name_prefix, config, task_manager ) log.info(f'initializing {jpim}') async for record in db.select_and_fetchall( ''' SELECT instances.*, instances_free_cores_mcpu.free_cores_mcpu FROM instances INNER JOIN instances_free_cores_mcpu ON instances.name = instances_free_cores_mcpu.name WHERE removed = 0 AND inst_coll = %s; ''', (jpim.name,), ): jpim.add_instance(Instance.from_record(app, jpim, record)) return jpim
async def create( app, db: Database, # BORROWED inst_coll_manager: InstanceCollectionManager, resource_manager: CloudResourceManager, machine_name_prefix: str, config: PoolConfig, async_worker_pool: AsyncWorkerPool, # BORROWED task_manager: aiotools.BackgroundTaskManager, ) -> 'Pool': pool = Pool(app, db, inst_coll_manager, resource_manager, machine_name_prefix, config, async_worker_pool, task_manager) log.info(f'initializing {pool}') async for record in db.select_and_fetchall( ''' SELECT instances.*, instances_free_cores_mcpu.free_cores_mcpu FROM instances INNER JOIN instances_free_cores_mcpu ON instances.name = instances_free_cores_mcpu.name WHERE removed = 0 AND inst_coll = %s; ''', (pool.name, ), ): pool.add_instance(Instance.from_record(app, pool, record)) return pool
async def on_startup(app): pool = concurrent.futures.ThreadPoolExecutor() app['blocking_pool'] = pool kube.config.load_incluster_config() k8s_client = kube.client.CoreV1Api() k8s_cache = K8sCache(k8s_client, refresh_time=5) app['k8s_cache'] = k8s_cache db = Database() await db.async_init(maxsize=50) app['db'] = db row = await db.select_and_fetchone(''' SELECT worker_type, worker_cores, worker_disk_size_gb, instance_id, internal_token FROM globals; ''') app['worker_type'] = row['worker_type'] app['worker_cores'] = row['worker_cores'] app['worker_disk_size_gb'] = row['worker_disk_size_gb'] instance_id = row['instance_id'] log.info(f'instance_id {instance_id}') app['instance_id'] = instance_id app['internal_token'] = row['internal_token'] resources = db.select_and_fetchall('SELECT resource FROM resources;') app['resources'] = [record['resource'] async for record in resources] machine_name_prefix = f'batch-worker-{DEFAULT_NAMESPACE}-' aiogoogle_credentials = aiogoogle.Credentials.from_file( '/gsa-key/key.json') compute_client = aiogoogle.ComputeClient(PROJECT, credentials=aiogoogle_credentials) app['compute_client'] = compute_client logging_client = aiogoogle.LoggingClient( credentials=aiogoogle_credentials, # The project-wide logging quota is 60 request/m. The event # loop sleeps 15s per iteration, so the max rate is 4 # iterations/m. Note, the event loop could make multiple # logging requests per iteration, so these numbers are not # quite comparable. I didn't want to consume the entire quota # since there will be other users of the logging API (us at # the web console, test deployments, etc.) rate_limit=RateLimit(10, 60)) app['logging_client'] = logging_client scheduler_state_changed = asyncio.Event() app['scheduler_state_changed'] = scheduler_state_changed cancel_ready_state_changed = asyncio.Event() app['cancel_ready_state_changed'] = cancel_ready_state_changed cancel_running_state_changed = asyncio.Event() app['cancel_running_state_changed'] = cancel_running_state_changed credentials = google.oauth2.service_account.Credentials.from_service_account_file( '/gsa-key/key.json') log_store = LogStore(BATCH_BUCKET_NAME, WORKER_LOGS_BUCKET_NAME, instance_id, pool, credentials=credentials) app['log_store'] = log_store inst_pool = InstancePool(app, machine_name_prefix) app['inst_pool'] = inst_pool await inst_pool.async_init() scheduler = Scheduler(app) await scheduler.async_init() app['scheduler'] = scheduler
async def on_startup(app): app['task_manager'] = aiotools.BackgroundTaskManager() pool = concurrent.futures.ThreadPoolExecutor() app['blocking_pool'] = pool kube.config.load_incluster_config() k8s_client = kube.client.CoreV1Api() k8s_cache = K8sCache(k8s_client, refresh_time=5) app['k8s_cache'] = k8s_cache db = Database() await db.async_init(maxsize=50) app['db'] = db row = await db.select_and_fetchone(''' SELECT instance_id, internal_token FROM globals; ''') instance_id = row['instance_id'] log.info(f'instance_id {instance_id}') app['instance_id'] = instance_id app['internal_token'] = row['internal_token'] app['batch_headers'] = {'Authorization': f'Bearer {row["internal_token"]}'} resources = db.select_and_fetchall('SELECT resource FROM resources;') app['resources'] = [record['resource'] async for record in resources] aiogoogle_credentials = aiogoogle.Credentials.from_file( '/gsa-key/key.json') compute_client = aiogoogle.ComputeClient(PROJECT, credentials=aiogoogle_credentials) app['compute_client'] = compute_client logging_client = aiogoogle.LoggingClient( credentials=aiogoogle_credentials, # The project-wide logging quota is 60 request/m. The event # loop sleeps 15s per iteration, so the max rate is 4 # iterations/m. Note, the event loop could make multiple # logging requests per iteration, so these numbers are not # quite comparable. I didn't want to consume the entire quota # since there will be other users of the logging API (us at # the web console, test deployments, etc.) rate_limit=RateLimit(10, 60), ) app['logging_client'] = logging_client scheduler_state_changed = Notice() app['scheduler_state_changed'] = scheduler_state_changed cancel_ready_state_changed = asyncio.Event() app['cancel_ready_state_changed'] = cancel_ready_state_changed cancel_creating_state_changed = asyncio.Event() app['cancel_creating_state_changed'] = cancel_creating_state_changed cancel_running_state_changed = asyncio.Event() app['cancel_running_state_changed'] = cancel_running_state_changed async_worker_pool = AsyncWorkerPool(100, queue_size=100) app['async_worker_pool'] = async_worker_pool credentials = google.oauth2.service_account.Credentials.from_service_account_file( '/gsa-key/key.json') log_store = LogStore(BATCH_BUCKET_NAME, instance_id, pool, credentials=credentials) app['log_store'] = log_store zone_monitor = ZoneMonitor(app) app['zone_monitor'] = zone_monitor await zone_monitor.async_init() inst_coll_configs = InstanceCollectionConfigs(app) await inst_coll_configs.async_init() inst_coll_manager = InstanceCollectionManager(app, MACHINE_NAME_PREFIX) app['inst_coll_manager'] = inst_coll_manager await inst_coll_manager.async_init(inst_coll_configs) canceller = Canceller(app) app['canceller'] = canceller await canceller.async_init() gce_event_monitor = GCEEventMonitor(app, MACHINE_NAME_PREFIX) app['gce_event_monitor'] = gce_event_monitor await gce_event_monitor.async_init() app['check_incremental_error'] = None app['check_resource_aggregation_error'] = None if HAIL_SHOULD_CHECK_INVARIANTS: app['task_manager'].ensure_future( periodically_call(10, check_incremental, app, db)) app['task_manager'].ensure_future( periodically_call(10, check_resource_aggregation, app, db)) app['task_manager'].ensure_future( periodically_call(10, monitor_billing_limits, app)) app['task_manager'].ensure_future( periodically_call(10, cancel_fast_failing_batches, app)) app['task_manager'].ensure_future( periodically_call(60, scheduling_cancelling_bump, app))