async def create(app): c = Canceller(app) c.task_manager.ensure_future( retry_long_running( 'cancel_cancelled_ready_jobs_loop', run_if_changed, c.cancel_ready_state_changed, c.cancel_cancelled_ready_jobs_loop_body, )) c.task_manager.ensure_future( retry_long_running( 'cancel_cancelled_creating_jobs_loop', run_if_changed, c.cancel_creating_state_changed, c.cancel_cancelled_creating_jobs_loop_body, )) c.task_manager.ensure_future( retry_long_running( 'cancel_cancelled_running_jobs_loop', run_if_changed, c.cancel_running_state_changed, c.cancel_cancelled_running_jobs_loop_body, )) c.task_manager.ensure_future( periodically_call(60, c.cancel_orphaned_attempts_loop_body)) return c
async def async_init(self): log.info(f'initializing {self}') await super().async_init() async for record in self.db.select_and_fetchall( 'SELECT * FROM instances WHERE removed = 0 AND inst_coll = %s;', (self.name, )): instance = Instance.from_record(self.app, self, record) self.add_instance(instance) self.task_manager.ensure_future( retry_long_running( 'create_instances_loop', run_if_changed, self.create_instances_state_changed, self.create_instances_loop_body, )) self.task_manager.ensure_future( retry_long_running('schedule_jobs_loop', run_if_changed, self.scheduler_state_changed, self.schedule_jobs_loop_body)) self.task_manager.ensure_future( periodically_call(15, self.bump_scheduler))
async def async_init(self): self.task_manager.ensure_future( retry_long_running( 'cancel_cancelled_ready_jobs_loop', run_if_changed, self.cancel_ready_state_changed, self.cancel_cancelled_ready_jobs_loop_body, )) self.task_manager.ensure_future( retry_long_running( 'cancel_cancelled_creating_jobs_loop', run_if_changed, self.cancel_creating_state_changed, self.cancel_cancelled_creating_jobs_loop_body, )) self.task_manager.ensure_future( retry_long_running( 'cancel_cancelled_running_jobs_loop', run_if_changed, self.cancel_running_state_changed, self.cancel_cancelled_running_jobs_loop_body, )) self.task_manager.ensure_future( periodically_call(60, self.cancel_orphaned_attempts_loop_body))
async def on_startup(app): db = Database() await db.async_init() app['db'] = db app['client_session'] = httpx.client_session() aiogoogle_credentials = aiogoogle.GoogleCredentials.from_file( '/billing-monitoring-gsa-key/key.json') bigquery_client = aiogoogle.GoogleBigQueryClient( 'broad-ctsa', credentials=aiogoogle_credentials) app['bigquery_client'] = bigquery_client compute_client = aiogoogle.GoogleComputeClient( PROJECT, credentials=aiogoogle_credentials) app['compute_client'] = compute_client query_billing_event = asyncio.Event() app['query_billing_event'] = query_billing_event region_info = { name: await compute_client.get(f'/regions/{name}') for name in BATCH_GCP_REGIONS } zones = [url_basename(z) for r in region_info.values() for z in r['zones']] app['zones'] = zones app['task_manager'] = aiotools.BackgroundTaskManager() app['task_manager'].ensure_future( retry_long_running('polling_loop', polling_loop, app)) app['task_manager'].ensure_future( retry_long_running('query_billing_loop', run_if_changed_idempotent, query_billing_event, query_billing_body, app)) app['task_manager'].ensure_future(periodically_call( 60, monitor_disks, app)) app['task_manager'].ensure_future( periodically_call(60, monitor_instances, app))
def __init__( self, app, db: Database, # BORROWED inst_coll_manager: InstanceCollectionManager, resource_manager: CloudResourceManager, machine_name_prefix: str, config: JobPrivateInstanceManagerConfig, task_manager: aiotools.BackgroundTaskManager, ): super().__init__( db, inst_coll_manager, resource_manager, config.cloud, config.name, machine_name_prefix, is_pool=False, max_instances=config.max_instances, max_live_instances=config.max_live_instances, task_manager=task_manager, ) self.app = app global_scheduler_state_changed: Notice = self.app['scheduler_state_changed'] self.create_instances_state_changed = global_scheduler_state_changed.subscribe() self.scheduler_state_changed = asyncio.Event() self.async_worker_pool: AsyncWorkerPool = app['async_worker_pool'] self.exceeded_shares_counter = ExceededSharesCounter() self.boot_disk_size_gb = config.boot_disk_size_gb task_manager.ensure_future( retry_long_running( 'create_instances_loop', run_if_changed, self.create_instances_state_changed, self.create_instances_loop_body, ) ) task_manager.ensure_future( retry_long_running( 'schedule_jobs_loop', run_if_changed, self.scheduler_state_changed, self.schedule_jobs_loop_body ) ) task_manager.ensure_future(periodically_call(15, self.bump_scheduler))
async def on_startup(app): task_manager = aiotools.BackgroundTaskManager() app['task_manager'] = task_manager app['client_session'] = httpx.client_session() kubernetes_asyncio.config.load_incluster_config() app['k8s_client'] = kubernetes_asyncio.client.CoreV1Api() app['k8s_cache'] = K8sCache(app['k8s_client']) db = Database() await db.async_init(maxsize=50) app['db'] = db row = await db.select_and_fetchone(''' SELECT instance_id, internal_token, frozen FROM globals; ''') instance_id = row['instance_id'] log.info(f'instance_id {instance_id}') app['instance_id'] = instance_id app['internal_token'] = row['internal_token'] app['batch_headers'] = {'Authorization': f'Bearer {row["internal_token"]}'} app['frozen'] = row['frozen'] scheduler_state_changed = Notice() app['scheduler_state_changed'] = scheduler_state_changed cancel_ready_state_changed = asyncio.Event() app['cancel_ready_state_changed'] = cancel_ready_state_changed cancel_creating_state_changed = asyncio.Event() app['cancel_creating_state_changed'] = cancel_creating_state_changed cancel_running_state_changed = asyncio.Event() app['cancel_running_state_changed'] = cancel_running_state_changed async_worker_pool = AsyncWorkerPool(100, queue_size=100) app['async_worker_pool'] = async_worker_pool credentials_file = '/gsa-key/key.json' fs = get_cloud_async_fs(credentials_file=credentials_file) app['file_store'] = FileStore(fs, BATCH_STORAGE_URI, instance_id) inst_coll_configs = await InstanceCollectionConfigs.create(db) app['driver'] = await get_cloud_driver(app, db, MACHINE_NAME_PREFIX, DEFAULT_NAMESPACE, inst_coll_configs, credentials_file, task_manager) canceller = await Canceller.create(app) app['canceller'] = canceller app['check_incremental_error'] = None app['check_resource_aggregation_error'] = None if HAIL_SHOULD_CHECK_INVARIANTS: task_manager.ensure_future( periodically_call(10, check_incremental, app, db)) task_manager.ensure_future( periodically_call(10, check_resource_aggregation, app, db)) task_manager.ensure_future( periodically_call(10, monitor_billing_limits, app)) task_manager.ensure_future( periodically_call(10, cancel_fast_failing_batches, app)) task_manager.ensure_future( periodically_call(60, scheduling_cancelling_bump, app)) task_manager.ensure_future(periodically_call(15, monitor_system, app))
async def on_startup(app): app['task_manager'] = aiotools.BackgroundTaskManager() pool = concurrent.futures.ThreadPoolExecutor() app['blocking_pool'] = pool kube.config.load_incluster_config() k8s_client = kube.client.CoreV1Api() k8s_cache = K8sCache(k8s_client, refresh_time=5) app['k8s_cache'] = k8s_cache db = Database() await db.async_init(maxsize=50) app['db'] = db row = await db.select_and_fetchone(''' SELECT instance_id, internal_token FROM globals; ''') instance_id = row['instance_id'] log.info(f'instance_id {instance_id}') app['instance_id'] = instance_id app['internal_token'] = row['internal_token'] app['batch_headers'] = {'Authorization': f'Bearer {row["internal_token"]}'} resources = db.select_and_fetchall('SELECT resource FROM resources;') app['resources'] = [record['resource'] async for record in resources] aiogoogle_credentials = aiogoogle.Credentials.from_file( '/gsa-key/key.json') compute_client = aiogoogle.ComputeClient(PROJECT, credentials=aiogoogle_credentials) app['compute_client'] = compute_client logging_client = aiogoogle.LoggingClient( credentials=aiogoogle_credentials, # The project-wide logging quota is 60 request/m. The event # loop sleeps 15s per iteration, so the max rate is 4 # iterations/m. Note, the event loop could make multiple # logging requests per iteration, so these numbers are not # quite comparable. I didn't want to consume the entire quota # since there will be other users of the logging API (us at # the web console, test deployments, etc.) rate_limit=RateLimit(10, 60), ) app['logging_client'] = logging_client scheduler_state_changed = Notice() app['scheduler_state_changed'] = scheduler_state_changed cancel_ready_state_changed = asyncio.Event() app['cancel_ready_state_changed'] = cancel_ready_state_changed cancel_creating_state_changed = asyncio.Event() app['cancel_creating_state_changed'] = cancel_creating_state_changed cancel_running_state_changed = asyncio.Event() app['cancel_running_state_changed'] = cancel_running_state_changed async_worker_pool = AsyncWorkerPool(100, queue_size=100) app['async_worker_pool'] = async_worker_pool credentials = google.oauth2.service_account.Credentials.from_service_account_file( '/gsa-key/key.json') log_store = LogStore(BATCH_BUCKET_NAME, instance_id, pool, credentials=credentials) app['log_store'] = log_store zone_monitor = ZoneMonitor(app) app['zone_monitor'] = zone_monitor await zone_monitor.async_init() inst_coll_configs = InstanceCollectionConfigs(app) await inst_coll_configs.async_init() inst_coll_manager = InstanceCollectionManager(app, MACHINE_NAME_PREFIX) app['inst_coll_manager'] = inst_coll_manager await inst_coll_manager.async_init(inst_coll_configs) canceller = Canceller(app) app['canceller'] = canceller await canceller.async_init() gce_event_monitor = GCEEventMonitor(app, MACHINE_NAME_PREFIX) app['gce_event_monitor'] = gce_event_monitor await gce_event_monitor.async_init() app['check_incremental_error'] = None app['check_resource_aggregation_error'] = None if HAIL_SHOULD_CHECK_INVARIANTS: app['task_manager'].ensure_future( periodically_call(10, check_incremental, app, db)) app['task_manager'].ensure_future( periodically_call(10, check_resource_aggregation, app, db)) app['task_manager'].ensure_future( periodically_call(10, monitor_billing_limits, app)) app['task_manager'].ensure_future( periodically_call(10, cancel_fast_failing_batches, app)) app['task_manager'].ensure_future( periodically_call(60, scheduling_cancelling_bump, app))
async def create( app, db: Database, # BORROWED machine_name_prefix: str, namespace: str, inst_coll_configs: InstanceCollectionConfigs, credentials_file: str, task_manager: aiotools.BackgroundTaskManager, # BORROWED ) -> 'GCPDriver': gcp_config = get_gcp_config() project = gcp_config.project zone = gcp_config.zone regions = gcp_config.regions compute_client = aiogoogle.GoogleComputeClient( project, credentials_file=credentials_file) activity_logs_client = aiogoogle.GoogleLoggingClient( credentials_file=credentials_file, # The project-wide logging quota is 60 request/m. The event # loop sleeps 15s per iteration, so the max rate is 4 # iterations/m. Note, the event loop could make multiple # logging requests per iteration, so these numbers are not # quite comparable. I didn't want to consume the entire quota # since there will be other users of the logging API (us at # the web console, test deployments, etc.) rate_limit=RateLimit(10, 60), ) zone_monitor = await ZoneMonitor.create(compute_client, regions, zone) billing_manager = await GCPBillingManager.create(db) inst_coll_manager = InstanceCollectionManager(db, machine_name_prefix, zone_monitor) resource_manager = GCPResourceManager(project, compute_client, billing_manager) create_pools_coros = [ Pool.create( app, db, inst_coll_manager, resource_manager, machine_name_prefix, config, app['async_worker_pool'], task_manager, ) for pool_name, config in inst_coll_configs.name_pool_config.items() ] jpim, *_ = await asyncio.gather( JobPrivateInstanceManager.create( app, db, inst_coll_manager, resource_manager, machine_name_prefix, inst_coll_configs.jpim_config, task_manager, ), *create_pools_coros) driver = GCPDriver( db, machine_name_prefix, compute_client, activity_logs_client, project, namespace, zone_monitor, inst_coll_manager, jpim, billing_manager, ) task_manager.ensure_future( periodically_call(15, driver.process_activity_logs)) task_manager.ensure_future( periodically_call(60, zone_monitor.update_region_quotas)) task_manager.ensure_future( periodically_call(60, driver.delete_orphaned_disks)) task_manager.ensure_future( periodically_call(300, billing_manager.refresh_resources)) return driver
async def create( app, db: Database, # BORROWED machine_name_prefix: str, namespace: str, inst_coll_configs: InstanceCollectionConfigs, credentials_file: str, task_manager: aiotools.BackgroundTaskManager, # BORROWED ) -> 'AzureDriver': azure_config = get_azure_config() subscription_id = azure_config.subscription_id resource_group = azure_config.resource_group region = azure_config.region regions = [region] with open(os.environ['HAIL_SSH_PUBLIC_KEY'], encoding='utf-8') as f: ssh_public_key = f.read() arm_client = aioazure.AzureResourceManagerClient( subscription_id, resource_group, credentials_file=credentials_file) compute_client = aioazure.AzureComputeClient( subscription_id, resource_group, credentials_file=credentials_file) resources_client = aioazure.AzureResourcesClient( subscription_id, credentials_file=credentials_file) network_client = aioazure.AzureNetworkClient( subscription_id, resource_group, credentials_file=credentials_file) pricing_client = aioazure.AzurePricingClient() region_monitor = await RegionMonitor.create(region) billing_manager = await AzureBillingManager.create( db, pricing_client, regions) inst_coll_manager = InstanceCollectionManager(db, machine_name_prefix, region_monitor) resource_manager = AzureResourceManager(subscription_id, resource_group, ssh_public_key, arm_client, compute_client, billing_manager) create_pools_coros = [ Pool.create( app, db, inst_coll_manager, resource_manager, machine_name_prefix, config, app['async_worker_pool'], task_manager, ) for pool_name, config in inst_coll_configs.name_pool_config.items() ] jpim, *_ = await asyncio.gather( JobPrivateInstanceManager.create( app, db, inst_coll_manager, resource_manager, machine_name_prefix, inst_coll_configs.jpim_config, task_manager, ), *create_pools_coros, ) driver = AzureDriver( db, machine_name_prefix, arm_client, compute_client, resources_client, network_client, pricing_client, subscription_id, resource_group, namespace, region_monitor, inst_coll_manager, jpim, billing_manager, ) task_manager.ensure_future( periodically_call(60, driver.delete_orphaned_nics)) task_manager.ensure_future( periodically_call(60, driver.delete_orphaned_public_ips)) task_manager.ensure_future( periodically_call(60, driver.delete_completed_deployments)) task_manager.ensure_future( periodically_call( 300, billing_manager.refresh_resources_from_retail_prices)) return driver