Ejemplo n.º 1
0
def get_compute_client(credentials_file: Optional[str] = None):
    if credentials_file is None:
        credentials_file = '/gsa-key/key.json'

    cloud = get_global_config()['cloud']

    if cloud == 'azure':
        azure_config = get_azure_config()
        return aioazure.AzureComputeClient(azure_config.subscription_id,
                                           azure_config.resource_group)

    assert cloud == 'gcp', cloud
    project = get_gcp_config().project
    return aiogoogle.GoogleComputeClient(project,
                                         credentials_file=credentials_file)
Ejemplo n.º 2
0
async def on_startup(app):
    db = Database()
    await db.async_init()
    app['db'] = db
    app['client_session'] = httpx.client_session()

    aiogoogle_credentials = aiogoogle.GoogleCredentials.from_file(
        '/billing-monitoring-gsa-key/key.json')

    bigquery_client = aiogoogle.GoogleBigQueryClient(
        'broad-ctsa', credentials=aiogoogle_credentials)
    app['bigquery_client'] = bigquery_client

    compute_client = aiogoogle.GoogleComputeClient(
        PROJECT, credentials=aiogoogle_credentials)
    app['compute_client'] = compute_client

    query_billing_event = asyncio.Event()
    app['query_billing_event'] = query_billing_event

    region_info = {
        name: await compute_client.get(f'/regions/{name}')
        for name in BATCH_GCP_REGIONS
    }
    zones = [url_basename(z) for r in region_info.values() for z in r['zones']]
    app['zones'] = zones

    app['task_manager'] = aiotools.BackgroundTaskManager()

    app['task_manager'].ensure_future(
        retry_long_running('polling_loop', polling_loop, app))

    app['task_manager'].ensure_future(
        retry_long_running('query_billing_loop', run_if_changed_idempotent,
                           query_billing_event, query_billing_body, app))

    app['task_manager'].ensure_future(periodically_call(
        60, monitor_disks, app))
    app['task_manager'].ensure_future(
        periodically_call(60, monitor_instances, app))
Ejemplo n.º 3
0
    def __init__(self, name, zone, project, instance_name, size_in_gb,
                 mount_path):
        assert size_in_gb >= 10
        # disk name must be 63 characters or less
        # https://cloud.google.com/compute/docs/reference/rest/v1/disks#resource:-disk
        # under the information for the name field
        assert len(name) <= 63

        self.compute_client = aiogoogle.GoogleComputeClient(
            project,
            credentials=aiogoogle.GoogleCredentials.from_file(
                '/worker-key.json'))
        self.name = name
        self.zone = zone
        self.project = project
        self.instance_name = instance_name
        self.size_in_gb = size_in_gb
        self.mount_path = mount_path

        self._created = False
        self._attached = False

        self.disk_path = f'/dev/disk/by-id/google-{self.name}'
Ejemplo n.º 4
0
async def on_startup(app):
    app['task_manager'] = aiotools.BackgroundTaskManager()

    app['client_session'] = httpx.client_session()

    kube.config.load_incluster_config()
    k8s_client = kube.client.CoreV1Api()
    k8s_cache = K8sCache(k8s_client, refresh_time=5)
    app['k8s_cache'] = k8s_cache

    db = Database()
    await db.async_init(maxsize=50)
    app['db'] = db

    row = await db.select_and_fetchone(
        '''
SELECT instance_id, internal_token, frozen FROM globals;
'''
    )

    instance_id = row['instance_id']
    log.info(f'instance_id {instance_id}')
    app['instance_id'] = instance_id

    app['internal_token'] = row['internal_token']

    app['batch_headers'] = {'Authorization': f'Bearer {row["internal_token"]}'}

    app['frozen'] = row['frozen']

    resources = db.select_and_fetchall('SELECT resource, rate FROM resources;')
    app['resource_rates'] = {record['resource']: record['rate'] async for record in resources}

    aiogoogle_credentials = aiogoogle.GoogleCredentials.from_file('/gsa-key/key.json')
    compute_client = aiogoogle.GoogleComputeClient(PROJECT, credentials=aiogoogle_credentials)
    app['compute_client'] = compute_client

    logging_client = aiogoogle.GoogleLoggingClient(
        credentials=aiogoogle_credentials,
        # The project-wide logging quota is 60 request/m.  The event
        # loop sleeps 15s per iteration, so the max rate is 4
        # iterations/m.  Note, the event loop could make multiple
        # logging requests per iteration, so these numbers are not
        # quite comparable.  I didn't want to consume the entire quota
        # since there will be other users of the logging API (us at
        # the web console, test deployments, etc.)
        rate_limit=RateLimit(10, 60),
    )
    app['logging_client'] = logging_client

    scheduler_state_changed = Notice()
    app['scheduler_state_changed'] = scheduler_state_changed

    cancel_ready_state_changed = asyncio.Event()
    app['cancel_ready_state_changed'] = cancel_ready_state_changed

    cancel_creating_state_changed = asyncio.Event()
    app['cancel_creating_state_changed'] = cancel_creating_state_changed

    cancel_running_state_changed = asyncio.Event()
    app['cancel_running_state_changed'] = cancel_running_state_changed

    async_worker_pool = AsyncWorkerPool(100, queue_size=100)
    app['async_worker_pool'] = async_worker_pool

    credentials = aiogoogle.GoogleCredentials.from_file('/gsa-key/key.json')
    fs = aiogoogle.GoogleStorageAsyncFS(credentials=credentials)
    app['file_store'] = FileStore(fs, BATCH_BUCKET_NAME, instance_id)

    zone_monitor = ZoneMonitor(app)
    app['zone_monitor'] = zone_monitor
    await zone_monitor.async_init()

    inst_coll_configs = InstanceCollectionConfigs(app)
    await inst_coll_configs.async_init()

    inst_coll_manager = InstanceCollectionManager(app, MACHINE_NAME_PREFIX)
    app['inst_coll_manager'] = inst_coll_manager
    await inst_coll_manager.async_init(inst_coll_configs)

    canceller = Canceller(app)
    app['canceller'] = canceller
    await canceller.async_init()

    gce_event_monitor = GCEEventMonitor(app, MACHINE_NAME_PREFIX)
    app['gce_event_monitor'] = gce_event_monitor
    await gce_event_monitor.async_init()

    app['check_incremental_error'] = None
    app['check_resource_aggregation_error'] = None

    if HAIL_SHOULD_CHECK_INVARIANTS:
        app['task_manager'].ensure_future(periodically_call(10, check_incremental, app, db))
        app['task_manager'].ensure_future(periodically_call(10, check_resource_aggregation, app, db))

    app['task_manager'].ensure_future(periodically_call(10, monitor_billing_limits, app))

    app['task_manager'].ensure_future(periodically_call(10, cancel_fast_failing_batches, app))

    app['task_manager'].ensure_future(periodically_call(60, scheduling_cancelling_bump, app))

    app['task_manager'].ensure_future(periodically_call(15, monitor_system, app))
Ejemplo n.º 5
0
    async def create(
            app,
            db: Database,  # BORROWED
            machine_name_prefix: str,
            namespace: str,
            inst_coll_configs: InstanceCollectionConfigs,
            credentials_file: str,
            task_manager: aiotools.BackgroundTaskManager,  # BORROWED
    ) -> 'GCPDriver':
        gcp_config = get_gcp_config()
        project = gcp_config.project
        zone = gcp_config.zone
        regions = gcp_config.regions

        compute_client = aiogoogle.GoogleComputeClient(
            project, credentials_file=credentials_file)

        activity_logs_client = aiogoogle.GoogleLoggingClient(
            credentials_file=credentials_file,
            # The project-wide logging quota is 60 request/m.  The event
            # loop sleeps 15s per iteration, so the max rate is 4
            # iterations/m.  Note, the event loop could make multiple
            # logging requests per iteration, so these numbers are not
            # quite comparable.  I didn't want to consume the entire quota
            # since there will be other users of the logging API (us at
            # the web console, test deployments, etc.)
            rate_limit=RateLimit(10, 60),
        )

        zone_monitor = await ZoneMonitor.create(compute_client, regions, zone)
        billing_manager = await GCPBillingManager.create(db)
        inst_coll_manager = InstanceCollectionManager(db, machine_name_prefix,
                                                      zone_monitor)
        resource_manager = GCPResourceManager(project, compute_client,
                                              billing_manager)

        create_pools_coros = [
            Pool.create(
                app,
                db,
                inst_coll_manager,
                resource_manager,
                machine_name_prefix,
                config,
                app['async_worker_pool'],
                task_manager,
            ) for pool_name, config in
            inst_coll_configs.name_pool_config.items()
        ]

        jpim, *_ = await asyncio.gather(
            JobPrivateInstanceManager.create(
                app,
                db,
                inst_coll_manager,
                resource_manager,
                machine_name_prefix,
                inst_coll_configs.jpim_config,
                task_manager,
            ), *create_pools_coros)

        driver = GCPDriver(
            db,
            machine_name_prefix,
            compute_client,
            activity_logs_client,
            project,
            namespace,
            zone_monitor,
            inst_coll_manager,
            jpim,
            billing_manager,
        )

        task_manager.ensure_future(
            periodically_call(15, driver.process_activity_logs))
        task_manager.ensure_future(
            periodically_call(60, zone_monitor.update_region_quotas))
        task_manager.ensure_future(
            periodically_call(60, driver.delete_orphaned_disks))
        task_manager.ensure_future(
            periodically_call(300, billing_manager.refresh_resources))

        return driver