Beispiel #1
0
 def setUp(self):
     remote_tmpdir = get_user_config().get('batch', 'remote_tmpdir')
     token = uuid.uuid4()
     self.test_path = f'{remote_tmpdir}memory-tests/{token}'
     self.fs = RouterAsyncFS(
         'gs',
         filesystems=[aiogoogle.GoogleStorageAsyncFS(project=PROJECT)])
     self.client = BlockingMemoryClient(fs=self.fs)
     self.temp_files = set()
Beispiel #2
0
def get_cloud_async_fs(credentials_file: Optional[str] = None) -> AsyncFS:
    if credentials_file is None:
        credentials_file = '/gsa-key/key.json'

    cloud = get_global_config()['cloud']

    if cloud == 'azure':
        return aioazure.AzureAsyncFS(credential_file=credentials_file)

    assert cloud == 'gcp', cloud
    return aiogoogle.GoogleStorageAsyncFS(credentials_file=credentials_file)
Beispiel #3
0
async def on_startup(app):
    credentials = aiogoogle.GoogleCredentials.from_file(
        '/benchmark-gsa-key/key.json')
    app['fs'] = aiogoogle.GoogleStorageAsyncFS(credentials=credentials)
    app['client_session'] = httpx.client_session()
    app['github_client'] = gidgethub.aiohttp.GitHubAPI(app['client_session'],
                                                       'hail-is/hail',
                                                       oauth_token=oauth_token)
    app['batch_client'] = await bc.BatchClient.create(
        billing_project='benchmark')
    app['task_manager'] = aiotools.BackgroundTaskManager()
    app['task_manager'].ensure_future(
        retry_long_running('github_polling_loop', github_polling_loop, app))
Beispiel #4
0
async def on_startup(app):
    app['task_manager'] = aiotools.BackgroundTaskManager()

    app['client_session'] = httpx.client_session()

    kube.config.load_incluster_config()
    k8s_client = kube.client.CoreV1Api()
    k8s_cache = K8sCache(k8s_client, refresh_time=5)
    app['k8s_cache'] = k8s_cache

    db = Database()
    await db.async_init(maxsize=50)
    app['db'] = db

    row = await db.select_and_fetchone(
        '''
SELECT instance_id, internal_token, frozen FROM globals;
'''
    )

    instance_id = row['instance_id']
    log.info(f'instance_id {instance_id}')
    app['instance_id'] = instance_id

    app['internal_token'] = row['internal_token']

    app['batch_headers'] = {'Authorization': f'Bearer {row["internal_token"]}'}

    app['frozen'] = row['frozen']

    resources = db.select_and_fetchall('SELECT resource, rate FROM resources;')
    app['resource_rates'] = {record['resource']: record['rate'] async for record in resources}

    aiogoogle_credentials = aiogoogle.GoogleCredentials.from_file('/gsa-key/key.json')
    compute_client = aiogoogle.GoogleComputeClient(PROJECT, credentials=aiogoogle_credentials)
    app['compute_client'] = compute_client

    logging_client = aiogoogle.GoogleLoggingClient(
        credentials=aiogoogle_credentials,
        # The project-wide logging quota is 60 request/m.  The event
        # loop sleeps 15s per iteration, so the max rate is 4
        # iterations/m.  Note, the event loop could make multiple
        # logging requests per iteration, so these numbers are not
        # quite comparable.  I didn't want to consume the entire quota
        # since there will be other users of the logging API (us at
        # the web console, test deployments, etc.)
        rate_limit=RateLimit(10, 60),
    )
    app['logging_client'] = logging_client

    scheduler_state_changed = Notice()
    app['scheduler_state_changed'] = scheduler_state_changed

    cancel_ready_state_changed = asyncio.Event()
    app['cancel_ready_state_changed'] = cancel_ready_state_changed

    cancel_creating_state_changed = asyncio.Event()
    app['cancel_creating_state_changed'] = cancel_creating_state_changed

    cancel_running_state_changed = asyncio.Event()
    app['cancel_running_state_changed'] = cancel_running_state_changed

    async_worker_pool = AsyncWorkerPool(100, queue_size=100)
    app['async_worker_pool'] = async_worker_pool

    credentials = aiogoogle.GoogleCredentials.from_file('/gsa-key/key.json')
    fs = aiogoogle.GoogleStorageAsyncFS(credentials=credentials)
    app['file_store'] = FileStore(fs, BATCH_BUCKET_NAME, instance_id)

    zone_monitor = ZoneMonitor(app)
    app['zone_monitor'] = zone_monitor
    await zone_monitor.async_init()

    inst_coll_configs = InstanceCollectionConfigs(app)
    await inst_coll_configs.async_init()

    inst_coll_manager = InstanceCollectionManager(app, MACHINE_NAME_PREFIX)
    app['inst_coll_manager'] = inst_coll_manager
    await inst_coll_manager.async_init(inst_coll_configs)

    canceller = Canceller(app)
    app['canceller'] = canceller
    await canceller.async_init()

    gce_event_monitor = GCEEventMonitor(app, MACHINE_NAME_PREFIX)
    app['gce_event_monitor'] = gce_event_monitor
    await gce_event_monitor.async_init()

    app['check_incremental_error'] = None
    app['check_resource_aggregation_error'] = None

    if HAIL_SHOULD_CHECK_INVARIANTS:
        app['task_manager'].ensure_future(periodically_call(10, check_incremental, app, db))
        app['task_manager'].ensure_future(periodically_call(10, check_resource_aggregation, app, db))

    app['task_manager'].ensure_future(periodically_call(10, monitor_billing_limits, app))

    app['task_manager'].ensure_future(periodically_call(10, cancel_fast_failing_batches, app))

    app['task_manager'].ensure_future(periodically_call(60, scheduling_cancelling_bump, app))

    app['task_manager'].ensure_future(periodically_call(15, monitor_system, app))