def __init__(self, *, name: Optional[str] = None, backend: Optional[ServiceBackend] = None, image: Optional[str] = None, cpus_per_job: Optional[Union[int, str]] = None, wait_on_exit: bool = True, cleanup_bucket: bool = True, project: Optional[str] = None): self.name = name or "BatchPoolExecutor-" + secret_alnum_string(4) self.backend = backend or ServiceBackend() if not isinstance(self.backend, ServiceBackend): raise ValueError( f'BatchPoolExecutor is not compatible with {type(backend)}') self.batches: List[Batch] = [] self.directory = self.backend.remote_tmpdir + f'batch-pool-executor/{self.name}/' self.inputs = self.directory + 'inputs/' self.outputs = self.directory + 'outputs/' self.fs = RouterAsyncFS('file', gcs_kwargs={'project': project}) self.futures: List[BatchPoolFuture] = [] self.finished_future_count = 0 self._shutdown = False version = sys.version_info if image is None: if version.major != 3 or version.minor not in (6, 7, 8): raise ValueError( f'You must specify an image if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})' ) self.image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim' else: self.image = image self.cpus_per_job = cpus_per_job self.cleanup_bucket = cleanup_bucket self.wait_on_exit = wait_on_exit
def init_service(billing_project: str = None, bucket: str = None, log=None, quiet=False, append=False, tmpdir=None, local_tmpdir=None, default_reference='GRCh37', global_seed=6348563392232659379, skip_logging_configuration=False, *, disable_progress_bar=True): from hail.backend.service_backend import ServiceBackend backend = ServiceBackend( billing_project, bucket, skip_logging_configuration=skip_logging_configuration, disable_progress_bar=disable_progress_bar) log = _get_log(log) if tmpdir is None: tmpdir = 'gs://' + backend.bucket + '/tmp/hail/' + secret_alnum_string( ) assert tmpdir.startswith('gs://') local_tmpdir = _get_local_tmpdir(local_tmpdir) HailContext(log, quiet, append, tmpdir, local_tmpdir, default_reference, global_seed, backend)
async def gs_filesystem(request): token = secret_alnum_string() with ThreadPoolExecutor() as thread_pool: if request.param.startswith('router/'): fs = RouterAsyncFS('file', filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS() ]) else: assert request.param.endswith('gs') fs = GoogleStorageAsyncFS() async with fs: test_storage_uri = os.environ['HAIL_TEST_STORAGE_URI'] protocol = 'gs://' assert test_storage_uri[:len(protocol)] == protocol base = f'{test_storage_uri}/tmp/{token}/' await fs.mkdir(base) sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, base) await fs.rmtree(sema, base) assert not await fs.isdir(base)
async def filesystem(request): token = secret_alnum_string() with ThreadPoolExecutor() as thread_pool: if request.param.startswith('router/'): fs = RouterAsyncFS( 'file', [LocalAsyncFS(thread_pool), GoogleStorageAsyncFS()]) elif request.param == 'file': fs = LocalAsyncFS(thread_pool) else: fs = GoogleStorageAsyncFS() async with fs: if request.param.endswith('file'): base = f'/tmp/{token}/' else: assert request.param.endswith('gs') bucket = os.environ['HAIL_TEST_BUCKET'] base = f'gs://{bucket}/tmp/{token}/' await fs.mkdir(base) sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, base) await fs.rmtree(sema, base) assert not await fs.isdir(base)
def __init__(self, *, name: Optional[str] = None, backend: Optional[ServiceBackend] = None, image: Optional[str] = None, cpus_per_job: Optional[Union[int, str]] = None, wait_on_exit: bool = True, cleanup_bucket: bool = True, project: Optional[str] = None): self.name = name or "BatchPoolExecutor-" + secret_alnum_string(4) self.backend = backend or ServiceBackend() if not isinstance(self.backend, ServiceBackend): raise ValueError( f'BatchPoolExecutor is not compatible with {type(backend)}') self.batches: List[Batch] = [] bucket: str = self.backend._bucket_name self.directory = f'gs://{bucket}/batch-pool-executor/{self.name}/' self.inputs = self.directory + 'inputs/' self.outputs = self.directory + 'outputs/' self.gcs = GCS(blocking_pool=concurrent.futures.ThreadPoolExecutor(), project=project) self.futures: List[BatchPoolFuture] = [] self.finished_future_count = 0 self._shutdown = False version = sys.version_info self.image = image or f'hailgenetics/python-dill:{version.major}.{version.minor}' self.cpus_per_job = cpus_per_job or 1 self.cleanup_bucket = cleanup_bucket self.wait_on_exit = wait_on_exit
async def init_service(billing_project: Optional[str] = None, remote_tmpdir: Optional[str] = None, log=None, quiet=False, append=False, tmpdir=None, local_tmpdir=None, default_reference='GRCh37', global_seed=6348563392232659379, skip_logging_configuration=False, *, disable_progress_bar=True): from hail.backend.service_backend import ServiceBackend backend = await ServiceBackend.create( billing_project=billing_project, remote_tmpdir=remote_tmpdir, skip_logging_configuration=skip_logging_configuration, disable_progress_bar=disable_progress_bar) log = _get_log(log) if tmpdir is None: tmpdir = backend.remote_tmpdir + 'tmp/hail/' + secret_alnum_string() local_tmpdir = _get_local_tmpdir(local_tmpdir) await HailContext.async_create(log, quiet, append, tmpdir, local_tmpdir, default_reference, global_seed, backend)
def TemporaryFilename(*, prefix: str = '', suffix: str = '', dir: Optional[str] = None) -> _TemporaryFilenameManager: """A context manager which produces a temporary filename that is deleted when the context manager exits. Warning ------- The filename is generated randomly and is extraordinarly unlikely to already exist, but this function does not satisfy the strict requirements of Python's :class:`.TemporaryFilename`. Examples -------- >>> with TemporaryFilename() as f: # doctest: +SKIP ... open(f, 'w').write('hello hail') ... print(open(f).read()) hello hail Returns ------- :class:`._TemporaryFilenameManager` """ if dir is None: dir = tmp_dir() if not dir.endswith('/'): dir = dir + '/' return _TemporaryFilenameManager( current_backend().fs, dir + prefix + secret_alnum_string(10) + suffix)
async def create_copy_paste_token(db, session_id, max_age_secs=300): copy_paste_token = secret_alnum_string() await db.just_execute( "INSERT INTO copy_paste_tokens (id, session_id, max_age_secs) VALUES(%s, %s, %s);", (copy_paste_token, session_id, max_age_secs), ) return copy_paste_token
def _new_job_resource_file(self, source, value=None): if value is None: value = secret_alnum_string(5) jrf = JobResourceFile(value) jrf._add_source(source) self._resource_map[jrf._uid] = jrf # pylint: disable=no-member return jrf
def __init__(self, file_store, batch_id): self.file_store = file_store self.batch_id = batch_id self.token = secret_alnum_string(16) self._data_bytes = bytearray() self._offsets_bytes = bytearray() self._n_elements = 0
async def post_create_resource(request, userdata): # pylint: disable=unused-argument db = request.app['db'] storage_client = request.app['storage_client'] checked_csrf = False attachments = {} post = {} reader = aiohttp.MultipartReader(request.headers, request.content) while True: part = await reader.next() # pylint: disable=not-callable if not part: break if part.name == '_csrf': # check csrf token # form fields are delivered in ordrer, the _csrf hidden field should appear first # https://stackoverflow.com/questions/7449861/multipart-upload-form-is-order-guaranteed token1 = request.cookies.get('_csrf') token2 = await part.text() if token1 is None or token2 is None or token1 != token2: log.info('request made with invalid csrf tokens') raise web.HTTPUnauthorized() checked_csrf = True elif part.name == 'file': if not checked_csrf: raise web.HTTPUnauthorized() filename = part.filename if not filename: continue attachment_id = secret_alnum_string() async with await storage_client.insert_object( BUCKET, f'atgu/attachments/{attachment_id}') as f: while True: chunk = await part.read_chunk() if not chunk: break await f.write(chunk) attachments[attachment_id] = filename else: post[part.name] = await part.text() if not checked_csrf: raise web.HTTPUnauthorized() now = time_msecs() id = await db.execute_insertone( ''' INSERT INTO `atgu_resources` (`time_created`, `title`, `description`, `contents`, `tags`, `attachments`, `time_updated`) VALUES (%s, %s, %s, %s, %s, %s, %s); ''', (now, post['title'], post['description'], post['contents'], post['tags'], json.dumps(attachments), now), ) return web.HTTPFound(deploy_config.external_url('atgu', f'/resources/{id}'))
async def _async_execute_untimed(self, ir): token = secret_alnum_string() with TemporaryDirectory(ensure_exists=False) as dir: async def create_inputs(): with self.fs.open(dir + '/in', 'wb') as infile: write_int(infile, ServiceBackend.EXECUTE) write_str(infile, tmp_dir()) write_str(infile, self.billing_project) write_str(infile, self.bucket) write_str(infile, self.render(ir)) write_str(infile, token) async def create_batch(): batch_attributes = self.batch_attributes if 'name' not in batch_attributes: batch_attributes = {**batch_attributes, 'name': 'execute(...)'} bb = self.async_bc.create_batch(token=token, attributes=batch_attributes) j = bb.create_jvm_job([ 'is.hail.backend.service.ServiceBackendSocketAPI2', os.environ['HAIL_SHA'], os.environ['HAIL_JAR_URL'], batch_attributes['name'], dir + '/in', dir + '/out', ], mount_tokens=True) return (j, await bb.submit(disable_progress_bar=self.disable_progress_bar)) _, (j, b) = await asyncio.gather(create_inputs(), create_batch()) status = await b.wait(disable_progress_bar=self.disable_progress_bar) if status['n_succeeded'] != 1: raise ValueError(f'batch failed {status} {await j.log()}') with self.fs.open(dir + '/out', 'rb') as outfile: success = read_bool(outfile) if success: s = read_str(outfile) try: resp = json.loads(s) except json.decoder.JSONDecodeError as err: raise ValueError(f'could not decode {s}') from err else: jstacktrace = read_str(outfile) raise FatalError(jstacktrace) typ = dtype(resp['type']) if typ == tvoid: x = None else: x = typ._convert_from_json_na(resp['value']) return x
async def local_filesystem(request): token = secret_alnum_string() with ThreadPoolExecutor() as thread_pool: async with LocalAsyncFS(thread_pool) as fs: base = f'/tmp/{token}/' await fs.mkdir(base) sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, base) await fs.rmtree(sema, base) assert not await fs.isdir(base)
def setUpClass(cls): cls.remote_tmpdir = os.environ['HAIL_TEST_STORAGE_URI'] if cls.remote_tmpdir[-1] == '/': cls.remote_tmpdir = cls.remote_tmpdir[:-1] local_tmpdir = _get_local_tmpdir(None) local_tmpdir = local_tmpdir[len('file://'):] cls.local_dir = os.path.join(local_tmpdir, secret_alnum_string(5)) os.makedirs(cls.local_dir) with open(os.path.join(cls.local_dir, 'randomBytes'), 'wb') as f: f.write(secrets.token_bytes(2048))
def _new_resource_group(self, source, mappings, root=None): assert isinstance(mappings, dict) if root is None: root = secret_alnum_string(5) d = {} new_resource_map = {} for name, code in mappings.items(): if not isinstance(code, str): raise BatchException(f"value for name '{name}' is not a string. Found '{type(code)}' instead.") r = self._new_job_resource_file(source=source, value=eval(f'f"""{code}"""')) # pylint: disable=W0123 d[name] = r new_resource_map[r._uid] = r # pylint: disable=no-member self._resource_map.update(new_resource_map) rg = _resource.ResourceGroup(source, root, **d) self._resource_map.update({rg._uid: rg}) return rg
def __init__(self, sema: asyncio.Semaphore, fs: 'GoogleStorageAsyncFS', dest_url: str, num_parts: int): self._sema = sema self._fs = fs self._dest_url = dest_url self._num_parts = num_parts bucket, dest_name = fs._get_bucket_name(dest_url) self._bucket = bucket self._dest_name = dest_name # compute dest_dirname so gs://{bucket}/{dest_dirname}file # refers to a file in dest_dirname with no double slashes dest_dirname = os.path.dirname(dest_name) if dest_dirname: dest_dirname = dest_dirname + '/' self._dest_dirname = dest_dirname self._token = secret_alnum_string()
async def async_request(self, endpoint, **data): data['token'] = secret_alnum_string() session = await self.session() async with session.ws_connect(f'{self.url}/api/v1alpha/{endpoint}') as socket: await socket.send_str(json.dumps(data)) response = await socket.receive() await socket.send_str('bye') if response.type == aiohttp.WSMsgType.ERROR: raise ValueError(f'bad response: {endpoint}; {data}; {response}') if response.type in (aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSED): warnings.warn(f'retrying after losing connection {endpoint}; {data}; {response}') raise TransientError() assert response.type == aiohttp.WSMsgType.TEXT result = json.loads(response.data) if result['status'] != 200: raise FatalError(f'Error from server: {result["value"]}') return result['value']
async def test_compose(): bucket = os.environ['HAIL_TEST_BUCKET'] token = secret_alnum_string() part_data = [b'a', b'bb', b'ccc'] async with StorageClient() as client: for i, b in enumerate(part_data): async with await client.insert_object(bucket, f'{token}/{i}') as f: await f.write(b) await client.compose(bucket, [f'{token}/{i}' for i in range(len(part_data))], f'{token}/combined') expected = b''.join(part_data) async with await client.get_object(bucket, f'{token}/combined') as f: actual = await f.read() assert actual == expected
async def filesystem( request) -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, str]]: token = secret_alnum_string() with ThreadPoolExecutor() as thread_pool: fs: AsyncFS if request.param.startswith('router/'): fs = RouterAsyncFS('file', filesystems=[ LocalAsyncFS(thread_pool), GoogleStorageAsyncFS(), S3AsyncFS(thread_pool), AzureAsyncFS() ]) elif request.param == 'file': fs = LocalAsyncFS(thread_pool) elif request.param.endswith('gs'): fs = GoogleStorageAsyncFS() elif request.param.endswith('s3'): fs = S3AsyncFS(thread_pool) else: assert request.param.endswith('hail-az') fs = AzureAsyncFS() async with fs: if request.param.endswith('file'): base = f'/tmp/{token}/' elif request.param.endswith('gs'): bucket = os.environ['HAIL_TEST_GCS_BUCKET'] base = f'gs://{bucket}/tmp/{token}/' elif request.param.endswith('s3'): bucket = os.environ['HAIL_TEST_S3_BUCKET'] base = f's3://{bucket}/tmp/{token}/' else: assert request.param.endswith('hail-az') account = os.environ['HAIL_TEST_AZURE_ACCOUNT'] container = os.environ['HAIL_TEST_AZURE_CONTAINER'] base = f'hail-az://{account}/{container}/tmp/{token}/' await fs.mkdir(base) sema = asyncio.Semaphore(50) async with sema: yield (sema, fs, base) await fs.rmtree(sema, base) assert not await fs.isdir(base)
def load_references_from_dataset(self, path): token = secret_alnum_string() with TemporaryDirectory(ensure_exists=False) as dir: with self.fs.open(dir + '/in', 'wb') as infile: write_int(infile, ServiceBackend.LOAD_REFERENCES_FROM_DATASET) write_str(infile, tmp_dir()) write_str(infile, self.billing_project) write_str(infile, self.bucket) write_str(infile, path) batch_attributes = self.batch_attributes if 'name' not in batch_attributes: batch_attributes = {**batch_attributes, 'name': 'load_references_from_dataset(...)'} bb = self.bc.create_batch(token=token, attributes=batch_attributes) j = bb.create_jvm_job([ 'is.hail.backend.service.ServiceBackendSocketAPI2', os.environ['HAIL_SHA'], os.environ['HAIL_JAR_URL'], batch_attributes['name'], dir + '/in', dir + '/out', ], mount_tokens=True) b = bb.submit(disable_progress_bar=self.disable_progress_bar) status = b.wait(disable_progress_bar=self.disable_progress_bar) if status['n_succeeded'] != 1: raise ValueError(f'batch failed {status} {j.log()}') with self.fs.open(dir + '/out', 'rb') as outfile: success = read_bool(outfile) if success: s = read_str(outfile) try: # FIXME: do we not have to parse the result? return json.loads(s) except json.decoder.JSONDecodeError as err: raise ValueError(f'could not decode {s}') from err else: jstacktrace = read_str(outfile) raise FatalError(jstacktrace)
def TemporaryDirectory( *, prefix: str = '', suffix: str = '', dir: Optional[str] = None, ensure_exists: bool = True) -> _TemporaryDirectoryManager: """A context manager which produces a temporary directory name that is recursively deleted when the context manager exits. If the filesystem has a notion of directories, then we ensure the directory exists. Warning ------- The directory name is generated randomly and is extraordinarly unlikely to already exist, but this function does not satisfy the strict requirements of Python's :class:`.TemporaryDirectory`. Examples -------- >>> with TemporaryDirectory() as dir: # doctest: +SKIP ... open(f'{dir}/hello', 'w').write('hello hail') ... print(open(f'{dir}/hello').read()) hello hail Returns ------- :class:`._TemporaryDirectoryManager` """ if dir is None: dir = tmp_dir() if not dir.endswith('/'): dir = dir + '/' dirname = dir + prefix + secret_alnum_string(10) + suffix fs = current_backend().fs if ensure_exists: fs.mkdir(dirname) return _TemporaryDirectoryManager(fs, dirname)
def blockmatrix_type(self, bmir): token = secret_alnum_string() with TemporaryDirectory(ensure_exists=False) as dir: with self.fs.open(dir + '/in', 'wb') as infile: write_int(infile, ServiceBackend.BLOCK_MATRIX_TYPE) write_str(infile, tmp_dir()) write_str(infile, self.render(bmir)) batch_attributes = self.batch_attributes if 'name' not in batch_attributes: batch_attributes = {**batch_attributes, 'name': 'blockmatrix_type(...)'} bb = self.bc.create_batch(token=token, attributes=batch_attributes) j = bb.create_jvm_job([ 'is.hail.backend.service.ServiceBackendSocketAPI2', os.environ['HAIL_SHA'], os.environ['HAIL_JAR_URL'], batch_attributes['name'], dir + '/in', dir + '/out', ], mount_tokens=True) b = bb.submit(disable_progress_bar=self.disable_progress_bar) status = b.wait(disable_progress_bar=self.disable_progress_bar) if status['n_succeeded'] != 1: raise ValueError(f'batch failed {status} {j.log()}') with self.fs.open(dir + '/out', 'rb') as outfile: success = read_bool(outfile) if success: s = read_str(outfile) try: return tblockmatrix._from_json(json.loads(s)) except json.decoder.JSONDecodeError as err: raise ValueError(f'could not decode {s}') from err else: jstacktrace = read_str(outfile) raise FatalError(jstacktrace)
async def schedule_loop_body(self): if self.app['frozen']: log.info( f'not scheduling any jobs for {self.pool}; batch is frozen') return True log.info(f'schedule {self.pool}: starting') start = time_msecs() n_scheduled = 0 user_resources = await self.compute_fair_share() total = sum(resources['allocated_cores_mcpu'] for resources in user_resources.values()) if not total: log.info(f'schedule {self.pool}: no allocated cores') should_wait = True return should_wait user_share = { user: max(int(300 * resources['allocated_cores_mcpu'] / total + 0.5), 20) for user, resources in user_resources.items() } async def user_runnable_jobs(user, remaining): async for batch in self.db.select_and_fetchall( ''' SELECT batches.id, batches_cancelled.id IS NOT NULL AS cancelled, userdata, user, format_version FROM batches LEFT JOIN batches_cancelled ON batches.id = batches_cancelled.id WHERE user = %s AND `state` = 'running'; ''', (user, ), "user_runnable_jobs__select_running_batches", ): async for record in self.db.select_and_fetchall( ''' SELECT job_id, spec, cores_mcpu FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 1 AND inst_coll = %s LIMIT %s; ''', (batch['id'], self.pool.name, remaining.value), "user_runnable_jobs__select_ready_always_run_jobs", ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record if not batch['cancelled']: async for record in self.db.select_and_fetchall( ''' SELECT job_id, spec, cores_mcpu FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND inst_coll = %s AND cancelled = 0 LIMIT %s; ''', (batch['id'], self.pool.name, remaining.value), "user_runnable_jobs__select_ready_jobs_batch_not_cancelled", ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record waitable_pool = WaitableSharedPool(self.async_worker_pool) should_wait = True for user, resources in user_resources.items(): allocated_cores_mcpu = resources['allocated_cores_mcpu'] if allocated_cores_mcpu == 0: continue scheduled_cores_mcpu = 0 share = user_share[user] remaining = Box(share) async for record in user_runnable_jobs(user, remaining): batch_id = record['batch_id'] job_id = record['job_id'] id = (batch_id, job_id) attempt_id = secret_alnum_string(6) record['attempt_id'] = attempt_id if scheduled_cores_mcpu + record[ 'cores_mcpu'] > allocated_cores_mcpu: if random.random() > self.exceeded_shares_counter.rate(): self.exceeded_shares_counter.push(True) self.scheduler_state_changed.set() break self.exceeded_shares_counter.push(False) instance = self.pool.get_instance(user, record['cores_mcpu']) if instance: instance.adjust_free_cores_in_memory(-record['cores_mcpu']) scheduled_cores_mcpu += record['cores_mcpu'] n_scheduled += 1 async def schedule_with_error_handling( app, record, id, instance): try: await schedule_job(app, record, instance) except Exception: log.info( f'scheduling job {id} on {instance} for {self.pool}', exc_info=True) await waitable_pool.call(schedule_with_error_handling, self.app, record, id, instance) remaining.value -= 1 if remaining.value <= 0: should_wait = False break await waitable_pool.wait() end = time_msecs() log.info( f'schedule: attempted to schedule {n_scheduled} jobs in {end - start}ms for {self.pool}' ) return should_wait
async def schedule_loop_body(self): log.info(f'schedule {self.pool}: starting') start = time_msecs() n_scheduled = 0 user_resources = await self.compute_fair_share() total = sum(resources['allocated_cores_mcpu'] for resources in user_resources.values()) if not total: log.info(f'schedule {self.pool}: no allocated cores') should_wait = True return should_wait user_share = { user: max(int(300 * resources['allocated_cores_mcpu'] / total + 0.5), 20) for user, resources in user_resources.items() } async def user_runnable_jobs(user, remaining): async for batch in self.db.select_and_fetchall( ''' SELECT id, cancelled, userdata, user, format_version FROM batches WHERE user = %s AND `state` = 'running'; ''', (user, ), timer_description= f'in schedule {self.pool}: get {user} running batches', ): async for record in self.db.select_and_fetchall( ''' SELECT job_id, spec, cores_mcpu FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 1 AND inst_coll = %s LIMIT %s; ''', (batch['id'], self.pool.name, remaining.value), timer_description= f'in schedule {self.pool}: get {user} batch {batch["id"]} runnable jobs (1)', ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record if not batch['cancelled']: async for record in self.db.select_and_fetchall( ''' SELECT job_id, spec, cores_mcpu FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND inst_coll = %s AND cancelled = 0 LIMIT %s; ''', (batch['id'], self.pool.name, remaining.value), timer_description= f'in schedule {self.pool}: get {user} batch {batch["id"]} runnable jobs (2)', ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record waitable_pool = WaitableSharedPool(self.async_worker_pool) def get_instance(user, cores_mcpu): i = self.pool.healthy_instances_by_free_cores.bisect_key_left( cores_mcpu) while i < len(self.pool.healthy_instances_by_free_cores): instance = self.pool.healthy_instances_by_free_cores[i] assert cores_mcpu <= instance.free_cores_mcpu return instance i += 1 histogram = collections.defaultdict(int) for instance in self.pool.healthy_instances_by_free_cores: histogram[instance.free_cores_mcpu] += 1 log.info( f'schedule {self.pool}: no viable instances for {cores_mcpu}: {histogram}' ) return None should_wait = True for user, resources in user_resources.items(): allocated_cores_mcpu = resources['allocated_cores_mcpu'] if allocated_cores_mcpu == 0: continue scheduled_cores_mcpu = 0 share = user_share[user] log.info( f'schedule {self.pool}: user-share: {user}: {allocated_cores_mcpu} {share}' ) remaining = Box(share) async for record in user_runnable_jobs(user, remaining): batch_id = record['batch_id'] job_id = record['job_id'] id = (batch_id, job_id) attempt_id = secret_alnum_string(6) record['attempt_id'] = attempt_id if scheduled_cores_mcpu + record[ 'cores_mcpu'] > allocated_cores_mcpu: if random.random() > self.exceeded_shares_counter.rate(): self.exceeded_shares_counter.push(True) self.scheduler_state_changed.set() break self.exceeded_shares_counter.push(False) instance = get_instance(user, record['cores_mcpu']) if instance: instance.adjust_free_cores_in_memory(-record['cores_mcpu']) scheduled_cores_mcpu += record['cores_mcpu'] n_scheduled += 1 should_wait = False async def schedule_with_error_handling( app, record, id, instance): try: await schedule_job(app, record, instance) except Exception: log.info( f'scheduling job {id} on {instance} for {self.pool}', exc_info=True) await waitable_pool.call(schedule_with_error_handling, self.app, record, id, instance) remaining.value -= 1 if remaining.value <= 0: break await waitable_pool.wait() end = time_msecs() log.info( f'schedule: scheduled {n_scheduled} jobs in {end - start}ms for {self.pool}' ) return should_wait
def _new_python_result(self, source, value=None) -> _resource.PythonResult: if value is None: value = secret_alnum_string(5) jrf = _resource.PythonResult(value, source) self._resource_map[jrf._uid] = jrf # pylint: disable=no-member return jrf
import os import datetime from hailtop.utils import secret_alnum_string HAIL_BENCHMARK_BUCKET_NAME = os.environ['HAIL_BENCHMARK_BUCKET_NAME'] INSTANCE_ID = os.environ.get('INSTANCE_ID') if INSTANCE_ID is None: INSTANCE_ID = secret_alnum_string(12) BENCHMARK_RESULTS_PATH = f'gs://{HAIL_BENCHMARK_BUCKET_NAME}/benchmark-test/{INSTANCE_ID}' START_POINT = os.environ.get('START_POINT') if START_POINT is None: now = datetime.datetime.now() start_point = now - datetime.timedelta(days=1) START_POINT = start_point.strftime("%Y-%m-%dT%H:%M:%SZ")
def push(self, success: bool): token = secret_alnum_string(6) self._global_counter.push(token, success)
async def create_instances_loop_body(self): log.info(f'create_instances for {self}: starting') start = time_msecs() n_instances_created = 0 user_resources = await self.compute_fair_share() total = sum(resources['n_allocated_jobs'] for resources in user_resources.values()) if not total: log.info(f'create_instances {self}: no allocated jobs') should_wait = True return should_wait user_share = { user: max(int(300 * resources['n_allocated_jobs'] / total + 0.5), 20) for user, resources in user_resources.items() } async def user_runnable_jobs(user, remaining): async for batch in self.db.select_and_fetchall( ''' SELECT id, cancelled, userdata, user, format_version FROM batches WHERE user = %s AND `state` = 'running'; ''', (user, ), timer_description= f'in create_instances {self}: get {user} running batches', ): async for record in self.db.select_and_fetchall( ''' SELECT jobs.job_id, jobs.spec, jobs.cores_mcpu, COALESCE(SUM(instances.state IS NOT NULL AND (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled) LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id LEFT JOIN instances ON attempts.instance_name = instances.name WHERE jobs.batch_id = %s AND jobs.state = 'Ready' AND always_run = 1 AND jobs.inst_coll = %s GROUP BY jobs.job_id, jobs.spec, jobs.cores_mcpu HAVING live_attempts = 0 LIMIT %s; ''', (batch['id'], self.name, remaining.value), timer_description= f'in create_instances {self}: get {user} batch {batch["id"]} runnable jobs (1)', ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record if not batch['cancelled']: async for record in self.db.select_and_fetchall( ''' SELECT jobs.job_id, jobs.spec, jobs.cores_mcpu, COALESCE(SUM(instances.state IS NOT NULL AND (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled) LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id LEFT JOIN instances ON attempts.instance_name = instances.name WHERE jobs.batch_id = %s AND jobs.state = 'Ready' AND always_run = 0 AND jobs.inst_coll = %s AND cancelled = 0 GROUP BY jobs.job_id, jobs.spec, jobs.cores_mcpu HAVING live_attempts = 0 LIMIT %s; ''', (batch['id'], self.name, remaining.value), timer_description= f'in create_instances {self}: get {user} batch {batch["id"]} runnable jobs (2)', ): record['batch_id'] = batch['id'] record['userdata'] = batch['userdata'] record['user'] = batch['user'] record['format_version'] = batch['format_version'] yield record waitable_pool = WaitableSharedPool(self.async_worker_pool) should_wait = True for user, resources in user_resources.items(): n_allocated_instances = resources['n_allocated_jobs'] if n_allocated_instances == 0: continue n_user_instances_created = 0 share = user_share[user] log.info(f'create_instances {self}: user-share: {user}: {share}') remaining = Box(share) async for record in user_runnable_jobs(user, remaining): batch_id = record['batch_id'] job_id = record['job_id'] id = (batch_id, job_id) attempt_id = secret_alnum_string(6) record['attempt_id'] = attempt_id if n_user_instances_created >= n_allocated_instances: if random.random() > self.exceeded_shares_counter.rate(): self.exceeded_shares_counter.push(True) self.scheduler_state_changed.set() break self.exceeded_shares_counter.push(False) n_instances_created += 1 n_user_instances_created += 1 should_wait = False log.info(f'creating job private instance for job {id}') async def create_instance_with_error_handling( batch_id, job_id, attempt_id, record, id): try: batch_format_version = BatchFormatVersion( record['format_version']) spec = json.loads(record['spec']) machine_spec = batch_format_version.get_spec_machine_spec( spec) instance, resources = await self.create_instance( batch_id, job_id, machine_spec) await mark_job_creating(self.app, batch_id, job_id, attempt_id, instance, time_msecs(), resources) except Exception: log.info(f'creating job private instance for job {id}', exc_info=True) await waitable_pool.call(create_instance_with_error_handling, batch_id, job_id, attempt_id, record, id) remaining.value -= 1 if remaining.value <= 0: break await waitable_pool.wait() end = time_msecs() log.info( f'create_instances: created instances for {n_instances_created} jobs in {end - start}ms for {self}' ) await asyncio.sleep( 15) # ensure we don't create more instances than GCE limit return should_wait
async def post_edit_resource(request, userdata): # pylint: disable=unused-argument db = request.app['db'] storage_client = request.app['storage_client'] id = int(request.match_info['id']) old_record = await db.select_and_fetchone( ''' SELECT attachments FROM atgu_resources WHERE id = %s; ''', (id), ) if not old_record: raise web.HTTPNotFound() old_attachments = json.loads(old_record['attachments']) checked_csrf = False attachments = {} post = {} reader = aiohttp.MultipartReader(request.headers, request.content) while True: part = await reader.next() # pylint: disable=not-callable if not part: break if part.name == '_csrf': # check csrf token token1 = request.cookies.get('_csrf') token2 = await part.text() if token1 is None or token2 is None or token1 != token2: log.info('request made with invalid csrf tokens') raise web.HTTPUnauthorized() checked_csrf = True elif part.name == 'attachment': if not checked_csrf: raise web.HTTPUnauthorized() attachment_id = await part.text() assert attachment_id in old_attachments attachments[attachment_id] = old_attachments[attachment_id] elif part.name == 'file': filename = part.filename if not filename: continue attachment_id = secret_alnum_string() async with await storage_client.insert_object( BUCKET, f'atgu/attachments/{attachment_id}') as f: while True: chunk = await part.read_chunk() if not chunk: break await f.write(chunk) attachments[attachment_id] = filename else: post[part.name] = await part.text() if not checked_csrf: raise web.HTTPUnauthorized() now = time_msecs() await db.execute_update( ''' UPDATE atgu_resources SET title = %s, description = %s, contents = %s, tags = %s, attachments = %s, time_updated = %s WHERE id = %s ''', (post['title'], post['description'], post['contents'], post['tags'], json.dumps(attachments), now, id), ) return web.HTTPFound(deploy_config.external_url('atgu', f'/resources/{id}'))
def read_input_group(self, **kwargs): """ Create a new resource group representing a mapping of identifier to input resource files. Examples -------- Read a binary PLINK file: >>> b = Batch() >>> bfile = b.read_input_group(bed="data/example.bed", ... bim="data/example.bim", ... fam="data/example.fam") >>> j = b.new_job() >>> j.command(f"plink --bfile {bfile} --geno --make-bed --out {j.geno}") >>> j.command(f"wc -l {bfile.fam}") >>> j.command(f"wc -l {bfile.bim}") >>> b.run() Read a FASTA file and it's index (file extensions matter!): >>> fasta = b.read_input_group(**{'fasta': 'data/example.fasta', ... 'fasta.idx': 'data/example.fasta.idx'}) Create a resource group where the identifiers don't match the file extensions: >>> rg = b.read_input_group(foo='data/foo.txt', ... bar='data/bar.txt') `rg.foo` and `rg.bar` will not have the `.txt` file extension and instead will be `{root}.foo` and `{root}.bar` where `{root}` is a random identifier. Notes ----- The identifier is used to refer to a specific resource file. For example, given the resource group `rg`, you can use the attribute notation `rg.identifier` or the get item notation `rg[identifier]`. The file extensions for each file are derived from the identifier. This is equivalent to `"{root}.identifier"` from :meth:`.Job.declare_resource_group`. We are planning on adding flexibility to incorporate more complicated extensions in the future such as `.vcf.bgz`. For now, use :meth:`.ResourceFile.add_extension` to add an extension to a resource file. Parameters ---------- kwargs: :obj:`dict` of :obj:`str` to :obj:`str` Key word arguments where the name/key is the identifier and the value is the file path. Returns ------- :class:`.InputResourceFile` """ root = secret_alnum_string(5) new_resources = { name: self._new_input_resource_file( file, value=f'{root}/{os.path.basename(file)}') for name, file in kwargs.items() } rg = ResourceGroup(None, root, **new_resources) self._resource_map.update({rg._uid: rg}) return rg