Example #1
0
 def __init__(self,
              *,
              name: Optional[str] = None,
              backend: Optional[ServiceBackend] = None,
              image: Optional[str] = None,
              cpus_per_job: Optional[Union[int, str]] = None,
              wait_on_exit: bool = True,
              cleanup_bucket: bool = True,
              project: Optional[str] = None):
     self.name = name or "BatchPoolExecutor-" + secret_alnum_string(4)
     self.backend = backend or ServiceBackend()
     if not isinstance(self.backend, ServiceBackend):
         raise ValueError(
             f'BatchPoolExecutor is not compatible with {type(backend)}')
     self.batches: List[Batch] = []
     self.directory = self.backend.remote_tmpdir + f'batch-pool-executor/{self.name}/'
     self.inputs = self.directory + 'inputs/'
     self.outputs = self.directory + 'outputs/'
     self.fs = RouterAsyncFS('file', gcs_kwargs={'project': project})
     self.futures: List[BatchPoolFuture] = []
     self.finished_future_count = 0
     self._shutdown = False
     version = sys.version_info
     if image is None:
         if version.major != 3 or version.minor not in (6, 7, 8):
             raise ValueError(
                 f'You must specify an image if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})'
             )
         self.image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim'
     else:
         self.image = image
     self.cpus_per_job = cpus_per_job
     self.cleanup_bucket = cleanup_bucket
     self.wait_on_exit = wait_on_exit
Example #2
0
def init_service(billing_project: str = None,
                 bucket: str = None,
                 log=None,
                 quiet=False,
                 append=False,
                 tmpdir=None,
                 local_tmpdir=None,
                 default_reference='GRCh37',
                 global_seed=6348563392232659379,
                 skip_logging_configuration=False,
                 *,
                 disable_progress_bar=True):
    from hail.backend.service_backend import ServiceBackend
    backend = ServiceBackend(
        billing_project,
        bucket,
        skip_logging_configuration=skip_logging_configuration,
        disable_progress_bar=disable_progress_bar)

    log = _get_log(log)
    if tmpdir is None:
        tmpdir = 'gs://' + backend.bucket + '/tmp/hail/' + secret_alnum_string(
        )
    assert tmpdir.startswith('gs://')
    local_tmpdir = _get_local_tmpdir(local_tmpdir)

    HailContext(log, quiet, append, tmpdir, local_tmpdir, default_reference,
                global_seed, backend)
Example #3
0
async def gs_filesystem(request):
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        if request.param.startswith('router/'):
            fs = RouterAsyncFS('file',
                               filesystems=[
                                   LocalAsyncFS(thread_pool),
                                   GoogleStorageAsyncFS()
                               ])
        else:
            assert request.param.endswith('gs')
            fs = GoogleStorageAsyncFS()
        async with fs:
            test_storage_uri = os.environ['HAIL_TEST_STORAGE_URI']
            protocol = 'gs://'
            assert test_storage_uri[:len(protocol)] == protocol
            base = f'{test_storage_uri}/tmp/{token}/'

            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Example #4
0
async def filesystem(request):
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        if request.param.startswith('router/'):
            fs = RouterAsyncFS(
                'file', [LocalAsyncFS(thread_pool),
                         GoogleStorageAsyncFS()])
        elif request.param == 'file':
            fs = LocalAsyncFS(thread_pool)
        else:
            fs = GoogleStorageAsyncFS()
        async with fs:
            if request.param.endswith('file'):
                base = f'/tmp/{token}/'
            else:
                assert request.param.endswith('gs')
                bucket = os.environ['HAIL_TEST_BUCKET']
                base = f'gs://{bucket}/tmp/{token}/'

            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Example #5
0
 def __init__(self,
              *,
              name: Optional[str] = None,
              backend: Optional[ServiceBackend] = None,
              image: Optional[str] = None,
              cpus_per_job: Optional[Union[int, str]] = None,
              wait_on_exit: bool = True,
              cleanup_bucket: bool = True,
              project: Optional[str] = None):
     self.name = name or "BatchPoolExecutor-" + secret_alnum_string(4)
     self.backend = backend or ServiceBackend()
     if not isinstance(self.backend, ServiceBackend):
         raise ValueError(
             f'BatchPoolExecutor is not compatible with {type(backend)}')
     self.batches: List[Batch] = []
     bucket: str = self.backend._bucket_name
     self.directory = f'gs://{bucket}/batch-pool-executor/{self.name}/'
     self.inputs = self.directory + 'inputs/'
     self.outputs = self.directory + 'outputs/'
     self.gcs = GCS(blocking_pool=concurrent.futures.ThreadPoolExecutor(),
                    project=project)
     self.futures: List[BatchPoolFuture] = []
     self.finished_future_count = 0
     self._shutdown = False
     version = sys.version_info
     self.image = image or f'hailgenetics/python-dill:{version.major}.{version.minor}'
     self.cpus_per_job = cpus_per_job or 1
     self.cleanup_bucket = cleanup_bucket
     self.wait_on_exit = wait_on_exit
Example #6
0
async def init_service(billing_project: Optional[str] = None,
                       remote_tmpdir: Optional[str] = None,
                       log=None,
                       quiet=False,
                       append=False,
                       tmpdir=None,
                       local_tmpdir=None,
                       default_reference='GRCh37',
                       global_seed=6348563392232659379,
                       skip_logging_configuration=False,
                       *,
                       disable_progress_bar=True):
    from hail.backend.service_backend import ServiceBackend
    backend = await ServiceBackend.create(
        billing_project=billing_project,
        remote_tmpdir=remote_tmpdir,
        skip_logging_configuration=skip_logging_configuration,
        disable_progress_bar=disable_progress_bar)

    log = _get_log(log)
    if tmpdir is None:
        tmpdir = backend.remote_tmpdir + 'tmp/hail/' + secret_alnum_string()
    local_tmpdir = _get_local_tmpdir(local_tmpdir)

    await HailContext.async_create(log, quiet, append, tmpdir, local_tmpdir,
                                   default_reference, global_seed, backend)
Example #7
0
def TemporaryFilename(*,
                      prefix: str = '',
                      suffix: str = '',
                      dir: Optional[str] = None) -> _TemporaryFilenameManager:
    """A context manager which produces a temporary filename that is deleted when the context manager exits.

    Warning
    -------

    The filename is generated randomly and is extraordinarly unlikely to already exist, but this
    function does not satisfy the strict requirements of Python's :class:`.TemporaryFilename`.

    Examples
    --------

    >>> with TemporaryFilename() as f:  # doctest: +SKIP
    ...     open(f, 'w').write('hello hail')
    ...     print(open(f).read())
    hello hail

    Returns
    -------
    :class:`._TemporaryFilenameManager`

    """
    if dir is None:
        dir = tmp_dir()
    if not dir.endswith('/'):
        dir = dir + '/'
    return _TemporaryFilenameManager(
        current_backend().fs, dir + prefix + secret_alnum_string(10) + suffix)
Example #8
0
async def create_copy_paste_token(db, session_id, max_age_secs=300):
    copy_paste_token = secret_alnum_string()
    await db.just_execute(
        "INSERT INTO copy_paste_tokens (id, session_id, max_age_secs) VALUES(%s, %s, %s);",
        (copy_paste_token, session_id, max_age_secs),
    )
    return copy_paste_token
 def _new_job_resource_file(self, source, value=None):
     if value is None:
         value = secret_alnum_string(5)
     jrf = JobResourceFile(value)
     jrf._add_source(source)
     self._resource_map[jrf._uid] = jrf  # pylint: disable=no-member
     return jrf
Example #10
0
    def __init__(self, file_store, batch_id):
        self.file_store = file_store
        self.batch_id = batch_id
        self.token = secret_alnum_string(16)

        self._data_bytes = bytearray()
        self._offsets_bytes = bytearray()
        self._n_elements = 0
Example #11
0
File: atgu.py Project: saponas/hail
async def post_create_resource(request, userdata):  # pylint: disable=unused-argument
    db = request.app['db']
    storage_client = request.app['storage_client']

    checked_csrf = False
    attachments = {}
    post = {}
    reader = aiohttp.MultipartReader(request.headers, request.content)
    while True:
        part = await reader.next()  # pylint: disable=not-callable
        if not part:
            break
        if part.name == '_csrf':
            # check csrf token
            # form fields are delivered in ordrer, the _csrf hidden field should appear first
            # https://stackoverflow.com/questions/7449861/multipart-upload-form-is-order-guaranteed
            token1 = request.cookies.get('_csrf')
            token2 = await part.text()
            if token1 is None or token2 is None or token1 != token2:
                log.info('request made with invalid csrf tokens')
                raise web.HTTPUnauthorized()
            checked_csrf = True
        elif part.name == 'file':
            if not checked_csrf:
                raise web.HTTPUnauthorized()
            filename = part.filename
            if not filename:
                continue
            attachment_id = secret_alnum_string()
            async with await storage_client.insert_object(
                    BUCKET, f'atgu/attachments/{attachment_id}') as f:
                while True:
                    chunk = await part.read_chunk()
                    if not chunk:
                        break
                    await f.write(chunk)
            attachments[attachment_id] = filename
        else:
            post[part.name] = await part.text()

    if not checked_csrf:
        raise web.HTTPUnauthorized()

    now = time_msecs()
    id = await db.execute_insertone(
        '''
INSERT INTO `atgu_resources` (`time_created`, `title`, `description`, `contents`, `tags`, `attachments`, `time_updated`)
VALUES (%s, %s, %s, %s, %s, %s, %s);
''',
        (now, post['title'], post['description'], post['contents'],
         post['tags'], json.dumps(attachments), now),
    )

    return web.HTTPFound(deploy_config.external_url('atgu',
                                                    f'/resources/{id}'))
    async def _async_execute_untimed(self, ir):
        token = secret_alnum_string()
        with TemporaryDirectory(ensure_exists=False) as dir:
            async def create_inputs():
                with self.fs.open(dir + '/in', 'wb') as infile:
                    write_int(infile, ServiceBackend.EXECUTE)
                    write_str(infile, tmp_dir())
                    write_str(infile, self.billing_project)
                    write_str(infile, self.bucket)
                    write_str(infile, self.render(ir))
                    write_str(infile, token)

            async def create_batch():
                batch_attributes = self.batch_attributes
                if 'name' not in batch_attributes:
                    batch_attributes = {**batch_attributes, 'name': 'execute(...)'}
                bb = self.async_bc.create_batch(token=token, attributes=batch_attributes)

                j = bb.create_jvm_job([
                    'is.hail.backend.service.ServiceBackendSocketAPI2',
                    os.environ['HAIL_SHA'],
                    os.environ['HAIL_JAR_URL'],
                    batch_attributes['name'],
                    dir + '/in',
                    dir + '/out',
                ], mount_tokens=True)
                return (j, await bb.submit(disable_progress_bar=self.disable_progress_bar))

            _, (j, b) = await asyncio.gather(create_inputs(), create_batch())

            status = await b.wait(disable_progress_bar=self.disable_progress_bar)
            if status['n_succeeded'] != 1:
                raise ValueError(f'batch failed {status} {await j.log()}')


            with self.fs.open(dir + '/out', 'rb') as outfile:
                success = read_bool(outfile)
                if success:
                    s = read_str(outfile)
                    try:
                        resp = json.loads(s)
                    except json.decoder.JSONDecodeError as err:
                        raise ValueError(f'could not decode {s}') from err
                else:
                    jstacktrace = read_str(outfile)
                    raise FatalError(jstacktrace)

        typ = dtype(resp['type'])
        if typ == tvoid:
            x = None
        else:
            x = typ._convert_from_json_na(resp['value'])

        return x
Example #13
0
async def local_filesystem(request):
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        async with LocalAsyncFS(thread_pool) as fs:
            base = f'/tmp/{token}/'
            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
Example #14
0
    def setUpClass(cls):
        cls.remote_tmpdir = os.environ['HAIL_TEST_STORAGE_URI']
        if cls.remote_tmpdir[-1] == '/':
            cls.remote_tmpdir = cls.remote_tmpdir[:-1]

        local_tmpdir = _get_local_tmpdir(None)
        local_tmpdir = local_tmpdir[len('file://'):]
        cls.local_dir = os.path.join(local_tmpdir, secret_alnum_string(5))

        os.makedirs(cls.local_dir)

        with open(os.path.join(cls.local_dir, 'randomBytes'), 'wb') as f:
            f.write(secrets.token_bytes(2048))
Example #15
0
    def _new_resource_group(self, source, mappings, root=None):
        assert isinstance(mappings, dict)
        if root is None:
            root = secret_alnum_string(5)
        d = {}
        new_resource_map = {}
        for name, code in mappings.items():
            if not isinstance(code, str):
                raise BatchException(f"value for name '{name}' is not a string. Found '{type(code)}' instead.")
            r = self._new_job_resource_file(source=source, value=eval(f'f"""{code}"""'))  # pylint: disable=W0123
            d[name] = r
            new_resource_map[r._uid] = r  # pylint: disable=no-member

        self._resource_map.update(new_resource_map)
        rg = _resource.ResourceGroup(source, root, **d)
        self._resource_map.update({rg._uid: rg})
        return rg
    def __init__(self, sema: asyncio.Semaphore, fs: 'GoogleStorageAsyncFS', dest_url: str, num_parts: int):
        self._sema = sema
        self._fs = fs
        self._dest_url = dest_url
        self._num_parts = num_parts
        bucket, dest_name = fs._get_bucket_name(dest_url)
        self._bucket = bucket
        self._dest_name = dest_name

        # compute dest_dirname so gs://{bucket}/{dest_dirname}file
        # refers to a file in dest_dirname with no double slashes
        dest_dirname = os.path.dirname(dest_name)
        if dest_dirname:
            dest_dirname = dest_dirname + '/'
        self._dest_dirname = dest_dirname

        self._token = secret_alnum_string()
Example #17
0
 async def async_request(self, endpoint, **data):
     data['token'] = secret_alnum_string()
     session = await self.session()
     async with session.ws_connect(f'{self.url}/api/v1alpha/{endpoint}') as socket:
         await socket.send_str(json.dumps(data))
         response = await socket.receive()
         await socket.send_str('bye')
         if response.type == aiohttp.WSMsgType.ERROR:
             raise ValueError(f'bad response: {endpoint}; {data}; {response}')
         if response.type in (aiohttp.WSMsgType.CLOSE,
                              aiohttp.WSMsgType.CLOSED):
             warnings.warn(f'retrying after losing connection {endpoint}; {data}; {response}')
             raise TransientError()
         assert response.type == aiohttp.WSMsgType.TEXT
         result = json.loads(response.data)
         if result['status'] != 200:
             raise FatalError(f'Error from server: {result["value"]}')
         return result['value']
Example #18
0
async def test_compose():
    bucket = os.environ['HAIL_TEST_BUCKET']
    token = secret_alnum_string()

    part_data = [b'a', b'bb', b'ccc']

    async with StorageClient() as client:
        for i, b in enumerate(part_data):
            async with await client.insert_object(bucket, f'{token}/{i}') as f:
                await f.write(b)
        await client.compose(bucket,
                             [f'{token}/{i}' for i in range(len(part_data))],
                             f'{token}/combined')

        expected = b''.join(part_data)
        async with await client.get_object(bucket, f'{token}/combined') as f:
            actual = await f.read()
        assert actual == expected
Example #19
0
async def filesystem(
        request) -> AsyncIterator[Tuple[asyncio.Semaphore, AsyncFS, str]]:
    token = secret_alnum_string()

    with ThreadPoolExecutor() as thread_pool:
        fs: AsyncFS
        if request.param.startswith('router/'):
            fs = RouterAsyncFS('file',
                               filesystems=[
                                   LocalAsyncFS(thread_pool),
                                   GoogleStorageAsyncFS(),
                                   S3AsyncFS(thread_pool),
                                   AzureAsyncFS()
                               ])
        elif request.param == 'file':
            fs = LocalAsyncFS(thread_pool)
        elif request.param.endswith('gs'):
            fs = GoogleStorageAsyncFS()
        elif request.param.endswith('s3'):
            fs = S3AsyncFS(thread_pool)
        else:
            assert request.param.endswith('hail-az')
            fs = AzureAsyncFS()
        async with fs:
            if request.param.endswith('file'):
                base = f'/tmp/{token}/'
            elif request.param.endswith('gs'):
                bucket = os.environ['HAIL_TEST_GCS_BUCKET']
                base = f'gs://{bucket}/tmp/{token}/'
            elif request.param.endswith('s3'):
                bucket = os.environ['HAIL_TEST_S3_BUCKET']
                base = f's3://{bucket}/tmp/{token}/'
            else:
                assert request.param.endswith('hail-az')
                account = os.environ['HAIL_TEST_AZURE_ACCOUNT']
                container = os.environ['HAIL_TEST_AZURE_CONTAINER']
                base = f'hail-az://{account}/{container}/tmp/{token}/'

            await fs.mkdir(base)
            sema = asyncio.Semaphore(50)
            async with sema:
                yield (sema, fs, base)
                await fs.rmtree(sema, base)
            assert not await fs.isdir(base)
    def load_references_from_dataset(self, path):
        token = secret_alnum_string()
        with TemporaryDirectory(ensure_exists=False) as dir:
            with self.fs.open(dir + '/in', 'wb') as infile:
                write_int(infile, ServiceBackend.LOAD_REFERENCES_FROM_DATASET)
                write_str(infile, tmp_dir())
                write_str(infile, self.billing_project)
                write_str(infile, self.bucket)
                write_str(infile, path)

            batch_attributes = self.batch_attributes
            if 'name' not in batch_attributes:
                batch_attributes = {**batch_attributes, 'name': 'load_references_from_dataset(...)'}
            bb = self.bc.create_batch(token=token, attributes=batch_attributes)

            j = bb.create_jvm_job([
                'is.hail.backend.service.ServiceBackendSocketAPI2',
                os.environ['HAIL_SHA'],
                os.environ['HAIL_JAR_URL'],
                batch_attributes['name'],
                dir + '/in',
                dir + '/out',
            ], mount_tokens=True)
            b = bb.submit(disable_progress_bar=self.disable_progress_bar)
            status = b.wait(disable_progress_bar=self.disable_progress_bar)
            if status['n_succeeded'] != 1:
                raise ValueError(f'batch failed {status} {j.log()}')


            with self.fs.open(dir + '/out', 'rb') as outfile:
                success = read_bool(outfile)
                if success:
                    s = read_str(outfile)
                    try:
                        # FIXME: do we not have to parse the result?
                        return json.loads(s)
                    except json.decoder.JSONDecodeError as err:
                        raise ValueError(f'could not decode {s}') from err
                else:
                    jstacktrace = read_str(outfile)
                    raise FatalError(jstacktrace)
Example #21
0
def TemporaryDirectory(
        *,
        prefix: str = '',
        suffix: str = '',
        dir: Optional[str] = None,
        ensure_exists: bool = True) -> _TemporaryDirectoryManager:
    """A context manager which produces a temporary directory name that is recursively deleted when the context manager exits.

    If the filesystem has a notion of directories, then we ensure the directory exists.

    Warning
    -------

    The directory name is generated randomly and is extraordinarly unlikely to already exist, but
    this function does not satisfy the strict requirements of Python's :class:`.TemporaryDirectory`.

    Examples
    --------

    >>> with TemporaryDirectory() as dir:  # doctest: +SKIP
    ...     open(f'{dir}/hello', 'w').write('hello hail')
    ...     print(open(f'{dir}/hello').read())
    hello hail

    Returns
    -------
    :class:`._TemporaryDirectoryManager`

    """
    if dir is None:
        dir = tmp_dir()
    if not dir.endswith('/'):
        dir = dir + '/'
    dirname = dir + prefix + secret_alnum_string(10) + suffix
    fs = current_backend().fs
    if ensure_exists:
        fs.mkdir(dirname)
    return _TemporaryDirectoryManager(fs, dirname)
    def blockmatrix_type(self, bmir):
        token = secret_alnum_string()
        with TemporaryDirectory(ensure_exists=False) as dir:
            with self.fs.open(dir + '/in', 'wb') as infile:
                write_int(infile, ServiceBackend.BLOCK_MATRIX_TYPE)
                write_str(infile, tmp_dir())
                write_str(infile, self.render(bmir))

            batch_attributes = self.batch_attributes
            if 'name' not in batch_attributes:
                batch_attributes = {**batch_attributes, 'name': 'blockmatrix_type(...)'}
            bb = self.bc.create_batch(token=token, attributes=batch_attributes)

            j = bb.create_jvm_job([
                'is.hail.backend.service.ServiceBackendSocketAPI2',
                os.environ['HAIL_SHA'],
                os.environ['HAIL_JAR_URL'],
                batch_attributes['name'],
                dir + '/in',
                dir + '/out',
            ], mount_tokens=True)
            b = bb.submit(disable_progress_bar=self.disable_progress_bar)
            status = b.wait(disable_progress_bar=self.disable_progress_bar)
            if status['n_succeeded'] != 1:
                raise ValueError(f'batch failed {status} {j.log()}')


            with self.fs.open(dir + '/out', 'rb') as outfile:
                success = read_bool(outfile)
                if success:
                    s = read_str(outfile)
                    try:
                        return tblockmatrix._from_json(json.loads(s))
                    except json.decoder.JSONDecodeError as err:
                        raise ValueError(f'could not decode {s}') from err
                else:
                    jstacktrace = read_str(outfile)
                    raise FatalError(jstacktrace)
Example #23
0
    async def schedule_loop_body(self):
        if self.app['frozen']:
            log.info(
                f'not scheduling any jobs for {self.pool}; batch is frozen')
            return True

        log.info(f'schedule {self.pool}: starting')
        start = time_msecs()
        n_scheduled = 0

        user_resources = await self.compute_fair_share()

        total = sum(resources['allocated_cores_mcpu']
                    for resources in user_resources.values())
        if not total:
            log.info(f'schedule {self.pool}: no allocated cores')
            should_wait = True
            return should_wait
        user_share = {
            user:
            max(int(300 * resources['allocated_cores_mcpu'] / total + 0.5), 20)
            for user, resources in user_resources.items()
        }

        async def user_runnable_jobs(user, remaining):
            async for batch in self.db.select_and_fetchall(
                    '''
SELECT batches.id, batches_cancelled.id IS NOT NULL AS cancelled, userdata, user, format_version
FROM batches
LEFT JOIN batches_cancelled
       ON batches.id = batches_cancelled.id
WHERE user = %s AND `state` = 'running';
''',
                (user, ),
                    "user_runnable_jobs__select_running_batches",
            ):
                async for record in self.db.select_and_fetchall(
                        '''
SELECT job_id, spec, cores_mcpu
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 1 AND inst_coll = %s
LIMIT %s;
''',
                    (batch['id'], self.pool.name, remaining.value),
                        "user_runnable_jobs__select_ready_always_run_jobs",
                ):
                    record['batch_id'] = batch['id']
                    record['userdata'] = batch['userdata']
                    record['user'] = batch['user']
                    record['format_version'] = batch['format_version']
                    yield record
                if not batch['cancelled']:
                    async for record in self.db.select_and_fetchall(
                            '''
SELECT job_id, spec, cores_mcpu
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND inst_coll = %s AND cancelled = 0
LIMIT %s;
''',
                        (batch['id'], self.pool.name, remaining.value),
                            "user_runnable_jobs__select_ready_jobs_batch_not_cancelled",
                    ):
                        record['batch_id'] = batch['id']
                        record['userdata'] = batch['userdata']
                        record['user'] = batch['user']
                        record['format_version'] = batch['format_version']
                        yield record

        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        should_wait = True
        for user, resources in user_resources.items():
            allocated_cores_mcpu = resources['allocated_cores_mcpu']
            if allocated_cores_mcpu == 0:
                continue

            scheduled_cores_mcpu = 0
            share = user_share[user]

            remaining = Box(share)
            async for record in user_runnable_jobs(user, remaining):
                batch_id = record['batch_id']
                job_id = record['job_id']
                id = (batch_id, job_id)
                attempt_id = secret_alnum_string(6)
                record['attempt_id'] = attempt_id

                if scheduled_cores_mcpu + record[
                        'cores_mcpu'] > allocated_cores_mcpu:
                    if random.random() > self.exceeded_shares_counter.rate():
                        self.exceeded_shares_counter.push(True)
                        self.scheduler_state_changed.set()
                        break
                    self.exceeded_shares_counter.push(False)

                instance = self.pool.get_instance(user, record['cores_mcpu'])
                if instance:
                    instance.adjust_free_cores_in_memory(-record['cores_mcpu'])
                    scheduled_cores_mcpu += record['cores_mcpu']
                    n_scheduled += 1

                    async def schedule_with_error_handling(
                            app, record, id, instance):
                        try:
                            await schedule_job(app, record, instance)
                        except Exception:
                            log.info(
                                f'scheduling job {id} on {instance} for {self.pool}',
                                exc_info=True)

                    await waitable_pool.call(schedule_with_error_handling,
                                             self.app, record, id, instance)

                remaining.value -= 1
                if remaining.value <= 0:
                    should_wait = False
                    break

        await waitable_pool.wait()

        end = time_msecs()
        log.info(
            f'schedule: attempted to schedule {n_scheduled} jobs in {end - start}ms for {self.pool}'
        )

        return should_wait
Example #24
0
File: pool.py Project: saponas/hail
    async def schedule_loop_body(self):
        log.info(f'schedule {self.pool}: starting')
        start = time_msecs()
        n_scheduled = 0

        user_resources = await self.compute_fair_share()

        total = sum(resources['allocated_cores_mcpu']
                    for resources in user_resources.values())
        if not total:
            log.info(f'schedule {self.pool}: no allocated cores')
            should_wait = True
            return should_wait
        user_share = {
            user:
            max(int(300 * resources['allocated_cores_mcpu'] / total + 0.5), 20)
            for user, resources in user_resources.items()
        }

        async def user_runnable_jobs(user, remaining):
            async for batch in self.db.select_and_fetchall(
                    '''
SELECT id, cancelled, userdata, user, format_version
FROM batches
WHERE user = %s AND `state` = 'running';
''',
                (user, ),
                    timer_description=
                    f'in schedule {self.pool}: get {user} running batches',
            ):
                async for record in self.db.select_and_fetchall(
                        '''
SELECT job_id, spec, cores_mcpu
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 1 AND inst_coll = %s
LIMIT %s;
''',
                    (batch['id'], self.pool.name, remaining.value),
                        timer_description=
                        f'in schedule {self.pool}: get {user} batch {batch["id"]} runnable jobs (1)',
                ):
                    record['batch_id'] = batch['id']
                    record['userdata'] = batch['userdata']
                    record['user'] = batch['user']
                    record['format_version'] = batch['format_version']
                    yield record
                if not batch['cancelled']:
                    async for record in self.db.select_and_fetchall(
                            '''
SELECT job_id, spec, cores_mcpu
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
WHERE batch_id = %s AND state = 'Ready' AND always_run = 0 AND inst_coll = %s AND cancelled = 0
LIMIT %s;
''',
                        (batch['id'], self.pool.name, remaining.value),
                            timer_description=
                            f'in schedule {self.pool}: get {user} batch {batch["id"]} runnable jobs (2)',
                    ):
                        record['batch_id'] = batch['id']
                        record['userdata'] = batch['userdata']
                        record['user'] = batch['user']
                        record['format_version'] = batch['format_version']
                        yield record

        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        def get_instance(user, cores_mcpu):
            i = self.pool.healthy_instances_by_free_cores.bisect_key_left(
                cores_mcpu)
            while i < len(self.pool.healthy_instances_by_free_cores):
                instance = self.pool.healthy_instances_by_free_cores[i]
                assert cores_mcpu <= instance.free_cores_mcpu
                return instance
                i += 1
            histogram = collections.defaultdict(int)
            for instance in self.pool.healthy_instances_by_free_cores:
                histogram[instance.free_cores_mcpu] += 1
            log.info(
                f'schedule {self.pool}: no viable instances for {cores_mcpu}: {histogram}'
            )
            return None

        should_wait = True
        for user, resources in user_resources.items():
            allocated_cores_mcpu = resources['allocated_cores_mcpu']
            if allocated_cores_mcpu == 0:
                continue

            scheduled_cores_mcpu = 0
            share = user_share[user]

            log.info(
                f'schedule {self.pool}: user-share: {user}: {allocated_cores_mcpu} {share}'
            )

            remaining = Box(share)
            async for record in user_runnable_jobs(user, remaining):
                batch_id = record['batch_id']
                job_id = record['job_id']
                id = (batch_id, job_id)
                attempt_id = secret_alnum_string(6)
                record['attempt_id'] = attempt_id

                if scheduled_cores_mcpu + record[
                        'cores_mcpu'] > allocated_cores_mcpu:
                    if random.random() > self.exceeded_shares_counter.rate():
                        self.exceeded_shares_counter.push(True)
                        self.scheduler_state_changed.set()
                        break
                    self.exceeded_shares_counter.push(False)

                instance = get_instance(user, record['cores_mcpu'])
                if instance:
                    instance.adjust_free_cores_in_memory(-record['cores_mcpu'])
                    scheduled_cores_mcpu += record['cores_mcpu']
                    n_scheduled += 1
                    should_wait = False

                    async def schedule_with_error_handling(
                            app, record, id, instance):
                        try:
                            await schedule_job(app, record, instance)
                        except Exception:
                            log.info(
                                f'scheduling job {id} on {instance} for {self.pool}',
                                exc_info=True)

                    await waitable_pool.call(schedule_with_error_handling,
                                             self.app, record, id, instance)

                remaining.value -= 1
                if remaining.value <= 0:
                    break

        await waitable_pool.wait()

        end = time_msecs()
        log.info(
            f'schedule: scheduled {n_scheduled} jobs in {end - start}ms for {self.pool}'
        )

        return should_wait
 def _new_python_result(self, source, value=None) -> _resource.PythonResult:
     if value is None:
         value = secret_alnum_string(5)
     jrf = _resource.PythonResult(value, source)
     self._resource_map[jrf._uid] = jrf  # pylint: disable=no-member
     return jrf
Example #26
0
import os
import datetime
from hailtop.utils import secret_alnum_string

HAIL_BENCHMARK_BUCKET_NAME = os.environ['HAIL_BENCHMARK_BUCKET_NAME']

INSTANCE_ID = os.environ.get('INSTANCE_ID')
if INSTANCE_ID is None:
    INSTANCE_ID = secret_alnum_string(12)

BENCHMARK_RESULTS_PATH = f'gs://{HAIL_BENCHMARK_BUCKET_NAME}/benchmark-test/{INSTANCE_ID}'

START_POINT = os.environ.get('START_POINT')
if START_POINT is None:
    now = datetime.datetime.now()
    start_point = now - datetime.timedelta(days=1)
    START_POINT = start_point.strftime("%Y-%m-%dT%H:%M:%SZ")
Example #27
0
 def push(self, success: bool):
     token = secret_alnum_string(6)
     self._global_counter.push(token, success)
Example #28
0
    async def create_instances_loop_body(self):
        log.info(f'create_instances for {self}: starting')
        start = time_msecs()
        n_instances_created = 0

        user_resources = await self.compute_fair_share()

        total = sum(resources['n_allocated_jobs']
                    for resources in user_resources.values())
        if not total:
            log.info(f'create_instances {self}: no allocated jobs')
            should_wait = True
            return should_wait
        user_share = {
            user: max(int(300 * resources['n_allocated_jobs'] / total + 0.5),
                      20)
            for user, resources in user_resources.items()
        }

        async def user_runnable_jobs(user, remaining):
            async for batch in self.db.select_and_fetchall(
                    '''
SELECT id, cancelled, userdata, user, format_version
FROM batches
WHERE user = %s AND `state` = 'running';
''',
                (user, ),
                    timer_description=
                    f'in create_instances {self}: get {user} running batches',
            ):
                async for record in self.db.select_and_fetchall(
                        '''
SELECT jobs.job_id, jobs.spec, jobs.cores_mcpu, COALESCE(SUM(instances.state IS NOT NULL AND
  (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_inst_coll_cancelled)
LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id
LEFT JOIN instances ON attempts.instance_name = instances.name
WHERE jobs.batch_id = %s AND jobs.state = 'Ready' AND always_run = 1 AND jobs.inst_coll = %s
GROUP BY jobs.job_id, jobs.spec, jobs.cores_mcpu
HAVING live_attempts = 0
LIMIT %s;
''',
                    (batch['id'], self.name, remaining.value),
                        timer_description=
                        f'in create_instances {self}: get {user} batch {batch["id"]} runnable jobs (1)',
                ):
                    record['batch_id'] = batch['id']
                    record['userdata'] = batch['userdata']
                    record['user'] = batch['user']
                    record['format_version'] = batch['format_version']
                    yield record
                if not batch['cancelled']:
                    async for record in self.db.select_and_fetchall(
                            '''
SELECT jobs.job_id, jobs.spec, jobs.cores_mcpu, COALESCE(SUM(instances.state IS NOT NULL AND
  (instances.state = 'pending' OR instances.state = 'active')), 0) as live_attempts
FROM jobs FORCE INDEX(jobs_batch_id_state_always_run_cancelled)
LEFT JOIN attempts ON jobs.batch_id = attempts.batch_id AND jobs.job_id = attempts.job_id
LEFT JOIN instances ON attempts.instance_name = instances.name
WHERE jobs.batch_id = %s AND jobs.state = 'Ready' AND always_run = 0 AND jobs.inst_coll = %s AND cancelled = 0
GROUP BY jobs.job_id, jobs.spec, jobs.cores_mcpu
HAVING live_attempts = 0
LIMIT %s;
''',
                        (batch['id'], self.name, remaining.value),
                            timer_description=
                            f'in create_instances {self}: get {user} batch {batch["id"]} runnable jobs (2)',
                    ):
                        record['batch_id'] = batch['id']
                        record['userdata'] = batch['userdata']
                        record['user'] = batch['user']
                        record['format_version'] = batch['format_version']
                        yield record

        waitable_pool = WaitableSharedPool(self.async_worker_pool)

        should_wait = True
        for user, resources in user_resources.items():
            n_allocated_instances = resources['n_allocated_jobs']
            if n_allocated_instances == 0:
                continue

            n_user_instances_created = 0

            share = user_share[user]

            log.info(f'create_instances {self}: user-share: {user}: {share}')

            remaining = Box(share)
            async for record in user_runnable_jobs(user, remaining):
                batch_id = record['batch_id']
                job_id = record['job_id']
                id = (batch_id, job_id)
                attempt_id = secret_alnum_string(6)
                record['attempt_id'] = attempt_id

                if n_user_instances_created >= n_allocated_instances:
                    if random.random() > self.exceeded_shares_counter.rate():
                        self.exceeded_shares_counter.push(True)
                        self.scheduler_state_changed.set()
                        break
                    self.exceeded_shares_counter.push(False)

                n_instances_created += 1
                n_user_instances_created += 1
                should_wait = False

                log.info(f'creating job private instance for job {id}')

                async def create_instance_with_error_handling(
                        batch_id, job_id, attempt_id, record, id):
                    try:
                        batch_format_version = BatchFormatVersion(
                            record['format_version'])
                        spec = json.loads(record['spec'])
                        machine_spec = batch_format_version.get_spec_machine_spec(
                            spec)
                        instance, resources = await self.create_instance(
                            batch_id, job_id, machine_spec)
                        await mark_job_creating(self.app, batch_id, job_id,
                                                attempt_id, instance,
                                                time_msecs(), resources)
                    except Exception:
                        log.info(f'creating job private instance for job {id}',
                                 exc_info=True)

                await waitable_pool.call(create_instance_with_error_handling,
                                         batch_id, job_id, attempt_id, record,
                                         id)

                remaining.value -= 1
                if remaining.value <= 0:
                    break

        await waitable_pool.wait()

        end = time_msecs()
        log.info(
            f'create_instances: created instances for {n_instances_created} jobs in {end - start}ms for {self}'
        )

        await asyncio.sleep(
            15)  # ensure we don't create more instances than GCE limit

        return should_wait
Example #29
0
File: atgu.py Project: saponas/hail
async def post_edit_resource(request, userdata):  # pylint: disable=unused-argument
    db = request.app['db']
    storage_client = request.app['storage_client']
    id = int(request.match_info['id'])
    old_record = await db.select_and_fetchone(
        '''
SELECT attachments FROM atgu_resources
WHERE id = %s;
''',
        (id),
    )
    if not old_record:
        raise web.HTTPNotFound()

    old_attachments = json.loads(old_record['attachments'])

    checked_csrf = False
    attachments = {}
    post = {}
    reader = aiohttp.MultipartReader(request.headers, request.content)
    while True:
        part = await reader.next()  # pylint: disable=not-callable
        if not part:
            break
        if part.name == '_csrf':
            # check csrf token
            token1 = request.cookies.get('_csrf')
            token2 = await part.text()
            if token1 is None or token2 is None or token1 != token2:
                log.info('request made with invalid csrf tokens')
                raise web.HTTPUnauthorized()
            checked_csrf = True
        elif part.name == 'attachment':
            if not checked_csrf:
                raise web.HTTPUnauthorized()
            attachment_id = await part.text()
            assert attachment_id in old_attachments
            attachments[attachment_id] = old_attachments[attachment_id]
        elif part.name == 'file':
            filename = part.filename
            if not filename:
                continue
            attachment_id = secret_alnum_string()
            async with await storage_client.insert_object(
                    BUCKET, f'atgu/attachments/{attachment_id}') as f:
                while True:
                    chunk = await part.read_chunk()
                    if not chunk:
                        break
                    await f.write(chunk)
            attachments[attachment_id] = filename
        else:
            post[part.name] = await part.text()

    if not checked_csrf:
        raise web.HTTPUnauthorized()

    now = time_msecs()
    await db.execute_update(
        '''
UPDATE atgu_resources SET
title = %s,
description = %s,
contents = %s,
tags = %s,
attachments = %s,
time_updated = %s
WHERE id = %s
''',
        (post['title'], post['description'], post['contents'], post['tags'],
         json.dumps(attachments), now, id),
    )

    return web.HTTPFound(deploy_config.external_url('atgu',
                                                    f'/resources/{id}'))
    def read_input_group(self, **kwargs):
        """
        Create a new resource group representing a mapping of identifier to
        input resource files.

        Examples
        --------

        Read a binary PLINK file:

        >>> b = Batch()
        >>> bfile = b.read_input_group(bed="data/example.bed",
        ...                            bim="data/example.bim",
        ...                            fam="data/example.fam")
        >>> j = b.new_job()
        >>> j.command(f"plink --bfile {bfile} --geno --make-bed --out {j.geno}")
        >>> j.command(f"wc -l {bfile.fam}")
        >>> j.command(f"wc -l {bfile.bim}")
        >>> b.run()

        Read a FASTA file and it's index (file extensions matter!):

        >>> fasta = b.read_input_group(**{'fasta': 'data/example.fasta',
        ...                               'fasta.idx': 'data/example.fasta.idx'})

        Create a resource group where the identifiers don't match the file extensions:

        >>> rg = b.read_input_group(foo='data/foo.txt',
        ...                         bar='data/bar.txt')

        `rg.foo` and `rg.bar` will not have the `.txt` file extension and
        instead will be `{root}.foo` and `{root}.bar` where `{root}` is a random
        identifier.

        Notes
        -----
        The identifier is used to refer to a specific resource file. For example,
        given the resource group `rg`, you can use the attribute notation
        `rg.identifier` or the get item notation `rg[identifier]`.

        The file extensions for each file are derived from the identifier.
        This is equivalent to `"{root}.identifier"` from
        :meth:`.Job.declare_resource_group`. We are planning on adding flexibility
        to incorporate more complicated extensions in the future such as `.vcf.bgz`.
        For now, use :meth:`.ResourceFile.add_extension` to add an extension to a
        resource file.

        Parameters
        ----------
        kwargs: :obj:`dict` of :obj:`str` to :obj:`str`
            Key word arguments where the name/key is the identifier and the value
            is the file path.

        Returns
        -------
        :class:`.InputResourceFile`
        """

        root = secret_alnum_string(5)
        new_resources = {
            name: self._new_input_resource_file(
                file, value=f'{root}/{os.path.basename(file)}')
            for name, file in kwargs.items()
        }
        rg = ResourceGroup(None, root, **new_resources)
        self._resource_map.update({rg._uid: rg})
        return rg