def setUp(self): bucket_name = get_user_config().get('batch', 'bucket') token = uuid.uuid4() self.test_path = f'gs://{bucket_name}/memory-tests/{token}' self.fs = GCS(concurrent.futures.ThreadPoolExecutor(), project=os.environ['PROJECT']) self.client = BlockingMemoryClient(fs=self.fs) self.temp_files = set()
def __init__(self, batch_logs_bucket_name, worker_logs_bucket_name, instance_id, blocking_pool, *, project=None, credentials=None): self.batch_logs_bucket_name = batch_logs_bucket_name self.worker_logs_bucket_name = worker_logs_bucket_name self.instance_id = instance_id self.worker_logs_root = f'gs://{worker_logs_bucket_name}/batch/logs/{instance_id}/worker' self.batch_logs_root = f'gs://{batch_logs_bucket_name}/batch/logs/{instance_id}/batch' self.gcs = GCS(blocking_pool, project=project, credentials=credentials)
def __init__(self, batch_logs_bucket_name, instance_id, blocking_pool, *, project=None, credentials=None): self.batch_logs_bucket_name = batch_logs_bucket_name self.instance_id = instance_id self.batch_logs_root = f'gs://{batch_logs_bucket_name}/batch/logs/{instance_id}/batch' self.gcs = GCS(blocking_pool, project=project, credentials=credentials) log.info(f'BATCH_LOGS_ROOT {self.batch_logs_root}') format_version = BatchFormatVersion(BATCH_FORMAT_VERSION) log.info( f'EXAMPLE BATCH_JOB_LOGS_PATH {self.log_path(format_version, 1, 1, "abc123", "main")}' )
class Tests(unittest.TestCase): def setUp(self): bucket_name = get_user_config().get('batch', 'bucket') token = uuid.uuid4() self.test_path = f'gs://{bucket_name}/memory-tests/{token}' self.fs = GCS(concurrent.futures.ThreadPoolExecutor(), project=os.environ['PROJECT']) self.client = BlockingMemoryClient(fs=self.fs) self.temp_files = set() def tearDown(self): async_to_blocking(self.fs.delete_gs_files(self.test_path)) self.client.close() def add_temp_file_from_string(self, name: str, str_value: str): handle = f'{self.test_path}/{name}' self.fs._write_gs_file_from_string(handle, str_value) return handle def test_non_existent(self): for _ in range(3): self.assertIsNone(self.client._get_file_if_exists(f'{self.test_path}/nonexistent')) def test_small_write_around(self): cases = [('empty_file', b''), ('null', b'\0'), ('small', b'hello world')] for file, data in cases: handle = self.add_temp_file_from_string(file, data) expected = self.fs._read_binary_gs_file(handle) self.assertEqual(expected, data) i = 0 cached = self.client._get_file_if_exists(handle) while cached is None and i < 10: cached = self.client._get_file_if_exists(handle) i += 1 self.assertEqual(cached, expected) def test_small_write_through(self): cases = [('empty_file2', b''), ('null2', b'\0'), ('small2', b'hello world')] for file, data in cases: filename = f'{self.test_path}/{file}' self.client.write_file(filename, data) cached = self.client._get_file_if_exists(filename) self.assertEqual(cached, data)
async def get_or_add_user(app, userdata): users = app['users'] username = userdata['username'] if username not in users: k8s_client = app['k8s_client'] gsa_key_secret = await retry_transient_errors( k8s_client.read_namespaced_secret, userdata['gsa_key_secret_name'], DEFAULT_NAMESPACE, _request_timeout=5.0) gsa_key = base64.b64decode(gsa_key_secret.data['key.json']).decode() users[username] = { 'fs': GCS(blocking_pool=app['thread_pool'], key=json.loads(gsa_key)) } return users[username]
def __init__(self, gcs_project=None, fs=None, deploy_config=None, session=None, headers=None, _token=None): if not deploy_config: self._deploy_config = get_deploy_config() else: self._deploy_config = deploy_config self.url = self._deploy_config.base_url('memory') self._session = session if fs is None: fs = GCS(blocking_pool=concurrent.futures.ThreadPoolExecutor(), project=gcs_project) self._fs = fs self._headers = {} if headers: self._headers.update(headers) if _token: self._headers['Authorization'] = f'Bearer {_token}'
class LogStore: def __init__(self, batch_logs_bucket_name, worker_logs_bucket_name, instance_id, blocking_pool, *, project=None, credentials=None): self.batch_logs_bucket_name = batch_logs_bucket_name self.worker_logs_bucket_name = worker_logs_bucket_name self.instance_id = instance_id self.worker_logs_root = f'gs://{worker_logs_bucket_name}/batch/logs/{instance_id}/worker' self.batch_logs_root = f'gs://{batch_logs_bucket_name}/batch/logs/{instance_id}/batch' self.gcs = GCS(blocking_pool, project=project, credentials=credentials) def worker_log_path(self, machine_name, log_file): # this has to match worker startup-script return f'{self.worker_logs_root}/{machine_name}/{log_file}' def batch_log_dir(self, batch_id): return f'{self.batch_logs_root}/{batch_id}' def log_path(self, format_version, batch_id, job_id, attempt_id, task): if not format_version.has_attempt_in_log_path(): return f'{self.batch_log_dir(batch_id)}/{job_id}/{task}/log' return f'{self.batch_log_dir(batch_id)}/{job_id}/{attempt_id}/{task}/log' async def read_log_file(self, format_version, batch_id, job_id, attempt_id, task): path = self.log_path(format_version, batch_id, job_id, attempt_id, task) return await self.gcs.read_gs_file(path) async def write_log_file(self, format_version, batch_id, job_id, attempt_id, task, data): path = self.log_path(format_version, batch_id, job_id, attempt_id, task) return await self.gcs.write_gs_file_from_string(path, data) async def delete_batch_logs(self, batch_id): await self.gcs.delete_gs_files(self.batch_log_dir(batch_id)) def status_path(self, batch_id, job_id, attempt_id): return f'{self.batch_log_dir(batch_id)}/{job_id}/{attempt_id}/status.json' async def read_status_file(self, batch_id, job_id, attempt_id): path = self.status_path(batch_id, job_id, attempt_id) return await self.gcs.read_gs_file(path) async def write_status_file(self, batch_id, job_id, attempt_id, status): path = self.status_path(batch_id, job_id, attempt_id) return await self.gcs.write_gs_file_from_string(path, status) async def delete_status_file(self, batch_id, job_id, attempt_id): path = self.status_path(batch_id, job_id, attempt_id) return await self.gcs.delete_gs_file(path) def specs_dir(self, batch_id, token): return f'{self.batch_logs_root}/{batch_id}/bunch/{token}' def specs_path(self, batch_id, token): return f'{self.specs_dir(batch_id, token)}/specs' def specs_index_path(self, batch_id, token): return f'{self.specs_dir(batch_id, token)}/specs.idx' async def read_spec_file(self, batch_id, token, start_job_id, job_id): idx_path = self.specs_index_path(batch_id, token) idx_start, idx_end = SpecWriter.get_index_file_offsets( job_id, start_job_id) offsets = await self.gcs.read_binary_gs_file(idx_path, start=idx_start, end=idx_end) spec_path = self.specs_path(batch_id, token) spec_start, spec_end = SpecWriter.get_spec_file_offsets(offsets) return await self.gcs.read_gs_file(spec_path, start=spec_start, end=spec_end) async def write_spec_file(self, batch_id, token, data_bytes, offsets_bytes): idx_path = self.specs_index_path(batch_id, token) write1 = self.gcs.write_gs_file_from_string( idx_path, offsets_bytes, content_type='application/octet-stream') specs_path = self.specs_path(batch_id, token) write2 = self.gcs.write_gs_file_from_string(specs_path, data_bytes) await asyncio.gather(write1, write2) async def delete_spec_file(self, batch_id, token): await self.gcs.delete_gs_files(self.specs_dir(batch_id, token))
class LogStore: def __init__(self, batch_logs_bucket_name, instance_id, blocking_pool, *, project=None, credentials=None): self.batch_logs_bucket_name = batch_logs_bucket_name self.instance_id = instance_id self.batch_logs_root = f'gs://{batch_logs_bucket_name}/batch/logs/{instance_id}/batch' self.gcs = GCS(blocking_pool, project=project, credentials=credentials) log.info(f'BATCH_LOGS_ROOT {self.batch_logs_root}') format_version = BatchFormatVersion(BATCH_FORMAT_VERSION) log.info( f'EXAMPLE BATCH_JOB_LOGS_PATH {self.log_path(format_version, 1, 1, "abc123", "main")}' ) def batch_log_dir(self, batch_id): return f'{self.batch_logs_root}/{batch_id}' def log_path(self, format_version, batch_id, job_id, attempt_id, task): if not format_version.has_attempt_in_log_path(): return f'{self.batch_log_dir(batch_id)}/{job_id}/{task}/log' return f'{self.batch_log_dir(batch_id)}/{job_id}/{attempt_id}/{task}/log' async def read_log_file(self, format_version, batch_id, job_id, attempt_id, task): path = self.log_path(format_version, batch_id, job_id, attempt_id, task) return await self.gcs.read_gs_file(path) async def write_log_file(self, format_version, batch_id, job_id, attempt_id, task, data): path = self.log_path(format_version, batch_id, job_id, attempt_id, task) return await self.gcs.write_gs_file_from_string(path, data) async def delete_batch_logs(self, batch_id): await self.gcs.delete_gs_files(self.batch_log_dir(batch_id)) def status_path(self, batch_id, job_id, attempt_id): return f'{self.batch_log_dir(batch_id)}/{job_id}/{attempt_id}/status.json' async def read_status_file(self, batch_id, job_id, attempt_id): path = self.status_path(batch_id, job_id, attempt_id) return await self.gcs.read_gs_file(path) async def write_status_file(self, batch_id, job_id, attempt_id, status): path = self.status_path(batch_id, job_id, attempt_id) return await self.gcs.write_gs_file_from_string(path, status) async def delete_status_file(self, batch_id, job_id, attempt_id): path = self.status_path(batch_id, job_id, attempt_id) return await self.gcs.delete_gs_file(path) def specs_dir(self, batch_id, token): return f'{self.batch_logs_root}/{batch_id}/bunch/{token}' def specs_path(self, batch_id, token): return f'{self.specs_dir(batch_id, token)}/specs' def specs_index_path(self, batch_id, token): return f'{self.specs_dir(batch_id, token)}/specs.idx' async def read_spec_file(self, batch_id, token, start_job_id, job_id): idx_path = self.specs_index_path(batch_id, token) idx_start, idx_end = SpecWriter.get_index_file_offsets( job_id, start_job_id) offsets = await self.gcs.read_binary_gs_file(idx_path, start=idx_start, end=idx_end) spec_path = self.specs_path(batch_id, token) spec_start, spec_end = SpecWriter.get_spec_file_offsets(offsets) return await self.gcs.read_gs_file(spec_path, start=spec_start, end=spec_end) async def write_spec_file(self, batch_id, token, data_bytes, offsets_bytes): idx_path = self.specs_index_path(batch_id, token) write1 = self.gcs.write_gs_file_from_string( idx_path, offsets_bytes, content_type='application/octet-stream') specs_path = self.specs_path(batch_id, token) write2 = self.gcs.write_gs_file_from_string(specs_path, data_bytes) await asyncio.gather(write1, write2) async def delete_spec_file(self, batch_id, token): await self.gcs.delete_gs_files(self.specs_dir(batch_id, token))