async def main(): parser = argparse.ArgumentParser(description='distributed buffer scale test') parser.add_argument('cluster_leader', type=str, help='cluster leader name, e.g. dbuf-0.dbuf') parser.add_argument('n', type=int, help='number of clients') parser.add_argument('bufsize', type=int, help='bufsize in MB') parser.add_argument('size', type=int, help='number of bytes to send per request') parser.add_argument('reqs', type=int, help='number of requests to send') args = parser.parse_args() n = args.n max_bufsize = args.bufsize * 1024 * 1024 print('dbuf scale test') print(args) async with dbuf.client.DBufClient(args.cluster_leader, max_bufsize=max_bufsize, rng=random.Random(0)) as client: print('creating session') await client.create() def bytearray_with_index(i): b = bytearray(args.size) struct.pack_into('l', b, 0, i) return b print('creating data') data = [bytearray_with_index(i) for i in range(n * args.reqs)] data_for_worker = list(utils.grouped(args.reqs, data)) print(f'starting test') start = time.time() keys, times = utils.unzip(await asyncio.gather( *[write(data_for_worker[i], args, client) for i in range(n)])) end = time.time() duration = end - start print(f'write aggregate-throughput: {n * args.size * args.reqs / duration / 1024 / 1024 / 1024 : 0.3f} GiB/s') keys = [x for xs in keys for x in xs] indices = list(range(len(keys))) np.random.shuffle(indices) keys = [keys[i] for i in indices] data = [data[i] for i in indices] keys = list(utils.grouped(args.reqs, keys)) data = list(utils.grouped(args.reqs, data)) start = time.time() data2, times = utils.unzip(await asyncio.gather( *[read(args, keys[i], client) for i in range(n)])) end = time.time() duration = end - start print(f'read aggregate-throughput: {n * args.size * args.reqs / duration / 1024 / 1024 / 1024 : 0.3f} GiB/s') await client.delete() assert len(data) == len(data2), f'{len(data)} {len(data2)}' assert data == data2, [(i, j, struct.unpack_from("l", x) if len(x) > 8 else x, struct.unpack_from("l", y) if len(y) > 8 else y) for i, (xs, ys) in enumerate(zip(data, data2)) for j, (x, y) in enumerate(zip(xs, ys)) if x != y]
async def submit(self): if self._submitted: raise ValueError("cannot submit an already submitted batch") self._submitted = True batch_spec = {'n_jobs': len(self._job_specs)} if self.attributes: batch_spec['attributes'] = self.attributes if self.callback: batch_spec['callback'] = self.callback b_resp = await self._client._post('/api/v1alpha/batches/create', json=batch_spec) b = await b_resp.json() log.info(f'created batch {b["id"]}') batch = Batch(self._client, b['id'], self.attributes) await bounded_gather(*[functools.partial(self._submit_job, batch.id, specs) for specs in grouped(job_array_size, self._job_specs)], parallelism=2) await self._client._patch(f'/api/v1alpha/batches/{batch.id}/close') log.info(f'closed batch {b["id"]}') for j in self._jobs: j._job = j._job._submit(batch) self._job_specs = [] self._jobs = [] self._job_idx = 0 return batch
def test_grouped_size_0_groups_9_elements(): try: list(grouped(0, [1,2,3,4,5,6,7,8,9])) except ValueError: pass else: assert False
async def fetch_prices(pricing_client: aioazure.AzurePricingClient, regions: List[str]) -> List[AzurePrice]: # Azure seems to have a limit on how long the OData filter request can be so we split the query into smaller groups vm_coros = [ vm_prices_by_region(pricing_client, region, machine_types) for region in regions for machine_types in grouped(8, azure_valid_machine_types) ] disk_coros = [managed_disk_prices_by_region(pricing_client, region) for region in regions] prices = await asyncio.gather(*vm_coros, *disk_coros) return flatten(prices)
def test_benchmark_lookalike_workflow(self): b = self.batch() setup_jobs = [] for i in range(10): j = b.new_job(f'setup_{i}').cpu(0.1) j.command(f'echo "foo" > {j.ofile}') setup_jobs.append(j) jobs = [] for i in range(500): j = b.new_job(f'create_file_{i}').cpu(0.1) j.command(f'echo {setup_jobs[i % len(setup_jobs)].ofile} > {j.ofile}') j.command(f'echo "bar" >> {j.ofile}') jobs.append(j) combine = b.new_job(f'combine_output').cpu(0.1) for tasks in grouped(arg_max(), jobs): combine.command(f'cat {" ".join(shq(j.ofile) for j in jobs)} >> {combine.ofile}') b.write_output(combine.ofile, f'{self.gcs_output_dir}/pipeline_benchmark_test.txt')
def main(df_x_path, df_y_path, output_path, python_image): backend = hb.ServiceBackend() b = hb.Batch(name='rf-loo', default_python_image=python_image) with hl.hadoop_open(df_y_path) as f: local_df_y = pd.read_table(f, header=0, index_col=0) df_x_input = b.read_input(df_x_path) df_y_input = b.read_input(df_y_path) indices = local_df_y.index.to_list() results = [None] * len(indices) inputs = [] for i, window in enumerate(indices): checkpoint = checkpoint_path(window) if hl.hadoop_exists(checkpoint): result = b.read_input(checkpoint) results[i] = result continue inputs.append((window, i, checkpoint)) for inputs in grouped(10, inputs): j = b.new_python_job() for window, i, checkpoint in inputs: result = j.call(random_forest, df_x_input, df_y_input, window) tsv_result = j.call(as_tsv, result) tsv_result = tsv_result.as_str() b.write_output(tsv_result, checkpoint) results[i] = tsv_result output = hb.concatenate(b, results) b.write_output(output, output_path) b.run(wait=False) backend.close()
def test_benchmark_lookalike_workflow(self): p = self.pipeline() setup_tasks = [] for i in range(10): t = p.new_task(f'setup_{i}').cpu(0.1) t.command(f'echo "foo" > {t.ofile}') setup_tasks.append(t) tasks = [] for i in range(500): t = p.new_task(f'create_file_{i}').cpu(0.1) t.command( f'echo {setup_tasks[i % len(setup_tasks)].ofile} > {t.ofile}') t.command(f'echo "bar" >> {t.ofile}') tasks.append(t) combine = p.new_task(f'combine_output').cpu(0.1) for tasks in grouped(arg_max(), tasks): combine.command( f'cat {" ".join(shq(t.ofile) for t in tasks)} >> {combine.ofile}' ) p.write_output(combine.ofile, f'{gcs_output_dir}/pipeline_benchmark_test.txt')
def test_grouped_size_3_groups_7_elements(): actual = list(grouped(3,['abc', 'def', 'ghi', 'jkl', 'mno', 'pqr', 'stu'])) expected = [['abc', 'def', 'ghi'], ['jkl', 'mno', 'pqr'], ['stu']] assert actual == expected
def test_grouped_size_2_groups_5_elements(): actual = list(grouped(2,['abc', 'def', 'ghi', 'jkl', 'mno'])) expected = [['abc', 'def'], ['ghi', 'jkl'], ['mno']] assert actual == expected
def test_grouped_size_1_groups_0_elements(): actual = list(grouped(1,[0])) expected = [[0]] assert actual == expected
def test_grouped_size_2_groups_1_elements(): actual = list(grouped(2,[1])) expected = [[1]] assert actual == expected
def test_grouped_size_3_groups_0_elements(): actual = list(grouped(3,[])) expected = [] assert actual == expected
def test_grouped_size_5_groups_9_elements(): actual = list(grouped(5, [1,2,3,4,5,6,7,8,9])) expected = [[1, 2, 3, 4, 5], [6, 7, 8, 9]] assert actual == expected
def test_grouped_size_1_groups_9_elements(): actual = list(grouped(1, [1,2,3,4,5,6,7,8,9])) expected = [[1], [2], [3], [4], [5], [6], [7], [8], [9]] assert actual == expected