Python groupedの例、hailtop.utils.grouped Pythonの例

コード例 #1

0

ファイルを表示

ファイル: scale_test.py プロジェクト: nawatts/hail

async def main():
    parser = argparse.ArgumentParser(description='distributed buffer scale test')
    parser.add_argument('cluster_leader', type=str, help='cluster leader name, e.g. dbuf-0.dbuf')
    parser.add_argument('n', type=int, help='number of clients')
    parser.add_argument('bufsize', type=int, help='bufsize in MB')
    parser.add_argument('size', type=int, help='number of bytes to send per request')
    parser.add_argument('reqs', type=int, help='number of requests to send')
    args = parser.parse_args()

    n = args.n

    max_bufsize = args.bufsize * 1024 * 1024

    print('dbuf scale test')
    print(args)
    async with dbuf.client.DBufClient(args.cluster_leader, max_bufsize=max_bufsize, rng=random.Random(0)) as client:
        print('creating session')
        await client.create()

        def bytearray_with_index(i):
            b = bytearray(args.size)
            struct.pack_into('l', b, 0, i)
            return b
        print('creating data')
        data = [bytearray_with_index(i) for i in range(n * args.reqs)]
        data_for_worker = list(utils.grouped(args.reqs, data))

        print(f'starting test')
        start = time.time()
        keys, times = utils.unzip(await asyncio.gather(
            *[write(data_for_worker[i], args, client) for i in range(n)]))
        end = time.time()
        duration = end - start
        print(f'write aggregate-throughput: {n * args.size * args.reqs / duration / 1024 / 1024 / 1024 : 0.3f} GiB/s')

        keys = [x for xs in keys for x in xs]
        indices = list(range(len(keys)))
        np.random.shuffle(indices)
        keys = [keys[i] for i in indices]
        data = [data[i] for i in indices]
        keys = list(utils.grouped(args.reqs, keys))
        data = list(utils.grouped(args.reqs, data))

        start = time.time()
        data2, times = utils.unzip(await asyncio.gather(
            *[read(args, keys[i], client) for i in range(n)]))
        end = time.time()
        duration = end - start
        print(f'read aggregate-throughput: {n * args.size * args.reqs / duration / 1024 / 1024 / 1024 : 0.3f} GiB/s')

        await client.delete()

        assert len(data) == len(data2), f'{len(data)} {len(data2)}'
        assert data == data2, [(i, j,
                                struct.unpack_from("l", x) if len(x) > 8 else x,
                                struct.unpack_from("l", y) if len(y) > 8 else y)
                               for i, (xs, ys) in enumerate(zip(data, data2))
                               for j, (x, y) in enumerate(zip(xs, ys))
                               if x != y]

コード例 #2

0

ファイルを表示

    async def submit(self):
        if self._submitted:
            raise ValueError("cannot submit an already submitted batch")
        self._submitted = True

        batch_spec = {'n_jobs': len(self._job_specs)}
        if self.attributes:
            batch_spec['attributes'] = self.attributes
        if self.callback:
            batch_spec['callback'] = self.callback

        b_resp = await self._client._post('/api/v1alpha/batches/create', json=batch_spec)
        b = await b_resp.json()
        log.info(f'created batch {b["id"]}')
        batch = Batch(self._client, b['id'], self.attributes)

        await bounded_gather(*[functools.partial(self._submit_job, batch.id, specs)
                               for specs in grouped(job_array_size, self._job_specs)],
                             parallelism=2)

        await self._client._patch(f'/api/v1alpha/batches/{batch.id}/close')
        log.info(f'closed batch {b["id"]}')

        for j in self._jobs:
            j._job = j._job._submit(batch)

        self._job_specs = []
        self._jobs = []
        self._job_idx = 0

        return batch

コード例 #3

0

ファイルを表示

def test_grouped_size_0_groups_9_elements():
    try:
        list(grouped(0, [1,2,3,4,5,6,7,8,9]))
    except ValueError:
        pass
    else:
        assert False

コード例 #4

0

ファイルを表示

ファイル: pricing.py プロジェクト: chrisvittal/hail

async def fetch_prices(pricing_client: aioazure.AzurePricingClient, regions: List[str]) -> List[AzurePrice]:
    # Azure seems to have a limit on how long the OData filter request can be so we split the query into smaller groups
    vm_coros = [
        vm_prices_by_region(pricing_client, region, machine_types)
        for region in regions
        for machine_types in grouped(8, azure_valid_machine_types)
    ]

    disk_coros = [managed_disk_prices_by_region(pricing_client, region) for region in regions]

    prices = await asyncio.gather(*vm_coros, *disk_coros)
    return flatten(prices)

コード例 #5

0

ファイルを表示

ファイル: test_batch.py プロジェクト: MikeyRupert/hail

    def test_benchmark_lookalike_workflow(self):
        b = self.batch()

        setup_jobs = []
        for i in range(10):
            j = b.new_job(f'setup_{i}').cpu(0.1)
            j.command(f'echo "foo" > {j.ofile}')
            setup_jobs.append(j)

        jobs = []
        for i in range(500):
            j = b.new_job(f'create_file_{i}').cpu(0.1)
            j.command(f'echo {setup_jobs[i % len(setup_jobs)].ofile} > {j.ofile}')
            j.command(f'echo "bar" >> {j.ofile}')
            jobs.append(j)

        combine = b.new_job(f'combine_output').cpu(0.1)
        for tasks in grouped(arg_max(), jobs):
            combine.command(f'cat {" ".join(shq(j.ofile) for j in jobs)} >> {combine.ofile}')
        b.write_output(combine.ofile, f'{self.gcs_output_dir}/pipeline_benchmark_test.txt')

コード例 #6

0

ファイルを表示

ファイル: run_rf_checkpoint_batching.py プロジェクト: saponas/hail

def main(df_x_path, df_y_path, output_path, python_image):
    backend = hb.ServiceBackend()
    b = hb.Batch(name='rf-loo', default_python_image=python_image)

    with hl.hadoop_open(df_y_path) as f:
        local_df_y = pd.read_table(f, header=0, index_col=0)

    df_x_input = b.read_input(df_x_path)
    df_y_input = b.read_input(df_y_path)

    indices = local_df_y.index.to_list()
    results = [None] * len(indices)

    inputs = []

    for i, window in enumerate(indices):
        checkpoint = checkpoint_path(window)
        if hl.hadoop_exists(checkpoint):
            result = b.read_input(checkpoint)
            results[i] = result
            continue

        inputs.append((window, i, checkpoint))

    for inputs in grouped(10, inputs):
        j = b.new_python_job()
        for window, i, checkpoint in inputs:
            result = j.call(random_forest, df_x_input, df_y_input, window)
            tsv_result = j.call(as_tsv, result)
            tsv_result = tsv_result.as_str()

            b.write_output(tsv_result, checkpoint)
            results[i] = tsv_result

    output = hb.concatenate(b, results)
    b.write_output(output, output_path)

    b.run(wait=False)
    backend.close()

コード例 #7

0

ファイルを表示

ファイル: test_pipeline.py プロジェクト: tpoterba/hail

    def test_benchmark_lookalike_workflow(self):
        p = self.pipeline()

        setup_tasks = []
        for i in range(10):
            t = p.new_task(f'setup_{i}').cpu(0.1)
            t.command(f'echo "foo" > {t.ofile}')
            setup_tasks.append(t)

        tasks = []
        for i in range(500):
            t = p.new_task(f'create_file_{i}').cpu(0.1)
            t.command(
                f'echo {setup_tasks[i % len(setup_tasks)].ofile} > {t.ofile}')
            t.command(f'echo "bar" >> {t.ofile}')
            tasks.append(t)

        combine = p.new_task(f'combine_output').cpu(0.1)
        for tasks in grouped(arg_max(), tasks):
            combine.command(
                f'cat {" ".join(shq(t.ofile) for t in tasks)} >> {combine.ofile}'
            )
        p.write_output(combine.ofile,
                       f'{gcs_output_dir}/pipeline_benchmark_test.txt')

コード例 #8

0

ファイルを表示

def test_grouped_size_3_groups_7_elements():
    actual = list(grouped(3,['abc', 'def', 'ghi', 'jkl', 'mno', 'pqr', 'stu']))
    expected = [['abc', 'def', 'ghi'], ['jkl', 'mno', 'pqr'], ['stu']]
    assert actual == expected

コード例 #9

0

ファイルを表示

def test_grouped_size_2_groups_5_elements():
    actual = list(grouped(2,['abc', 'def', 'ghi', 'jkl', 'mno']))
    expected = [['abc', 'def'], ['ghi', 'jkl'], ['mno']]
    assert actual == expected

コード例 #10

0

ファイルを表示

def test_grouped_size_1_groups_0_elements():
    actual = list(grouped(1,[0]))
    expected = [[0]]
    assert actual == expected

コード例 #11

0

ファイルを表示

def test_grouped_size_2_groups_1_elements():
    actual = list(grouped(2,[1]))
    expected = [[1]]
    assert actual == expected

コード例 #12

0

ファイルを表示

def test_grouped_size_3_groups_0_elements():
    actual = list(grouped(3,[]))
    expected = []
    assert actual == expected

コード例 #13

0

ファイルを表示

def test_grouped_size_5_groups_9_elements():
    actual = list(grouped(5, [1,2,3,4,5,6,7,8,9]))
    expected = [[1, 2, 3, 4, 5], [6, 7, 8, 9]]
    assert actual == expected

コード例 #14

0

ファイルを表示

def test_grouped_size_1_groups_9_elements():
    actual = list(grouped(1, [1,2,3,4,5,6,7,8,9]))
    expected = [[1], [2], [3], [4], [5], [6], [7], [8], [9]]
    assert actual == expected