コード例 #1
0
ファイル: array.py プロジェクト: lasersonlab/zappy
    def _repartition_chunks(self, chunks):
        c = chunks[0]
        partition_row_ranges, total_rows, new_num_partitions = calculate_partition_boundaries(
            chunks, self.partition_row_counts)

        # make a new zarr group in the intermediate store with a unique name
        root = self.intermediate_group.create_group(str(uuid.uuid4()))
        # make a zarr group for each partition
        with concurrent.futures.ThreadPoolExecutor(max_workers=64) as executor:
            executor.map(lambda index: root.create_group(str(index)),
                         range(new_num_partitions))

        def tmp_store(pairs):
            for pair in pairs:
                index, offsets, partial_chunk = pair[0], pair[1][0], pair[1][1]
                g = root.require_group(str(index))
                g.array("%s-%s" % (offsets[0], offsets[1]),
                        partial_chunk,
                        chunks=False)

        x1 = self.dag.add_input(partition_row_ranges)
        x2 = self.dag.transform(
            lambda x, y: extract_partial_chunks((x, y), chunks),
            [x1, self.input])

        x3 = self.dag.transform(tmp_store, [x2])

        # run computation to save partial chunks
        list(self.dag.compute(x3))

        # create a new computation to read and combine partial chunks
        def tmp_load(new_index):
            # last chunk has fewer than c rows
            if new_index == new_num_partitions - 1 and total_rows % c != 0:
                last_chunk_rows = total_rows % c
                arr = np.zeros((last_chunk_rows, chunks[1]))
            else:
                arr = np.zeros(chunks)
            g = root.require_group(str(new_index))
            for (name, partial_chunk) in g.arrays():
                new_start_offset, new_end_offset = [
                    int(n) for n in name.split("-")
                ]
                arr[new_start_offset:new_end_offset] = partial_chunk
            return arr

        dag = DAG(self.executor)
        input = dag.add_input(list(range(new_num_partitions)))
        input = dag.transform(tmp_load, [input])

        # TODO: delete intermediate store when dag is computed
        return ExecutorZappyArray(self.executor, dag, input, self.shape,
                                  chunks, self.dtype)
コード例 #2
0
ファイル: array.py プロジェクト: bebatut/zappy
 def ones(cls, executor, shape, chunks, dtype=float, intermediate_store=None):
     dag = DAG(executor)
     input = dag.add_input(list(get_chunk_sizes(shape, chunks)))
     input = dag.transform(lambda chunk: np.ones(chunk, dtype=dtype), [input])
     return cls(
         executor,
         dag,
         input,
         shape,
         chunks,
         dtype,
         intermediate_store=intermediate_store,
     )
コード例 #3
0
ファイル: array.py プロジェクト: lasersonlab/zappy
 def from_ndarray(cls, executor, arr, chunks, intermediate_store=None):
     func, chunk_indices = ZappyArray._read_chunks(arr, chunks)
     dag = DAG(executor)
     # the input is just the chunk indices
     input = dag.add_input(chunk_indices)
     # add a transform to read chunks
     input = dag.transform(func, [input])
     return cls(
         executor,
         dag,
         input,
         arr.shape,
         chunks,
         arr.dtype,
         intermediate_store=intermediate_store,
     )
コード例 #4
0
def test_dag_multiple_partitions():
    dag = DAG(concurrent.futures.ThreadPoolExecutor())
    input = dag.add_input([2, 3, 5])
    output = dag.transform(add_one, [input])
    assert list(dag.compute(output)) == [3, 4, 6]
コード例 #5
0
def test_dag_single_partition_binary_function():
    dag = DAG(concurrent.futures.ThreadPoolExecutor())
    input1 = dag.add_input([2])
    input2 = dag.add_input([3])
    output = dag.transform(add, [input1, input2])
    assert list(dag.compute(output)) == [5]
コード例 #6
0
def test_dag_single_partition_serial_functions():
    dag = DAG(concurrent.futures.ThreadPoolExecutor())
    input = dag.add_input([2])
    intermediate = dag.transform(add_one, [input])
    output = dag.transform(times_two, [intermediate])
    assert list(dag.compute(output)) == [6]
コード例 #7
0
ファイル: test_dag.py プロジェクト: lasersonlab/zappy
def test_dag_multiple_partitions_binary_function():
    dag = DAG(concurrent.futures.ThreadPoolExecutor())
    input1 = dag.add_input([2, 3, 5])
    input2 = dag.add_input([7, 11, 13])
    output = dag.transform(add, [input1, input2])
    assert list(dag.compute(output)) == [9, 14, 18]