def _repartition_chunks(self, chunks): c = chunks[0] partition_row_ranges, total_rows, new_num_partitions = calculate_partition_boundaries( chunks, self.partition_row_counts) # make a new zarr group in the intermediate store with a unique name root = self.intermediate_group.create_group(str(uuid.uuid4())) # make a zarr group for each partition with concurrent.futures.ThreadPoolExecutor(max_workers=64) as executor: executor.map(lambda index: root.create_group(str(index)), range(new_num_partitions)) def tmp_store(pairs): for pair in pairs: index, offsets, partial_chunk = pair[0], pair[1][0], pair[1][1] g = root.require_group(str(index)) g.array("%s-%s" % (offsets[0], offsets[1]), partial_chunk, chunks=False) x1 = self.dag.add_input(partition_row_ranges) x2 = self.dag.transform( lambda x, y: extract_partial_chunks((x, y), chunks), [x1, self.input]) x3 = self.dag.transform(tmp_store, [x2]) # run computation to save partial chunks list(self.dag.compute(x3)) # create a new computation to read and combine partial chunks def tmp_load(new_index): # last chunk has fewer than c rows if new_index == new_num_partitions - 1 and total_rows % c != 0: last_chunk_rows = total_rows % c arr = np.zeros((last_chunk_rows, chunks[1])) else: arr = np.zeros(chunks) g = root.require_group(str(new_index)) for (name, partial_chunk) in g.arrays(): new_start_offset, new_end_offset = [ int(n) for n in name.split("-") ] arr[new_start_offset:new_end_offset] = partial_chunk return arr dag = DAG(self.executor) input = dag.add_input(list(range(new_num_partitions))) input = dag.transform(tmp_load, [input]) # TODO: delete intermediate store when dag is computed return ExecutorZappyArray(self.executor, dag, input, self.shape, chunks, self.dtype)
def ones(cls, executor, shape, chunks, dtype=float, intermediate_store=None): dag = DAG(executor) input = dag.add_input(list(get_chunk_sizes(shape, chunks))) input = dag.transform(lambda chunk: np.ones(chunk, dtype=dtype), [input]) return cls( executor, dag, input, shape, chunks, dtype, intermediate_store=intermediate_store, )
def from_ndarray(cls, executor, arr, chunks, intermediate_store=None): func, chunk_indices = ZappyArray._read_chunks(arr, chunks) dag = DAG(executor) # the input is just the chunk indices input = dag.add_input(chunk_indices) # add a transform to read chunks input = dag.transform(func, [input]) return cls( executor, dag, input, arr.shape, chunks, arr.dtype, intermediate_store=intermediate_store, )
def test_dag_multiple_partitions(): dag = DAG(concurrent.futures.ThreadPoolExecutor()) input = dag.add_input([2, 3, 5]) output = dag.transform(add_one, [input]) assert list(dag.compute(output)) == [3, 4, 6]
def test_dag_single_partition_binary_function(): dag = DAG(concurrent.futures.ThreadPoolExecutor()) input1 = dag.add_input([2]) input2 = dag.add_input([3]) output = dag.transform(add, [input1, input2]) assert list(dag.compute(output)) == [5]
def test_dag_single_partition_serial_functions(): dag = DAG(concurrent.futures.ThreadPoolExecutor()) input = dag.add_input([2]) intermediate = dag.transform(add_one, [input]) output = dag.transform(times_two, [intermediate]) assert list(dag.compute(output)) == [6]
def test_dag_multiple_partitions_binary_function(): dag = DAG(concurrent.futures.ThreadPoolExecutor()) input1 = dag.add_input([2, 3, 5]) input2 = dag.add_input([7, 11, 13]) output = dag.transform(add, [input1, input2]) assert list(dag.compute(output)) == [9, 14, 18]