def test_sanitize_index(): pd = pytest.importorskip('pandas') with pytest.raises(TypeError): sanitize_index('Hello!') np.testing.assert_equal(sanitize_index(pd.Series([1, 2, 3])), [1, 2, 3]) np.testing.assert_equal(sanitize_index((1, 2, 3)), [1, 2, 3])
def test_sanitize_index(): pd = pytest.importorskip('pandas') with pytest.raises(TypeError): sanitize_index('Hello!') assert sanitize_index(pd.Series([1, 2, 3])) == [1, 2, 3] assert sanitize_index((1, 2, 3)) == [1, 2, 3]
def reshape(x, shape, merge_chunks=True, limit=None): """Reshape array to new shape Parameters ---------- shape : int or tuple of ints The new shape should be compatible with the original shape. If an integer, then the result will be a 1-D array of that length. One shape dimension can be -1. In this case, the value is inferred from the length of the array and remaining dimensions. merge_chunks : bool, default True Whether to merge chunks using the logic in :meth:`dask.array.rechunk` when communication is necessary given the input array chunking and the output shape. With ``merge_chunks==False``, the input array will be rechunked to a chunksize of 1, which can create very many tasks. limit: int (optional) The maximum block size to target in bytes. If no limit is provided, it defaults to using the ``array.chunk-size`` Dask config value. Notes ----- This is a parallelized version of the ``np.reshape`` function with the following limitations: 1. It assumes that the array is stored in `row-major order`_ 2. It only allows for reshapings that collapse or merge dimensions like ``(1, 2, 3, 4) -> (1, 6, 4)`` or ``(64,) -> (4, 4, 4)`` .. _`row-major order`: https://en.wikipedia.org/wiki/Row-_and_column-major_order When communication is necessary this algorithm depends on the logic within rechunk. It endeavors to keep chunk sizes roughly the same when possible. See :ref:`array-chunks.reshaping` for a discussion the tradeoffs of ``merge_chunks``. See Also -------- dask.array.rechunk numpy.reshape """ # Sanitize inputs, look for -1 in shape from dask.array.core import PerformanceWarning from dask.array.slicing import sanitize_index shape = tuple(map(sanitize_index, shape)) known_sizes = [s for s in shape if s != -1] if len(known_sizes) < len(shape): if len(shape) - len(known_sizes) > 1: raise ValueError("can only specify one unknown dimension") # Fastpath for x.reshape(-1) on 1D arrays, allows unknown shape in x # for this case only. if len(shape) == 1 and x.ndim == 1: return x missing_size = sanitize_index(x.size / reduce(mul, known_sizes, 1)) shape = tuple(missing_size if s == -1 else s for s in shape) if np.isnan(sum(x.shape)): raise ValueError("Array chunk size or shape is unknown. shape: %s\n\n" "Possible solution with x.compute_chunk_sizes()" % str(x.shape)) if reduce(mul, shape, 1) != x.size: raise ValueError("total size of new array must be unchanged") if x.shape == shape: return x meta = meta_from_array(x, len(shape)) name = "reshape-" + tokenize(x, shape) if x.npartitions == 1: key = next(flatten(x.__dask_keys__())) dsk = {(name, ) + (0, ) * len(shape): (M.reshape, key, shape)} chunks = tuple((d, ) for d in shape) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x]) return Array(graph, name, chunks, meta=meta) # Logic or how to rechunk din = len(x.shape) dout = len(shape) if not merge_chunks and din > dout: x = x.rechunk({i: 1 for i in range(din - dout)}) inchunks, outchunks = reshape_rechunk(x.shape, shape, x.chunks) # Check output chunks are not too large max_chunksize_in_bytes = reduce( mul, [max(i) for i in outchunks]) * x.dtype.itemsize if limit is None: limit = parse_bytes(config.get("array.chunk-size")) split = config.get("array.slicing.split-large-chunks", None) else: limit = parse_bytes(limit) split = True if max_chunksize_in_bytes > limit: if split is None: msg = ( "Reshaping is producing a large chunk. To accept the large\n" "chunk and silence this warning, set the option\n" " >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n" " ... array.reshape(shape)\n\n" "To avoid creating the large chunks, set the option\n" " >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n" " ... array.reshape(shape)" "Explictly passing ``limit`` to ``reshape`` will also silence this warning\n" " >>> array.reshape(shape, limit='128 MiB')") warnings.warn(msg, PerformanceWarning, stacklevel=6) elif split: # Leave chunk sizes unaltered where possible matching_chunks = Counter(inchunks) & Counter(outchunks) chunk_plan = [] for out in outchunks: if matching_chunks[out] > 0: chunk_plan.append(out) matching_chunks[out] -= 1 else: chunk_plan.append("auto") outchunks = normalize_chunks( chunk_plan, shape=shape, limit=limit, dtype=x.dtype, previous_chunks=inchunks, ) x2 = x.rechunk(inchunks) # Construct graph in_keys = list(product([x2.name], *[range(len(c)) for c in inchunks])) out_keys = list(product([name], *[range(len(c)) for c in outchunks])) shapes = list(product(*outchunks)) dsk = { a: (M.reshape, b, shape) for a, b, shape in zip(out_keys, in_keys, shapes) } graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x2]) return Array(graph, name, outchunks, meta=meta)