Beispiel #1
0
    def add_data(self, gdf):

        # Populate columns idxs
        if not self.col_idx:
            for i, x in enumerate(gdf.columns.values):
                self.col_idx[str(x)] = i

        # Generate `ind` array to map each row to an output file.
        # This approach is certainly more optimized for shuffling
        # than it is for non-shuffling, but using a single code
        # path is probably worth the (possible) minor overhead.
        nrows = gdf.shape[0]
        typ = np.min_scalar_type(nrows * 2)
        if self.shuffle:
            ind = cp.random.choice(cp.arange(self.num_out_files, dtype=typ),
                                   nrows)
        else:
            ind = cp.arange(nrows, dtype=typ)
            cp.floor_divide(ind,
                            math.ceil(nrows / self.num_out_files),
                            out=ind)
        for x, group in enumerate(
                gdf.scatter_by_map(ind,
                                   map_size=self.num_out_files,
                                   keep_index=False)):
            self.num_samples[x] += len(group)
            if self.num_threads > 1:
                self.queue.put((x, group))
            else:
                self._write_table(x, group)

        # wait for all writes to finish before exiting
        # (so that we aren't using memory)
        if self.num_threads > 1:
            self.queue.join()
Beispiel #2
0
    def _add_data_scatter(self, gdf):
        """Write scattered pieces.

        This method is for cudf-backed data only.
        """
        assert not self.cpu

        # Generate `ind` array to map each row to an output file.
        # This approach is certainly more optimized for shuffling
        # than it is for non-shuffling, but using a single code
        # path is probably worth the (possible) minor overhead.
        nrows = gdf.shape[0]
        typ = np.min_scalar_type(nrows * 2)
        if self.shuffle:
            ind = cp.random.choice(cp.arange(self.num_out_files, dtype=typ),
                                   nrows)
        else:
            ind = cp.arange(nrows, dtype=typ)
            cp.floor_divide(ind,
                            math.ceil(nrows / self.num_out_files),
                            out=ind)
        for x, group in enumerate(
                gdf.scatter_by_map(ind,
                                   map_size=self.num_out_files,
                                   keep_index=False)):
            self.num_samples[x] += len(group)
            if self.num_threads > 1:
                self.queue.put((x, group))
            else:
                self._write_table(x, group)
Beispiel #3
0
def _masked_column_median(arr, masked_value):
    """Compute the median of each column in the 2D array arr, ignoring any
    instances of masked_value"""
    mask = _get_mask(arr, masked_value)
    if arr.size == 0:
        return cp.full(arr.shape[1], cp.nan)
    arr_sorted = arr.copy()
    if not cp.isnan(masked_value):
        # If nan is not the missing value, any column with nans should
        # have a median of nan
        nan_cols = cp.any(cp.isnan(arr), axis=0)
        arr_sorted[mask] = cp.nan
    else:
        nan_cols = cp.full(arr.shape[1], False)
    # nans are always sorted to end of array
    arr_sorted = cp.sort(arr_sorted, axis=0)

    count_missing_values = mask.sum(axis=0)
    # Ignore missing values in determining "halfway" index of sorted
    # array
    n_elems = arr.shape[0] - count_missing_values

    # If no elements remain after removing missing value, median for
    # that colum is nan
    nan_cols = cp.logical_or(nan_cols, n_elems <= 0)

    col_index = cp.arange(arr_sorted.shape[1])
    median = (arr_sorted[cp.floor_divide(n_elems - 1, 2), col_index] +
              arr_sorted[cp.floor_divide(n_elems, 2), col_index]) / 2

    median[nan_cols] = cp.nan
    return median
Beispiel #4
0
    def add_data(self, gdf):
        # Populate columns idxs
        if not self.col_idx:
            for i, x in enumerate(gdf.columns.values):
                self.col_idx[str(x)] = i

        # list columns in cudf don't currently support chunked writing in parquet.
        # hack around this by just writing a single file with this partition
        # this restriction can be removed once cudf supports chunked writing
        # in parquet
        if any(is_list_dtype(gdf[col].dtype) for col in gdf.columns):
            self._write_table(0, gdf, True)
            return

        # Generate `ind` array to map each row to an output file.
        # This approach is certainly more optimized for shuffling
        # than it is for non-shuffling, but using a single code
        # path is probably worth the (possible) minor overhead.
        nrows = gdf.shape[0]
        typ = np.min_scalar_type(nrows * 2)
        if self.shuffle:
            ind = cp.random.choice(cp.arange(self.num_out_files, dtype=typ),
                                   nrows)
        else:
            ind = cp.arange(nrows, dtype=typ)
            cp.floor_divide(ind,
                            math.ceil(nrows / self.num_out_files),
                            out=ind)
        for x, group in enumerate(
                gdf.scatter_by_map(ind,
                                   map_size=self.num_out_files,
                                   keep_index=False)):
            self.num_samples[x] += len(group)
            if self.num_threads > 1:
                self.queue.put((x, group))
            else:
                self._write_table(x, group)

        # wait for all writes to finish before exiting
        # (so that we aren't using memory)
        if self.num_threads > 1:
            self.queue.join()
def floor_divide(x1: Array, x2: Array, /) -> Array:
    """
    Array API compatible wrapper for :py:func:`np.floor_divide <numpy.floor_divide>`.

    See its docstring for more information.
    """
    if x1.dtype not in _numeric_dtypes or x2.dtype not in _numeric_dtypes:
        raise TypeError("Only numeric dtypes are allowed in floor_divide")
    # Call result type here just to raise on disallowed type combinations
    _result_type(x1.dtype, x2.dtype)
    x1, x2 = Array._normalize_two_args(x1, x2)
    return Array._new(np.floor_divide(x1._array, x2._array))
Beispiel #6
0
 def __rfloordiv__(self, other):
     return cupy.floor_divide(other, self)
Beispiel #7
0
 def __floordiv__(self, other):
     return cupy.floor_divide(self, other)
Beispiel #8
0
if __name__ == "__main__":
    random.seed(2222)
    tempwt = []
    tempval = []

    for i in range(3000000):
        tempwt.append(random.randint(1, 100))
        tempval.append(i + 1)

    wt = cp.array(tempwt)
    val = cp.array(tempval)

    capacity = 5000000

    start = time.time()
    cost = cp.floor_divide(val, wt)

    iVal = cp.column_stack((wt, val, cost))

    # sorting items by value
    sorted_iVal = iVal[cp.argsort(-iVal[:, -1])]

    iVal_cpu = cp.asnumpy(sorted_iVal)

    #print(iVal)
    #print(sorted_iVal)
    #print("Finding Max")

    maxValue = getMaxValue(iVal_cpu, capacity)

    t = time.time() - start
Beispiel #9
0
def _scale(a, n, m, copy=True):
    """Scale an array of unsigned/positive integers from `n` to `m` bits.

    Numbers can be represented exactly only if `m` is a multiple of `n`.

    Parameters
    ----------
    a : ndarray
        Input image array.
    n : int
        Number of bits currently used to encode the values in `a`.
    m : int
        Desired number of bits to encode the values in `out`.
    copy : bool, optional
        If True, allocates and returns new array. Otherwise, modifies
        `a` in place.

    Returns
    -------
    out : array
        Output image array. Has the same kind as `a`.
    """
    kind = a.dtype.kind
    if n > m and a.max() < 2**m:
        mnew = math.ceil(m / 2) * 2
        if mnew > m:
            dtype = "int{}".format(mnew)
        else:
            dtype = "uint{}".format(mnew)
        n = math.ceil(n / 2) * 2
        warn("Downcasting {} to {} without scaling because max "
             "value {} fits in {}".format(a.dtype, dtype, a.max(), dtype),
             stacklevel=3)
        return a.astype(_dtype_bits(kind, m))
    elif n == m:
        return a.copy() if copy else a
    elif n > m:
        # downscale with precision loss
        if copy:
            b = cp.empty(a.shape, _dtype_bits(kind, m))
            cp.floor_divide(a,
                            2**(n - m),
                            out=b,
                            dtype=a.dtype,
                            casting='unsafe')
            return b
        else:
            a //= 2**(n - m)
            return a
    elif m % n == 0:
        # exact upscale to a multiple of `n` bits
        if copy:
            b = cp.empty(a.shape, _dtype_bits(kind, m))
            cp.multiply(a, (2**m - 1) // (2**n - 1), out=b, dtype=b.dtype)
            return b
        else:
            a = a.astype(_dtype_bits(kind, m, a.dtype.itemsize), copy=False)
            a *= (2**m - 1) // (2**n - 1)
            return a
    else:
        # upscale to a multiple of `n` bits,
        # then downscale with precision loss
        o = (m // n + 1) * n
        if copy:
            b = cp.empty(a.shape, _dtype_bits(kind, o))
            cp.multiply(a, (2**o - 1) // (2**n - 1), out=b, dtype=b.dtype)
            b //= 2**(o - m)
            return b
        else:
            a = a.astype(_dtype_bits(kind, o, a.dtype.itemsize), copy=False)
            a *= (2**o - 1) // (2**n - 1)
            a //= 2**(o - m)
            return a