def right_shift(x1: BlockArray, x2: BlockArray, out: BlockArray = None, where=True, **kwargs) -> BlockArray: return _instance().map_bop(op_name="right_shift", arr_1=x1, arr_2=x2, out=out, where=where, kwargs=numpy_utils.ufunc_kwargs(kwargs))
def true_divide( x1: BlockArray, x2: BlockArray, out: BlockArray = None, where=True, **kwargs ) -> BlockArray: return _instance().map_bop( op_name="true_divide", arr_1=x1, arr_2=x2, out=out, where=where, kwargs=numpy_utils.ufunc_kwargs(kwargs), )
def exp(x: BlockArray, out: BlockArray = None, where=True, **kwargs) -> BlockArray: return _instance().map_uop( op_name="exp", arr=x, out=out, where=where, kwargs=numpy_utils.ufunc_kwargs(kwargs), )
def arange(start=None, stop=None, step=1, dtype=np.int64) -> BlockArray: if stop is None: stop = start start = 0 if step != 1: raise NotImplementedError( "Only step size of 1 is currently supported.") shape = (stop - start, ) app = _instance() block_shape = app.get_block_shape(shape, dtype) return app.arange(shape, block_shape, step, dtype)
def greater_equal(x1: BlockArray, x2: BlockArray, out: BlockArray = None, where=True, **kwargs) -> BlockArray: return _instance().map_bop(op_name="greater_equal", arr_1=x1, arr_2=x2, out=out, where=where, kwargs=numpy_utils.ufunc_kwargs(kwargs))
def _get_shapes(self, size=None, dtype=None): if dtype is None: dtype = _np.float64 if size is None: size = () if not isinstance(size, tuple): assert _array_utils.is_int(size) shape = (size, ) else: shape = size block_shape = _instance().get_block_shape(shape, dtype) return shape, block_shape
def permutation(self, x): app = _instance() if _array_utils.is_int(x): shape = (x, ) block_shape = app.compute_block_shape(shape=shape, dtype=_np.int64) return self.rs().permutation(shape[0], block_shape[0]) else: assert isinstance(x, BlockArray) shape = x.shape block_shape = x.shape arr_perm = self.rs().permutation(shape[0], block_shape[0]).get() return x[arr_perm]
def update(self): for i in range(self.n_users): q = self.R[i, :] > 0 V_j = self.V[:, q] Q = nps.matmul(V_j, V_j.T) + self.lambda_U * nps.identity(self.n_dims) QQ = _instance().inv(Q) Y = self.R[:, q][i, :] YY = nps.matmul(Y, V_j.T) self.U[:, i] = nps.matmul(QQ, YY)
def loadtxt( fname, dtype=float, comments="# ", delimiter=" ", converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0, encoding="bytes", max_rows=None, ) -> BlockArray: app = _instance() num_rows = app.cm.num_cores_total() try: ba: BlockArray = app.loadtxt( fname, dtype=dtype, comments=comments, delimiter=delimiter, converters=converters, skiprows=skiprows, usecols=usecols, unpack=unpack, ndmin=ndmin, encoding=encoding, max_rows=max_rows, num_workers=num_rows, ) shape = ba.shape block_shape = app.compute_block_shape(shape, dtype) return ba.reshape(block_shape=block_shape) except Exception as _: warnings.warn( "Failed to load text data in parallel; using np.loadtxt locally.") np_arr = np.loadtxt( fname, dtype=dtype, comments=comments, delimiter=delimiter, converters=converters, skiprows=skiprows, usecols=usecols, unpack=unpack, ndmin=ndmin, encoding=encoding, max_rows=max_rows, ) shape = np_arr.shape block_shape = app.compute_block_shape(shape, dtype) return app.array(np_arr, block_shape=block_shape)
def max(a: BlockArray, axis=None, out=None, keepdims=False, initial=None, where=None) -> BlockArray: if initial is not None: raise NotImplementedError("'initial' is currently not supported.") if where is not None: raise NotImplementedError("'where' is currently not supported.") if out is not None: raise NotImplementedError("'out' is currently not supported.") return _instance().max(a, axis=axis, keepdims=keepdims)
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0): shape = (num, ) dtype = np.float64 if dtype is None else dtype app = _instance() block_shape = app.get_block_shape(shape, dtype) return app.linspace(start, stop, shape, block_shape, endpoint, retstep, dtype, axis)
def read_csv(filename, dtype=float, delimiter=",", has_header=False) -> BlockArray: """Read a csv text file. Args: filename: The filename of the csv. dtype: The data type of the csv file's entries. delimiter: The value delimiter for each row; usually a comma. has_header: Whether the csv file has a header. The header is discarded. Returns: A BlockArray instance. """ return _instance().read_csv(filename, dtype, delimiter, has_header)
def read_csv(filename, dtype=np.float, delimiter=',', has_header=False) -> BlockArray: """ Read a csv text file. :param filename: The filename of the csv. :param dtype: The data type of the csv file's entries. :param delimiter: The value delimiter for each row; usually a comma. :param has_header: Whether the csv file has a header. The header is discarded. :return: A BlockArray instance. """ return _instance().read_csv(filename, dtype, delimiter, has_header)
def block_sgd(model: GLM, beta, X: BlockArray, y: BlockArray, tol: BlockArray, max_iter: int, lr: BlockArray): # SGD with batches equal to block shape along first axis. app = _instance() for _ in range(max_iter): for (start, stop) in X.grid.grid_slices[0]: X_batch, y_batch = X[start:stop], y[start:stop] mu = model.forward(X_batch, beta) g = model.gradient(X_batch, y_batch, mu, beta=beta) beta += -lr * g if app.max(app.abs(g)) <= tol: break return beta
def std(a: BlockArray, axis=None, dtype=None, out=None, ddof=0, keepdims=False): if out is not None: raise NotImplementedError("'out' is currently not supported.") return _instance().std(a, axis=axis, ddof=ddof, keepdims=keepdims, dtype=dtype)
def train(params: Dict, data: NumsDMatrix, *args, evals=(), **kwargs): X: BlockArray = data.X y: BlockArray = data.y assert len(X.shape) == 2 assert X.shape[0] == X.shape[0] and X.block_shape[0] == y.block_shape[0] assert len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1) app: ArrayApplication = _instance() cm: ComputeManager = app.cm cm.register("xgb_train", xgb_train_remote, {}) # Start tracker num_workers = X.grid.grid_shape[0] env = _start_rabit_tracker(num_workers) rabit_args = [("%s=%s" % item).encode() for item in env.items()] evals_flat = [] for eval_X, eval_y, eval_method in evals: if eval_X.shape != eval_X.block_shape: eval_X = eval_X.reshape(shape=eval_X.shape, block_shape=eval_X.shape) if eval_y.shape != eval_y.block_shape: eval_y = eval_y.reshape(shape=eval_y.shape, block_shape=eval_y.shape) eval_X_oid = eval_X.blocks.item().oid eval_y_oid = eval_y.blocks.item().oid evals_flat += [eval_X_oid, eval_y_oid, eval_method] X: BlockArray = X.reshape(block_shape=(X.block_shape[0], X.shape[1])) result: BlockArray = BlockArray( ArrayGrid(shape=(X.grid.grid_shape[0], ), block_shape=(1, ), dtype="dict"), cm) for grid_entry in X.grid.get_entry_iterator(): X_block: Block = X.blocks[grid_entry] i = grid_entry[0] if len(y.shape) == 1: y_block: Block = y.blocks[i] else: y_block: Block = y.blocks[i, 0] syskwargs = {"grid_entry": grid_entry, "grid_shape": X.grid.grid_shape} result.blocks[i].oid = cm.call("xgb_train", X_block.oid, y_block.oid, rabit_args, params, args, kwargs, *evals_flat, syskwargs=syskwargs) return result
def arange(start=None, stop=None, step=1, dtype=None) -> BlockArray: if start is None: raise TypeError("Missing required argument start") if stop is None: stop = start start = 0 if step != 1: raise NotImplementedError("Only step size of 1 is currently supported.") if dtype is None: dtype = np.__getattribute__(str(np.result_type(start, stop))) shape = (int(np.ceil(stop - start)),) app = _instance() block_shape = app.get_block_shape(shape, dtype) return app.arange(start, shape, block_shape, step, dtype)
def __init__( self, penalty="none", alpha=1.0, l1_ratio=0.5, tol=0.0001, max_iter=100, solver="newton", lr=0.01, random_state=None, fit_intercept=True, normalize=False, ): if fit_intercept is False: raise NotImplementedError( "fit_incercept=False currently not supported.") if normalize is True: raise NotImplementedError( "normalize=True currently not supported.") self._app = _instance() if random_state is None: self.rs: NumsRandomState = self._app.random elif array_utils.is_int(random_state): self.rs: NumsRandomState = NumsRandomState(cm=self._app.cm, seed=random_state) elif isinstance(random_state, NumsRandomState): self.rs: NumsRandomState = random_state else: raise Exception("Unexpected type for random_state %s" % str(type(random_state))) self._penalty = None if penalty == "none" else penalty if self._penalty not in (None, "l1", "l2", "elasticnet"): raise NotImplementedError("%s penalty not supported" % self._penalty) # All sources use lambda as regularization term, and alpha l1/l2 ratio. self._lambda = alpha self._l1penalty = None self._l1penalty_vec = None self._l2penalty = None self._l2penalty_vec = None self._l2penalty_diag = None self.alpha = l1_ratio self._tol = tol self._max_iter = max_iter self._opt = solver self._lr = lr self._beta = None self._beta0 = None
def from_modin(df): # pylint: disable = import-outside-toplevel, protected-access, unidiomatic-typecheck try: from modin.pandas.dataframe import DataFrame from modin.engines.ray.pandas_on_ray.frame.data import PandasOnRayFrame from modin.engines.ray.pandas_on_ray.frame.partition import PandasOnRayFramePartition except Exception as e: raise Exception("Unable to import modin. Install modin with command 'pip install modin'") \ from e assert isinstance(df, DataFrame), "Unexpected dataframe type %s" % str(type(df)) assert isinstance(df._query_compiler._modin_frame, PandasOnRayFrame), \ "Unexpected dataframe type %s" % str(type(df._query_compiler._modin_frame)) frame: PandasOnRayFrame = df._query_compiler._modin_frame app: ArrayApplication = _instance() system = app.cm # Make sure the partitions are numeric. dtype = frame.dtypes[0] assert dtype in (float, np.float, np.float32, np.float64, int, np.int, np.int32, np.int64) # Make sure dtypes are equal. for dt in frame.dtypes: if type(frame.dtypes.dtype) == np.dtype: continue assert dt == frame.dtypes dtype = np.__getattribute__(str(dtype)) # Convert from Pandas to NumPy. pd_parts = frame._frame_mgr_cls.map_partitions(frame._partitions, lambda df: np.array(df)) grid_shape = len(frame._row_lengths), len(frame._column_widths) shape = (np.sum(frame._row_lengths), np.sum(frame._column_widths)) block_shape = app.get_block_shape(shape, dtype) rows = [] for i in range(grid_shape[0]): cols = [] for j in range(grid_shape[1]): curr_block_shape = (frame._row_lengths[i], frame._column_widths[j]) part: PandasOnRayFramePartition = pd_parts[(i, j)] part.drain_call_queue() ba: BlockArray = BlockArray.from_oid(part.oid, curr_block_shape, dtype, system) cols.append(ba) if grid_shape[1] == 1: row_ba: BlockArray = cols[0] else: row_ba: BlockArray = app.concatenate(cols, axis=1, axis_block_size=block_shape[1]) rows.append(row_ba) result = app.concatenate(rows, axis=0, axis_block_size=block_shape[0]) return result
def sgd(model: GLM, beta, X: BlockArray, y: BlockArray, tol: BlockArray, max_iter: int, lr: BlockArray): # Classic SGD. app = _instance() for _ in range(max_iter): # Sample an entry uniformly at random. idx = model.rs.numpy().integers(X.shape[0]) X_sample, y_sample = X[idx:idx + 1], y[idx:idx + 1] mu = model.forward(X_sample, beta) g = model.gradient(X_sample, y_sample, mu, beta=beta) beta += -lr * g if app.max(app.abs(g)) <= tol: # sklearn uses max instead of l2 norm. break return beta
def __init__(self, model: GLM, m=3, max_iter=100, thresh=1e-5, dtype=np.float64): self.app: ArrayApplication = _instance() self.model: GLM = model self.m = m self.max_iter = max_iter self.thresh = thresh self.dtype = dtype self.k = 0 self.identity = None self.memory: Union[List[LBFGSMemory], List[None]] = [None] * m self.ls = BackTrackingLineSearch(model)
def median(a: BlockArray, axis=None, out=None, keepdims=False) -> BlockArray: """Compute the median of a BlockArray. Args: a: A BlockArray. Returns: The median value. """ if axis is not None: raise NotImplementedError("'axis' is currently not supported.") if out is not None: raise NotImplementedError("'out' is currently not supported.") if keepdims: raise NotImplementedError("'keepdims' is currently not supported.") return _instance().median(a)
def gd( model: GLM, beta, X: BlockArray, y: BlockArray, tol: BlockArray, max_iter: int, lr: BlockArray, ): app = _instance() for _ in range(max_iter): mu = model.forward(X, beta) g = model.gradient(X, y, mu, beta=beta) beta += -lr * g if app.max(app.abs(g)) <= tol: break return beta
def __init__(self, train_size=0.75, lambda_U = 0.3, lambda_V = 0.3): self._app = _instance() self.n_dims = 5 self.parameters = {} self.lambda_U = lambda_U self.lambda_V = lambda_V self.n_users = 10 self.n_movies = 10 self.train_set = nps.random.randn_sparse(10, 10) self.test_set = nps.random.randn_sparse(3, 3) self.R = self.train_set self.U = nps.zeros((self.n_dims, self.n_users), dtype=np.float64) self.V = nps.random.randn(self.n_dims, self.n_movies)
def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0): app = _instance() ba: BlockArray = linspace(start, stop, num, endpoint, dtype=None, axis=axis) ba = power(app.scalar(base), ba) if dtype is not None and dtype != ba.dtype: ba = ba.astype(dtype) return ba
def __init__( self, penalty="none", C=1.0, tol=0.0001, max_iter=100, solver="newton-cg", lr=0.01, random_state=None, fit_intercept=True, normalize=False, ): if fit_intercept is False: raise NotImplementedError("fit_incercept=False currently not supported.") if normalize is True: raise NotImplementedError("normalize=True currently not supported.") self._app = _instance() if random_state is None: self.rs: NumsRandomState = self._app.random elif array_utils.is_int(random_state): self.rs: NumsRandomState = NumsRandomState( cm=self._app.cm, seed=random_state ) elif isinstance(random_state, NumsRandomState): self.rs: NumsRandomState = random_state else: raise Exception( "Unexpected type for random_state %s" % str(type(random_state)) ) self._penalty = None if penalty == "none" else penalty if not (self._penalty is None or self._penalty == "l2"): raise NotImplementedError("%s penalty not supported" % self._penalty) self._lambda = 1.0 / C self._lambda_vec = None self._tol = tol self._max_iter = max_iter self._opt = solver self._lr = lr self._beta = None self._beta0 = None
def top_k( a: BlockArray, k: int, largest=True, sorted=False ) -> Tuple[BlockArray, BlockArray]: """Find the `k` largest or smallest elements of a BlockArray. If there are multiple kth elements that are equal in value, then no guarantees are made as to which ones are included in the top k. Args: a: A BlockArray. k: Number of top elements to return. largest: Whether to return largest or smallest elements. Returns: A tuple containing two BlockArrays, (`values`, `indices`). values: Values of the top k elements, unsorted. indices: Indices of the top k elements, ordered by their corresponding values. """ if sorted: # The result can be sorted when sorting is implemented. raise NotImplementedError("'sorted' is currently not supported.") return _instance().top_k(a, k, largest=largest)
def predict(self, X: BlockArray): app: ArrayApplication = _instance() sys: System = app.system sys.register("xgb_predict", xgb_predict_remote, {}) model_block: Block = self.model.blocks[0] result: BlockArray = BlockArray( ArrayGrid(shape=(X.shape[0], ), block_shape=(X.block_shape[0], ), dtype=nps.int.__name__), sys) for grid_entry in X.grid.get_entry_iterator(): i = grid_entry[0] X_block: Block = X.blocks[grid_entry] r_block: Block = result.blocks[i] syskwargs = { "grid_entry": grid_entry, "grid_shape": X.grid.grid_shape } r_block.oid = sys.call("xgb_predict", model_block.oid, X_block.oid, syskwargs=syskwargs) return result
def array(object, dtype=None, copy=True, order="K", ndmin=0, subok=False) -> BlockArray: if order is not None and order != "K": raise NotImplementedError("Only order='K' is supported.") if ndmin != 0: raise NotImplementedError("Only ndmin=0 is currently supported.") if subok: raise ValueError("subok must be False.") if isinstance(object, BlockArray): if copy: object = object.copy() if dtype is not None: if dtype is not object.dtype: object = object.astype(dtype) return object result = np.array( object, dtype=dtype, copy=copy, order=order, ndmin=ndmin, subok=subok ) dtype = np.__getattribute__(str(result.dtype)) shape = result.shape app = _instance() block_shape = app.compute_block_shape(shape, dtype) return app.array(result, block_shape)
def qr(a, mode="reduced"): if mode != "reduced": raise NotImplementedError("Only reduced QR decomposition is supported.") return _instance().qr(a)