def __init__(self, arg1, shape=None, filename="sparse.spy", tablename="dok_matrix", dtype=None, copy=False, commit_freq=1.0): spmatrix.__init__(self) self.dtype = getdtype(dtype, default=float) if isinstance(arg1, tuple) and isshape(arg1): # (M,N) M, N = arg1 self.shape = (M, N) elif isspmatrix(arg1): # Sparse ctor if isspmatrix_dok(arg1) and copy: arg1 = arg1.copy() else: arg1 = arg1.todok() if dtype is not None: arg1 = arg1.astype(dtype) self.shape = arg1.shape self.dtype = arg1.dtype else: # Dense ctor try: arg1 = np.asarray(arg1) except: raise TypeError('invalid input format') if len(arg1.shape) != 2: raise TypeError('expected rank <=2 dense array or matrix') from scipy.sparse.coo import coo_matrix d = coo_matrix(arg1, dtype=dtype).todok() self.shape = arg1.shape self.dtype = d.dtype ddict.__init__(self, filename, tablename=tablename, commit_freq=commit_freq, key_types=("UNSIGNED INTEGER", int_tuple_ser(*self.shape), int_tuple_unser(*self.shape)), val_types=("REAL", float, float)) if isspmatrix(arg1): # Sparse ctor ddict.update(self, arg1) elif not (isinstance(arg1, tuple) and isshape(arg1)): ddict.update(self, d)
def test_getdtype(self): A = np.array([1], dtype='int8') assert_equal(sputils.getdtype(None, default=float), float) assert_equal(sputils.getdtype(None, a=A), np.int8)
def test_getdtype(self): A = np.array([1],dtype='int8') assert_equal(sputils.getdtype(None,default=float),np.float) assert_equal(sputils.getdtype(None,a=A),np.int8)
def __init__(self, arg1, shape=None, dtype=None, copy=False): _data_matrix.__init__(self) if isspmatrix(arg1): if arg1.format == self.format and copy: arg1 = arg1.copy() else: arg1 = arg1.asformat(self.format) self._set_self(arg1) elif isinstance(arg1, tuple): if isshape(arg1): # It's a tuple of matrix dimensions (M, N) # create empty matrix self._shape = check_shape(arg1) M, N = self.shape # Select index dtype large enough to pass array and # scalar parameters to sparsetools idx_dtype = get_index_dtype(maxval=max(M, N)) self.data = np.zeros(0, getdtype(dtype, default=float)) self.indices = np.zeros(0, idx_dtype) self.indptr = np.zeros(self._swap((M, N))[0] + 1, dtype=idx_dtype) else: if len(arg1) == 2: # (data, ij) format from scipy.sparse.coo import coo_matrix other = self.__class__(coo_matrix(arg1, shape=shape)) self._set_self(other) elif len(arg1) == 3: # (data, indices, indptr) format (data, indices, indptr) = arg1 # Select index dtype large enough to pass array and # scalar parameters to sparsetools maxval = None if shape is not None: maxval = max(shape) idx_dtype = get_index_dtype((indices, indptr), maxval=maxval, check_contents=True) self.indices = np.array(indices, copy=copy, dtype=idx_dtype) self.indptr = np.array(indptr, copy=copy, dtype=idx_dtype) self.data = np.array(data, copy=copy, dtype=dtype) else: raise ValueError("unrecognized {}_matrix " "constructor usage".format(self.format)) else: # must be dense try: arg1 = np.asarray(arg1) except Exception: raise ValueError("unrecognized {}_matrix constructor usage" "".format(self.format)) from scipy.sparse.coo import coo_matrix self._set_self(self.__class__(coo_matrix(arg1, dtype=dtype))) # Read matrix dimensions given, if any if shape is not None: self._shape = check_shape(shape) else: if self.shape is None: # shape not already set, try to infer dimensions try: major_dim = len(self.indptr) - 1 minor_dim = self.indices.max() + 1 except Exception: raise ValueError('unable to infer matrix dimensions') else: self._shape = check_shape( self._swap((major_dim, minor_dim))) if dtype is not None: self.data = self.data.astype(dtype, copy=False) self.check_format(full_check=False)
def __init__(self, arg1, shape=None, dtype=None, copy=False): _data_matrix.__init__(self) if isspmatrix(arg1): if arg1.format == self.format and copy: arg1 = arg1.copy() else: arg1 = arg1.asformat(self.format) self._set_self(arg1) elif isinstance(arg1, tuple): if isshape(arg1): # It's a tuple of matrix dimensions (M, N) # create empty matrix self.shape = arg1 # spmatrix checks for errors here M, N = self.shape idx_dtype = get_index_dtype(maxval=self._swap((M,N))[1]) self.data = da.zeros(0, getdtype(dtype, default=float)) self.indices = da.zeros(0, idx_dtype) self.indptr = da.zeros(self._swap((M,N))[0] + 1, dtype=idx_dtype) else: if len(arg1) == 2: # (data, ij) format from .coo import coo_matrix other = self.__class__(coo_matrix(arg1, shape=shape)) self._set_self(other) elif len(arg1) == 3: # (data, indices, indptr) format (data, indices, indptr) = arg1 idx_dtype = get_index_dtype((indices, indptr), check_contents=True) chunks = (10,) self.indices = da.from_array(indices, chunks=chunks) self.indptr = da.from_array(indptr, chunks=chunks) self.data = da.from_array(data, chunks=chunks) else: raise ValueError("unrecognized %s_matrix constructor usage" % self.format) else: # must be dense try: arg1 = np.asarray(arg1) except: raise ValueError("unrecognized %s_matrix constructor usage" % self.format) from scipy.sparse.coo import coo_matrix self._set_self(self.__class__(coo_matrix(arg1, dtype=dtype))) # Read matrix dimensions given, if any if shape is not None: self.shape = shape # spmatrix will check for errors else: if self.shape is None: # shape not already set, try to infer dimensions try: major_dim = len(self.indptr) - 1 minor_dim = self.indices.max() + 1 except: raise ValueError('unable to infer matrix dimensions') else: self.shape = self._swap((major_dim,minor_dim)) if dtype is not None: self.data = np.asarray(self.data, dtype=dtype) self.check_format(full_check=False)
def __init__(self, arg1, shape=None, dtype=None, copy=False): _data_matrix.__init__(self) self.chunks = (10, 1) if isinstance(arg1, tuple): if isshape(arg1): M, N = arg1 self.shape = (M, N) idx_dtype = get_index_dtype(maxval=max(M, N)) self.row = np.array([], dtype=idx_dtype) self.col = np.array([], dtype=idx_dtype) self.data = np.array([], getdtype(dtype, default=float)) self.has_canonical_format = True else: try: obj, (row, col) = arg1 except (TypeError, ValueError): raise TypeError('invalid input format') if shape is None: if len(row) == 0 or len(col) == 0: raise ValueError('cannot infer dimensions from zero ' 'sized index arrays') M = np.max(row) + 1 N = np.max(col) + 1 self.shape = (M, N) else: # Use 2 steps to ensure shape has length 2. M, N = shape self.shape = (M, N) idx_dtype = get_index_dtype(maxval=max(self.shape)) if isinstance(row, da.core.Array): self.row = row else: self.row = da.from_array(row, chunks=self.chunks) if isinstance(col, da.core.Array): self.col = col else: self.col = da.from_array(col, chunks=self.chunks) if isinstance(obj, da.core.Array): self.data = obj else: self.data = da.from_array(obj, chunks=self.chunks) self.has_canonical_format = False else: if isspmatrix(arg1): if isspmatrix_coo(arg1) and copy: self.row = arg1.row.copy() self.col = arg1.col.copy() self.data = arg1.data.copy() self.shape = arg1.shape else: coo = arg1.tocoo() self.row = coo.row self.col = coo.col self.data = coo.data self.shape = coo.shape self.has_canonical_format = False else: #dense argument M = np.atleast_2d(np.asarray(arg1)) if M.ndim != 2: raise TypeError('expected dimension <= 2 array or matrix') else: self.shape = M.shape self.row, self.col = M.nonzero() self.data = M[self.row, self.col] self.has_canonical_format = True if dtype is not None: self.data = self.data.astype(dtype) self._check()
def __init__(self, arg1, block_size=None, n_samples=None, n_history=None, shape=None, dtype=None, copy=False): _data_matrix.__init__(self) # case 1: instantiate from another sparse matrix if isspmatrix(arg1): if arg1.format == self.format: self._set_self(arg1) elif arg1.format == "csr": self._csr_to_delta_csr(arg1, block_size, n_samples, n_history) else: raise NotImplementedError( "Instantiation from sparse matrix not yet ready") # case 2: instantiate from some kind of raw data elif isinstance(arg1, tuple): if isshape(arg1): # input is size specification (M,N) for empty matrix # code mostly taken from scipy CSR implementation, other than an # additional line to instantiate deltas array self.shape = arg1 M, N = self.shape idx_dtype = get_index_dtype(maxval=max(M, N)) self.data = np.zeros(0, getdtype(dtype, default='float')) self.indices = np.zeros(0, idx_dtype) self.indptr = np.zeros(self._swap((M, N))[0] + 1, dtype=idx_dtype) self.deltas = np.zeros(0, dtype=idx_dtype) else: if len(arg1) == 2: # COO data format raise NotImplementedError( "Instantiation from COO format not yet ready") elif len(arg1) == 3 or len(arg1) == 4: # contents of the tuple are the raw data structures (self.data, self.indices, self.indptr) = arg1[:3] # use given shape or automatically infer one if shape is not None: self.shape = shape else: M = indptr.shape[0] - 1 N = np.max(indices) self.shape = (M, N) # a fourth array, for the deltas pointer, should always be # given in general use, but we also allow for the case where # it is omitted in order to maintain backwards compatibility # with superclass methods. In this case we just let each # deltas[i] = i; in other words treating this matrix as a # standard CSR matrix with no delta encoding self.deltas = arg1[3] if len(arg1) > 3 else np.arange( self.shape[0]) # case 3: instantiate from generator object elif isinstance(arg1, types.GeneratorType): self._construct_from_iterable(arg1, getdtype(dtype, default='float'), np.int32, block_size, n_samples, shape) # case 4: instantiate from dense matrix / array else: try: arg1 = np.asarray(arg1) except: raise ValueError( "unrecognized delta_csr_matrix constructor usage") # create a generator expression for iterating over rows of arg1 row_gen = (arg1[i, :] for i in range(arg1.shape[0])) self._construct_from_iterable( row_gen, arg1.dtype, get_index_dtype(maxval=max(*arg1.shape)), block_size, n_samples, shape=arg1.shape) self.check_format(full_check=False)
def _construct_from_iterable(self, rows, dtype, idx_dtype, block_size=None, n_samples=None, shape=None, data_size=10000): """ Build a delta encoded sparse matrix row-by-row from an iterable of rows (e.g. a dense matrix or a CSR matrix) """ # data structures are initially empty self.data = np.zeros(data_size, getdtype(dtype, default='float')) self.indices = np.zeros(data_size, idx_dtype) self.indptr = np.zeros(10, dtype=idx_dtype) self.deltas = np.zeros(10, dtype=idx_dtype) # keep track of which rows have been used as reference rows already reference_rows = {} M, N = shape if shape is not None else (None, None) # keep track of how many rows we have added thus far num_rows_added = 0 # keep track of how large the data array currently is data_added = 0 # use a HashSimilarityDetector to locate reference rows if block_size is None: block_size = N // 10 sd = HashSimilarityDetector(block_size, n_samples) for row in rows: # from the first row we can infer the number of columns if N is None: N = row.shape[-1] # all rows should be the same length if row.shape[-1] != N: raise ValueError( "Inconsistent row sizes passed (expected %d, got %d)" % (N, row.shape[-1])) # before we start: we will have to update self.deltas at some point, # which being dynamically allocated may not have enough allocated # space to update. As such, we should expand it if necessary if self.deltas.shape[0] < num_rows_added + 1: self.deltas.resize(self.deltas.shape[0] * 2) # use the HashSimilarityDetector to locate a row sufficiently # similar to this one to serve as a reference row # first, represent the row as a string: row_str = vec_to_str(row) # then look up a match ref = sd.get_best_match(row_str) # if no match was found, store the row directly if ref == -1: data_added += self._append_row_data(row, num_rows_added, data_added) # update self.deltas so that this row points to itself self.deltas[num_rows_added] = num_rows_added # since this row was added directly, we can consider it as a # candidate for a reference row in the future sd.add(row_str, num_rows_added) else: # reconstruct the reference row ref_row = np.zeros(N) start_idx = self.indptr[ref] end_idx = self.indptr[ref + 1] ref_row[self.indices[start_idx:end_idx]] = self.data[ start_idx:end_idx] # now compute the difference row_as_array = row.toarray().flatten() if isspmatrix( row) else row delta = row_as_array - ref_row # add the delta vector to the matrix data_added += self._append_row_data(delta, num_rows_added, data_added) # update self.deltas to point to the reference row self.deltas[num_rows_added] = ref del ref_row num_rows_added += 1 # Once all rows have been added we can infer the height, if not given if M is None: M = num_rows_added # sanity check that the number of rows added equals what we were told if num_rows_added != M: raise ValueError( "Number of rows provided not consistent with specified shape") # update this object's shape variable self.shape = (M, N) # since we dynamically allocate our data structures for performance # reasons, we must resize them to reflect how much data has actually # been added. self.data = np.resize(self.data, data_added) self.indices = np.resize(self.indices, data_added) self.indptr.resize(num_rows_added + 1) self.deltas.resize(num_rows_added)