def _fit_eig(self, x): scatter_matrix = _scatter_matrix(x, self.arity) cov_matrix = _estimate_covariance(scatter_matrix, x.shape[0]) if self.n_components: shape1 = self.n_components else: shape1 = x.shape[1] n_blocks = int(ceil(shape1 / x._reg_shape[1])) val_blocks = Array._get_out_blocks((1, n_blocks)) vec_blocks = Array._get_out_blocks((n_blocks, x._n_blocks[1])) _decompose(cov_matrix, self.n_components, x._reg_shape[1], val_blocks, vec_blocks) bshape = (x._reg_shape[1], x._reg_shape[1]) self.components_ = Array(vec_blocks, bshape, bshape, (shape1, x.shape[1]), False) self.explained_variance_ = Array(val_blocks, bshape, bshape, (1, shape1), False) return self
def _sort_v(v, sorting): v_blocks = [[] for _ in range(v._n_blocks[1])] hbsize = v._reg_shape[1] for i, vblock in enumerate(v._iterator("columns")): out_blocks = [[] for _ in range(v._n_blocks[1])] _sort_v_block(vblock._blocks, i, hbsize, sorting, out_blocks) for j in range(len(out_blocks)): v_blocks[j].append(out_blocks[j]) vbsize = v._reg_shape[0] final_blocks = Array._get_out_blocks(v._n_blocks) for i, v_block in enumerate(v_blocks): new_block = [object() for _ in range(v._n_blocks[0])] _merge_svd_block(v_block, i, hbsize, vbsize, sorting, new_block) for j in range(len(new_block)): final_blocks[j][i] = new_block[j] for elem in v_block: compss_delete_object(elem) return Array(final_blocks, v._top_left_shape, v._reg_shape, v.shape, v._sparse)
def _compute_u_sorted(a, sorting): u_blocks = [[] for _ in range(a._n_blocks[1])] hbsize = a._reg_shape[1] for i, vblock in enumerate(a._iterator("columns")): u_block = [object() for _ in range(a._n_blocks[1])] _compute_u_block_sorted(vblock._blocks, i, hbsize, sorting, u_block) for j in range(len(u_block)): u_blocks[j].append(u_block[j]) vbsize = a._reg_shape[0] final_blocks = Array._get_out_blocks(a._n_blocks) for i, u_block in enumerate(u_blocks): new_block = [object() for _ in range(a._n_blocks[0])] _merge_svd_block(u_block, i, hbsize, vbsize, sorting, new_block) for j in range(len(new_block)): final_blocks[j][i] = new_block[j] for elem in u_block: compss_delete_object(elem) return Array(final_blocks, a._top_left_shape, a._reg_shape, a.shape, a._sparse)
def _update_u(z, u_blocks, w_blocks, out_blocks): u_np = np.squeeze(Array._merge_blocks(u_blocks)) w_np = np.squeeze(Array._merge_blocks(w_blocks)) u_new = u_np + w_np - z n_cols = u_blocks[0][0].shape[1] for i in range(len(out_blocks)): out_blocks[i] = u_new[i * n_cols:(i + 1) * n_cols].reshape(1, -1)
def _partial_variability_params(x, y, mean_x, mean_y): x, y = Array._merge_blocks(x), Array._merge_blocks(y) normalized_x = x[:, 0] - mean_x # the 0 is because only 1D LR is supported normalized_y = y - mean_y normalized_xy_dot = np.dot(normalized_x, normalized_y) normalized_xx_dot = np.dot(normalized_x, normalized_x) return normalized_xy_dot, normalized_xx_dot
def _score(x_list, y_list, clf): x = Array._merge_blocks(x_list) y = Array._merge_blocks(y_list) y_pred = clf.predict(x) equal = np.equal(y_pred, y.ravel()) return np.sum(equal), x.shape[0]
def kneighbors(self, x, n_neighbors=None, return_distance=True): """ Finds the K nearest neighbors of the input samples. Returns indices and distances to the neighbors of each sample. Parameters ---------- x : ds-array, shape=(n_samples, n_features) The query samples. n_neighbors: int, optional (default=None) Number of neighbors to get. If None, the value passed in the constructor is employed. return_distance : boolean, optional (default=True) Whether to return distances. Returns ------- dist : ds-array, shape=(n_samples, n_neighbors) Array representing the lengths to points, only present if return_distance=True. ind : ds-array, shape=(n_samples, n_neighbors) Indices of the nearest samples in the fitted data. """ validation.check_is_fitted(self, '_fit_data') if n_neighbors is None: n_neighbors = self.n_neighbors distances = [] indices = [] for q_row in x._iterator(axis=0): queries = [] for row in self._fit_data._iterator(axis=0): queries.append( _get_neighbors(row._blocks, q_row._blocks, n_neighbors)) dist, ind = _merge_queries(*queries) distances.append([dist]) indices.append([ind]) ind_arr = Array(blocks=indices, top_left_shape=(x._top_left_shape[0], n_neighbors), reg_shape=(x._reg_shape[0], n_neighbors), shape=(x.shape[0], n_neighbors), sparse=False) if return_distance: dst_arr = Array(blocks=distances, top_left_shape=(x._top_left_shape[0], n_neighbors), reg_shape=(x._reg_shape[0], n_neighbors), shape=(x.shape[0], n_neighbors), sparse=False) return dst_arr, ind_arr return ind_arr
def _partial_estimate_parameters(x, resp): x = Array._merge_blocks(x) resp = Array._merge_blocks(resp) partial_nk = resp.sum(axis=0) if issparse(x): partial_means = x.T.dot(resp).T else: partial_means = np.matmul(resp.T, x) return x.shape[0], partial_nk, partial_means
def _partial_covar_diag(resp, x, means): x = Array._merge_blocks(x) resp = Array._merge_blocks(resp) if issparse(x): avg_resp_sample_2 = x.multiply(x).T.dot(resp).T avg_sample_means = means * x.T.dot(resp).T else: avg_resp_sample_2 = np.dot(resp.T, x * x) avg_sample_means = means * np.dot(resp.T, x) return avg_resp_sample_2 - 2 * avg_sample_means
def _choose_and_assign_rows_xy(x, y, subsamples_sizes, subsamples, seed): np.random.seed(seed) x = Array._merge_blocks(x) y = Array._merge_blocks(y) indices = np.random.permutation(x.shape[0]) start = 0 for i, size in enumerate(subsamples_sizes): end = start + size subsamples[i] = (x[indices[start:end]], y[indices[start:end]]) start = end
def _generate_bins(mn, mx, dimensions, n_regions): bins = [] mn_arr = Array._merge_blocks(mn)[0] mx_arr = Array._merge_blocks(mx)[0] # create bins for the different regions in the grid in every dimension for dim in dimensions: bin_ = np.linspace(mn_arr[dim], mx_arr[dim], n_regions + 1) bins.append(bin_) return bins
def _get_neighbors(blocks, q_blocks, n_neighbors): samples = Array._merge_blocks(blocks) q_samples = Array._merge_blocks(q_blocks) n_samples = samples.shape[0] knn = SKNeighbors(n_neighbors=n_neighbors) knn.fit(X=samples) dist, ind = knn.kneighbors(X=q_samples) return dist, ind, n_samples
def _u_step(self): u_blocks = [] for u_hblock, w_hblock in zip(self._u._iterator(), self._w._iterator()): out_blocks = [object() for _ in range(self._u._n_blocks[1])] _update_u(self._z, u_hblock._blocks, w_hblock._blocks, out_blocks) u_blocks.append(out_blocks) r_shape = self._u._reg_shape shape = self._u.shape self._u = Array(u_blocks, r_shape, r_shape, shape, self._u._sparse)
def _transform(blocks, m_blocks, v_blocks, out_blocks): x = Array._merge_blocks(blocks) mean = Array._merge_blocks(m_blocks) var = Array._merge_blocks(v_blocks) scaled_x = (x - mean) / np.sqrt(var) constructor_func = np.array if not issparse(x) else csr_matrix start, end = 0, 0 for i, block in enumerate(blocks[0]): end += block.shape[1] out_blocks[i] = constructor_func(scaled_x[:, start:end])
def _merge(x_list, y_list, id_list): samples = Array._merge_blocks(x_list) labels = Array._merge_blocks(y_list) sample_ids = Array._merge_blocks(id_list) _, uniques = np.unique(sample_ids, return_index=True) indices = np.argsort(uniques) uniques = uniques[indices] sample_ids = sample_ids[uniques] samples = samples[uniques] labels = labels[uniques] return samples, labels, sample_ids
def _compute_primal_res(self, z_old): blocks = [] for w_hblock in self._w._iterator(): out_blocks = [object() for _ in range(self._w._n_blocks[1])] _substract(w_hblock._blocks, z_old, out_blocks) blocks.append(out_blocks) prires = Array(blocks, self._w._reg_shape, self._w._reg_shape, self._w.shape, self._w._sparse) # this should be a ds-array of a single element. We return only the # block return (prires.norm(axis=1)**2).sum().sqrt()
def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset: """Creates a RfDataset object from samples x and labels y. This function creates a dislib.classification.rf.data.RfDataset by saving x and y in files. Parameters ---------- x : ds-array, shape = (n_samples, n_features) The training input samples. y : ds-array, shape = (n_samples,) or (n_samples, n_outputs) The target values. Returns ------- rf_dataset : dislib.classification.rf._data.RfDataset """ n_samples = x.shape[0] n_features = x.shape[1] samples_file = tempfile.NamedTemporaryFile(mode='wb', prefix='tmp_rf_samples_', delete=False) samples_path = samples_file.name samples_file.close() _allocate_samples_file(samples_path, n_samples, n_features) start_idx = 0 row_blocks_iterator = x._iterator(axis=0) top_row = next(row_blocks_iterator) _fill_samples_file(samples_path, top_row._blocks, start_idx) start_idx += x._top_left_shape[0] for x_row in row_blocks_iterator: _fill_samples_file(samples_path, x_row._blocks, start_idx) start_idx += x._reg_shape[0] labels_file = tempfile.NamedTemporaryFile(mode='w', prefix='tmp_rf_labels_', delete=False) labels_path = labels_file.name labels_file.close() for y_row in y._iterator(axis=0): _fill_labels_file(labels_path, y_row._blocks) rf_dataset = RfDataset(samples_path, labels_path) rf_dataset.n_samples = n_samples rf_dataset.n_features = n_features return rf_dataset
def _compute_var(blocks, m_blocks): x = Array._merge_blocks(blocks) mean = Array._merge_blocks(m_blocks) sparse = issparse(x) if sparse: x = x.toarray() mean = mean.toarray() var = np.mean(np.array(x - mean)**2, axis=0) if sparse: return csr_matrix(var) else: return var
def _soft_thresholding(w_blocks, u_blocks, k): w_mean = np.squeeze(Array._merge_blocks(w_blocks)) u_mean = np.squeeze(Array._merge_blocks(u_blocks)) v = w_mean + u_mean z = np.zeros(v.shape) for i in range(z.shape[0]): if np.abs(v[i]) <= k: z[i] = 0 else: if v[i] > k: z[i] = v[i] - k else: z[i] = v[i] + k return z
def _partial_covar_full(resp, x, means): x = Array._merge_blocks(x) resp = Array._merge_blocks(resp) n_components, n_features = means.shape covariances = np.empty((n_components, n_features, n_features)) for k in range(n_components): if issparse(x): diff = (x - means[k] for x in x) partial_covs = (np.dot(r * d.T, d) for d, r in zip(diff, resp[:, k])) covariances[k] = sum(partial_covs) else: diff = x - means[k] covariances[k] = np.dot(resp[:, k] * diff.T, diff) return covariances
def transform(self, x): """ Standarize data. Parameters ---------- x : ds-array, shape=(n_samples, n_features) Returns ------- x_new : ds-array, shape=(n_samples, n_features) Scaled data. """ if self.mean_ is None or self.var_ is None: raise Exception("Model has not been initialized.") n_blocks = x._n_blocks[1] blocks = [] m_blocks = self.mean_._blocks v_blocks = self.var_._blocks for row in x._iterator(axis=0): out_blocks = [object() for _ in range(n_blocks)] _transform(row._blocks, m_blocks, v_blocks, out_blocks) blocks.append(out_blocks) return Array(blocks, top_left_shape=x._top_left_shape, reg_shape=x._reg_shape, shape=x.shape, sparse=x._sparse)
def fit(self, x, y): """ Fits the model with training data. Parameters ---------- x : ds-array, shape=(n_samples, n_features) Training samples. y : ds-array, shape=(n_samples, 1) Class labels of x. Returns ------- self : ADMM """ if not x._is_regular(): x_reg = x.rechunk(x._reg_shape) else: x_reg = x self._init_model(x_reg) while not self.converged_ and self.n_iter_ < self.max_iter: self._step(x_reg, y) self.n_iter_ += 1 if self.verbose: print("Iteration ", self.n_iter_) z_blocks = [object() for _ in range(x_reg._n_blocks[1])] _split_z(self._z, x._reg_shape[1], z_blocks) self.z_ = Array([z_blocks], (1, x._reg_shape[1]), (1, x._reg_shape[1]), (1, x.shape[1]), False) return self
def _partial_covar_tied(x): x = Array._merge_blocks(x) if issparse(x): avg_sample_2 = x.T.dot(x) else: avg_sample_2 = np.dot(x.T, x) return avg_sample_2
def predict(self, x): """ Predict using the linear model. Parameters ---------- x : ds-array Samples to be predicted: x.shape (n_samples, 1). Returns ------- y : ds-array Predicted values """ blocks = [list()] for r_block in x._iterator(axis='rows'): blocks[0].append( _predict(r_block._blocks, self.coef_, self.intercept_)) return Array(blocks=blocks, top_left_shape=(x._top_left_shape[0], 1), reg_shape=(x._reg_shape[0], 1), shape=(x.shape[0], 1), sparse=x._sparse)
def _rotate(coli_blocks, colj_blocks, j): if j is None: return coli = Array._merge_blocks(coli_blocks) colj = Array._merge_blocks(colj_blocks) n = coli.shape[1] coli_k = coli @ j[:n, :n] + colj @ j[n:, :n] colj_k = coli @ j[:n, n:] + colj @ j[n:, n:] block_size = coli_blocks[0][0].shape[0] for i in range(len(coli_blocks)): coli_blocks[i][0][:] = coli_k[i * block_size:(i + 1) * block_size][:] colj_blocks[i][0][:] = colj_k[i * block_size:(i + 1) * block_size][:]
def _subset_transform(blocks, u_blocks, c_blocks, reg_shape, out_blocks): data = Array._merge_blocks(blocks) mean = Array._merge_blocks(u_blocks) components = Array._merge_blocks(c_blocks) if issparse(data): data = data.toarray() mean = mean.toarray() res = (np.matmul(data - mean, components.T)) if issparse(data): res = csr_matrix(res) for j in range(0, len(blocks[0])): out_blocks[j] = res[:, j * reg_shape:(j + 1) * reg_shape]
def _subset_scatter_matrix(blocks): data = Array._merge_blocks(blocks) if issparse(data): data = data.toarray() return np.dot(data.T, data)
def predict(self, x): """ Perform classification on samples. Parameters ---------- x : ds-array, shape=(n_samples, n_features) Input samples. Returns ------- y : ds-array, shape(n_samples, 1) Class labels of x. """ assert (self._clf is not None or self._svs is not None), \ "Model has not been initialized. Call fit() first." y_list = [] for row in x._iterator(axis=0): y_list.append([_predict(row._blocks, self._clf)]) return Array(blocks=y_list, top_left_shape=(x._top_left_shape[0], 1), reg_shape=(x._reg_shape[0], 1), shape=(x.shape[0], 1), sparse=False)
def _load_mdcrd(path, block_size, n_cols, n_blocks, bytes_per_snap, bytes_per_block): blocks = [] file_size = os.stat(path).st_size - _CRD_LINE_SIZE try: fid = open(path, "rb") fid.read(_CRD_LINE_SIZE) # skip header for _ in range(0, file_size, bytes_per_block): data = fid.read(bytes_per_block) out_blocks = [object() for _ in range(n_blocks)] _read_crd_bytes(data, block_size[1], n_cols, out_blocks) compss_delete_object(data) blocks.append(out_blocks) finally: fid.close() n_samples = int(file_size / bytes_per_snap) return Array(blocks, top_left_shape=block_size, reg_shape=block_size, shape=(n_samples, n_cols), sparse=False)
def decision_function(self, x): """ Evaluates the decision function for the samples in x. Parameters ---------- x : ds-array, shape=(n_samples, n_features) Input samples. Returns ------- df : ds-array, shape=(n_samples, 2) The decision function of the samples for each class in the model. """ assert (self._clf is not None or self._svs is not None), \ "Model has not been initialized. Call fit() first." df = [] for row in x._iterator(axis=0): df.append([_decision_function(row._blocks, self._clf)]) return Array(blocks=df, top_left_shape=(x._top_left_shape[0], 1), reg_shape=(x._reg_shape[0], 1), shape=(x.shape[0], 1), sparse=False)