def compute_distances(self, x1, x2=None): """ The method - extracts normalized continuous attributes and then uses `row_norms` and `safe_sparse_do`t to compute the distance as x^2 - 2xy - y^2 (the trick from sklearn); - calls a function in Cython that recomputes the distances between pairs of rows that yielded nan - calls a function in Cython that adds the contributions of discrete columns """ callbacks = StepwiseCallbacks(self.callback, [20, 10, 50, 5, 15]) if self.continuous.any(): data1, data2 = self.continuous_columns(x1, x2, self.means, np.sqrt(2 * self.vars)) # adapted from sklearn.metric.euclidean_distances xx = row_norms(data1, squared=True)[:, np.newaxis] if x2 is not None: yy = row_norms(data2, squared=True)[np.newaxis, :] else: yy = xx.T distances = _safe_sparse_dot(data1, data2.T, dense_output=True, callback=callbacks.next()) distances *= -2 distances += xx distances += yy with np.errstate(invalid="ignore"): # Nans are fixed below np.maximum(distances, 0, out=distances) if x2 is None: distances.flat[::distances.shape[0] + 1] = 0.0 fixer = _distance.fix_euclidean_rows_normalized if self.normalize \ else _distance.fix_euclidean_rows fixer(distances, data1, data2, self.means, self.vars, self.dist_missing2_cont, x2 is not None, callbacks.next()) else: distances = np.zeros( (x1.shape[0], (x2 if x2 is not None else x1).shape[0])) if np.any(self.discrete): data1, data2 = self.discrete_columns(x1, x2) _distance.euclidean_rows_discrete(distances, data1, data2, self.dist_missing_disc, self.dist_missing2_disc, x2 is not None, callbacks.next()) if x2 is None: _distance.lower_to_symmetric(distances, callbacks.next()) return _interruptible_sqrt(distances, callback=callbacks.next())
def compute_distances(self, x1, x2=None): """ The method - extracts normalized continuous attributes and then uses `row_norms` and `safe_sparse_do`t to compute the distance as x^2 - 2xy - y^2 (the trick from sklearn); - calls a function in Cython that recomputes the distances between pairs of rows that yielded nan - calls a function in Cython that adds the contributions of discrete columns """ if self.continuous.any(): data1, data2 = self.continuous_columns( x1, x2, self.means, np.sqrt(2 * self.vars)) # adapted from sklearn.metric.euclidean_distances xx = row_norms(data1, squared=True)[:, np.newaxis] if x2 is not None: yy = row_norms(data2, squared=True)[np.newaxis, :] else: yy = xx.T distances = safe_sparse_dot(data1, data2.T, dense_output=True) distances *= -2 distances += xx distances += yy with np.errstate(invalid="ignore"): # Nans are fixed below np.maximum(distances, 0, out=distances) if x2 is None: distances.flat[::distances.shape[0] + 1] = 0.0 fixer = _distance.fix_euclidean_rows_normalized if self.normalize \ else _distance.fix_euclidean_rows fixer(distances, data1, data2, self.means, self.vars, self.dist_missing2_cont, x2 is not None) else: distances = np.zeros((x1.shape[0], (x2 if x2 is not None else x1).shape[0])) if np.any(self.discrete): data1, data2 = self.discrete_columns(x1, x2) _distance.euclidean_rows_discrete( distances, data1, data2, self.dist_missing_disc, self.dist_missing2_disc, x2 is not None) if x2 is None: _distance.lower_to_symmetric(distances) return np.sqrt(distances)
def compute_distances(self, x1, x2): """ The method - extracts normalized continuous attributes and computes distances ignoring the possibility of nans - recomputes the distances between pairs of rows that yielded nans - adds the contributions of discrete columns using the same function as the Euclidean distance """ callbacks = StepwiseCallbacks(self.callback, [5, 5, 60, 30]) if self.continuous.any(): data1, data2 = self.continuous_columns(x1, x2, self.medians, 2 * self.mads) distances = _distance.manhattan_rows_cont(data1, data2, x2 is not None, callbacks.next()) if self.normalize: _distance.fix_manhattan_rows_normalized( distances, data1, data2, x2 is not None, callbacks.next()) else: _distance.fix_manhattan_rows(distances, data1, data2, self.medians, self.mads, self.dist_missing2_cont, x2 is not None, callbacks.next()) else: distances = np.zeros( (x1.shape[0], (x2 if x2 is not None else x1).shape[0])) if np.any(self.discrete): data1, data2 = self.discrete_columns(x1, x2) # For discrete attributes, Euclidean is same as Manhattan _distance.euclidean_rows_discrete(distances, data1, data2, self.dist_missing_disc, self.dist_missing2_disc, x2 is not None, callbacks.next()) if x2 is None: _distance.lower_to_symmetric(distances, callbacks.next()) return distances
def compute_distances(self, x1, x2): """ The method - extracts normalized continuous attributes and computes distances ignoring the possibility of nans - recomputes the distances between pairs of rows that yielded nans - adds the contributions of discrete columns using the same function as the Euclidean distance """ if self.continuous.any(): data1, data2 = self.continuous_columns( x1, x2, self.medians, 2 * self.mads) distances = _distance.manhattan_rows_cont( data1, data2, x2 is not None) if self.normalize: _distance.fix_manhattan_rows_normalized( distances, data1, data2, x2 is not None) else: _distance.fix_manhattan_rows( distances, data1, data2, self.medians, self.mads, self.dist_missing2_cont, x2 is not None) else: distances = np.zeros((x1.shape[0], (x2 if x2 is not None else x1).shape[0])) if np.any(self.discrete): data1, data2 = self.discrete_columns(x1, x2) # For discrete attributes, Euclidean is same as Manhattan _distance.euclidean_rows_discrete( distances, data1, data2, self.dist_missing_disc, self.dist_missing2_disc, x2 is not None) if x2 is None: _distance.lower_to_symmetric(distances) return distances