def test_nan_euclidean_distances_infinite_values(X, Y): with pytest.raises(ValueError) as excinfo: nan_euclidean_distances(X, Y=Y) exp_msg = ("Input contains infinity or a value too large for " "dtype('float64').") assert exp_msg == str(excinfo.value)
def test_nan_euclidean_distances_complete_nan(missing_value): X = np.array([[missing_value, missing_value], [0, 1]]) exp_dist = np.array([[np.nan, np.nan], [np.nan, 0]]) dist = nan_euclidean_distances(X, missing_values=missing_value) assert_allclose(exp_dist, dist) dist = nan_euclidean_distances(X, X.copy(), missing_values=missing_value) assert_allclose(exp_dist, dist)
def test_nan_euclidean_distances_one_feature_match_positive(missing_value): # First feature is the only feature that is non-nan and in both # samples. The result of `nan_euclidean_distances` with squared=True # should be non-negative. The non-squared version should all be close to 0. X = np.array([[-122.27, 648., missing_value, 37.85], [-122.27, missing_value, 2.34701493, missing_value]]) dist_squared = nan_euclidean_distances(X, missing_values=missing_value, squared=True) assert np.all(dist_squared >= 0) dist = nan_euclidean_distances(X, missing_values=missing_value, squared=False) assert_allclose(dist, 0.0)
def test_nan_euclidean_distances_2x2(X, X_diag, missing_value): exp_dist = np.array([[0., X_diag], [X_diag, 0]]) dist = nan_euclidean_distances(X, missing_values=missing_value) assert_allclose(exp_dist, dist) dist_sq = nan_euclidean_distances( X, squared=True, missing_values=missing_value) assert_allclose(exp_dist**2, dist_sq) dist_two = nan_euclidean_distances(X, X, missing_values=missing_value) assert_allclose(exp_dist, dist_two) dist_two_copy = nan_euclidean_distances( X, X.copy(), missing_values=missing_value) assert_allclose(exp_dist, dist_two_copy)
def test_nan_euclidean_distances_equal_to_euclidean_distance(squared): # with no nan values rng = np.random.RandomState(1337) X = rng.randn(3, 4) Y = rng.randn(4, 4) normal_distance = euclidean_distances(X, Y=Y, squared=squared) nan_distance = nan_euclidean_distances(X, Y=Y, squared=squared) assert_allclose(normal_distance, nan_distance)
def test_nan_euclidean_distances_not_trival(missing_value): X = np.array([[1., missing_value, 3., 4., 2.], [missing_value, 4., 6., 1., missing_value], [3., missing_value, missing_value, missing_value, 1.]]) Y = np.array([[missing_value, 7., 7., missing_value, 2.], [missing_value, missing_value, 5., 4., 7.], [missing_value, missing_value, missing_value, 4., 5.]]) # Check for symmetry D1 = nan_euclidean_distances(X, Y, missing_values=missing_value) D2 = nan_euclidean_distances(Y, X, missing_values=missing_value) assert_almost_equal(D1, D2.T) # Check with explicit formula and squared=True assert_allclose( nan_euclidean_distances(X[:1], Y[:1], squared=True, missing_values=missing_value), [[5.0 / 2.0 * ((7 - 3)**2 + (2 - 2)**2)]]) # Check with explicit formula and squared=False assert_allclose( nan_euclidean_distances(X[1:2], Y[1:2], squared=False, missing_values=missing_value), [[np.sqrt(5.0 / 2.0 * ((6 - 5)**2 + (1 - 4)**2))]]) # Check when Y = X is explicitly passed D3 = nan_euclidean_distances(X, missing_values=missing_value) D4 = nan_euclidean_distances(X, X, missing_values=missing_value) D5 = nan_euclidean_distances(X, X.copy(), missing_values=missing_value) assert_allclose(D3, D4) assert_allclose(D4, D5) # Check copy = True against copy = False D6 = nan_euclidean_distances(X, Y, copy=True) D7 = nan_euclidean_distances(X, Y, copy=False) assert_allclose(D6, D7)
def findKNeighbors(self): corr_features = dict() for col1 in self.X_train.columns: d = [] votes = [] for col2 in self.X_train.columns: dist = nan_euclidean_distances( self.X_train[col1].values.reshape(1, -1), self.X_train[col2].values.reshape(1, -1)) d.append([dist, col2]) # sorts and picks the nearest one # print('neighbors', d) d.sort(key=lambda x: x[0]) d = d[0:self.k] for v, j in d: votes.append(j) corr_features[col1] = votes # ans = Counter(votes).most_common(1)[0][0] # result.append(ans) return corr_features
def discr_stat(X, Y, dissimilarity="euclidean", remove_isolates=True, return_rdfs=True): """ Computes the discriminability statistic. Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) Input data. If dissimilarity=='precomputed', the input should be the dissimilarity matrix. Y : 1d-array, shape (n_samples) Input labels. dissimilarity : str, {"euclidean" (default), "precomputed"} Dissimilarity measure can be 'euclidean' (pairwise Euclidean distances between points in the dataset) or 'precomputed' (pre-computed dissimilarities). remove_isolates : bool, optional, default=True Whether to remove data that have single label. return_rdfs : bool, optional, default=False Whether to return rdf for all data points. Returns ------- stat : float Discriminability statistic. rdfs : array, shape (n_samples, max{len(id)}) Rdfs for each sample. Only returned if ``return_rdfs==True``. """ check_X_y(X, Y, accept_sparse=True) uniques, counts = np.unique(Y, return_counts=True) if (counts != 1).sum() <= 1: msg = "You have passed a vector containing only a single unique sample id." raise ValueError(msg) if remove_isolates: idx = np.isin(Y, uniques[counts != 1]) labels = Y[idx] if dissimilarity == "euclidean" or dissimilarity == "cosine" or dissimilarity == "haversine" or \ dissimilarity == "manhattan" or dissimilarity == "mahalanobis": X = X[idx] else: X = X[np.ix_(idx, idx)] else: labels = Y if dissimilarity == "euclidean": dissimilarities = nan_euclidean_distances(X) elif dissimilarity == "cosine": dissimilarities = cosine_distances(X) elif dissimilarity == "haversine": dissimilarities = haversine_distances(X) elif dissimilarity == "manhattan": dissimilarities = manhattan_distances(X) else: dissimilarities = X rdfs = _discr_rdf(dissimilarities, labels) rdfs[rdfs < 0.5] = np.nan stat = np.nanmean(rdfs) if return_rdfs: return stat, rdfs else: return stat
def test_knn_imputer_weight_distance(na): X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) # Test with "distance" weight nn = KNeighborsRegressor(metric="euclidean", weights="distance") X_rows_idx = [0, 2, 3, 4, 5, 6] nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0]) knn_imputed_value = nn.predict(X[1:2, 1:])[0] # Manual calculation X_neighbors_idx = [0, 2, 3, 4, 5] dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na) weights = 1 / dist[:, X_neighbors_idx].ravel() manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights) X_imputed_distance1 = np.array([[0, 0], [manual_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) # NearestNeighbor calculation X_imputed_distance2 = np.array([[0, 0], [knn_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) imputer = KNNImputer(weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed_distance1) assert_allclose(imputer.fit_transform(X), X_imputed_distance2) # Test with weights = "distance" and n_neighbors=2 X = np.array([ [na, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ]) # neighbors are rows 1, 2, the nan_euclidean_distances are: dist_0_1 = np.sqrt((3 / 2) * ((1 - 0)**2 + (2 - 0)**2)) dist_0_2 = np.sqrt((3 / 2) * ((2 - 0)**2 + (3 - 0)**2)) imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2]) X_imputed = np.array([ [imputed_value, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ]) imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) # Test with varying missingness patterns X = np.array([ [1, 0, 0, 1], [0, na, 1, na], [1, 1, 1, na], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ]) # Get weights of donor neighbors dist = nan_euclidean_distances(X, missing_values=na) r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]] r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]] r1c1_nbor_wt = 1 / r1c1_nbor_dists r1c3_nbor_wt = 1 / r1c3_nbor_dists r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]] r2c3_nbor_wt = 1 / r2c3_nbor_dists # Collect donor values col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy() col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy() # Final imputed values r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt) r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt) r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt) X_imputed = np.array([ [1, 0, 0, 1], [0, r1c1_imp, 1, r1c3_imp], [1, 1, 1, r2c3_imp], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ]) imputer = KNNImputer(weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) X = np.array([ [0, 0, 0, na], [1, 1, 1, na], [2, 2, na, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [na, 7, 7, 7], ]) dist = pairwise_distances(X, metric="nan_euclidean", squared=False, missing_values=na) # Calculate weights r0c3_w = 1.0 / dist[0, 2:-1] r1c3_w = 1.0 / dist[1, 2:-1] r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] r7c0_w = 1.0 / dist[7, 2:7] # Calculate weighted averages r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) r7c0 = np.average(X[2:7, 0], weights=r7c0_w) X_imputed = np.array([ [0, 0, 0, r0c3], [1, 1, 1, r1c3], [2, 2, r2c2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [r7c0, 7, 7, 7], ]) imputer_comp_wt = KNNImputer(missing_values=na, weights="distance") assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Check to ensure NaNs work with pairwise_distances. X_masked = rng.random_sample((5, 4)) Y_masked = rng.random_sample((2, 4)) X_masked[0, 0] = np.nan Y_masked[0, 0] = np.nan S_masked = pairwise_distances(X_masked, Y_masked, metric="nan_euclidean") S2_masked = nan_euclidean_distances(X_masked, Y_masked) assert_array_almost_equal(S_masked, S2_masked) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # Test haversine distance # The data should be valid latitude and longitude X = rng.random_sample((5, 2)) X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi / 2 X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi S = pairwise_distances(X, metric="haversine") S2 = haversine_distances(X) assert_array_almost_equal(S, S2) # Test haversine distance, with Y != X Y = rng.random_sample((2, 2)) Y[:, 0] = (Y[:, 0] - 0.5) * 2 * np.pi / 2 Y[:, 1] = (Y[:, 1] - 0.5) * 2 * np.pi S = pairwise_distances(X, Y, metric="haversine") S2 = haversine_distances(X, Y) assert_array_almost_equal(S, S2) # "cityblock" uses scikit-learn metric, cityblock (function) is # scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert S.shape[0] == S.shape[1] assert S.shape[0] == X.shape[0] assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert S.shape[0] == X.shape[0] assert S.shape[1] == Y.shape[0] assert_array_almost_equal(S, S2) # Test cosine as a string metric versus cosine callable # The string "cosine" uses sklearn.metric, # while the function cosine is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert S.shape[0] == X.shape[0] assert S.shape[1] == Y.shape[0] assert_array_almost_equal(S, S2) # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given with pytest.raises(TypeError): pairwise_distances(X_sparse, metric="minkowski") with pytest.raises(TypeError): pairwise_distances(X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unknown with pytest.raises(ValueError): pairwise_distances(X, Y, metric="blah")
def retrieve_closest_indices( df, num_indices, forecast_length, window_size: int = 10, distance_metric: str = "braycurtis", stride_size: int = 1, start_index: int = None, include_differenced: bool = False, include_last: bool = True, verbose: int = 0, ): """Find next indicies closest to the final segment of forecast_length Args: df (pd.DataFrame): source data in wide format num_indices (int): number of indices to return forecast_length (int): length of forecast window_size (int): length of comparison distance_metric (str): distance measure from scipy and nan_euclidean stride_size (int): length of spacing between windows start_index (int): index to begin creation of windows from include_difference (bool): if True, also compare on differences """ array = df.to_numpy() index = df.index tlt_len = array.shape[0] combined_window_size = window_size + forecast_length # remove extra so last segment not included at all # have the last window end evenly max_steps = array.shape[0] - combined_window_size if not include_last: max_steps = max_steps - forecast_length if start_index is None: # handle massive stride size relative to data start_index = 0 if stride_size * 6 < array.shape[0]: start_index = max_steps % stride_size if num_indices > (max_steps / stride_size): raise ValueError( "num_validations/num_indices too high for this dataset") window_idxs = window_id_maker( window_size=combined_window_size, start_index=start_index, max_steps=max_steps, stride_size=stride_size, skip_size=1, ) # calculate distance between all points and last window of history if distance_metric == "nan_euclidean": from sklearn.metrics.pairwise import nan_euclidean_distances res = np.array([ nan_euclidean_distances( array[:, a][window_idxs[:, :window_size]], array[(tlt_len - window_size):tlt_len, a].reshape(1, -1), ) for a in range(array.shape[1]) ]) if include_differenced: array_diff = np.diff(array, n=1, axis=0) array_diff = np.concatenate([array_diff[0:1], array_diff]) res_diff = np.array([ nan_euclidean_distances( array_diff[:, a][window_idxs[:, :window_size]], array_diff[(tlt_len - window_size):tlt_len, a].reshape(1, -1), ) for a in range(array_diff.shape[1]) ]) res = np.mean([res, res_diff], axis=0) else: from scipy.spatial.distance import cdist res = np.array([ cdist( array[:, a][window_idxs[:, :window_size]], array[(tlt_len - window_size):tlt_len, a].reshape(1, -1), metric=distance_metric, ) for a in range(array.shape[1]) ]) if include_differenced: array_diff = np.diff(array, n=1, axis=0) array_diff = np.concatenate([array_diff[0:1], array_diff]) res_diff = np.array([ cdist( array_diff[:, a][window_idxs[:, :window_size]], array_diff[(tlt_len - window_size):tlt_len, a].reshape(1, -1), metric=distance_metric, ) for a in range(array_diff.shape[1]) ]) res = np.mean([res, res_diff], axis=0) # find the lowest distance historical windows res_sum = np.nansum(res, axis=0) num_top = num_indices # partial then full sort res_idx = np.argpartition(res_sum, num_top, axis=0)[0:num_top] res_idx = res_idx[np.argsort(res_sum[res_idx].flatten())] if verbose > 1: print( f"similarity validation distance metrics: {res_sum[res_idx].flatten()} with last window: {res_sum[-1].item()}" ) select_index = index.to_numpy()[window_idxs[res_idx]] if select_index.ndim == 3: res_shape = select_index.shape select_index = select_index.reshape((res_shape[0], res_shape[2])) return select_index
from sklearn.impute import KNNImputer from sklearn.metrics.pairwise import nan_euclidean_distances # 交叉验证 from sklearn.model_selection import cross_val_score # KFlod的函数 from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.pipeline import Pipeline import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]] imputer = KNNImputer(n_neighbors=2, metric='nan_euclidean') imputer.fit_transform(X) nan_euclidean_distances([[np.nan, 6, 5], [3, 4, 3]], [[3, 4, 3], [1, 2, np.nan], [8, 8, 7]]) nan_euclidean_distances([[np.nan, 6, 5], [3, 4, 3]], [[3, 4, 3], [1, 2, input_file = './horse-colic.csv' df_data = pd.read_csv(input_file, header=None, na_values='?') data = df_data.values ix = [i for i in range(data.shape[1]) if i != 235] X, y = data[:, ix], data[:, 236] for i in range(df_data.shape[1]): n_miss = df_data[[i]].isnull().sum() perc = n_miss / df_data.shape[0] * 100 if n_miss.values[0] > 0:
def __init__(self, path, missing_ratio, knn_impute=False, n_neighbor=5): scaler = MinMaxScaler() data = np.load(path, allow_pickle=True) data = data.item() self.missing_ratio = missing_ratio self.x = data["x"] self.y = data["y"] """ the argsort of numpy seems have bugs""" if missing_ratio == 0.0: pdist_x = cosine_distances(self.x, self.x) else: pdist_x = nan_euclidean_distances(self.x, self.x) graph_knn = np.zeros_like(pdist_x) for i in range(self.x.shape[0]): sorted_row = np.sort(pdist_x[i, :]) sort_index = np.argsort(pdist_x[i, :]) '''deal with when there are multiple point is the same to guarantee the diag is 0''' if sort_index[0] != i: j = np.where(sort_index == i) sort_index[j] = sort_index[0] sort_index[0] = i # if i==266: # print(sorted_row) selected_index = sort_index[:1 + n_neighbor] graph_knn[i, selected_index] = 1 # print(sort_index[:20]) thresh = sorted_row[n_neighbor + 1] # if thresh == sorted_row[n_neighbor + 2]: # print(sorted_row) pdist_x[i, :] = (pdist_x[i, :] < thresh).astype(float) # pdist_x[i,:] = 0 # pdist_x[i, sort_index[1:1+n_neighbor]]=1 # graph_knn = pdist_x - np.eye(self.x.shape[0]) graph_knn = graph_knn - np.eye(self.x.shape[0]) np.testing.assert_equal(np.sum(graph_knn, axis=1), n_neighbor * np.ones(self.x.shape[0])) n, d = self.x.shape mask = np.random.rand(n, d) mask = (mask > self.missing_ratio).astype(float) self.m = mask if missing_ratio > 0.0: self.x[mask == 0] = np.nan # imputer = KNNImputer(n_neighbors=2) imputer = SimpleImputer(missing_values=np.nan, strategy='mean') self.x = imputer.fit_transform(self.x) self.m = np.ones_like(self.x) scaler.fit(self.x) self.x = scaler.transform(self.x) '''get nn''' x_nn = [] for i in range(self.x.shape[0]): index = np.where(graph_knn[i, :] == 1) # if index[0].shape[0] != 5: # print(i) x_i = self.x[index, :] x_nn.append(x_i.squeeze()) self.x_nn = np.stack(x_nn)
def calculateSimilarity(self, x1, x2): # sim = pairwise_distances.cosine_similarity(x1.fillna(0), x2.fillna(0)) sim = pairwise.nan_euclidean_distances(x1, x2) return sim