def mode(data): """ Substitute missing values with the mode of that column(most frequent). In the case that there is a tie (there are multiple, most frequent values) for a column randomly pick one of them. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ nan_xy = matrix.nan_indices(data) modes = [] for y_i in range(np.shape(data)[1]): unique_counts = np.unique(data[:, [y_i]], return_counts=True) max_count = np.max(unique_counts[1]) mode_y = [unique for unique, count in np.transpose(unique_counts) if count == max_count and not np.isnan(unique)] modes.append(mode_y) # Appends index of column and column modes for x_i, y_i in nan_xy: data[x_i][y_i] = np.random.choice(modes[y_i]) return data
def count_missing(data): """ Calculate the total percentage of missing values and also the percentage in each column. Parameters ---------- data: np.array Data to impute. Returns ------- dict Percentage of missing values in total and in each column. """ size = len(data.flatten()) nan_xy = matrix.nan_indices(data) np.unique(nan_xy) counter = {y: 0. for y in np.unique(nan_xy.T[1])} change_in_percentage = 1. / size for _, y in nan_xy: counter[y] += change_in_percentage total_missing = len(nan_xy) / size counter["total"] = total_missing return counter
def describe(data): # verbose=True): """ Print input/output multiple times Eventually will be used instead of matrix.nan_indices everywhere Parameters ---------- data: numpy.nd.array The data you want to get a description from verbose: boolean(optional) Decides whether the description is short or long form Returns ------- dict missingness: list Confidence interval of data being MCAR, MAR or MNAR - in that order nan_xy: list of tuples Indices of all null points nan_n: list Total number of null values for each column pmissing_n: float Percentage of missing values in dataset nan_rows: list Indices of all rows that are completely null nan_cols: list Indices of all columns that are completely null mean_rows: list Mean value of each row mean_cols: list Mean value of each column std_dev: list std dev for each row/column min_max: list Finds the minimum and maximum for each row """ # missingness = [0.33, 0.33, 0.33] # find_missingness(data) nan_xy = matrix.nan_indices(data) nan_n = len(nan_xy) pmissing_n = float(nan_n / len(data.flatten)) # pmissing_rows = "" # pmissing_cols = "" # nan_rows = "" # nan_cols = "" # mean_rows = "" # mean_cols = "" # std_dev = "" # "missingness": missingness, description = {"nan_xy": nan_xy, "nan_n": nan_n, "pmissing_n": pmissing_n} # "pmissing_rows": pmissing_rows, # "pmissing_cols": pmissing_cols, # "nan_rows": nan_rows, # "nan_cols": nan_cols, # "mean_rows": mean_rows, # "mean_cols": mean_cols, # "std_dev": std_dev} return description
def em(data, loops=50): """ Imputes given data using expectation maximization. E-step: Calculates the expected complete data log likelihood ratio. M-step: Finds the parameters that maximize the log likelihood of the complete data. Parameters ---------- data: numpy.nd.array Data to impute. loops: int Number of em iterations to run before breaking. inplace: boolean If True, operate on the numpy array reference Returns ------- numpy.nd.array Imputed data. """ nan_xy = matrix.nan_indices(data) for x_i, y_i in nan_xy: col = data[:, int(y_i)] mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() col[x_i] = np.random.normal(loc=mu, scale=std) previous, i = 1, 1 for i in range(loops): # Expectation mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() # Maximization col[x_i] = np.random.normal(loc=mu, scale=std) # Break out of loop if likelihood doesn't change at least 10% # and has run at least 5 times delta = (col[x_i] - previous) / previous if i > 5 and delta < 0.1: data[x_i][y_i] = col[x_i] break data[x_i][y_i] = col[x_i] previous = col[x_i] return data
def mean(data): """ Substitute missing values with the mean of that column. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ nan_xy = matrix.nan_indices(data) for x_i, y_i in nan_xy: row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(row_wo_nan) data[x_i][y_i] = new_value return data
def random(data): """ Fill missing values in with a randomly selected value from the same column. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ nan_xy = matrix.nan_indices(data) for x, y in nan_xy: uniques = np.unique(data[:, y]) uniques = uniques[~np.isnan(uniques)] data[x][y] = np.random.choice(uniques) return data
def median(data): """ Substitute missing values with the median of that column(middle). Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ nan_xy = matrix.nan_indices(data) cols_missing = set(nan_xy.T[1]) medians = {} for y_i in cols_missing: cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] median_y = np.median(cols_wo_nan) medians[str(y_i)] = median_y for x_i, y_i in nan_xy: data[x_i][y_i] = medians[str(y_i)] return data
def test_missing_values_present(): """ Check that the dataset is corrupted (missing values present)""" assert matrix.nan_indices(data).size != 0
def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, inplace=False): """ Interpolate the missing values based on nearby values. For example, with an array like this: array([[-1.24940, -1.38673, -0.03214945, 0.08255145, -0.007415], [ 2.14662, 0.32758 , -0.82601414, 1.78124027, 0.873998], [-0.41400, -0.977629, nan, -1.39255344, 1.680435], [ 0.40975, 1.067599, 0.29152388, -1.70160145, -0.565226], [-0.54592, -1.126187, 2.04004377, 0.16664863, -0.010677]]) Using a `k` or window size of 3. The one missing value would be set to -1.18509122. The window operates on the horizontal axis. Usage ----- The parameters default the function to a moving mean. You may want to change the default window size: moving_window(data, wsize=10) To only look at past data (null value is at the rightmost index in the window): moving_window(data, nindex=-1) To use a custom function: moving_window(data, func=np.median) You can also do something like take 1.5x the max of previous values in the window: moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1) Parameters ---------- data: numpy.ndarray 2D matrix to impute. nindex: int Null index. Index of the null value inside the moving average window. Use cases: Say you wanted to make value skewed toward the left or right side. 0 would only take the average of values from the right and -1 would only take the average of values from the left wsize: int Window size. Size of the moving average window/area of values being used for each local imputation. This number includes the missing value. errors: {"raise", "coerce", "ignore"} Errors will occur with the indexing of the windows - for example if there is a nan at data[x][0] and `nindex` is set to -1 or there is a nan at data[x][-1] and `nindex` is set to 0. `"raise"` will raise an error, `"coerce"` will try again using an nindex set to the middle and `"ignore"` will just leave it as a nan. inplace: {True, False} Whether to return a copy or run on the passed-in array Returns ------- numpy.ndarray Imputed data. """ if errors == "ignore": raise Exception("`errors` value `ignore` not implemented yet. Sorry!") if not inplace: data = data.copy() if nindex is None: # If using equal window side lengths assert wsize % 2 == 1, "The parameter `wsize` should not be even "\ "if the value `nindex` is not set since it defaults to the midpoint "\ "and an even `wsize` makes the midpoint ambiguous" wside_left = wsize // 2 wside_right = wsize // 2 else: # If using custom window side lengths assert nindex < wsize, "The null index must be smaller than the window size" if nindex == -1: wside_left = wsize - 1 wside_right = 0 else: wside_left = nindex wside_right = wsize - nindex - 1 while True: nan_xy = matrix.nan_indices(data) n_nan_prev = len(nan_xy) for x_i, y_i in nan_xy: left_i = max(0, y_i - wside_left) right_i = min(len(data), y_i + wside_right + 1) window = data[x_i, left_i:right_i] window_not_null = window[~np.isnan(window)] if len(window_not_null) > 0: try: data[x_i][y_i] = func(window_not_null) continue except Exception as e: if errors == "raise": raise e if errors == "coerce": # If either the window has a length of 0 or the aggregate function fails somehow, # do a fallback of just trying the best we can by using it as the middle and trying # to recalculate. Use temporary wside_left/wside_right, for only the calculation of # this specific problamatic value wside_left_tmp = wsize // 2 wside_right_tmp = wside_left_tmp left_i_tmp = max(0, y_i - wside_left_tmp) right_i_tmp = min(len(data), y_i + wside_right_tmp + 1) window = data[x_i, left_i_tmp:right_i_tmp] window_not_null = window[~np.isnan(window)] try: data[x_i][y_i] = func(window_not_null) except Exception as e: print("Exception:", e) if n_nan_prev == len(matrix.nan_indices(data)): break return data
def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, idw_fn=idw.shepards, init_impute_fn=mean): """ Impute using a variant of the nearest neighbours approach Basic idea: Impute array with a passed in initial impute fn (mean impute) and then use the resulting complete array to construct a KDTree. Use this KDTree to compute nearest neighbours. After finding `k` nearest neighbours, take the weighted average of them. Basically, find the nearest row in terms of distance This approach is much, much faster than the other implementation (fit+transform for each subset) which is almost prohibitively expensive. Parameters ---------- data: numpy.ndarray 2D matrix to impute. k: int, optional Parameter used for method querying the KDTree class object. Number of neighbours used in the KNN query. Refer to the docs for [`scipy.spatial.KDTree.query`] (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). eps: nonnegative float, optional Parameter used for method querying the KDTree class object. From the SciPy docs: "Return approximate nearest neighbors; the kth returned value is guaranteed to be no further than (1+eps) times the distance to the real kth nearest neighbor". Refer to the docs for [`scipy.spatial.KDTree.query`] (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). p : float, 1<=p<=infinity, optional Parameter used for method querying the KDTree class object. Straight from the SciPy docs: "Which Minkowski p-norm to use. 1 is the sum-of-absolute-values Manhattan distance 2 is the usual Euclidean distance infinity is the maximum-coordinate-difference distance". Refer to the docs for [`scipy.spatial.KDTree.query`] (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). distance_upper_bound : nonnegative float, optional Parameter used for method querying the KDTree class object. Straight from the SciPy docs: "Return only neighbors within this distance. This is used to prune tree searches, so if you are doing a series of nearest-neighbor queries, it may help to supply the distance to the nearest neighbor of the most recent point." Refer to the docs for [`scipy.spatial.KDTree.query`] (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). leafsize: int, optional Parameter used for construction of the `KDTree` class object. Straight from the SciPy docs: "The number of points at which the algorithm switches over to brute-force. Has to be positive". Refer to the docs for [`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html) for more information. idw_fn: fn, optional Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)` init_impute_fn: fn, optional Returns ------- numpy.ndarray Imputed data. Examples -------- >>> data = np.arange(25).reshape((5, 5)).astype(np.float) >>> data[0][2] = np.nan >>> data array([[ 0., 1., nan, 3., 4.], [ 5., 6., 7., 8., 9.], [10., 11., 12., 13., 14.], [15., 16., 17., 18., 19.], [20., 21., 22., 23., 24.]]) >> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour array([[ 0., 1., 7., 3., 4.], [ 5., 6., 7., 8., 9.], [10., 11., 12., 13., 14.], [15., 16., 17., 18., 19.], [20., 21., 22., 23., 24.]]) >> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours array([[ 0. , 1. , 10.08608891, 3. , 4. ], [ 5. , 6. , 7. , 8. , 9. ], [10. , 11. , 12. , 13. , 14. ], [15. , 16. , 17. , 18. , 19. ], [20. , 21. , 22. , 23. , 24. ]]) >> fast_knn(data, k=3) array([[ 0. , 1. , 13.40249283, 3. , 4. ], [ 5. , 6. , 7. , 8. , 9. ], [10. , 11. , 12. , 13. , 14. ], [15. , 16. , 17. , 18. , 19. ], [20. , 21. , 22. , 23. , 24. ]]) >> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error ... IndexError: index 5 is out of bounds for axis 0 with size 5 """ nan_xy = matrix.nan_indices(data) data_c = init_impute_fn(data) kdtree = KDTree(data_c, leafsize=leafsize) for x_i, y_i in nan_xy: distances, indices = kdtree.query( data_c[x_i], k=k + 1, eps=eps, p=p, distance_upper_bound=distance_upper_bound) # Will always return itself in the first index. Delete it. distances, indices = distances[1:], indices[1:] # Add small constant to distances to avoid division by 0 distances += 1e-3 weights = idw_fn(distances) # Assign missing value the weighted average of `k` nearest neighbours data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices]) return data
def buck_iterative(data): """ Iterative variant of buck's method - Variable to regress on is chosen at random. - EM type infinite regression loop stops after change in prediction from previous prediction < 10% for all columns with missing values A Method of Estimation of Missing Values in Multivariate Data Suitable for use with an Electronic Computer S. F. Buck Journal of the Royal Statistical Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306 Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ nan_xy = matrix.nan_indices(data) # Add a column of zeros to the index values nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1) nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz] temp = [] cols_missing = {y for _, y, _ in nan_xyz} # Step 1: Simple Imputation, these are just placeholders for x_i, y_i, value in nan_xyz: # Column containing nan value without the nan value col = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(col) data[x_i][y_i] = new_value temp.append([x_i, y_i, new_value]) nan_xyz = temp # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary) converged = [False] * len(nan_xyz) while not all(converged): # Step 2: Placeholders are set back to missing for one variable/column dependent_col = int(np.random.choice(list(cols_missing))) missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col] # Step 3: Perform linear regression using the other variables x_train, y_train = [], [] for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs): x_train.append(np.delete(data[x_i], dependent_col)) y_train.append(data[x_i][dependent_col]) model = LinearRegression() model.fit(x_train, y_train) # Step 4: Missing values for the missing variable/column are replaced # with predictions from our new linear regression model # For null indices with the dependent column that was randomly chosen for i, z in enumerate(nan_xyz): x_i = z[0] y_i = z[1] value = data[x_i, y_i] if y_i == dependent_col: # Row 'x' without the nan value new_value = model.predict( [np.delete(data[x_i], dependent_col)]) data[x_i][y_i] = new_value.reshape(1, -1) if value == 0.0: delta = (new_value - value) / 0.01 else: delta = (new_value - value) / value converged[i] = abs(delta) < 0.1 return data