def mode_imputation(data): """ Substitute missing values with the mode of that column(most frequent). In the case that there is a tie (there are multiple, most frequent values) for a column randomly pick one of them. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) modes = [] for y_i in range(np.shape(data)[1]): unique_counts = np.unique(data[:, [y_i]], return_counts=True) max_count = np.max(unique_counts[1]) mode_y = [ unique for unique, count in np.transpose(unique_counts) if count == max_count and not np.isnan(unique) ] modes.append(mode_y) # Appends index of column and column modes for x_i, y_i in null_xy: data[x_i][y_i] = np.random.choice(modes[y_i]) return data
def from_before_observation(data, axis=0): if not checks(data): raise Exception("Checks failed") if axis == 0: data = np.transpose(data) elif axis == 1: pass null_xy = find_null(data) for x_i, y_i in null_xy: # Simplest scenario, look one row back if x_i - 1 > -1: data[x_i][y_i] = data[x_i - 1][y_i] # Look n rows forward else: x_residuals = np.shape(data)[0] - x_i - 1 # n data points left val_found = False for i in range(1, x_residuals): if not np.isnan(data[x_i + i][y_i]): val_found = True break if val_found: for x_nan in range(i): data[x_i + x_nan][y_i] = data[x_i + i][y_i] else: print("Error: Entire Column is NaN") raise Exception return data
def describe(data): # verbose=True): """ Print input/output multiple times Parameters ---------- data: numpy.nd.array The data you want to get a description from verbose: boolean(optional) Decides whether the description is short or long form Returns ------- dict missingness: list Confidence interval of data being MCAR, MAR or MNAR - in that order null_xy: list of tuples Indices of all null points null_n: list Total number of null values for each column pmissing_n: float Percentage of missing values in dataset null_rows: list Indices of all rows that are completely null null_cols: list Indices of all columns that are completely null mean_rows: list Mean value of each row mean_cols: list Mean value of each column std_dev: list std dev for each row/column min_max: list Finds the minimum and maximum for each row """ # missingness = [0.33, 0.33, 0.33] # find_missingness(data) null_xy = find_null(data) null_n = len(null_xy) pmissing_n = float(null_n / len(data.flatten)) # pmissing_rows = "" # pmissing_cols = "" # null_rows = "" # null_cols = "" # mean_rows = "" # mean_cols = "" # std_dev = "" # "missingness": missingness, description = { "null_xy": null_xy, "null_n": null_n, "pmissing_n": pmissing_n } # "pmissing_rows": pmissing_rows, # "pmissing_cols": pmissing_cols, # "null_rows": null_rows, # "null_cols": null_cols, # "mean_rows": mean_rows, # "mean_cols": mean_cols, # "std_dev": std_dev} return description
def em_algorithm(data, loops=50, dtype="cont"): if not checks(data): raise Exception("Checks failed") if dtype == "cont": null_xy = find_null(data) for x_i, y_i in null_xy: col = data[:, int(y_i)] mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() col[x_i] = random.gauss(mu, std) previous, i = 1, 1 for i in range(loops): mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() col[x_i] = random.gauss(mu, std) delta = (col[x_i] - previous) / previous if i > 5 and delta < 0.1: data[x_i][y_i] = col[x_i] break data[x_i][y_i] = col[x_i] previous = col[x_i] return data else: raise Exception("Other dtypes not supported yet.")
def count_missing(data): """ Calculate the total percentage of missing values and also the percentage in each column. Parameters ---------- data: np.array Data to impute. Returns ------- dict Percentage of missing values in total and in each column. """ size = len(data.flatten()) null_xy = find_null(data) np.unique(null_xy) counter = {y: 0. for y in np.unique(null_xy.T[1])} change_in_percentage = 1. / size for _, y in null_xy: counter[y] += change_in_percentage total_missing = len(null_xy) / size counter["total"] = total_missing return counter
def arima(data, p, d, q): """Autoregressive Integrated Moving Average Imputation PARAMETERS ---------- data: numpy.ndarray The matrix with missing values that you want to impute p: int Number of autoregressive terms d: int Number of nonseasonal differences needed for stationarity q: int Number of lagged forecast errors in the prediction equation RETURNS ------- numpy.ndarray """ # Verify inputs if not checks(data): raise Exception("Checks failed") try: p = int(p) d = int(d) q = int(q) data = isinstance(data, np.ndarray) except: raise Exception # Arima null_xy = find_null(data) for x, y in null_xy: print(x, y) return data
def arima(data, p, d, q): """Autoregressive Integrated Moving Average Imputation Stationary model PARAMETERS ---------- data: numpy.ndarray The matrix with missing values that you want to impute p: int Number of autoregressive terms. Ex (p,d,q)=(1,0,0). d: int Number of nonseasonal differences needed for stationarity q: int Number of lagged forecast errors in the prediction equation RETURNS ------- numpy.ndarray """ def _compute_nan_endpoints(x, y): pass try: p = int(p) d = int(d) q = int(q) data = isinstance(data, np.ndarray) except: raise Exception # ARIMA null_xy = find_null(data) for x, y in null_xy: print(x, y) return data
def random_imputation(data): if not checks(data): raise Exception("Checks failed") null_xy = find_null(data) for x, y in null_xy: uniques = np.unique(data[:, y]) uniques = uniques[~np.isnan(uniques)] data[x][y] = np.random.choice(uniques) return data
def locf(data, axis=0): """ Last Observation Carried Forward For each set of missing indices, use the value of one row before(same column). In the case that the missing value is the first row, look one row ahead instead. If this next row is also NaN, look to the next row. Repeat until you find a row in this column that's not NaN. All the rows before will be filled with this value. Parameters ---------- data: numpy.ndarray Data to impute. axis: boolean (optional) 0 if time series is in row format (Ex. data[0][:] is 1st data point). 1 if time series is in col format (Ex. data[:][0] is 1st data point). Returns ------- numpy.ndarray Imputed data. """ if not checks(data): raise Exception("Checks failed") if axis == 0: data = np.transpose(data) elif axis == 1: pass null_xy = find_null(data) for x_i, y_i in null_xy: # Simplest scenario, look one row back if x_i - 1 > -1: data[x_i][y_i] = data[x_i - 1][y_i] # Look n rows forward else: x_residuals = np.shape(data)[0] - x_i - 1 # n datapoints left val_found = False for i in range(1, x_residuals): if not np.isnan(data[x_i + i][y_i]): val_found = True break if val_found: # pylint: disable=undefined-loop-variable for x_nan in range(i): data[x_i + x_nan][y_i] = data[x_i + i][y_i] else: print("Error: Entire Column is NaN") raise Exception return data
def em(data, loops=50, dtype="cont"): """ Imputes given data using expectation maximization. E-step: Calculates the expected complete data log likelihood ratio. M-step: Finds the parameters that maximize the log likelihood of the complete data. Parameters ---------- data: numpy.nd.array Data to impute. loops: int Number of em iterations to run before breaking. dtype: ("cont","disc") Indicates whether the possible values will come from a continuous range or categorical range. Returns ------- numpy.nd.array Imputed data. """ if not checks(data): raise Exception("Checks failed") if dtype == "cont": null_xy = find_null(data) for x_i, y_i in null_xy: col = data[:, int(y_i)] mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() col[x_i] = random.gauss(mu, std) previous, i = 1, 1 for i in range(loops): # Expectation mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() # Maximization col[x_i] = random.gauss(mu, std) # Break out of loop if likelihood doesn't change at least 10% # and has run at least 5 times delta = (col[x_i] - previous) / previous if i > 5 and delta < 0.1: data[x_i][y_i] = col[x_i] break data[x_i][y_i] = col[x_i] previous = col[x_i] return data else: raise Exception("Other dtypes not supported yet.")
def mean_imputation(data): """ Substitute missing values with the mean of that column. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) for x_i, y_i in null_xy: row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(row_wo_nan) data[x_i][y_i] = new_value return data
def random_imputation(data): """ Fill missing values in with a randomly selected value from the same column. Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) for x, y in null_xy: uniques = np.unique(data[:, y]) uniques = uniques[~np.isnan(uniques)] data[x][y] = np.random.choice(uniques) return data
def median_imputation(data): """ Substitute missing values with the median of that column(middle). Parameters ---------- data: numpy.ndarray Data to impute. Returns ------- numpy.ndarray Imputed data. """ null_xy = find_null(data) cols_missing = set(null_xy.T[1]) medians = {} for y_i in cols_missing: cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] median_y = np.median(cols_wo_nan) medians[str(y_i)] = median_y for x_i, y_i in null_xy: data[x_i][y_i] = medians[str(y_i)] return data
def mice(data): """Multivariate Imputation by Chained Equations Reference: Buuren, S. V., & Groothuis-Oudshoorn, K. (2011). Mice: Multivariate Imputation by Chained Equations in R. Journal of Statistical Software, 45(3). doi:10.18637/jss.v045.i03 Implementation follows the main idea from the paper above. Differs in decision of which variable to regress on (here, I choose it at random). Also differs in stopping criterion (here the model stops after change in prediction from previous prediction is less than 10%). PARAMETERS ---------- data: numpy.ndarray Data to impute. RETURNS ------- numpy.ndarray Imputed data. """ if not checks(data): raise Exception("Checks failed") null_xy = find_null(data) # Add a column of zeros to the index values null_xyv = np.append(null_xy, np.zeros((np.shape(null_xy)[0], 1)), axis=1) null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv] temp = [] cols_missing = set([y for _, y, _ in null_xyv]) # Step 1: Simple Imputation, these are just placeholders for x_i, y_i, value in null_xyv: # Column containing nan value without the nan value col = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(col) data[x_i][y_i] = new_value temp.append([x_i, y_i, new_value]) null_xyv = temp # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary) converged = [False] * len(null_xyv) while all(converged): # Step 2: Placeholders are set back to missing for one variable/column dependent_col = int(np.random.choice(list(cols_missing))) missing_xs = [int(x) for x, y, value in null_xyv if y == dependent_col] # Step 3: Perform linear regression using the other variables x_train, y_train = [], [] for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs): x_train.append(np.delete(data[x_i], dependent_col)) y_train.append(data[x_i][dependent_col]) model = LinearRegression() model.fit(x_train, y_train) # Step 4: Missing values for the missing variable/column are replaced # with predictions from our new linear regression model temp = [] # For null indices with the dependent column that was randomly chosen for i, x_i, y_i, value in enumerate(null_xyv): if y_i == dependent_col: # Row 'x' without the nan value new_value = model.predict(np.delete(data[x_i], dependent_col)) data[x_i][y_i] = new_value.reshape(1, -1) temp.append([x_i, y_i, new_value]) delta = (new_value - value) / value if delta < 0.1: converged[i] = True null_xyv = temp return data
def _nan_exists(data): """ True if there is at least one np.nan in the array""" null_xy = find_null(data) return len(null_xy) > 0
def test_missing_values_present(self): """ Check that the dataset is corrupted (missing values present)""" self.assertTrue(find_null(self.data).size != 0)