Example #1
0
def mode(data):
    """ Substitute missing values with the mode of that column(most frequent).

    In the case that there is a tie (there are multiple, most frequent values)
    for a column randomly pick one of them.

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    nan_xy = matrix.nan_indices(data)
    modes = []
    for y_i in range(np.shape(data)[1]):
        unique_counts = np.unique(data[:, [y_i]], return_counts=True)
        max_count = np.max(unique_counts[1])
        mode_y = [unique for unique, count in np.transpose(unique_counts)
                  if count == max_count and not np.isnan(unique)]
        modes.append(mode_y)  # Appends index of column and column modes
    for x_i, y_i in nan_xy:
        data[x_i][y_i] = np.random.choice(modes[y_i])
    return data
Example #2
0
def count_missing(data):
    """ Calculate the total percentage of missing values and also the
    percentage in each column.

    Parameters
    ----------
    data: np.array
        Data to impute.

    Returns
    -------
    dict
        Percentage of missing values in total and in each column.

    """
    size = len(data.flatten())
    nan_xy = matrix.nan_indices(data)
    np.unique(nan_xy)
    counter = {y: 0. for y in np.unique(nan_xy.T[1])}
    change_in_percentage = 1. / size
    for _, y in nan_xy:
        counter[y] += change_in_percentage
    total_missing = len(nan_xy) / size
    counter["total"] = total_missing

    return counter
Example #3
0
def describe(data):  # verbose=True):
    """ Print input/output multiple times

    Eventually will be used instead of matrix.nan_indices everywhere

    Parameters
    ----------
    data: numpy.nd.array
        The data you want to get a description from
    verbose: boolean(optional)
        Decides whether the description is short or long form

    Returns
    -------
    dict
        missingness: list
            Confidence interval of data being MCAR, MAR or MNAR - in that order
        nan_xy: list of tuples
            Indices of all null points
        nan_n: list
            Total number of null values for each column
        pmissing_n: float
            Percentage of missing values in dataset
        nan_rows: list
            Indices of all rows that are completely null
        nan_cols: list
            Indices of all columns that are completely null
        mean_rows: list
            Mean value of each row
        mean_cols: list
            Mean value of each column
        std_dev: list
            std dev for each row/column
        min_max: list
            Finds the minimum and maximum for each row

    """
    #    missingness = [0.33, 0.33, 0.33]  # find_missingness(data)
    nan_xy = matrix.nan_indices(data)
    nan_n = len(nan_xy)
    pmissing_n = float(nan_n / len(data.flatten))
    #    pmissing_rows = ""
    #    pmissing_cols = ""
    #    nan_rows = ""
    #    nan_cols = ""
    #    mean_rows = ""
    #    mean_cols = ""
    #    std_dev = ""
    #                   "missingness": missingness,
    description = {"nan_xy": nan_xy, "nan_n": nan_n, "pmissing_n": pmissing_n}
    #                   "pmissing_rows": pmissing_rows,
    #                   "pmissing_cols": pmissing_cols,
    #                   "nan_rows": nan_rows,
    #                   "nan_cols": nan_cols,
    #                   "mean_rows": mean_rows,
    #                   "mean_cols": mean_cols,
    #                   "std_dev": std_dev}
    return description
Example #4
0
def em(data, loops=50):
    """ Imputes given data using expectation maximization.

    E-step: Calculates the expected complete data log likelihood ratio.
    M-step: Finds the parameters that maximize the log likelihood of the
    complete data.

    Parameters
    ----------
    data: numpy.nd.array
        Data to impute.
    loops: int
        Number of em iterations to run before breaking.
    inplace: boolean
        If True, operate on the numpy array reference

    Returns
    -------
    numpy.nd.array
        Imputed data.

    """
    nan_xy = matrix.nan_indices(data)
    for x_i, y_i in nan_xy:
        col = data[:, int(y_i)]
        mu = col[~np.isnan(col)].mean()
        std = col[~np.isnan(col)].std()
        col[x_i] = np.random.normal(loc=mu, scale=std)
        previous, i = 1, 1
        for i in range(loops):
            # Expectation
            mu = col[~np.isnan(col)].mean()
            std = col[~np.isnan(col)].std()
            # Maximization
            col[x_i] = np.random.normal(loc=mu, scale=std)
            # Break out of loop if likelihood doesn't change at least 10%
            # and has run at least 5 times
            delta = (col[x_i] - previous) / previous
            if i > 5 and delta < 0.1:
                data[x_i][y_i] = col[x_i]
                break
            data[x_i][y_i] = col[x_i]
            previous = col[x_i]
    return data
Example #5
0
def mean(data):
    """ Substitute missing values with the mean of that column.

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    nan_xy = matrix.nan_indices(data)
    for x_i, y_i in nan_xy:
        row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
        new_value = np.mean(row_wo_nan)
        data[x_i][y_i] = new_value
    return data
Example #6
0
def random(data):
    """ Fill missing values in with a randomly selected value from the same
    column.

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    nan_xy = matrix.nan_indices(data)
    for x, y in nan_xy:
        uniques = np.unique(data[:, y])
        uniques = uniques[~np.isnan(uniques)]
        data[x][y] = np.random.choice(uniques)
    return data
Example #7
0
def median(data):
    """ Substitute missing values with the median of that column(middle).

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    nan_xy = matrix.nan_indices(data)
    cols_missing = set(nan_xy.T[1])
    medians = {}
    for y_i in cols_missing:
        cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
        median_y = np.median(cols_wo_nan)
        medians[str(y_i)] = median_y
    for x_i, y_i in nan_xy:
        data[x_i][y_i] = medians[str(y_i)]
    return data
Example #8
0
def test_missing_values_present():
    """ Check that the dataset is corrupted (missing values present)"""
    assert matrix.nan_indices(data).size != 0
Example #9
0
def moving_window(data,
                  nindex=None,
                  wsize=5,
                  errors="coerce",
                  func=np.mean,
                  inplace=False):
    """ Interpolate the missing values based on nearby values.

    For example, with an array like this:

        array([[-1.24940, -1.38673, -0.03214945,  0.08255145, -0.007415],
               [ 2.14662,  0.32758 , -0.82601414,  1.78124027,  0.873998],
               [-0.41400, -0.977629,         nan, -1.39255344,  1.680435],
               [ 0.40975,  1.067599,  0.29152388, -1.70160145, -0.565226],
               [-0.54592, -1.126187,  2.04004377,  0.16664863, -0.010677]])

    Using a `k` or window size of 3. The one missing value would be set
    to -1.18509122. The window operates on the horizontal axis.

    Usage
    -----

    The parameters default the function to a moving mean. You may want to change
    the default window size:

        moving_window(data, wsize=10)

    To only look at past data (null value is at the rightmost index in the window):

        moving_window(data, nindex=-1)

    To use a custom function:

        moving_window(data, func=np.median)

    You can also do something like take 1.5x the max of previous values in the window:

        moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1)

    Parameters
    ----------
    data: numpy.ndarray
        2D matrix to impute.
    nindex: int
        Null index. Index of the null value inside the moving average window.
        Use cases: Say you wanted to make value skewed toward the left or right
        side. 0 would only take the average of values from the right and -1
        would only take the average of values from the left
    wsize: int
        Window size. Size of the moving average window/area of values being used
        for each local imputation. This number includes the missing value.
    errors: {"raise", "coerce", "ignore"}
        Errors will occur with the indexing of the windows - for example if there
        is a nan at data[x][0] and `nindex` is set to -1 or there is a nan at
        data[x][-1] and `nindex` is set to 0. `"raise"` will raise an error,
        `"coerce"` will try again using an nindex set to the middle and `"ignore"`
        will just leave it as a nan.
    inplace: {True, False}
        Whether to return a copy or run on the passed-in array

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    if errors == "ignore":
        raise Exception("`errors` value `ignore` not implemented yet. Sorry!")

    if not inplace:
        data = data.copy()

    if nindex is None:  # If using equal window side lengths
        assert wsize % 2 == 1, "The parameter `wsize` should not be even "\
        "if the value `nindex` is not set since it defaults to the midpoint "\
        "and an even `wsize` makes the midpoint ambiguous"
        wside_left = wsize // 2
        wside_right = wsize // 2
    else:  # If using custom window side lengths
        assert nindex < wsize, "The null index must be smaller than the window size"
        if nindex == -1:
            wside_left = wsize - 1
            wside_right = 0
        else:
            wside_left = nindex
            wside_right = wsize - nindex - 1

    while True:
        nan_xy = matrix.nan_indices(data)
        n_nan_prev = len(nan_xy)
        for x_i, y_i in nan_xy:
            left_i = max(0, y_i - wside_left)
            right_i = min(len(data), y_i + wside_right + 1)
            window = data[x_i, left_i:right_i]
            window_not_null = window[~np.isnan(window)]

            if len(window_not_null) > 0:
                try:
                    data[x_i][y_i] = func(window_not_null)
                    continue
                except Exception as e:
                    if errors == "raise":
                        raise e

            if errors == "coerce":
                # If either the window has a length of 0 or the aggregate function fails somehow,
                # do a fallback of just trying the best we can by using it as the middle and trying
                # to recalculate. Use temporary wside_left/wside_right, for only the calculation of
                # this specific problamatic value
                wside_left_tmp = wsize // 2
                wside_right_tmp = wside_left_tmp

                left_i_tmp = max(0, y_i - wside_left_tmp)
                right_i_tmp = min(len(data), y_i + wside_right_tmp + 1)

                window = data[x_i, left_i_tmp:right_i_tmp]
                window_not_null = window[~np.isnan(window)]
                try:
                    data[x_i][y_i] = func(window_not_null)
                except Exception as e:
                    print("Exception:", e)
        if n_nan_prev == len(matrix.nan_indices(data)):
            break

    return data
Example #10
0
def fast_knn(data,
             k=3,
             eps=0,
             p=2,
             distance_upper_bound=np.inf,
             leafsize=10,
             idw_fn=idw.shepards,
             init_impute_fn=mean):
    """ Impute using a variant of the nearest neighbours approach

    Basic idea: Impute array with a passed in initial impute fn (mean impute)
    and then use the resulting complete array to construct a KDTree. Use this
    KDTree to compute nearest neighbours.  After finding `k` nearest
    neighbours, take the weighted average of them. Basically, find the nearest
    row in terms of distance

    This approach is much, much faster than the other implementation (fit+transform
    for each subset) which is almost prohibitively expensive.

    Parameters
    ----------
    data: numpy.ndarray
        2D matrix to impute.

    k: int, optional
        Parameter used for method querying the KDTree class object. Number of
        neighbours used in the KNN query. Refer to the docs for
        [`scipy.spatial.KDTree.query`]
        (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).

    eps: nonnegative float, optional
        Parameter used for method querying the KDTree class object. From the
        SciPy docs: "Return approximate nearest neighbors; the kth returned
        value is guaranteed to be no further than (1+eps) times the distance to
        the real kth nearest neighbor". Refer to the docs for
        [`scipy.spatial.KDTree.query`]
        (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).

    p : float, 1<=p<=infinity, optional
        Parameter used for method querying the KDTree class object. Straight from the
        SciPy docs: "Which Minkowski p-norm to use. 1 is the
        sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
        distance infinity is the maximum-coordinate-difference distance". Refer to
        the docs for
        [`scipy.spatial.KDTree.query`]
        (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).

    distance_upper_bound : nonnegative float, optional
        Parameter used for method querying the KDTree class object. Straight
        from the SciPy docs: "Return only neighbors within this distance. This
        is used to prune tree searches, so if you are doing a series of
        nearest-neighbor queries, it may help to supply the distance to the
        nearest neighbor of the most recent point." Refer to the docs for
        [`scipy.spatial.KDTree.query`]
        (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).

    leafsize: int, optional
        Parameter used for construction of the `KDTree` class object. Straight from
        the SciPy docs: "The number of points at which the algorithm switches
        over to brute-force. Has to be positive". Refer to the docs for
        [`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html)
        for more information.

    idw_fn: fn, optional
        Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom
        one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using
        functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`

    init_impute_fn: fn, optional

    Returns
    -------
    numpy.ndarray
        Imputed data.

    Examples
    --------

        >>> data = np.arange(25).reshape((5, 5)).astype(np.float)
        >>> data[0][2] =  np.nan
        >>> data
        array([[ 0.,  1., nan,  3.,  4.],
               [ 5.,  6.,  7.,  8.,  9.],
               [10., 11., 12., 13., 14.],
               [15., 16., 17., 18., 19.],
               [20., 21., 22., 23., 24.]])
        >> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour
        array([[ 0.,  1.,  7.,  3.,  4.],
               [ 5.,  6.,  7.,  8.,  9.],
               [10., 11., 12., 13., 14.],
               [15., 16., 17., 18., 19.],
               [20., 21., 22., 23., 24.]])
        >> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours
        array([[ 0.        ,  1.        , 10.08608891,  3.        ,  4.        ],
               [ 5.        ,  6.        ,  7.        ,  8.        ,  9.        ],
               [10.        , 11.        , 12.        , 13.        , 14.        ],
               [15.        , 16.        , 17.        , 18.        , 19.        ],
               [20.        , 21.        , 22.        , 23.        , 24.        ]])
        >> fast_knn(data, k=3)
        array([[ 0.        ,  1.        , 13.40249283,  3.        ,  4.        ],
               [ 5.        ,  6.        ,  7.        ,  8.        ,  9.        ],
               [10.        , 11.        , 12.        , 13.        , 14.        ],
               [15.        , 16.        , 17.        , 18.        , 19.        ],
               [20.        , 21.        , 22.        , 23.        , 24.        ]])
        >> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error
        ...
        IndexError: index 5 is out of bounds for axis 0 with size 5

    """
    nan_xy = matrix.nan_indices(data)
    data_c = init_impute_fn(data)
    kdtree = KDTree(data_c, leafsize=leafsize)

    for x_i, y_i in nan_xy:
        distances, indices = kdtree.query(
            data_c[x_i],
            k=k + 1,
            eps=eps,
            p=p,
            distance_upper_bound=distance_upper_bound)
        # Will always return itself in the first index. Delete it.
        distances, indices = distances[1:], indices[1:]
        # Add small constant to distances to avoid division by 0
        distances += 1e-3
        weights = idw_fn(distances)
        # Assign missing value the weighted average of `k` nearest neighbours
        data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices])
    return data
Example #11
0
def buck_iterative(data):
    """ Iterative variant of buck's method

    - Variable to regress on is chosen at random.
    - EM type infinite regression loop stops after change in prediction from
      previous prediction < 10% for all columns with missing values

    A Method of Estimation of Missing Values in Multivariate Data Suitable for
    use with an Electronic Computer S. F. Buck Journal of the Royal Statistical
    Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    nan_xy = matrix.nan_indices(data)

    # Add a column of zeros to the index values
    nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1)

    nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz]
    temp = []
    cols_missing = {y for _, y, _ in nan_xyz}

    # Step 1: Simple Imputation, these are just placeholders
    for x_i, y_i, value in nan_xyz:
        # Column containing nan value without the nan value
        col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]

        new_value = np.mean(col)
        data[x_i][y_i] = new_value
        temp.append([x_i, y_i, new_value])
    nan_xyz = temp

    # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)

    converged = [False] * len(nan_xyz)
    while not all(converged):
        # Step 2: Placeholders are set back to missing for one variable/column
        dependent_col = int(np.random.choice(list(cols_missing)))
        missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col]

        # Step 3: Perform linear regression using the other variables
        x_train, y_train = [], []
        for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs):
            x_train.append(np.delete(data[x_i], dependent_col))
            y_train.append(data[x_i][dependent_col])
        model = LinearRegression()
        model.fit(x_train, y_train)

        # Step 4: Missing values for the missing variable/column are replaced
        # with predictions from our new linear regression model
        # For null indices with the dependent column that was randomly chosen
        for i, z in enumerate(nan_xyz):
            x_i = z[0]
            y_i = z[1]
            value = data[x_i, y_i]
            if y_i == dependent_col:
                # Row 'x' without the nan value
                new_value = model.predict(
                    [np.delete(data[x_i], dependent_col)])
                data[x_i][y_i] = new_value.reshape(1, -1)
                if value == 0.0:
                    delta = (new_value - value) / 0.01
                else:
                    delta = (new_value - value) / value
                converged[i] = abs(delta) < 0.1
    return data