コード例 #1
0
def mode_imputation(data):
    """ Substitute missing values with the mode of that column(most frequent).

    In the case that there is a tie (there are multiple, most frequent values)
    for a column randomly pick one of them.

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    null_xy = find_null(data)
    modes = []
    for y_i in range(np.shape(data)[1]):
        unique_counts = np.unique(data[:, [y_i]], return_counts=True)
        max_count = np.max(unique_counts[1])
        mode_y = [
            unique for unique, count in np.transpose(unique_counts)
            if count == max_count and not np.isnan(unique)
        ]
        modes.append(mode_y)  # Appends index of column and column modes
    for x_i, y_i in null_xy:
        data[x_i][y_i] = np.random.choice(modes[y_i])
    return data
コード例 #2
0
ファイル: imputations.py プロジェクト: realchief/TRCMimpute
def from_before_observation(data, axis=0):

    if not checks(data):
        raise Exception("Checks failed")

    if axis == 0:
        data = np.transpose(data)
    elif axis == 1:
        pass

    null_xy = find_null(data)
    for x_i, y_i in null_xy:

        # Simplest scenario, look one row back
        if x_i - 1 > -1:
            data[x_i][y_i] = data[x_i - 1][y_i]

        # Look n rows forward

        else:
            x_residuals = np.shape(data)[0] - x_i - 1  # n data points left
            val_found = False
            for i in range(1, x_residuals):
                if not np.isnan(data[x_i + i][y_i]):
                    val_found = True
                    break
            if val_found:

                for x_nan in range(i):
                    data[x_i + x_nan][y_i] = data[x_i + i][y_i]
            else:
                print("Error: Entire Column is NaN")
                raise Exception
    return data
コード例 #3
0
def describe(data):  # verbose=True):
    """ Print input/output multiple times

    Parameters
    ----------
    data: numpy.nd.array
        The data you want to get a description from
    verbose: boolean(optional)
        Decides whether the description is short or long form

    Returns
    -------
    dict
        missingness: list
            Confidence interval of data being MCAR, MAR or MNAR - in that order
        null_xy: list of tuples
            Indices of all null points
        null_n: list
            Total number of null values for each column
        pmissing_n: float
            Percentage of missing values in dataset
        null_rows: list
            Indices of all rows that are completely null
        null_cols: list
            Indices of all columns that are completely null
        mean_rows: list
            Mean value of each row
        mean_cols: list
            Mean value of each column
        std_dev: list
            std dev for each row/column
        min_max: list
            Finds the minimum and maximum for each row

    """
    #    missingness = [0.33, 0.33, 0.33]  # find_missingness(data)
    null_xy = find_null(data)
    null_n = len(null_xy)
    pmissing_n = float(null_n / len(data.flatten))
    #    pmissing_rows = ""
    #    pmissing_cols = ""
    #    null_rows = ""
    #    null_cols = ""
    #    mean_rows = ""
    #    mean_cols = ""
    #    std_dev = ""
    #                   "missingness": missingness,
    description = {
        "null_xy": null_xy,
        "null_n": null_n,
        "pmissing_n": pmissing_n
    }
    #                   "pmissing_rows": pmissing_rows,
    #                   "pmissing_cols": pmissing_cols,
    #                   "null_rows": null_rows,
    #                   "null_cols": null_cols,
    #                   "mean_rows": mean_rows,
    #                   "mean_cols": mean_cols,
    #                   "std_dev": std_dev}
    return description
コード例 #4
0
ファイル: imputations.py プロジェクト: realchief/TRCMimpute
def em_algorithm(data, loops=50, dtype="cont"):

    if not checks(data):
        raise Exception("Checks failed")
    if dtype == "cont":
        null_xy = find_null(data)
        for x_i, y_i in null_xy:
            col = data[:, int(y_i)]
            mu = col[~np.isnan(col)].mean()
            std = col[~np.isnan(col)].std()
            col[x_i] = random.gauss(mu, std)
            previous, i = 1, 1
            for i in range(loops):

                mu = col[~np.isnan(col)].mean()
                std = col[~np.isnan(col)].std()

                col[x_i] = random.gauss(mu, std)

                delta = (col[x_i] - previous) / previous
                if i > 5 and delta < 0.1:
                    data[x_i][y_i] = col[x_i]
                    break
                data[x_i][y_i] = col[x_i]
                previous = col[x_i]
        return data
    else:
        raise Exception("Other dtypes not supported yet.")
コード例 #5
0
def count_missing(data):
    """ Calculate the total percentage of missing values and also the
    percentage in each column.

    Parameters
    ----------
    data: np.array
        Data to impute.

    Returns
    -------
    dict
        Percentage of missing values in total and in each column.

    """
    size = len(data.flatten())
    null_xy = find_null(data)
    np.unique(null_xy)
    counter = {y: 0. for y in np.unique(null_xy.T[1])}
    change_in_percentage = 1. / size
    for _, y in null_xy:
        counter[y] += change_in_percentage
    total_missing = len(null_xy) / size
    counter["total"] = total_missing

    return counter
コード例 #6
0
def arima(data, p, d, q):
    """Autoregressive Integrated Moving Average Imputation

    PARAMETERS
    ----------
    data: numpy.ndarray
        The matrix with missing values that you want to impute
    p: int
        Number of autoregressive terms
    d: int
        Number of nonseasonal differences needed for stationarity
    q: int
        Number of lagged forecast errors in the prediction equation
    RETURNS
    -------
    numpy.ndarray
    """
    # Verify inputs
    if not checks(data):
        raise Exception("Checks failed")
    try:
        p = int(p)
        d = int(d)
        q = int(q)
        data = isinstance(data, np.ndarray)
    except:
        raise Exception
    # Arima
    null_xy = find_null(data)
    for x, y in null_xy:
        print(x, y)
    return data
コード例 #7
0
def arima(data, p, d, q):
    """Autoregressive Integrated Moving Average Imputation

    Stationary model

    PARAMETERS
    ----------
    data: numpy.ndarray
        The matrix with missing values that you want to impute
    p: int
        Number of autoregressive terms. Ex (p,d,q)=(1,0,0). 
    d: int
        Number of nonseasonal differences needed for stationarity
    q: int
        Number of lagged forecast errors in the prediction equation
    RETURNS
    -------
    numpy.ndarray
    """
    def _compute_nan_endpoints(x, y):
        pass

    try:
        p = int(p)
        d = int(d)
        q = int(q)
        data = isinstance(data, np.ndarray)
    except:
        raise Exception
    # ARIMA
    null_xy = find_null(data)
    for x, y in null_xy:
        print(x, y)
    return data
コード例 #8
0
ファイル: imputations.py プロジェクト: realchief/TRCMimpute
def random_imputation(data):

    if not checks(data):
        raise Exception("Checks failed")
    null_xy = find_null(data)
    for x, y in null_xy:
        uniques = np.unique(data[:, y])
        uniques = uniques[~np.isnan(uniques)]
        data[x][y] = np.random.choice(uniques)
    return data
コード例 #9
0
ファイル: locf.py プロジェクト: AutoDataPlatform/impyute
def locf(data, axis=0):
    """ Last Observation Carried Forward

    For each set of missing indices, use the value of one row before(same
    column). In the case that the missing value is the first row, look one
    row ahead instead. If this next row is also NaN, look to the next row.
    Repeat until you find a row in this column that's not NaN. All the rows
    before will be filled with this value.

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.
    axis: boolean (optional)
        0 if time series is in row format (Ex. data[0][:] is 1st data point).
        1 if time series is in col format (Ex. data[:][0] is 1st data point).

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    if not checks(data):
        raise Exception("Checks failed")

    if axis == 0:
        data = np.transpose(data)
    elif axis == 1:
        pass

    null_xy = find_null(data)
    for x_i, y_i in null_xy:
        # Simplest scenario, look one row back
        if x_i - 1 > -1:
            data[x_i][y_i] = data[x_i - 1][y_i]
        # Look n rows forward
        else:
            x_residuals = np.shape(data)[0] - x_i - 1  # n datapoints left
            val_found = False
            for i in range(1, x_residuals):
                if not np.isnan(data[x_i + i][y_i]):
                    val_found = True
                    break
            if val_found:
                # pylint: disable=undefined-loop-variable
                for x_nan in range(i):
                    data[x_i + x_nan][y_i] = data[x_i + i][y_i]
            else:
                print("Error: Entire Column is NaN")
                raise Exception
    return data
コード例 #10
0
def em(data, loops=50, dtype="cont"):
    """ Imputes given data using expectation maximization.

    E-step: Calculates the expected complete data log likelihood ratio.
    M-step: Finds the parameters that maximize the log likelihood of the
    complete data.

    Parameters
    ----------
    data: numpy.nd.array
        Data to impute.
    loops: int
        Number of em iterations to run before breaking.
    dtype: ("cont","disc")
        Indicates whether the possible values will come from a continuous
        range or categorical range.

    Returns
    -------
    numpy.nd.array
        Imputed data.

    """
    if not checks(data):
        raise Exception("Checks failed")
    if dtype == "cont":
        null_xy = find_null(data)
        for x_i, y_i in null_xy:
            col = data[:, int(y_i)]
            mu = col[~np.isnan(col)].mean()
            std = col[~np.isnan(col)].std()
            col[x_i] = random.gauss(mu, std)
            previous, i = 1, 1
            for i in range(loops):
                # Expectation
                mu = col[~np.isnan(col)].mean()
                std = col[~np.isnan(col)].std()
                # Maximization
                col[x_i] = random.gauss(mu, std)
                # Break out of loop if likelihood doesn't change at least 10%
                # and has run at least 5 times
                delta = (col[x_i] - previous) / previous
                if i > 5 and delta < 0.1:
                    data[x_i][y_i] = col[x_i]
                    break
                data[x_i][y_i] = col[x_i]
                previous = col[x_i]
        return data
    else:
        raise Exception("Other dtypes not supported yet.")
コード例 #11
0
def mean_imputation(data):
    """ Substitute missing values with the mean of that column.

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    null_xy = find_null(data)
    for x_i, y_i in null_xy:
        row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
        new_value = np.mean(row_wo_nan)
        data[x_i][y_i] = new_value
    return data
コード例 #12
0
ファイル: random_imputation.py プロジェクト: akshi8/impyute
def random_imputation(data):
    """ Fill missing values in with a randomly selected value from the same
    column.

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    null_xy = find_null(data)
    for x, y in null_xy:
        uniques = np.unique(data[:, y])
        uniques = uniques[~np.isnan(uniques)]
        data[x][y] = np.random.choice(uniques)
    return data
コード例 #13
0
def median_imputation(data):
    """ Substitute missing values with the median of that column(middle).

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    null_xy = find_null(data)
    cols_missing = set(null_xy.T[1])
    medians = {}
    for y_i in cols_missing:
        cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
        median_y = np.median(cols_wo_nan)
        medians[str(y_i)] = median_y
    for x_i, y_i in null_xy:
        data[x_i][y_i] = medians[str(y_i)]
    return data
コード例 #14
0
ファイル: mice.py プロジェクト: AutoDataPlatform/impyute
def mice(data):
    """Multivariate Imputation by Chained Equations

    Reference:
        Buuren, S. V., & Groothuis-Oudshoorn, K. (2011). Mice: Multivariate
        Imputation by Chained Equations in R. Journal of Statistical Software,
        45(3). doi:10.18637/jss.v045.i03

    Implementation follows the main idea from the paper above. Differs in
    decision of which variable to regress on (here, I choose it at random).
    Also differs in stopping criterion (here the model stops after change in
    prediction from previous prediction is less than 10%).

    PARAMETERS
    ----------
    data: numpy.ndarray
        Data to impute.

    RETURNS
    -------
    numpy.ndarray
        Imputed data.

    """
    if not checks(data):
        raise Exception("Checks failed")
    null_xy = find_null(data)

    # Add a column of zeros to the index values
    null_xyv = np.append(null_xy, np.zeros((np.shape(null_xy)[0], 1)), axis=1)

    null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv]
    temp = []
    cols_missing = set([y for _, y, _ in null_xyv])

    # Step 1: Simple Imputation, these are just placeholders
    for x_i, y_i, value in null_xyv:
        # Column containing nan value without the nan value
        col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]

        new_value = np.mean(col)
        data[x_i][y_i] = new_value
        temp.append([x_i, y_i, new_value])
    null_xyv = temp

    # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)

    converged = [False] * len(null_xyv)
    while all(converged):
        # Step 2: Placeholders are set back to missing for one variable/column
        dependent_col = int(np.random.choice(list(cols_missing)))
        missing_xs = [int(x) for x, y, value in null_xyv if y == dependent_col]

        # Step 3: Perform linear regression using the other variables
        x_train, y_train = [], []
        for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs):
            x_train.append(np.delete(data[x_i], dependent_col))
            y_train.append(data[x_i][dependent_col])
        model = LinearRegression()
        model.fit(x_train, y_train)

        # Step 4: Missing values for the missing variable/column are replaced
        # with predictions from our new linear regression model
        temp = []
        # For null indices with the dependent column that was randomly chosen
        for i, x_i, y_i, value in enumerate(null_xyv):
            if y_i == dependent_col:
                # Row 'x' without the nan value
                new_value = model.predict(np.delete(data[x_i], dependent_col))
                data[x_i][y_i] = new_value.reshape(1, -1)
                temp.append([x_i, y_i, new_value])
                delta = (new_value - value) / value
                if delta < 0.1:
                    converged[i] = True
        null_xyv = temp
    return data
コード例 #15
0
ファイル: checks.py プロジェクト: akshi8/impyute
def _nan_exists(data):
    """ True if there is at least one np.nan in the array"""
    null_xy = find_null(data)
    return len(null_xy) > 0
コード例 #16
0
 def test_missing_values_present(self):
     """ Check that the dataset is corrupted (missing values present)"""
     self.assertTrue(find_null(self.data).size != 0)