Beispiel #1
0
    def fit(self, X, y):
        """Fit the Imputer to the dataset by fitting bayesian model.

        Args:
            X (pd.Dataframe): dataset to fit the imputer.
            y (pd.Series): response, which is eventually imputed.

        Returns:
            self. Instance of the class.
        """
        _not_num_series(self.strategy, y)
        nc = len(X.columns)

        # initialize model for bayesian linear reg. Default vals for priors
        # assume data is scaled and centered. Convergence can struggle or fail
        # if not the case and proper values for the priors are not specified
        # separately, also assumes each beta is normal and "independent"
        # while betas likely not independent, this is technically a rule of OLS
        with pm.Model() as fit_model:
            alpha = pm.Normal("alpha", self.am, sd=self.asd)
            beta = pm.Normal("beta", self.bm, sd=self.bsd, shape=nc)
            sigma = pm.HalfCauchy("σ", self.sig)
            mu = alpha+beta.dot(X.T)
            score = pm.Normal("score", mu, sd=sigma, observed=y)
        self.statistics_ = {"param": fit_model, "strategy": self.strategy}
        return self
Beispiel #2
0
    def impute(self, X):
        """Perform imputations using the statistics generated from fit.

        The impute method handles the actual imputation. Missing values
        in a given dataset are replaced with the respective mean from fit.

        Args:
            X (pd.Series): Dataset to impute missing data from fit.

        Returns:
            np.array -- imputed dataset.
        """
        # check if fitted then impute with mean
        check_is_fitted(self, "statistics_")
        _not_num_series(self.strategy, X)
        omu = self.statistics_["param"]  # mean of observed data
        idx = X.isnull()  # missing data
        nO = sum(~idx)  # number of observed
        m = sum(idx)  # number to impute
        muhatk = stats.norm(omu, np.sqrt(1 / nO))
        # imputation cross-terms *NOT* uncorrelated
        Ymi = stats.multivariate_normal(
            np.ones(m) * muhatk.rvs(),
            np.ones((m, m)) / nO + np.eye(m)).rvs()
        out = X.copy()
        out[idx] = Ymi
        return out
    def fit(self, X, y):
        """Fit the Imputer to the dataset by fitting linear model.

        Args:
            X (pd.Dataframe): dataset to fit the imputer.
            y (pd.Series): response, which is eventually imputed.

        Returns:
            self. Instance of the class.
        """
        _not_num_series(self.strategy, y)
        self.lm.fit(X, y)
        self.statistics_ = {"strategy": self.strategy}
        return self
Beispiel #4
0
    def fit(self, X, y=None):
        """Fit the Imputer to the dataset and calculate the median.

        Args:
            X (pd.Series): Dataset to fit the imputer.
            y (None): ignored, None to meet requirements of base class

        Returns:
            self. Instance of the class.
        """
        _not_num_series(self.strategy, X)
        median = X.median()
        self.statistics_ = {"param": median, "strategy": self.strategy}
        return self
Beispiel #5
0
    def fit(self, X, y=None):
        """Fit Imputer to dataset and calculate mean and sample variance.

        Args:
            X (pd.Series): Dataset to fit the imputer.
            y (None): ignored, None to meet requirements of base class

        Returns:
            self. Instance of the class.
        """

        # get the moments for the normal distribution of feature X
        _not_num_series(self.strategy, X)
        moments = (X.mean(), X.std())
        self.statistics_ = {"param": moments, "strategy": self.strategy}
        return self
Beispiel #6
0
    def impute(self, X):
        """Perform imputations using the statistics generated from fit.

        The impute method handles the actual imputation. Missing values
        in a given dataset are replaced with the respective median from fit.

        Args:
            X (pd.Series): Dataset to impute missing data from fit.

        Returns:
            float -- imputed dataset.
        """
        # check is fitted then impute with median
        check_is_fitted(self, "statistics_")
        _not_num_series(self.strategy, X)
        imp = self.statistics_["param"]
        return imp
    def fit(self, X, y):
        """Fit the Imputer to the dataset by fitting linear model.

        The fit step also generates predictions on the observed data. These
        predictions are necessary to derive the mean_squared_error, which is
        passed as a parameter to the impute phase. The MSE is used to create
        the normal error distribution from which the imptuer draws.

        Args:
            X (pd.Dataframe): dataset to fit the imputer.
            y (pd.Series): response, which is eventually imputed.

        Returns:
            self. Instance of the class.
        """
        _not_num_series(self.strategy, y)
        self.lm.fit(X, y)
        preds = self.lm.predict(X)
        mse = mean_squared_error(y, preds)
        self.statistics_ = {"param": mse, "strategy": self.strategy}
        return self
Beispiel #8
0
    def fit(self, X, y):
        """Fit the Imputer to the dataset by fitting bayesian and LS model.

        Args:
            X (pd.Dataframe): dataset to fit the imputer.
            y (pd.Series): response, which is eventually imputed.

        Returns:
            self. Instance of the class.
        """
        _not_num_series(self.strategy, y)
        nc = len(X.columns)

        # get predictions for the data, which will be used for "closest" vals
        y_pred = self.lm.fit(X, y).predict(X)
        y_df = DataFrame({"y": y, "y_pred": y_pred})

        # calculate bayes and use appropriate means for alpha and beta priors
        # here we specify the point estimates from the linear regression as the
        # means for the priors. This will greatly speed up posterior sampling
        # and help ensure that convergence occurs
        if self.am is None:
            self.am = self.lm.intercept_
        if self.bm is None:
            self.bm = self.lm.coef_

        # initialize model for bayesian linear reg. Default vals for priors
        # assume data is scaled and centered. Convergence can struggle or fail
        # if not the case and proper values for the priors are not specified
        # separately, also assumes each beta is normal and "independent"
        # while betas likely not independent, this is technically a rule of OLS
        with pm.Model() as fit_model:
            alpha = pm.Normal("alpha", self.am, sd=self.asd)
            beta = pm.Normal("beta", self.bm, sd=self.bsd, shape=nc)
            sigma = pm.HalfCauchy("σ", self.sig)
            mu = alpha + beta.dot(X.T)
            score = pm.Normal("score", mu, sd=sigma, observed=y)
        params = {"model": fit_model, "y_obs": y_df}
        self.statistics_ = {"param": params, "strategy": self.strategy}
        return self
Beispiel #9
0
    def impute(self, X):
        """Perform imputations using the statistics generated from fit.

        The transform method handles the actual imputation. It constructs a
        normal distribution using the sample mean and variance from fit.
        It then imputes missing values with a random draw from the respective
        distribution.

        Args:
            X (pd.Series): Dataset to impute missing data from fit.

        Returns:
            np.array -- imputed dataset.
        """

        # check if fitted and identify location of missingness
        check_is_fitted(self, "statistics_")
        _not_num_series(self.strategy, X)
        ind = X[X.isnull()].index

        # create normal distribution and sample from it
        imp_mean, imp_std = self.statistics_["param"]
        imp = norm(imp_mean, imp_std).rvs(size=len(ind))
        return imp