Beispiel #1
0
class ImputerWrapper:
  """ A simple wrapper around Imputer and supports using zero to fill in missing values.
      If entire column is nan it gets filled with 0 to avoid Imputer removing the column.
  """

  def __init__(self, missing_values='NaN', strategy='zero', axis=0, verbose=0, copy=False):
    self.strategy = strategy
    self.imputer = None
    if strategy != 'zero':
      self.imputer = Imputer(missing_values, strategy, axis, verbose, copy)

  def prepare(self, X):
    for j in range(X.shape[1]):
      all_nan = True
      for i in range(X.shape[0]):
        if not numpy.isnan(X[i][j]):
          all_nan = False
          break
      if all_nan:
        logging.info('column %d all nan, filling with 0' % j)
        for i in range(X.shape[0]):
          X[i][j] = 0.0

  def fit(self, X, y=None):
    if self.strategy == 'zero':
      return self
    self.prepare(X)
    self.imputer.fit(X, y)
    return self

  def fit_transform(self, X, y=None, **fit_params):
    if self.strategy == 'zero':
      for i in range(X.shape[0]):
        for j in range(X.shape[1]):
          if numpy.isnan(X[i][j]):
            X[i][j] = 0.0
      return X
    self.prepare(X)
    return self.imputer.fit_transform(X, y, **fit_params)

  def get_params(self, deep=True):
    if self.strategy == 'zero':
      return None
    return self.imputer.get_params(deep)

  def set_params(self, **params):
    if self.strategy == 'zero':
      return self
    self.imputer.set_params(**params)
    return self

  def transform(self, X):
    if self.strategy == 'zero':
      for i in range(X.shape[0]):
        for j in range(X.shape[1]):
          if numpy.isnan(X[i][j]):
            X[i][j] = 0.0
      return X
    return self.imputer.transform(X)
Beispiel #2
0
def treat_nulls(data_train, data_test, strategy={}):
    for feature in strategy.keys():
        imp = Imputer(missing_values='NaN', axis=0)
        if strategy[feature] == 'average':
            imp.set_params(strategy='strategy__mean')

        if strategy[feature] == 'median':
            imp.set_params(strategy='strategy__median')

        if strategy[feature] == 'mode':
            imp.set_params(strategy='strategy__most_frequent')

        if strategy[feature] == 'remove':
            data_train.features.dropna(subset=[feature], inplace=True)
            data_test.features.dropna(subset=[feature], inplace=True)

        else:  # if not remove
            imp.fit(data_train.features[feature])
            data_train.features[feature] = imp.transform(
                data_train.features[feature])
            data_test.features[feature] = imp.transform(
                data_test.features[feature])

    return imp.transform(data_train), imp.transform(data_test)
Beispiel #3
0
class ImputerWrapper:
    """ A simple wrapper around Imputer and supports using zero to fill in missing values.
      If entire column is nan it gets filled with 0 to avoid Imputer removing the column.
  """
    def __init__(self,
                 missing_values='NaN',
                 strategy='zero',
                 axis=0,
                 verbose=0,
                 copy=False):
        self.strategy = strategy
        self.imputer = None
        if strategy != 'zero':
            self.imputer = Imputer(missing_values, strategy, axis, verbose,
                                   copy)

    def prepare(self, X):
        for j in range(X.shape[1]):
            all_nan = True
            for i in range(X.shape[0]):
                if not numpy.isnan(X[i][j]):
                    all_nan = False
                    break
            if all_nan:
                logging.info('column %d all nan, filling with 0' % j)
                for i in range(X.shape[0]):
                    X[i][j] = 0.0

    def fit(self, X, y=None):
        if self.strategy == 'zero':
            return self
        self.prepare(X)
        self.imputer.fit(X, y)
        return self

    def fit_transform(self, X, y=None, **fit_params):
        if self.strategy == 'zero':
            for i in range(X.shape[0]):
                for j in range(X.shape[1]):
                    if numpy.isnan(X[i][j]):
                        X[i][j] = 0.0
            return X
        self.prepare(X)
        return self.imputer.fit_transform(X, y, **fit_params)

    def get_params(self, deep=True):
        if self.strategy == 'zero':
            return None
        return self.imputer.get_params(deep)

    def set_params(self, **params):
        if self.strategy == 'zero':
            return self
        self.imputer.set_params(**params)
        return self

    def transform(self, X):
        if self.strategy == 'zero':
            for i in range(X.shape[0]):
                for j in range(X.shape[1]):
                    if numpy.isnan(X[i][j]):
                        X[i][j] = 0.0
            return X
        return self.imputer.transform(X)
Beispiel #4
0
# In[ ]:

print(data1.isnull().sum(), data2.isnull().sum())

# We can now use Imputer for Imputing Missing Data

# In[ ]:

from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder
le = LabelEncoder()
x_train['Embarked'] = x_train['Embarked'].fillna('$')
x_train['Embarked'] = le.fit_transform(x_train['Embarked'])
x_train['Cabin'] = le.fit_transform(x_train['Cabin'])
imr = Imputer(missing_values=8, strategy='median', axis=0, copy=False)
x_train[['Cabin']] = imr.fit_transform(x_train[['Cabin']])
imr.set_params(missing_values=np.nan, strategy='mean')
x_train[['Age']] = imr.fit_transform(x_train[['Age']])
imr.set_params(missing_values=3, strategy='most_frequent')
x_train[['Embarked']] = imr.fit_transform(x_train[['Embarked']])
ohe = OneHotEncoder(categorical_features=[1])
x_train['Sex'] = le.fit_transform(x_train['Sex'])

print(x_train.head())

# In[ ]:

fig, ax1 = plt.subplots(figsize=(10, 10))
sns.heatmap(data=x_train.corr(), annot=True, fmt='.1f', linewidths=.1)

# In[ ]:
Beispiel #5
0

# 补齐缺失的参数
from sklearn.preprocessing import Imputer
a = np.array([[0,  2,  4],
              [3,  2,  5],
              [10,  2,  4]])
np.vstack((a, [None] * 3))

im = Imputer()
im.fit_transform(a)

# 参数 class sklearn.preprocessing.Imputer(missing_values='NaN',
# strategy='mean', axis=0, verbose=0, copy=True)

im.set_params(strategy="mean")  # 均值
im.set_params(strategy="median")  # 中位数
im.set_params(strategy="most_frequent")  # 最频繁
im.fit_transform(a)

# missing_values : integer or “NaN”, optional (default=”NaN”)
# The placeholder for the missing values. All occurrences of
# missing_values will be imputed. For missing values encoded as np.nan,
# use the string value “NaN”.

# 行还是列推断
im.set_params(axis=1)
a = a.T
im.fit_transform(a)

# verbose,控制详细程度