Example #1
0
class Data(object):
    def __init__(self, X, y, shuffle=True, random_state=None):
        if isinstance(X, np.ndarray):
            self.X = pd.DataFrame(X)
            self.y = pd.Series(y)
        else:
            self.X = X.copy(deep=True)
            self.y = y.copy(deep=True)
        if not isinstance(self.X, pd.DataFrame):
            raise ValueError('%s is not supported' % type(X))
        self.shape_before = self.X.shape

        self.__imputer()

        self.__encoder()

        if shuffle:
            self.X, self.y = sk_shuffle(self.X,
                                        self.y,
                                        random_state=random_state)

        self.X = StandardScaler().fit_transform(self.X)

    def __imputer(self):
        fill = pd.Series([
            self.X[c].value_counts().index[0]
            if self.X[c].dtype == np.dtype('O') else self.X[c].median()
            if self.X[c].dtype == np.dtype('int') else self.X[c].mean()
            for c in self.X
        ],
                         index=self.X.columns)
        self.col_was_null = [
            c for c in self.X if pd.isnull(self.X[c]).sum() > 0
        ]
        self.X = self.X.fillna(fill)

    def __encoder(self):
        self.del_columns = []
        for i in xrange(len(self.X.columns)):
            if self.X.dtypes[i] == np.dtype('O'):
                enc = LabelEncoder()
                col_enc = enc.fit_transform(self.X.icol(i))
                col_onehot = np.array(OneHotEncoder().fit_transform(
                    col_enc.reshape(-1, 1)).todense())
                col_names = [
                    str(self.X.columns[i]) + '_' + c for c in enc.classes_
                ]
                col_onehot = pd.DataFrame(col_onehot,
                                          columns=col_names,
                                          index=self.X.index)
                self.X = pd.concat([self.X, col_onehot], axis=1)
                self.del_columns.append(self.X.columns[i])
        for col in self.del_columns:
            del self.X[col]