def inverse_transform(self, y, **transform_params): notify.entering(__class__.__name__, "inverse_transform") d = {"Sale_Price": np.exp(y["Sale_Price"].values)} df = pd.DataFrame(data=d) notify.leaving(__class__.__name__, "inverse_transform") return df
def fit(self, X, y=None, **fit_params): notify.entering(__class__.__name__, "fit") self.original_features_ = X.columns.tolist() self.ohe_ = OneHotEncoder(handle_unknown="ignore") self.ohe_.fit(X) notify.leaving(__class__.__name__, "fit") return self
def _transform(self, X, y=None): notify.entering(__class__.__name__, "transform") X["Garage_Yr_Blt"].replace(to_replace=2207, value=2007, inplace=True) X = X.drop(columns=["Latitude", "Longitude"]) X = X.fillna(X.median()) notify.leaving(__class__.__name__, "transform") return X
def _transform(self, X, y=None, **transform_params): notify.entering(__class__.__name__, "transform") notify.leaving(__class__.__name__, "transform") self._encoder.fit(X, y) X = self._encoder.transform(X, y) # Scale the features and standardize to zero mean unit variance scaler = StandardScaler() X[self._nominal] = scaler.fit_transform(X[self._nominal]) #X = X.fillna(X.mean()) return X
def transform(self, X, **transform_params): notify.entering(__class__.__name__, "transform") categorical = list(X.select_dtypes(include=["object"]).columns) # Create imputer object imputer = SimpleImputer(strategy="most_frequent") # Perform imputation of categorical variables to most frequent X[categorical] = imputer.fit_transform(X[categorical]) notify.leaving(__class__.__name__, "transform") return X
def run(self, X, y=None): notify.entering(__class__.__name__, "run") # Add an age feature and remove year built X["Age"] = X["Year_Sold"] - X["Year_Built"] X["Age"].fillna(X["Age"].median()) # Add age feature for garage. X["Garage_Age"] = X["Year_Sold"] - X["Garage_Yr_Blt"] X["Garage_Age"].fillna(value=0,inplace=True) notify.leaving(__class__.__name__, "run") return X, y
def transform(self, X, **transform_params): notify.entering(__class__.__name__, "transform") # Impute missing values as linear function of other features imputer = IterativeImputer() X[self._continuous] = imputer.fit_transform(X[self._continuous]) # Power transformation to make feature distributions closer to Guassian power = PowerTransformer(method="yeo-johnson", standardize=False) X[self._continuous] = power.fit_transform(X[self._continuous]) notify.leaving(__class__.__name__, "transform") return X
def transform(self, X, **transform_params): notify.entering(__class__.__name__, "transform") for variable, mappings in self._ordinal_map.items(): for k,v in mappings.items(): X[variable].replace({k:v}, inplace=True) # Scale data as continuous scaler = StandardScaler() X[ordinal] = scaler.fit_transform(X[ordinal]) notify.leaving(__class__.__name__, "transform") return X
def transform(self, X, **transform_params): notify.entering(__class__.__name__, "transform") # Missing discrete variables will be imputed according to the strategy provided # Default strategy is the mean. imputer = SimpleImputer(strategy=self._strategy) X[self._discrete] = imputer.fit_transform(X[self._discrete]) # Standardize discrete variables to zero mean and unit variance scaler = StandardScaler() X[self._discrete] = scaler.fit_transform(X[self._discrete]) notify.leaving(__class__.__name__, "transform") return X
def run(self, X, y=None, **fit_params): notify.entering(__class__.__name__, "run") # Add an age feature and remove year built X["Age"] = X["Year_Sold"] - X["Year_Built"] X["Age"].fillna(X["Age"].median()) # Remove longitude and latitude X = X.drop(columns=["Latitude", "Longitude"]) # Remove outliers idx = X[(X["Gr_Liv_Area"] <= 4000) & (X["Garage_Yr_Blt"]<=2010)].index.tolist() X = X.iloc[idx] y = y.iloc[idx] notify.leaving(__class__.__name__, "run") return X, y
def transform(self, X, **transform_params): """Converting nominal variables to one-hot representation.""" notify.entering(__class__.__name__, "transform") self.X_ = self.ohe_.transform(X).toarray() self.transformed_features_ = self.ohe_.get_feature_names(self.original_features_).tolist() self.X_df_ = pd.DataFrame(self.X_, columns=self.transformed_features_) self.to_original_ = {} for i in self.transformed_features_: for j in self.original_features_: if j in i: self.to_original_[i] = j break notify.leaving(__class__.__name__, "transform") return self.X_
def _transform(self, X, y=None): notify.entering(__class__.__name__, "_fit") self.features_removed_ = [] correlations = pd.DataFrame() all_columns = X.columns.tolist() columns = list(set(X.columns.tolist()).intersection(self._numeric)) # Perform pairwise correlation coefficient calculations for col_a, col_b in itertools.combinations(columns,2): r, p = pearsonr(X[col_a], X[col_b]) cols = col_a + "__" + col_b d = {"Columns": cols, "A": col_a, "B": col_b,"Correlation": r, "p-value": p} df = pd.DataFrame(data=d, index=[0]) correlations = pd.concat((correlations, df), axis=0) # Now compute correlation between features and target. relevance = pd.DataFrame() for column in columns: r, p = pearsonr(X.loc[:,column], y) d = {"Feature": column, "Correlation": r, "p-value": p} df = pd.DataFrame(data=d, index=[0]) relevance = pd.concat((relevance,df), axis=0) # Obtain observations above correlation threshold and below alpha self.suspects_ = correlations[(correlations["Correlation"] >= self._threshold) & (correlations["p-value"] <= self._alpha)] if self.suspects_.shape[0] == 0: self.X_ = X return # Iterate over suspects and determine column to remove based upon # correlation with target to_remove = [] for index, row in self.suspects_.iterrows(): a = np.abs(relevance[relevance["Feature"] == row["A"]]["Correlation"].values) b = np.abs(relevance[relevance["Feature"] == row["B"]]["Correlation"].values) if a > b: to_remove.append(row["B"]) else: to_remove.append(row["A"]) self.X_ = X.drop(columns=to_remove) self.features_removed_ += to_remove self._fit(self.X_,y) notify.leaving(__class__.__name__, "fit_") return self.X_
def _transform(self, X, y=None, **transform_params): notify.entering(__class__.__name__, "transform") categorical = list(X.select_dtypes(include=["object"]).columns) # Create imputer object imputer = SimpleImputer(strategy=self._strategy) # Perform imputation of categorical variables to most frequent X[self._ordinal] = imputer.fit_transform(X[self._ordinal]) # Map levels to ordinal mappings for variable, mappings in self._ordinal_map.items(): for k,v in mappings.items(): X[variable].replace({k:v}, inplace=True) # Scale the features and standardize to zero mean unit variance scaler = StandardScaler() X[self._ordinal] = scaler.fit_transform(X[self._ordinal]) notify.leaving(__class__.__name__, "transform") return X
def _transform(self, X, y=None): notify.entering(__class__.__name__, "_fit") results = pd.DataFrame() all_columns = X.columns.tolist() categorical = self._continuous + self._discrete columns = list(set(X.columns.tolist()).intersection(categorical)) # Measure variance between predictor levels w.r.t. the response self.remaining_ = pd.DataFrame() self.features_removed_ = [] for column in columns: r, p = pearsonr(X[column], y) if (np.abs(r) <= self._threshold) & (p <= self._alpha): self.features_removed_.append(column) else: d = {"Feature": column, "Correlation": r, "p-value": p} df = pd.DataFrame(data=d, index=[0]) self.remaining_ = pd.concat((self.remaining_, df), axis=0) # Drop features self.X_ = X.drop(columns=self.features_removed_) notify.leaving(__class__.__name__, "_fit") return self.X_
def _transform(self, X, y=None): notify.entering(__class__.__name__, "_fit") results = pd.DataFrame() all_columns = X.columns.tolist() categorical = self._ordinal + self._nominal columns = list(set(X.columns.tolist()).intersection(categorical)) # Measure variance between predictor levels w.r.t. the response self.remaining_ = pd.DataFrame() self.features_removed_ = [] for column in columns: f, p = f_oneway(X[column], y) if p > self._alpha: self.features_removed_.append(column) else: d = {"Feature": column, "F-statistic": f, "p-value": p} df = pd.DataFrame(data=d, index=[0]) self.remaining_ = pd.concat((self.remaining_, df), axis=0) # Drop features self.X_ = X.drop(columns=self.features_removed_) notify.leaving(__class__.__name__, "_fit") return self.X_
def transform(self, X, **transform_params): notify.entering(__class__.__name__, "transform") notify.leaving(__class__.__name__, "transform") return self._enc.transform(X)
def _fit(self, X, y=None, **fit_params): notify.entering(__class__.__name__, "fit") notify.leaving(__class__.__name__, "fit") return self