def encode(fitted_encoder: BaseEstimator, features: List[str], df: pd.DataFrame) -> pd.DataFrame: """ Pipeline function to encode data with fitted sklearn OneHotEncoder. Parameters ---------- fitted_encoder : sklearn OneHotEncoder Encodes input data. features : list List of categorical feature. df : pd.DataFrame Data we want to normalize Returns ------- output : pd.DataFrame Whole DataFrame with encoded values """ output = df.copy() encoded_features = fitted_encoder.get_feature_names(features) output[encoded_features] = fitted_encoder.transform(output[features]) output = output.drop(features, axis=1) return output
def hook(self, model: BaseEstimator, history: History): noimages = self.datapoints.shape[0] latvar = model.transform(self.datapoints) konstrukt = model.inverse_transform(latvar()) last = history.last() filename = 'rekonstrukt-%02de-%03db.png' % (last[1], last[2]) img = visualize_reconstruction(self.datapoints, konstrukt, noimages) img.save(join(self.outdir, filename))
def _fit_step(self, transformer: BaseEstimator, ids: Tuple, is_final: bool, X: pd.DataFrame, y: Iterable = None, **fit_params): # make transformer unique for each CV split transformer.train_ = tuple(X.index) transformer.features_ = tuple(X.columns) # load transformer from database transformer_loaded, ids_loaded = self._load(transformer, ids) is_loaded = False if transformer_loaded is None else True if is_loaded: transformer = transformer_loaded ids = ids_loaded # fit final step if is_final: if not is_loaded: transformer.fit(X, y, **fit_params) # fit intermediate steps else: if not is_loaded: transformer.fit(X, y, **fit_params) transformed_data = transformer.transform(X) if isinstance(transformed_data, Tuple): X, y = transformed_data else: Xnp = transformed_data # reshape input data if Xnp.shape != X.shape: if isinstance(X, pd.DataFrame): X = X.iloc[:, transformer.get_support()] else: X = pd.DataFrame(Xnp) # save transformer if not is_loaded: ids = self._save(transformer, ids) return transformer, ids, X
def pca_transform( ds: Dataset, est: BaseEstimator, *, variable: str = "call_alternate_allele_count", check_missing: bool = True, merge: bool = True, ) -> Dataset: """ Apply PCA estimator to new data """ AC = _allele_counts(ds, variable, check_missing=check_missing) projection = est.transform(da.asarray(AC).T) new_ds = Dataset( {variables.sample_pca_projection: (("samples", "components"), projection)} ) return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
def _determine_offset(model: BaseEstimator, X: Union[np.ndarray, pd.DataFrame]) -> int: """ Determine the model's offset. How much does the output of the model differ from its input? Parameters ---------- model: sklearn.base.BaseEstimator Trained model with either ``predict`` or ``transform`` method, preference given to ``predict``. X: Union[np.ndarray, pd.DataFrame] Data to pass to the model's ``predict`` or ``transform`` method. Returns ------- int The difference between X and the model's output lengths. """ out = model.predict(X) if hasattr(model, "predict") else model.transform(X) return len(X) - len(out)
def scale(fitted_scaler: BaseEstimator, features: List[str], df: pd.DataFrame) -> pd.DataFrame: """ Pipeline function to normalize data with fitted sklearn scaler. Parameters ---------- fitted_scaler : sklearn Scaler Normalizes input data features : list List of continuous feature df : pd.DataFrame Data we want to normalize Returns ------- output : pd.DataFrame Whole DataFrame with normalized values """ output = df.copy() output[features] = fitted_scaler.transform(output[features]) return output
def scaled_linspace(x: np.ndarray, y: np.ndarray, num: int, scaler: BaseEstimator) -> np.ndarray: """Generate a linspace, evenly spaced according to the normalization Args: x (np.ndarray): First point y (np.ndarray): Sencond point num (int): Number of points (in between the two points) method (str): Normalization method Returns: np.ndarray: Sequence of points evenly spaced """ # Normalize the points x = scaler.transform([x])[0] y = scaler.transform([y])[0] # Generate the linspace ls = np.linspace(x, y, num=num + 1, endpoint=True) # Unnormalize the points ls = scaler.inverse_transform(ls) return ls