def to_monoid(self, v): X, y = v score_func = self._hyperparams["score_func"] n_samples_seen_ = count(X) feature_names_in_ = get_columns(X) lifted_score_ = score_func.to_monoid((X, y)) return _SelectKBestMonoid( n_samples_seen_=n_samples_seen_, feature_names_in_=feature_names_in_, lifted_score_=lifted_score_, )
def to_monoid(self, v): X, y = v if self._hyperparams["cols"] is None: self._hyperparams["cols"] = get_obj_cols(X) cols = self._hyperparams["cols"] N = self._hyperparams["n_components"] feature_names_cat = [f"col_{i}" for i in range(N)] feature_names_num = [col for col in get_columns(X) if col not in cols] feature_names = feature_names_cat + feature_names_num # type: ignore n_samples_seen_ = count(X) return _HashingEncoderMonoid(n_samples_seen_=n_samples_seen_, feature_names=feature_names)
def to_monoid(self, v): X, _ = v n_samples_seen_ = count(X) feature_names_in_ = get_columns(X) agg_op = Aggregate( columns={c: collect_set(it[c]) for c in feature_names_in_}) agg_data = agg_op.transform(X) agg_data = _ensure_pandas(agg_data) categories_ = [np.sort(agg_data.loc[0, c]) for c in feature_names_in_] return _OneHotEncoderMonoid( n_samples_seen_=n_samples_seen_, feature_names_in_=feature_names_in_, categories_=categories_, )
def to_monoid(self, v): hyperparams = self._hyperparams X, _ = v n_samples_seen_ = count(X) feature_names_in_ = get_columns(X) if hyperparams["categories"] == "auto": agg_op = Aggregate( columns={c: collect_set(it[c]) for c in feature_names_in_}) agg_data = agg_op.transform(X) if lale.helpers._is_spark_df(agg_data): agg_data = agg_data.toPandas() categories_ = [ np.sort(agg_data.loc[0, c]) for c in feature_names_in_ ] else: categories_ = hyperparams["categories"] return _OrdinalEncoderMonoid( n_samples_seen_=n_samples_seen_, feature_names_in_=feature_names_in_, categories_=categories_, )
def to_monoid(self, v) -> _MinMaxScalerMonoid: X, _ = v agg = {f"{c}_min": agg_min(it[c]) for c in get_columns(X)} agg.update({f"{c}_max": agg_max(it[c]) for c in get_columns(X)}) aggregate = Aggregate(columns=agg) data_min_max = aggregate.transform(X) if _is_spark_df(X): data_min_max = data_min_max.toPandas() n = len(get_columns(X)) data_min_ = np.zeros(shape=(n)) data_max_ = np.zeros(shape=(n)) for i, c in enumerate(get_columns(X)): data_min_[i] = data_min_max[f"{c}_min"] data_max_[i] = data_min_max[f"{c}_max"] data_min_ = np.array(data_min_) data_max_ = np.array(data_max_) n_samples_seen_ = count(X) feature_names_in_ = get_columns(X) return _MinMaxScalerMonoid( data_min_=data_min_, data_max_=data_max_, n_samples_seen_=n_samples_seen_, feature_names_in_=feature_names_in_, )