コード例 #1
0
ファイル: select_k_best.py プロジェクト: hirzel/lale
 def to_monoid(self, v):
     X, y = v
     score_func = self._hyperparams["score_func"]
     n_samples_seen_ = count(X)
     feature_names_in_ = get_columns(X)
     lifted_score_ = score_func.to_monoid((X, y))
     return _SelectKBestMonoid(
         n_samples_seen_=n_samples_seen_,
         feature_names_in_=feature_names_in_,
         lifted_score_=lifted_score_,
     )
コード例 #2
0
 def to_monoid(self, v):
     X, y = v
     if self._hyperparams["cols"] is None:
         self._hyperparams["cols"] = get_obj_cols(X)
     cols = self._hyperparams["cols"]
     N = self._hyperparams["n_components"]
     feature_names_cat = [f"col_{i}" for i in range(N)]
     feature_names_num = [col for col in get_columns(X) if col not in cols]
     feature_names = feature_names_cat + feature_names_num  # type: ignore
     n_samples_seen_ = count(X)
     return _HashingEncoderMonoid(n_samples_seen_=n_samples_seen_,
                                  feature_names=feature_names)
コード例 #3
0
 def to_monoid(self, v):
     X, _ = v
     n_samples_seen_ = count(X)
     feature_names_in_ = get_columns(X)
     agg_op = Aggregate(
         columns={c: collect_set(it[c])
                  for c in feature_names_in_})
     agg_data = agg_op.transform(X)
     agg_data = _ensure_pandas(agg_data)
     categories_ = [np.sort(agg_data.loc[0, c]) for c in feature_names_in_]
     return _OneHotEncoderMonoid(
         n_samples_seen_=n_samples_seen_,
         feature_names_in_=feature_names_in_,
         categories_=categories_,
     )
コード例 #4
0
ファイル: ordinal_encoder.py プロジェクト: hirzel/lale
 def to_monoid(self, v):
     hyperparams = self._hyperparams
     X, _ = v
     n_samples_seen_ = count(X)
     feature_names_in_ = get_columns(X)
     if hyperparams["categories"] == "auto":
         agg_op = Aggregate(
             columns={c: collect_set(it[c])
                      for c in feature_names_in_})
         agg_data = agg_op.transform(X)
         if lale.helpers._is_spark_df(agg_data):
             agg_data = agg_data.toPandas()
         categories_ = [
             np.sort(agg_data.loc[0, c]) for c in feature_names_in_
         ]
     else:
         categories_ = hyperparams["categories"]
     return _OrdinalEncoderMonoid(
         n_samples_seen_=n_samples_seen_,
         feature_names_in_=feature_names_in_,
         categories_=categories_,
     )
コード例 #5
0
ファイル: min_max_scaler.py プロジェクト: hirzel/lale
 def to_monoid(self, v) -> _MinMaxScalerMonoid:
     X, _ = v
     agg = {f"{c}_min": agg_min(it[c]) for c in get_columns(X)}
     agg.update({f"{c}_max": agg_max(it[c]) for c in get_columns(X)})
     aggregate = Aggregate(columns=agg)
     data_min_max = aggregate.transform(X)
     if _is_spark_df(X):
         data_min_max = data_min_max.toPandas()
     n = len(get_columns(X))
     data_min_ = np.zeros(shape=(n))
     data_max_ = np.zeros(shape=(n))
     for i, c in enumerate(get_columns(X)):
         data_min_[i] = data_min_max[f"{c}_min"]
         data_max_[i] = data_min_max[f"{c}_max"]
     data_min_ = np.array(data_min_)
     data_max_ = np.array(data_max_)
     n_samples_seen_ = count(X)
     feature_names_in_ = get_columns(X)
     return _MinMaxScalerMonoid(
         data_min_=data_min_,
         data_max_=data_max_,
         n_samples_seen_=n_samples_seen_,
         feature_names_in_=feature_names_in_,
     )