def to_monoid(self, batch) -> _DIorSPDData: if len(batch) == 2: X, y_pred = batch y_true = None else: y_true, y_pred, X = batch assert y_pred is not None and X is not None, batch y_pred = self._y_pred_series(y_true, y_pred, X) encoded_X, y_pred = self.prot_attr_enc.transform(X, y_pred) df = pd.concat([encoded_X, y_pred], axis=1) pa_names = self.privileged_groups[0].keys() pipeline = GroupBy(by=[it[pa] for pa in pa_names] + [it[y_pred.name]]) >> Aggregate( columns={"count": count(it[y_pred.name])}) agg_df = pipeline.transform(df) def count2(priv, fav): row = (priv, ) * len(pa_names) + (fav, ) return agg_df.at[row, "count"] if row in agg_df.index else 0 return _DIorSPDData( priv0_fav0=count2(priv=0, fav=0), priv0_fav1=count2(priv=0, fav=1), priv1_fav0=count2(priv=1, fav=0), priv1_fav1=count2(priv=1, fav=1), )
def _lift(X, hyperparams): feature_names_in = X.columns count_op = Aggregate(columns={"count": count(it[feature_names_in[0]])}) count_data = lale.helpers._ensure_pandas(count_op.transform(X)) n_samples_seen = count_data.loc[0, "count"] if hyperparams["with_mean"] or hyperparams["with_std"]: sum1_op = Aggregate( columns={c: agg_sum(it[c]) for c in feature_names_in}) sum1_data = lale.helpers._ensure_pandas(sum1_op.transform(X)) sum1 = [sum1_data.loc[0, c] for c in feature_names_in] else: sum1 = None if hyperparams["with_std"]: sum2_op = Map( columns={c: it[c] * it[c] for c in feature_names_in} ) >> Aggregate( columns={c: agg_sum(it[c]) for c in feature_names_in}) sum2_data = lale.helpers._ensure_pandas(sum2_op.transform(X)) sum2 = [sum2_data.loc[0, c] for c in feature_names_in] else: sum2 = None return feature_names_in, n_samples_seen, sum1, sum2
def to_monoid(self, v): X, _ = v hyperparams = self._hyperparams feature_names_in = get_columns(X) count_op = Aggregate(columns={"count": count(it[feature_names_in[0]])}) count_data = lale.helpers._ensure_pandas(count_op.transform(X)) n_samples_seen = count_data.loc[0, "count"] if hyperparams["with_mean"] or hyperparams["with_std"]: sum1_op = Aggregate( columns={c: agg_sum(it[c]) for c in feature_names_in}) sum1_data = lale.helpers._ensure_pandas(sum1_op.transform(X)) sum1 = [sum1_data.loc[0, c] for c in feature_names_in] else: sum1 = None if hyperparams["with_std"]: sum2_op = Map( columns={c: it[c] * it[c] for c in feature_names_in} ) >> Aggregate( columns={c: agg_sum(it[c]) for c in feature_names_in}) sum2_data = lale.helpers._ensure_pandas(sum2_op.transform(X)) sum2 = [sum2_data.loc[0, c] for c in feature_names_in] else: sum2 = None return _StandardScalerMonoid( feature_names_in_=feature_names_in, n_samples_seen_=n_samples_seen, _sum1=sum1, _sum2=sum2, )
def test_fit_error(self): relational = Relational(operator=( Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[ it.main.TrainId == it.delay.TrainId, it.main["Arrival time"] >= it.delay.TimeStamp, ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId)) with self.assertRaises(ValueError): _ = relational.fit([self.X_train], self.y_train)
def test_fit_transform(self): relational = Relational(operator=( Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[ it.main.TrainId == it.delay.TrainId, it.main["Arrival time"] >= it.delay.TimeStamp, ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId)) trained_relational = relational.fit(self.X_train, self.y_train) _ = trained_relational.transform(self.X_test)
def test_fit_transform_in_pipeline(self): relational = Relational(operator=( Scan(table=it.main) & Scan(table=it.delay)) >> Join(pred=[ it.main.TrainId == it.delay.TrainId, it.main["Arrival time"] >= it.delay.TimeStamp, ]) >> Aggregate(columns=[count(it.Delay)], group_by=it.MessageId)) pipeline = relational >> LogisticRegression() trained_pipeline = pipeline.fit(self.X_train, self.y_train) _ = trained_pipeline.predict(self.X_test)
def __init__(self): from lale.lib.rasl.concat_features import ConcatFeatures self._pipeline_suffix = ( ConcatFeatures >> Map( columns={"match": astype("int", it.y_true == it.y_pred) }) # type: ignore >> Aggregate(columns={ "match": sum(it.match), "total": count(it.match) }))
def __init__(self): from lale.lib.rasl.concat_features import ConcatFeatures self._pipeline_suffix = ( ConcatFeatures >> Map( columns={ "y": it.y_true, # observed values "f": it.y_pred, # predicted values "y2": it.y_true * it.y_true, # squares "e2": (it.y_true - it.y_pred) * (it.y_true - it.y_pred), # type: ignore }) >> Aggregate( columns={ "n": count(it.y), "sum": sum(it.y), "sum_sq": sum(it.y2), "res_sum_sq": sum(it.e2), # residual sum of squares }))
def to_monoid(self, batch) -> _AODorEODData: if len(batch) == 2: X, y_pred = batch y_true = None else: y_true, y_pred, X = batch assert y_pred is not None and X is not None, batch y_pred = self._y_pred_series(y_true, y_pred, X) encoded_X, y_pred = self.prot_attr_enc.transform(X, y_pred) def is_fresh(col_name): assert y_true is not None and isinstance(y_true, pd.Series), batch return col_name not in encoded_X.columns and col_name != y_true.name if is_fresh("y_pred"): y_pred_name = "y_pred" else: y_pred_name = next(f"y_pred_{i}" for i in itertools.count(0) if is_fresh(f"y_pred_{i}")) y_pred = pd.Series(y_pred, y_pred.index, name=y_pred_name) _, y_true = self.prot_attr_enc.transform(X, y_true) df = pd.concat([y_true, y_pred, encoded_X], axis=1) pa_names = self.privileged_groups[0].keys() pipeline = GroupBy(by=[it[y_true.name], it[y_pred_name]] + [it[pa] for pa in pa_names]) >> Aggregate( columns={"count": count(it[y_pred.name])}) agg_df = pipeline.transform(df) def count3(tru, pred, priv): row = (tru, pred) + (priv, ) * len(pa_names) return agg_df.at[row, "count"] if row in agg_df.index else 0 return _AODorEODData( tru0_pred0_priv0=count3(tru=0, pred=0, priv=0), tru0_pred0_priv1=count3(tru=0, pred=0, priv=1), tru0_pred1_priv0=count3(tru=0, pred=1, priv=0), tru0_pred1_priv1=count3(tru=0, pred=1, priv=1), tru1_pred0_priv0=count3(tru=1, pred=0, priv=0), tru1_pred0_priv1=count3(tru=1, pred=0, priv=1), tru1_pred1_priv0=count3(tru=1, pred=1, priv=0), tru1_pred1_priv1=count3(tru=1, pred=1, priv=1), )
def _lift(X, hyperparams): feature_names_in_ = get_columns(X) strategy = hyperparams["strategy"] if strategy == "constant": fill_value = _SimpleImputerImpl._get_fill_value(X, hyperparams) agg_data = [[fill_value for col in get_columns(X)]] lifted_statistics = pd.DataFrame(agg_data, columns=get_columns(X)) elif strategy == "mean": agg_op_sum = Aggregate( columns={c: sum(it[c]) for c in get_columns(X)}, exclude_value=hyperparams["missing_values"], ) agg_op_count = Aggregate( columns={c: count(it[c]) for c in get_columns(X)}, exclude_value=hyperparams["missing_values"], ) lifted_statistics = {} agg_sum = agg_op_sum.transform(X) if agg_sum is not None and _is_spark_df(agg_sum): agg_sum = agg_sum.toPandas() agg_count = agg_op_count.transform(X) if agg_count is not None and _is_spark_df(agg_count): agg_count = agg_count.toPandas() lifted_statistics["sum"] = agg_sum lifted_statistics["count"] = agg_count else: raise ValueError( "_lift is only supported for imputation strategy `mean` and `constant`." ) return ( feature_names_in_, lifted_statistics, strategy, ) # strategy is added so that _combine can use it
def test_with_hyperopt2(self): from lale.expressions import ( count, it, max, mean, min, string_indexer, sum, variance, ) wrap_imported_operators() scan = Scan(table=it["main"]) scan_0 = Scan(table=it["customers"]) join = Join(pred=[(it["main"]["group_customer_id"] == it["customers"] ["group_customer_id"])]) map = Map( columns={ "[main](group_customer_id)[customers]|number_children|identity": it["number_children"], "[main](group_customer_id)[customers]|name|identity": it["name"], "[main](group_customer_id)[customers]|income|identity": it["income"], "[main](group_customer_id)[customers]|address|identity": it["address"], "[main](group_customer_id)[customers]|age|identity": it["age"], }, remainder="drop", ) pipeline_4 = join >> map scan_1 = Scan(table=it["purchase"]) join_0 = Join( pred=[(it["main"]["group_id"] == it["purchase"]["group_id"])], join_limit=50.0, ) aggregate = Aggregate( columns={ "[main](group_id)[purchase]|price|variance": variance(it["price"]), "[main](group_id)[purchase]|time|sum": sum(it["time"]), "[main](group_id)[purchase]|time|mean": mean(it["time"]), "[main](group_id)[purchase]|time|min": min(it["time"]), "[main](group_id)[purchase]|price|sum": sum(it["price"]), "[main](group_id)[purchase]|price|count": count(it["price"]), "[main](group_id)[purchase]|price|mean": mean(it["price"]), "[main](group_id)[purchase]|price|min": min(it["price"]), "[main](group_id)[purchase]|price|max": max(it["price"]), "[main](group_id)[purchase]|time|max": max(it["time"]), "[main](group_id)[purchase]|time|variance": variance(it["time"]), }, group_by=it["row_id"], ) pipeline_5 = join_0 >> aggregate map_0 = Map( columns={ "[main]|group_customer_id|identity": it["group_customer_id"], "[main]|transaction_id|identity": it["transaction_id"], "[main]|group_id|identity": it["group_id"], "[main]|comments|identity": it["comments"], "[main]|id|identity": it["id"], "prefix_0_id": it["prefix_0_id"], "next_purchase": it["next_purchase"], "[main]|time|identity": it["time"], }, remainder="drop", ) scan_2 = Scan(table=it["transactions"]) scan_3 = Scan(table=it["products"]) join_1 = Join(pred=[ (it["main"]["transaction_id"] == it["transactions"] ["transaction_id"]), (it["transactions"]["product_id"] == it["products"]["product_id"]), ]) map_1 = Map( columns={ "[main](transaction_id)[transactions](product_id)[products]|price|identity": it["price"], "[main](transaction_id)[transactions](product_id)[products]|type|identity": it["type"], }, remainder="drop", ) pipeline_6 = join_1 >> map_1 join_2 = Join(pred=[(it["main"]["transaction_id"] == it["transactions"] ["transaction_id"])]) map_2 = Map( columns={ "[main](transaction_id)[transactions]|description|identity": it["description"], "[main](transaction_id)[transactions]|product_id|identity": it["product_id"], }, remainder="drop", ) pipeline_7 = join_2 >> map_2 map_3 = Map(columns=[ string_indexer(it["[main]|comments|identity"]), string_indexer( it["[main](transaction_id)[transactions]|description|identity"] ), string_indexer(it[ "[main](transaction_id)[transactions](product_id)[products]|type|identity"] ), string_indexer( it["[main](group_customer_id)[customers]|name|identity"]), string_indexer( it["[main](group_customer_id)[customers]|address|identity"]), ]) pipeline_8 = ConcatFeatures() >> map_3 relational = Relational(operator=make_pipeline_graph( steps=[ scan, scan_0, pipeline_4, scan_1, pipeline_5, map_0, scan_2, scan_3, pipeline_6, pipeline_7, pipeline_8, ], edges=[ (scan, pipeline_4), (scan, pipeline_5), (scan, map_0), (scan, pipeline_6), (scan, pipeline_7), (scan_0, pipeline_4), (pipeline_4, pipeline_8), (scan_1, pipeline_5), (pipeline_5, pipeline_8), (map_0, pipeline_8), (scan_2, pipeline_6), (scan_2, pipeline_7), (scan_3, pipeline_6), (pipeline_6, pipeline_8), (pipeline_7, pipeline_8), ], )) pipeline = relational >> (KNeighborsClassifier | LogisticRegression) from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) from lale.lib.lale import Hyperopt opt = Hyperopt(estimator=pipeline, max_evals=2) opt.fit(X, y)