def _unfold(self, op: Operator, n: int) -> Optional[Operator]: """ Unroll all possible operators from the grammar `g` starting from non-terminal `op` after `n` derivations. Parameters ---------- op : Operator starting rule (e.g., `g.start`) n : int number of derivations Returns ------- Optional[Operator] """ if isinstance(op, BasePipeline): steps = op.steps() new_maybe_steps: List[Optional[Operator]] = [ self._unfold(sop, n) for sop in op.steps() ] if None not in new_maybe_steps: new_steps: List[Operator] = cast(List[Operator], new_maybe_steps) step_map = {steps[i]: new_steps[i] for i in range(len(steps))} new_edges = [(step_map[s], step_map[d]) for s, d in op.edges()] return make_pipeline_graph(new_steps, new_edges, True) else: return None if isinstance(op, OperatorChoice): steps = [ s for s in (self._unfold(sop, n) for sop in op.steps()) if s ] return make_choice(*steps) if steps else None if isinstance(op, NonTerminal): return self._unfold(self._variables[op.name()], n - 1) if n > 0 else None if isinstance(op, IndividualOp): return op assert False, f"Unknown operator {op}"
def test_irreducible_1(self): from lale.lib.sklearn import ( PCA, KNeighborsClassifier, LogisticRegression, MinMaxScaler, Nystroem, ) from lale.operators import make_pipeline_graph choice = PCA | Nystroem pipeline = make_pipeline_graph( steps=[choice, MinMaxScaler, LogisticRegression, KNeighborsClassifier], edges=[ (choice, LogisticRegression), (MinMaxScaler, LogisticRegression), (MinMaxScaler, KNeighborsClassifier), ], ) expected = """from sklearn.decomposition import PCA from sklearn.kernel_approximation import Nystroem from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from lale.operators import make_pipeline_graph import lale lale.wrap_imported_operators() choice = PCA | Nystroem pipeline = make_pipeline_graph( steps=[choice, MinMaxScaler, LogisticRegression, KNeighborsClassifier], edges=[ (choice, LogisticRegression), (MinMaxScaler, LogisticRegression), (MinMaxScaler, KNeighborsClassifier), ], )""" self._roundtrip(expected, lale.pretty_print.to_string(pipeline))
def _sample(self, op: Operator, n: int) -> Optional[Operator]: """ Sample the grammar `g` starting from `g.start`, that is, choose one element at random for each possible choices. Parameters ---------- op : Operator starting rule (e.g., `g.start`) n : int number of derivations Returns ------- Optional[Operator] """ if isinstance(op, BasePipeline): steps = op.steps() new_maybe_steps: List[Optional[Operator]] = [ self._sample(sop, n) for sop in op.steps() ] if None not in new_maybe_steps: new_steps: List[Operator] = cast(List[Operator], new_maybe_steps) step_map = {steps[i]: new_steps[i] for i in range(len(steps))} new_edges = [(step_map[s], step_map[d]) for s, d in op.edges()] return make_pipeline_graph(new_steps, new_edges, True) else: return None if isinstance(op, OperatorChoice): return self._sample(random.choice(op.steps()), n) if isinstance(op, NonTerminal): return self._sample(getattr(self, op.name()), n - 1) if n > 0 else None if isinstance(op, IndividualOp): return op assert False, f"Unknown operator {op}"
def set_operator_params(op: Ops.Operator, **impl_params) -> Ops.TrainableOperator: """May return a new operator, in which case the old one should be overwritten """ if isinstance(op, Ops.PlannedIndividualOp): main_params, partitioned_sub_params = partition_sklearn_params( impl_params) hyper = op._hyperparams if hyper is None: hyper = {} # we set the sub params first for sub_key, sub_params in partitioned_sub_params.items(): set_structured_params(sub_key, sub_params, hyper) # we have now updated any nested operators # (if this is a higher order operator) # and can work on the main operator all_params = {**main_params, **hyper} return op.set_params(**all_params) elif isinstance(op, Ops.BasePipeline): steps = op.steps() main_params, partitioned_sub_params = partition_sklearn_params( impl_params) assert not main_params, f"Unexpected non-nested arguments {main_params}" found_names: Dict[str, int] = {} step_map: Dict[Ops.Operator, Ops.TrainableOperator] = {} for s in steps: name = s.name() name_index = 0 params: Dict[str, Any] = {} if name in found_names: name_index = found_names[name] + 1 found_names[name] = name_index uname = make_indexed_name(name, name_index) if uname in partitioned_sub_params: params = partitioned_sub_params[uname] else: found_names[name] = 0 uname = make_degen_indexed_name(name, 0) if uname in partitioned_sub_params: params = partitioned_sub_params[uname] assert name not in partitioned_sub_params elif name in partitioned_sub_params: params = partitioned_sub_params[name] new_s = set_operator_params(s, **params) if s != new_s: step_map[s] = new_s # make sure that no parameters were passed in for operations # that are not actually part of this pipeline for k in partitioned_sub_params.keys(): n, i = get_name_and_index(k) assert n in found_names and i <= found_names[n] if step_map: op._subst_steps(step_map) if not isinstance(op, Ops.TrainablePipeline): # As a result of choices made, we may now be a TrainableIndividualOp ret = Ops.make_pipeline_graph(op.steps(), op.edges(), ordered=True) if not isinstance(ret, Ops.TrainableOperator): assert False return ret else: return op else: assert isinstance(op, Ops.TrainableOperator) return op elif isinstance(op, Ops.OperatorChoice): choices = op.steps() choice_index: int choice_params: Dict[str, Any] if len(choices) == 1: choice_index = 0 chosen_params = impl_params else: (choice_index, chosen_params) = partition_sklearn_choice_params(impl_params) assert 0 <= choice_index and choice_index < len(choices) choice: Ops.Operator = choices[choice_index] new_step = set_operator_params(choice, **chosen_params) # we remove the OperatorChoice, replacing it with the branch that was taken return new_step else: assert False, f"Not yet supported operation of type: {op.__class__.__name__}"
def test_with_hyperopt2(self): from lale.expressions import ( count, it, max, mean, min, string_indexer, sum, variance, ) wrap_imported_operators() scan = Scan(table=it["main"]) scan_0 = Scan(table=it["customers"]) join = Join(pred=[(it["main"]["group_customer_id"] == it["customers"] ["group_customer_id"])]) map = Map( columns={ "[main](group_customer_id)[customers]|number_children|identity": it["number_children"], "[main](group_customer_id)[customers]|name|identity": it["name"], "[main](group_customer_id)[customers]|income|identity": it["income"], "[main](group_customer_id)[customers]|address|identity": it["address"], "[main](group_customer_id)[customers]|age|identity": it["age"], }, remainder="drop", ) pipeline_4 = join >> map scan_1 = Scan(table=it["purchase"]) join_0 = Join( pred=[(it["main"]["group_id"] == it["purchase"]["group_id"])], join_limit=50.0, ) aggregate = Aggregate( columns={ "[main](group_id)[purchase]|price|variance": variance(it["price"]), "[main](group_id)[purchase]|time|sum": sum(it["time"]), "[main](group_id)[purchase]|time|mean": mean(it["time"]), "[main](group_id)[purchase]|time|min": min(it["time"]), "[main](group_id)[purchase]|price|sum": sum(it["price"]), "[main](group_id)[purchase]|price|count": count(it["price"]), "[main](group_id)[purchase]|price|mean": mean(it["price"]), "[main](group_id)[purchase]|price|min": min(it["price"]), "[main](group_id)[purchase]|price|max": max(it["price"]), "[main](group_id)[purchase]|time|max": max(it["time"]), "[main](group_id)[purchase]|time|variance": variance(it["time"]), }, group_by=it["row_id"], ) pipeline_5 = join_0 >> aggregate map_0 = Map( columns={ "[main]|group_customer_id|identity": it["group_customer_id"], "[main]|transaction_id|identity": it["transaction_id"], "[main]|group_id|identity": it["group_id"], "[main]|comments|identity": it["comments"], "[main]|id|identity": it["id"], "prefix_0_id": it["prefix_0_id"], "next_purchase": it["next_purchase"], "[main]|time|identity": it["time"], }, remainder="drop", ) scan_2 = Scan(table=it["transactions"]) scan_3 = Scan(table=it["products"]) join_1 = Join(pred=[ (it["main"]["transaction_id"] == it["transactions"] ["transaction_id"]), (it["transactions"]["product_id"] == it["products"]["product_id"]), ]) map_1 = Map( columns={ "[main](transaction_id)[transactions](product_id)[products]|price|identity": it["price"], "[main](transaction_id)[transactions](product_id)[products]|type|identity": it["type"], }, remainder="drop", ) pipeline_6 = join_1 >> map_1 join_2 = Join(pred=[(it["main"]["transaction_id"] == it["transactions"] ["transaction_id"])]) map_2 = Map( columns={ "[main](transaction_id)[transactions]|description|identity": it["description"], "[main](transaction_id)[transactions]|product_id|identity": it["product_id"], }, remainder="drop", ) pipeline_7 = join_2 >> map_2 map_3 = Map(columns=[ string_indexer(it["[main]|comments|identity"]), string_indexer( it["[main](transaction_id)[transactions]|description|identity"] ), string_indexer(it[ "[main](transaction_id)[transactions](product_id)[products]|type|identity"] ), string_indexer( it["[main](group_customer_id)[customers]|name|identity"]), string_indexer( it["[main](group_customer_id)[customers]|address|identity"]), ]) pipeline_8 = ConcatFeatures() >> map_3 relational = Relational(operator=make_pipeline_graph( steps=[ scan, scan_0, pipeline_4, scan_1, pipeline_5, map_0, scan_2, scan_3, pipeline_6, pipeline_7, pipeline_8, ], edges=[ (scan, pipeline_4), (scan, pipeline_5), (scan, map_0), (scan, pipeline_6), (scan, pipeline_7), (scan_0, pipeline_4), (pipeline_4, pipeline_8), (scan_1, pipeline_5), (pipeline_5, pipeline_8), (map_0, pipeline_8), (scan_2, pipeline_6), (scan_2, pipeline_7), (scan_3, pipeline_6), (pipeline_6, pipeline_8), (pipeline_7, pipeline_8), ], )) pipeline = relational >> (KNeighborsClassifier | LogisticRegression) from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) from lale.lib.lale import Hyperopt opt = Hyperopt(estimator=pipeline, max_evals=2) opt.fit(X, y)