Example #1
0
    def _unfold(self, op: Operator, n: int) -> Optional[Operator]:
        """ Unroll all possible operators from the grammar `g` starting from    non-terminal `op` after `n` derivations.

        Parameters
        ----------
        op : Operator
            starting rule (e.g., `g.start`)
        n : int
            number of derivations

        Returns
        -------
        Optional[Operator]
        """
        if isinstance(op, BasePipeline):
            steps = op.steps()
            new_maybe_steps: List[Optional[Operator]] = [
                self._unfold(sop, n) for sop in op.steps()
            ]
            if None not in new_maybe_steps:
                new_steps: List[Operator] = cast(List[Operator],
                                                 new_maybe_steps)
                step_map = {steps[i]: new_steps[i] for i in range(len(steps))}
                new_edges = [(step_map[s], step_map[d]) for s, d in op.edges()]
                return make_pipeline_graph(new_steps, new_edges, True)
            else:
                return None
        if isinstance(op, OperatorChoice):
            steps = [
                s for s in (self._unfold(sop, n) for sop in op.steps()) if s
            ]
            return make_choice(*steps) if steps else None
        if isinstance(op, NonTerminal):
            return self._unfold(self._variables[op.name()], n -
                                1) if n > 0 else None
        if isinstance(op, IndividualOp):
            return op
        assert False, f"Unknown operator {op}"
Example #2
0
    def test_irreducible_1(self):
        from lale.lib.sklearn import (
            PCA,
            KNeighborsClassifier,
            LogisticRegression,
            MinMaxScaler,
            Nystroem,
        )
        from lale.operators import make_pipeline_graph

        choice = PCA | Nystroem
        pipeline = make_pipeline_graph(
            steps=[choice, MinMaxScaler, LogisticRegression, KNeighborsClassifier],
            edges=[
                (choice, LogisticRegression),
                (MinMaxScaler, LogisticRegression),
                (MinMaxScaler, KNeighborsClassifier),
            ],
        )
        expected = """from sklearn.decomposition import PCA
from sklearn.kernel_approximation import Nystroem
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from lale.operators import make_pipeline_graph
import lale

lale.wrap_imported_operators()
choice = PCA | Nystroem
pipeline = make_pipeline_graph(
    steps=[choice, MinMaxScaler, LogisticRegression, KNeighborsClassifier],
    edges=[
        (choice, LogisticRegression),
        (MinMaxScaler, LogisticRegression),
        (MinMaxScaler, KNeighborsClassifier),
    ],
)"""
        self._roundtrip(expected, lale.pretty_print.to_string(pipeline))
Example #3
0
    def _sample(self, op: Operator, n: int) -> Optional[Operator]:
        """
        Sample the grammar `g` starting from `g.start`, that is, choose one element at random for each possible choices.

        Parameters
        ----------
        op : Operator
            starting rule (e.g., `g.start`)
        n : int
            number of derivations

        Returns
        -------
        Optional[Operator]
        """
        if isinstance(op, BasePipeline):
            steps = op.steps()
            new_maybe_steps: List[Optional[Operator]] = [
                self._sample(sop, n) for sop in op.steps()
            ]
            if None not in new_maybe_steps:
                new_steps: List[Operator] = cast(List[Operator],
                                                 new_maybe_steps)
                step_map = {steps[i]: new_steps[i] for i in range(len(steps))}
                new_edges = [(step_map[s], step_map[d]) for s, d in op.edges()]
                return make_pipeline_graph(new_steps, new_edges, True)
            else:
                return None
        if isinstance(op, OperatorChoice):
            return self._sample(random.choice(op.steps()), n)
        if isinstance(op, NonTerminal):
            return self._sample(getattr(self, op.name()), n -
                                1) if n > 0 else None
        if isinstance(op, IndividualOp):
            return op
        assert False, f"Unknown operator {op}"
Example #4
0
def set_operator_params(op: Ops.Operator,
                        **impl_params) -> Ops.TrainableOperator:
    """May return a new operator, in which case the old one should be overwritten
    """
    if isinstance(op, Ops.PlannedIndividualOp):
        main_params, partitioned_sub_params = partition_sklearn_params(
            impl_params)
        hyper = op._hyperparams
        if hyper is None:
            hyper = {}
        # we set the sub params first
        for sub_key, sub_params in partitioned_sub_params.items():
            set_structured_params(sub_key, sub_params, hyper)

        # we have now updated any nested operators
        # (if this is a higher order operator)
        # and can work on the main operator
        all_params = {**main_params, **hyper}
        return op.set_params(**all_params)
    elif isinstance(op, Ops.BasePipeline):
        steps = op.steps()
        main_params, partitioned_sub_params = partition_sklearn_params(
            impl_params)
        assert not main_params, f"Unexpected non-nested arguments {main_params}"
        found_names: Dict[str, int] = {}
        step_map: Dict[Ops.Operator, Ops.TrainableOperator] = {}
        for s in steps:
            name = s.name()
            name_index = 0
            params: Dict[str, Any] = {}
            if name in found_names:
                name_index = found_names[name] + 1
                found_names[name] = name_index
                uname = make_indexed_name(name, name_index)
                if uname in partitioned_sub_params:
                    params = partitioned_sub_params[uname]
            else:
                found_names[name] = 0
                uname = make_degen_indexed_name(name, 0)
                if uname in partitioned_sub_params:
                    params = partitioned_sub_params[uname]
                    assert name not in partitioned_sub_params
                elif name in partitioned_sub_params:
                    params = partitioned_sub_params[name]
            new_s = set_operator_params(s, **params)
            if s != new_s:
                step_map[s] = new_s
        # make sure that no parameters were passed in for operations
        # that are not actually part of this pipeline
        for k in partitioned_sub_params.keys():
            n, i = get_name_and_index(k)
            assert n in found_names and i <= found_names[n]
        if step_map:
            op._subst_steps(step_map)
            if not isinstance(op, Ops.TrainablePipeline):
                # As a result of choices made, we may now be a TrainableIndividualOp
                ret = Ops.make_pipeline_graph(op.steps(),
                                              op.edges(),
                                              ordered=True)
                if not isinstance(ret, Ops.TrainableOperator):
                    assert False
                return ret
            else:
                return op
        else:
            assert isinstance(op, Ops.TrainableOperator)
            return op
    elif isinstance(op, Ops.OperatorChoice):
        choices = op.steps()
        choice_index: int
        choice_params: Dict[str, Any]
        if len(choices) == 1:
            choice_index = 0
            chosen_params = impl_params
        else:
            (choice_index,
             chosen_params) = partition_sklearn_choice_params(impl_params)

        assert 0 <= choice_index and choice_index < len(choices)
        choice: Ops.Operator = choices[choice_index]

        new_step = set_operator_params(choice, **chosen_params)
        # we remove the OperatorChoice, replacing it with the branch that was taken
        return new_step
    else:
        assert False, f"Not yet supported operation of type: {op.__class__.__name__}"
Example #5
0
    def test_with_hyperopt2(self):
        from lale.expressions import (
            count,
            it,
            max,
            mean,
            min,
            string_indexer,
            sum,
            variance,
        )

        wrap_imported_operators()
        scan = Scan(table=it["main"])
        scan_0 = Scan(table=it["customers"])
        join = Join(pred=[(it["main"]["group_customer_id"] == it["customers"]
                           ["group_customer_id"])])
        map = Map(
            columns={
                "[main](group_customer_id)[customers]|number_children|identity":
                it["number_children"],
                "[main](group_customer_id)[customers]|name|identity":
                it["name"],
                "[main](group_customer_id)[customers]|income|identity":
                it["income"],
                "[main](group_customer_id)[customers]|address|identity":
                it["address"],
                "[main](group_customer_id)[customers]|age|identity":
                it["age"],
            },
            remainder="drop",
        )
        pipeline_4 = join >> map
        scan_1 = Scan(table=it["purchase"])
        join_0 = Join(
            pred=[(it["main"]["group_id"] == it["purchase"]["group_id"])],
            join_limit=50.0,
        )
        aggregate = Aggregate(
            columns={
                "[main](group_id)[purchase]|price|variance":
                variance(it["price"]),
                "[main](group_id)[purchase]|time|sum": sum(it["time"]),
                "[main](group_id)[purchase]|time|mean": mean(it["time"]),
                "[main](group_id)[purchase]|time|min": min(it["time"]),
                "[main](group_id)[purchase]|price|sum": sum(it["price"]),
                "[main](group_id)[purchase]|price|count": count(it["price"]),
                "[main](group_id)[purchase]|price|mean": mean(it["price"]),
                "[main](group_id)[purchase]|price|min": min(it["price"]),
                "[main](group_id)[purchase]|price|max": max(it["price"]),
                "[main](group_id)[purchase]|time|max": max(it["time"]),
                "[main](group_id)[purchase]|time|variance":
                variance(it["time"]),
            },
            group_by=it["row_id"],
        )
        pipeline_5 = join_0 >> aggregate
        map_0 = Map(
            columns={
                "[main]|group_customer_id|identity": it["group_customer_id"],
                "[main]|transaction_id|identity": it["transaction_id"],
                "[main]|group_id|identity": it["group_id"],
                "[main]|comments|identity": it["comments"],
                "[main]|id|identity": it["id"],
                "prefix_0_id": it["prefix_0_id"],
                "next_purchase": it["next_purchase"],
                "[main]|time|identity": it["time"],
            },
            remainder="drop",
        )
        scan_2 = Scan(table=it["transactions"])
        scan_3 = Scan(table=it["products"])
        join_1 = Join(pred=[
            (it["main"]["transaction_id"] == it["transactions"]
             ["transaction_id"]),
            (it["transactions"]["product_id"] == it["products"]["product_id"]),
        ])
        map_1 = Map(
            columns={
                "[main](transaction_id)[transactions](product_id)[products]|price|identity":
                it["price"],
                "[main](transaction_id)[transactions](product_id)[products]|type|identity":
                it["type"],
            },
            remainder="drop",
        )
        pipeline_6 = join_1 >> map_1
        join_2 = Join(pred=[(it["main"]["transaction_id"] == it["transactions"]
                             ["transaction_id"])])
        map_2 = Map(
            columns={
                "[main](transaction_id)[transactions]|description|identity":
                it["description"],
                "[main](transaction_id)[transactions]|product_id|identity":
                it["product_id"],
            },
            remainder="drop",
        )
        pipeline_7 = join_2 >> map_2
        map_3 = Map(columns=[
            string_indexer(it["[main]|comments|identity"]),
            string_indexer(
                it["[main](transaction_id)[transactions]|description|identity"]
            ),
            string_indexer(it[
                "[main](transaction_id)[transactions](product_id)[products]|type|identity"]
                           ),
            string_indexer(
                it["[main](group_customer_id)[customers]|name|identity"]),
            string_indexer(
                it["[main](group_customer_id)[customers]|address|identity"]),
        ])
        pipeline_8 = ConcatFeatures() >> map_3
        relational = Relational(operator=make_pipeline_graph(
            steps=[
                scan,
                scan_0,
                pipeline_4,
                scan_1,
                pipeline_5,
                map_0,
                scan_2,
                scan_3,
                pipeline_6,
                pipeline_7,
                pipeline_8,
            ],
            edges=[
                (scan, pipeline_4),
                (scan, pipeline_5),
                (scan, map_0),
                (scan, pipeline_6),
                (scan, pipeline_7),
                (scan_0, pipeline_4),
                (pipeline_4, pipeline_8),
                (scan_1, pipeline_5),
                (pipeline_5, pipeline_8),
                (map_0, pipeline_8),
                (scan_2, pipeline_6),
                (scan_2, pipeline_7),
                (scan_3, pipeline_6),
                (pipeline_6, pipeline_8),
                (pipeline_7, pipeline_8),
            ],
        ))
        pipeline = relational >> (KNeighborsClassifier | LogisticRegression)
        from sklearn.datasets import load_iris

        X, y = load_iris(return_X_y=True)
        from lale.lib.lale import Hyperopt

        opt = Hyperopt(estimator=pipeline, max_evals=2)
        opt.fit(X, y)