def enumerate(self, pipeline, k):
        tree = pt.to_tree(pipeline)
        tree.annotate()
        new_trees, lineage = self.derive_variant_trees(
            tree,
            k,
            past_rules=set(),
        )

        already_handled = set()

        while len(new_trees) > 0:
            head_tree = new_trees[0]
            new_trees = new_trees[1:]

            head_lineage = lineage[0]
            lineage = lineage[1:]
            if head_tree is None:
                # bogus pipeline: had only one component
                # and we deleted it so it becomes None
                continue

            h = pt.to_hashable_json(head_tree)
            if h in already_handled:
                continue
            already_handled.add(h)

            try:
                pt.to_pipeline(head_tree)
            except Exception as err:
                # don't add to more new_trees
                # as has issue... (unlikely to fix
                # downstream)
                self.statistics.record_failure(err)
                continue

            self.statistics.record(head_tree, head_lineage)
            yield head_tree

            rec_new_trees, rec_lineage = self.derive_variant_trees(
                head_tree,
                k,
                past_rules=set(head_lineage),
            )

            if len(rec_new_trees) > 0:
                new_trees.extend(rec_new_trees)
                lineage.extend(rec_lineage)
                # this tree is productive, so put it back
                # into the queue for later use (if possible)
                new_trees.append(head_tree)
                lineage.append(head_lineage)
Example #2
0
    def can_build_rule(edit):
        base_cond = is_update_edit(edit) and \
            pt.is_component_node(edit.arg1) and \
            pt.is_component_node(edit.arg2) and \
            edit.arg1.parent is not None

        if not base_cond:
            return False
        # want to also try compiling the post on its own
        try:
            pt.to_pipeline(edit.arg2)
            return True
        except:
            return False
Example #3
0
    def apply(self, node, seed=None):
        if seed is not None:
            np.random.seed(seed)

        # find children of node that existed as a child of the pre
        candidate_locations = []
        for ix, c in enumerate(node.children):
            if c.label in self.pre_children_label_set:
                candidate_locations.append(ix)

        if len(candidate_locations) == 0:
            # only happens if we try to apply without
            # calling .can_apply, so must be trying to force
            # application...so we'll just set candidate_locations
            # to be any
            candidate_locations = np.arange(0, len(node.children))

        # pick a location at random
        target_ix = np.random.choice(candidate_locations, 1)[0]
        # randomly pick if insert before or after that ix
        # if target_ix == ix, we're insert before
        # so add 0 to insert before or 1 to insert after
        target_ix = target_ix + np.random.choice([0, 1], 1)[0]

        node = pt.shallow_copy(node)
        n_children = len(node.children)
        post = pt.shallow_copy(self.post)

        if target_ix < n_children:
            # the new component will *not* be at the
            # end of the pipeline
            # so if its a classifier
            # we want to insert, need to wrap in stackingestimator
            if self.post_is_classifier:
                post = pt.shallow_copy(self.wrapped_post)
        else:
            # at the end of the pipeline
            if not self.post_is_classifier:
                # can't have a non-classifier at the end of the
                # pipeline
                # so shift the insertion point back by one
                target_ix -= 1
            else:
                # the post is a classifier, so the existing
                # classifier needs to be wrapped in stacking estimator
                # otherwise pipeline is invalid
                existing_clf_node = node.children[-1]
                compiled_clf = pt.to_pipeline(existing_clf_node)
                has_classifier = sklearn.base.is_classifier(compiled_clf)
                # should always be true given the .can_apply condition
                assert has_classifier
                # we want to insert a new classifier at the end
                # so we take existing classifier and wrap it
                wrapped_clf = pt.to_tree(StackingEstimator(compiled_clf))
                wrapped_clf = wrapped_clf.children[0]
                # replace the raw clf with the new wrapped clf
                node.replace_child(n_children - 1, wrapped_clf)

        node.insert_child(target_ix, post)
        return node
Example #4
0
    def can_apply(self, node):
        # a component insertion can only be applied to
        # a "combinator" object, so must be Pipeline type
        # TODO: extend with other combinator types here if necessary
        if not pt.is_composition_node(node):
            return False

        # node must be a well-formed pipeline, i.e. there must be
        # a classifier at the end
        if len(node.children) == 0:
            return False

        try:
            compiled_possible_clf = pt.to_pipeline(node.children[-1])
            if not sklearn.base.is_classifier(compiled_possible_clf):
                return False
        except:
            # can't compile it, can't really insert appropriately
            return False

        # we apply it by inserting into its children
        # so check that at least one child matches
        # what we observed in the pre node's children
        return any(c.label in self.pre_children_label_set
                   for c in node.children)
Example #5
0
 def apply(self, node):
     compiled_node = pt.to_pipeline(node)
     if self.post_is_classifier and not sklearn.base.is_classifier(
             compiled_node):
         post = self.wrapped_post
     else:
         post = self.post
     return pt.shallow_copy(post)
def test_tree_to_pipeline():
    orig_pipelines = data.pipelines
    trees = [pt.to_tree(p) for p in tqdm.tqdm(orig_pipelines)]
    regen_pipelines = [pt.to_pipeline(t) for t in tqdm.tqdm(trees)]
    regen_trees = [pt.to_tree(p) for p in regen_pipelines]

    for ix, (t1, t2) in enumerate(zip(trees, regen_trees)):
        j1 = pt.to_hashable_json(t1)
        j2 = pt.to_hashable_json(t2)
        assert j1 == j2, "Pipelines should match"
Example #7
0
    def __init__(self, edit):
        self.pre, self.post = edit.arg1, edit.arg2

        compiled_post = pt.to_pipeline(self.post)
        self.post_is_classifier = sklearn.base.is_classifier(compiled_post)

        if self.post_is_classifier:
            wrapped_post = pt.to_tree(StackingEstimator(compiled_post))
            self.wrapped_post = wrapped_post.children[0]

        self._info = ComponentRule.info_from_node(self.pre)
Example #8
0
    def __init__(self, edit):
        # the "pre" is really the parent node
        self.pre = edit.pre_parent
        self.post = edit.arg2

        # some re-used info for pre children
        self.pre_children_n = len(self.pre.children)
        self.pre_children_labels = [c.label for c in self.pre.children]
        self.pre_children_label_set = set(self.pre_children_labels)

        compiled_post = pt.to_pipeline(self.post)
        self.post_is_classifier = sklearn.base.is_classifier(compiled_post)

        if self.post_is_classifier:
            wrapped_post = pt.to_tree(StackingEstimator(compiled_post))
            # remove the 'root' node, just want the one for the clf
            self.wrapped_post = wrapped_post.children[0]

        self._info = ComponentRule.info_from_node(self.pre)
 def enumerate(self, pipeline, rules_per_node, X=None, y=None):
     tree = pt.to_tree(pipeline)
     tree.annotate()
     for candidate in self.recursive_enumerate(tree, rules_per_node):
         if candidate is None:
             # managed to delete all nodes (or critical nodes)
             continue
         try:
             compiled = pt.to_pipeline(candidate)
             if X is None:
                 yield compiled
             else:
                 try:
                     compiled.fit(X, y)
                     yield compiled
                 except:
                     continue
         except:
             continue
def test_tree_deletion():
    clf = sklearn.linear_model.LogisticRegression(penalty="l1")
    tr = pt.to_tree(clf)

    # delete all the hyperparameters --> equivalent to setting defaults
    # when we recompile into a pipeline
    hypers = list(tr.children[0].children)
    target_node = tr.children[0]
    n = len(hypers)
    for h in hypers:
        target_node.delete_child(h)
        with pytest.raises(ValueError):
            target_node.children.index(h)
        assert (n - 1) == len(target_node.children), "deletion failed"
        n -= 1
    assert n == 0, "deleting all children failed"

    check = pt.to_hashable_json(pt.to_tree(pt.to_pipeline(tr)))
    clf2 = sklearn.linear_model.LogisticRegression()
    answer = pt.to_hashable_json(pt.to_tree(clf2))
    assert check == answer, "deleting hyperparams didn't yield defaults"
def run_single_tree(
    X_search,
    y_search,
    X_test,
    y_test,
    test_pipeline_tree,
    enumerator,
    bound_num_repaired_pipelines,
    dev_cv=3,
    bound_k=3,
    cv=5,
    scoring="f1_macro",
    random_state=42,
):
    repairer = PipelineRepairer(enumerator)

    results_summary = []
    orig_info = {
        "type": "orig",
        "graph": test_pipeline_tree,
    }

    orig_compiled = pt.to_pipeline(test_pipeline_tree)

    # TODO: this should be a param
    # should be about 5% of dataset, since search is 50%
    num_obs_search = int(X_search.shape[0] * 0.1)
    assert num_obs_search >= 1
    if isinstance(X_search, pd.DataFrame):
        X_search = X_search.values
    if isinstance(y_search, (pd.DataFrame, pd.Series)):
        y_search = y_search.values

    X_search = X_search[:num_obs_search]
    y_search = y_search[:num_obs_search]

    utils.set_seed(random_state)
    repaired = repairer.repair(
        orig_compiled,
        X_search,
        y_search,
        bound_k=bound_k,
        bound_num_repairs=bound_num_repaired_pipelines,
        scoring=scoring,
        cv=dev_cv,
        random_state=random_state,
        verbosity=1,
    )

    try:
        print("Evaluate original")
        utils.set_seed(random_state)
        orig_results = mp_utils.run(
            DEFAULT_TIMEOUT_EVAL,
            cross_validate,
            orig_compiled,
            X_test,
            y_test,
            cv=StratifiedKFold(
                cv,
                random_state=random_state,
                shuffle=True,
            ),
            scoring=scoring,
            return_estimator=True,
            return_train_score=True,
        )
        orig_info["test_scores"] = orig_results["test_score"]
        orig_info["mean_test_score"] = np.mean(orig_results["test_score"])
        orig_info["failed"] = False
        orig_info["timedout"] = False
    except mp_utils.TimeoutError:
        print("Timedout on original pipeline")
        orig_info["failed"] = True
        orig_info["timedout"] = True
        orig_info["test_scores"] = []
        orig_info["mean_test_score"] = np.nan
    except Exception as err:
        print("Failed to run original pipeline")
        print(err)
        orig_info["failed"] = True
        orig_info["timedout"] = False
        orig_info["test_scores"] = []
        orig_info["mean_test_score"] = np.nan

    if repaired is None:
        print("No repair found")
        orig_info["no_repaired_candidates"] = True
        results_summary.append(orig_info)
        return pd.DataFrame(results_summary)
    else:
        orig_info["no_repaired_candidates"] = False

    results_summary.append(orig_info)

    repair_info = {
        "type": "repair",
        "graph": pt.to_tree(repaired),
        "no_repaired_candidates": False,
    }
    try:
        print("Evaluate repaired")
        utils.set_seed(random_state)
        repaired_results = mp_utils.run(
            DEFAULT_TIMEOUT_EVAL,
            cross_validate,
            repaired,
            X_test,
            y_test,
            cv=StratifiedKFold(
                cv,
                random_state=random_state,
                shuffle=True,
            ),
            scoring=scoring,
            return_estimator=True,
            return_train_score=True,
        )
        repair_info["test_scores"] = repaired_results["test_score"]
        repair_info["mean_test_score"] = np.mean(
            repaired_results["test_score"])
        repair_info["failed"] = False
        repair_info["timedout"] = False
    except mp_utils.TimeoutError:
        print("Timedout on repair pipeline")
        orig_info["failed"] = True
        orig_info["timedout"] = True
        orig_info["test_scores"] = []
        orig_info["mean_test_score"] = np.nan
    except Exception as err:
        print("Failed to run repaired pipeline")
        print(err)
        repair_info["test_scores"] = []
        repair_info["mean_test_score"] = np.nan
        repair_info["failed"] = True
        repair_info["timedout"] = False

    repair_info["repairer_statistics"] = repairer.statistics
    orig_info["repairer_statistics"] = None
    results_summary.append(repair_info)
    return pd.DataFrame(results_summary)
Example #12
0
    def repair(
        self,
        pipeline,
        X,
        y,
        bound_k=3,
        bound_num_repairs=10,
        scoring="f1_macro",
        cv=3,
        random_state=42,
        verbosity=0,
    ):
        self.debug_data = []
        cv_splitter = StratifiedKFold(
            cv,
            random_state=random_state,
            shuffle=True,
        )

        orig_tree = pt.to_tree(pipeline)
        tried = set([])
        tried.add(pt.to_hashable_json(orig_tree))

        repair_ct = 0
        best_candidate = None
        best_score = None
        pbar = tqdm.tqdm(total=bound_num_repairs)

        time_budget_left = self.total_time
        tree_gen = self.enumerator.enumerate(orig_tree, bound_k)

        # attempted pipelines before one executes without failure
        ct_tries = 0
        time_last_pipeline = time.time()

        while repair_ct < bound_num_repairs and time_budget_left > 0:
            if verbosity > 0:
                sys.stdout.write(
                    "Time left:{:.2f}(s)\r".format(time_budget_left))
                sys.stdout.flush()
            iter_start_time = time.time()

            # generate candidate trees in separate thread
            # so we can time out
            with stopit.ThreadingTimeout(time_budget_left) as ctx:
                try:
                    candidate_tree = next(tree_gen)
                except StopIteration:
                    time_budget_left = 0
                    break

            if ctx.state == ctx.TIMED_OUT:
                time_budget_left = 0
                break

            if candidate_tree is None:
                time_budget_left -= (time.time() - iter_start_time)
                continue

            candidate_hash = pt.to_hashable_json(candidate_tree)

            if candidate_hash in tried:
                time_budget_left -= (time.time() - iter_start_time)
                continue

            try:
                compiled_candidate = pt.to_pipeline(candidate_tree)
            except Exception as err:
                ct_tries += 1
                self.statistics.record_failure(err)
                time_budget_left -= (time.time() - iter_start_time)
                continue

            tried.add(candidate_hash)
            try:
                repair_results = mp_utils.run(
                    DEFAULT_TIMEOUT_PER_REPAIR,
                    cross_validate,
                    compiled_candidate,
                    X,
                    y,
                    cv=cv_splitter,
                    scoring=scoring,
                    return_estimator=True,
                    return_train_score=True,
                )
                self.debug_data.append(compiled_candidate)
                now = time.time()
                time_taken = now - time_last_pipeline
                time_last_pipeline = now
                self.statistics.record_success(ct_tries, time_taken)
                time_budget_left -= (time.time() - iter_start_time)
            # based on
            # https://github.com/josepablocam/ams/blob/master/core/search.py#L274
            except (
                    TimeoutError,
                    ValueError,
                    TypeError,
                    ZeroDivisionError,
                    IndexError,
                    AttributeError,
                    MemoryError,
                    ImportError,
                    mp_utils.TimeoutError,
                    mp_utils.mp.pool.MaybeEncodingError,
            ) as err:
                ct_tries += 1
                self.statistics.record_failure(err)
                time_budget_left -= (time.time() - iter_start_time)
                continue

            repair_ct += 1
            candidate_score = np.mean(repair_results["test_score"])
            if best_candidate is None or candidate_score > best_score:
                best_candidate = compiled_candidate
                best_score = candidate_score
            pbar.update(1)
        pbar.close()
        return best_candidate