Esempio n. 1
0
def test_tree_to_pipeline():
    orig_pipelines = data.pipelines
    trees = [pt.to_tree(p) for p in tqdm.tqdm(orig_pipelines)]
    regen_pipelines = [pt.to_pipeline(t) for t in tqdm.tqdm(trees)]
    regen_trees = [pt.to_tree(p) for p in regen_pipelines]

    for ix, (t1, t2) in enumerate(zip(trees, regen_trees)):
        j1 = pt.to_hashable_json(t1)
        j2 = pt.to_hashable_json(t2)
        assert j1 == j2, "Pipelines should match"
Esempio n. 2
0
def test_tree_replacement():
    clf = sklearn.linear_model.LogisticRegression(penalty="l1")
    tr = pt.to_tree(clf)

    clf2 = sklearn.linear_model.LinearRegression()
    tr2 = pt.to_tree(clf2)

    tr.replace_child(0, tr2.children[0])

    assert tr.children[0] == tr2.children[0], "replacement failed"
    assert pt.to_hashable_json(tr) == pt.to_hashable_json(
        tr2), "subtrees incorrect"
Esempio n. 3
0
def test_tree_deletion():
    clf = sklearn.linear_model.LogisticRegression(penalty="l1")
    tr = pt.to_tree(clf)

    # delete all the hyperparameters --> equivalent to setting defaults
    # when we recompile into a pipeline
    hypers = list(tr.children[0].children)
    target_node = tr.children[0]
    n = len(hypers)
    for h in hypers:
        target_node.delete_child(h)
        with pytest.raises(ValueError):
            target_node.children.index(h)
        assert (n - 1) == len(target_node.children), "deletion failed"
        n -= 1
    assert n == 0, "deleting all children failed"

    check = pt.to_hashable_json(pt.to_tree(pt.to_pipeline(tr)))
    clf2 = sklearn.linear_model.LogisticRegression()
    answer = pt.to_hashable_json(pt.to_tree(clf2))
    assert check == answer, "deleting hyperparams didn't yield defaults"
    def enumerate(self, pipeline, k):
        tree = pt.to_tree(pipeline)
        tree.annotate()
        new_trees, lineage = self.derive_variant_trees(
            tree,
            k,
            past_rules=set(),
        )

        already_handled = set()

        while len(new_trees) > 0:
            head_tree = new_trees[0]
            new_trees = new_trees[1:]

            head_lineage = lineage[0]
            lineage = lineage[1:]
            if head_tree is None:
                # bogus pipeline: had only one component
                # and we deleted it so it becomes None
                continue

            h = pt.to_hashable_json(head_tree)
            if h in already_handled:
                continue
            already_handled.add(h)

            try:
                pt.to_pipeline(head_tree)
            except Exception as err:
                # don't add to more new_trees
                # as has issue... (unlikely to fix
                # downstream)
                self.statistics.record_failure(err)
                continue

            self.statistics.record(head_tree, head_lineage)
            yield head_tree

            rec_new_trees, rec_lineage = self.derive_variant_trees(
                head_tree,
                k,
                past_rules=set(head_lineage),
            )

            if len(rec_new_trees) > 0:
                new_trees.extend(rec_new_trees)
                lineage.extend(rec_lineage)
                # this tree is productive, so put it back
                # into the queue for later use (if possible)
                new_trees.append(head_tree)
                lineage.append(head_lineage)
def main():
    args = get_args()
    with open(args.input, "rb") as fin:
        corpus = pickle.load(fin)

    pipeline = sklearn.pipeline.Pipeline([
        ("clf", sklearn.linear_model.LogisticRegression())
    ])

    rule_sampler = get_rule_sampler(args.rule_strategy, corpus,
                                    args.random_state)
    enumerator = get_tree_enumerator(args.enumeration_strategy, rule_sampler)
    orig_tree = pt.to_tree(pipeline)
    explored = set([orig_tree])

    ix = 0
    for p in enumerator.enumerate(pipeline, args.bound_k):
        if ix >= args.bound_num_pipelines:
            break
        new_tree = pt.to_tree(p)
        h = pt.to_hashable_json(new_tree)
        if h in explored:
            continue
        explored.add(h)

        print("New pipeline", ix)
        dist, edits = pt.tree_edit_distance(
            orig_tree,
            new_tree,
            return_operations=True,
        )
        print("Distance", dist)
        ct_edits = 0
        for edit in edits:
            if is_match_edit(edit):
                continue
            msg = "Edit: {} -> {}".format(get_safe_label(edit.arg1),
                                          get_safe_label(edit.arg2))
            print(msg)
            ct_edits += 1
        print(pt.to_json(new_tree))
        ix += 1
Esempio n. 6
0
    def repair(
        self,
        pipeline,
        X,
        y,
        bound_k=3,
        bound_num_repairs=10,
        scoring="f1_macro",
        cv=3,
        random_state=42,
        verbosity=0,
    ):
        self.debug_data = []
        cv_splitter = StratifiedKFold(
            cv,
            random_state=random_state,
            shuffle=True,
        )

        orig_tree = pt.to_tree(pipeline)
        tried = set([])
        tried.add(pt.to_hashable_json(orig_tree))

        repair_ct = 0
        best_candidate = None
        best_score = None
        pbar = tqdm.tqdm(total=bound_num_repairs)

        time_budget_left = self.total_time
        tree_gen = self.enumerator.enumerate(orig_tree, bound_k)

        # attempted pipelines before one executes without failure
        ct_tries = 0
        time_last_pipeline = time.time()

        while repair_ct < bound_num_repairs and time_budget_left > 0:
            if verbosity > 0:
                sys.stdout.write(
                    "Time left:{:.2f}(s)\r".format(time_budget_left))
                sys.stdout.flush()
            iter_start_time = time.time()

            # generate candidate trees in separate thread
            # so we can time out
            with stopit.ThreadingTimeout(time_budget_left) as ctx:
                try:
                    candidate_tree = next(tree_gen)
                except StopIteration:
                    time_budget_left = 0
                    break

            if ctx.state == ctx.TIMED_OUT:
                time_budget_left = 0
                break

            if candidate_tree is None:
                time_budget_left -= (time.time() - iter_start_time)
                continue

            candidate_hash = pt.to_hashable_json(candidate_tree)

            if candidate_hash in tried:
                time_budget_left -= (time.time() - iter_start_time)
                continue

            try:
                compiled_candidate = pt.to_pipeline(candidate_tree)
            except Exception as err:
                ct_tries += 1
                self.statistics.record_failure(err)
                time_budget_left -= (time.time() - iter_start_time)
                continue

            tried.add(candidate_hash)
            try:
                repair_results = mp_utils.run(
                    DEFAULT_TIMEOUT_PER_REPAIR,
                    cross_validate,
                    compiled_candidate,
                    X,
                    y,
                    cv=cv_splitter,
                    scoring=scoring,
                    return_estimator=True,
                    return_train_score=True,
                )
                self.debug_data.append(compiled_candidate)
                now = time.time()
                time_taken = now - time_last_pipeline
                time_last_pipeline = now
                self.statistics.record_success(ct_tries, time_taken)
                time_budget_left -= (time.time() - iter_start_time)
            # based on
            # https://github.com/josepablocam/ams/blob/master/core/search.py#L274
            except (
                    TimeoutError,
                    ValueError,
                    TypeError,
                    ZeroDivisionError,
                    IndexError,
                    AttributeError,
                    MemoryError,
                    ImportError,
                    mp_utils.TimeoutError,
                    mp_utils.mp.pool.MaybeEncodingError,
            ) as err:
                ct_tries += 1
                self.statistics.record_failure(err)
                time_budget_left -= (time.time() - iter_start_time)
                continue

            repair_ct += 1
            candidate_score = np.mean(repair_results["test_score"])
            if best_candidate is None or candidate_score > best_score:
                best_candidate = compiled_candidate
                best_score = candidate_score
            pbar.update(1)
        pbar.close()
        return best_candidate