def enumerate(self, pipeline, k): tree = pt.to_tree(pipeline) tree.annotate() new_trees, lineage = self.derive_variant_trees( tree, k, past_rules=set(), ) already_handled = set() while len(new_trees) > 0: head_tree = new_trees[0] new_trees = new_trees[1:] head_lineage = lineage[0] lineage = lineage[1:] if head_tree is None: # bogus pipeline: had only one component # and we deleted it so it becomes None continue h = pt.to_hashable_json(head_tree) if h in already_handled: continue already_handled.add(h) try: pt.to_pipeline(head_tree) except Exception as err: # don't add to more new_trees # as has issue... (unlikely to fix # downstream) self.statistics.record_failure(err) continue self.statistics.record(head_tree, head_lineage) yield head_tree rec_new_trees, rec_lineage = self.derive_variant_trees( head_tree, k, past_rules=set(head_lineage), ) if len(rec_new_trees) > 0: new_trees.extend(rec_new_trees) lineage.extend(rec_lineage) # this tree is productive, so put it back # into the queue for later use (if possible) new_trees.append(head_tree) lineage.append(head_lineage)
def can_build_rule(edit): base_cond = is_update_edit(edit) and \ pt.is_component_node(edit.arg1) and \ pt.is_component_node(edit.arg2) and \ edit.arg1.parent is not None if not base_cond: return False # want to also try compiling the post on its own try: pt.to_pipeline(edit.arg2) return True except: return False
def apply(self, node, seed=None): if seed is not None: np.random.seed(seed) # find children of node that existed as a child of the pre candidate_locations = [] for ix, c in enumerate(node.children): if c.label in self.pre_children_label_set: candidate_locations.append(ix) if len(candidate_locations) == 0: # only happens if we try to apply without # calling .can_apply, so must be trying to force # application...so we'll just set candidate_locations # to be any candidate_locations = np.arange(0, len(node.children)) # pick a location at random target_ix = np.random.choice(candidate_locations, 1)[0] # randomly pick if insert before or after that ix # if target_ix == ix, we're insert before # so add 0 to insert before or 1 to insert after target_ix = target_ix + np.random.choice([0, 1], 1)[0] node = pt.shallow_copy(node) n_children = len(node.children) post = pt.shallow_copy(self.post) if target_ix < n_children: # the new component will *not* be at the # end of the pipeline # so if its a classifier # we want to insert, need to wrap in stackingestimator if self.post_is_classifier: post = pt.shallow_copy(self.wrapped_post) else: # at the end of the pipeline if not self.post_is_classifier: # can't have a non-classifier at the end of the # pipeline # so shift the insertion point back by one target_ix -= 1 else: # the post is a classifier, so the existing # classifier needs to be wrapped in stacking estimator # otherwise pipeline is invalid existing_clf_node = node.children[-1] compiled_clf = pt.to_pipeline(existing_clf_node) has_classifier = sklearn.base.is_classifier(compiled_clf) # should always be true given the .can_apply condition assert has_classifier # we want to insert a new classifier at the end # so we take existing classifier and wrap it wrapped_clf = pt.to_tree(StackingEstimator(compiled_clf)) wrapped_clf = wrapped_clf.children[0] # replace the raw clf with the new wrapped clf node.replace_child(n_children - 1, wrapped_clf) node.insert_child(target_ix, post) return node
def can_apply(self, node): # a component insertion can only be applied to # a "combinator" object, so must be Pipeline type # TODO: extend with other combinator types here if necessary if not pt.is_composition_node(node): return False # node must be a well-formed pipeline, i.e. there must be # a classifier at the end if len(node.children) == 0: return False try: compiled_possible_clf = pt.to_pipeline(node.children[-1]) if not sklearn.base.is_classifier(compiled_possible_clf): return False except: # can't compile it, can't really insert appropriately return False # we apply it by inserting into its children # so check that at least one child matches # what we observed in the pre node's children return any(c.label in self.pre_children_label_set for c in node.children)
def apply(self, node): compiled_node = pt.to_pipeline(node) if self.post_is_classifier and not sklearn.base.is_classifier( compiled_node): post = self.wrapped_post else: post = self.post return pt.shallow_copy(post)
def test_tree_to_pipeline(): orig_pipelines = data.pipelines trees = [pt.to_tree(p) for p in tqdm.tqdm(orig_pipelines)] regen_pipelines = [pt.to_pipeline(t) for t in tqdm.tqdm(trees)] regen_trees = [pt.to_tree(p) for p in regen_pipelines] for ix, (t1, t2) in enumerate(zip(trees, regen_trees)): j1 = pt.to_hashable_json(t1) j2 = pt.to_hashable_json(t2) assert j1 == j2, "Pipelines should match"
def __init__(self, edit): self.pre, self.post = edit.arg1, edit.arg2 compiled_post = pt.to_pipeline(self.post) self.post_is_classifier = sklearn.base.is_classifier(compiled_post) if self.post_is_classifier: wrapped_post = pt.to_tree(StackingEstimator(compiled_post)) self.wrapped_post = wrapped_post.children[0] self._info = ComponentRule.info_from_node(self.pre)
def __init__(self, edit): # the "pre" is really the parent node self.pre = edit.pre_parent self.post = edit.arg2 # some re-used info for pre children self.pre_children_n = len(self.pre.children) self.pre_children_labels = [c.label for c in self.pre.children] self.pre_children_label_set = set(self.pre_children_labels) compiled_post = pt.to_pipeline(self.post) self.post_is_classifier = sklearn.base.is_classifier(compiled_post) if self.post_is_classifier: wrapped_post = pt.to_tree(StackingEstimator(compiled_post)) # remove the 'root' node, just want the one for the clf self.wrapped_post = wrapped_post.children[0] self._info = ComponentRule.info_from_node(self.pre)
def enumerate(self, pipeline, rules_per_node, X=None, y=None): tree = pt.to_tree(pipeline) tree.annotate() for candidate in self.recursive_enumerate(tree, rules_per_node): if candidate is None: # managed to delete all nodes (or critical nodes) continue try: compiled = pt.to_pipeline(candidate) if X is None: yield compiled else: try: compiled.fit(X, y) yield compiled except: continue except: continue
def test_tree_deletion(): clf = sklearn.linear_model.LogisticRegression(penalty="l1") tr = pt.to_tree(clf) # delete all the hyperparameters --> equivalent to setting defaults # when we recompile into a pipeline hypers = list(tr.children[0].children) target_node = tr.children[0] n = len(hypers) for h in hypers: target_node.delete_child(h) with pytest.raises(ValueError): target_node.children.index(h) assert (n - 1) == len(target_node.children), "deletion failed" n -= 1 assert n == 0, "deleting all children failed" check = pt.to_hashable_json(pt.to_tree(pt.to_pipeline(tr))) clf2 = sklearn.linear_model.LogisticRegression() answer = pt.to_hashable_json(pt.to_tree(clf2)) assert check == answer, "deleting hyperparams didn't yield defaults"
def run_single_tree( X_search, y_search, X_test, y_test, test_pipeline_tree, enumerator, bound_num_repaired_pipelines, dev_cv=3, bound_k=3, cv=5, scoring="f1_macro", random_state=42, ): repairer = PipelineRepairer(enumerator) results_summary = [] orig_info = { "type": "orig", "graph": test_pipeline_tree, } orig_compiled = pt.to_pipeline(test_pipeline_tree) # TODO: this should be a param # should be about 5% of dataset, since search is 50% num_obs_search = int(X_search.shape[0] * 0.1) assert num_obs_search >= 1 if isinstance(X_search, pd.DataFrame): X_search = X_search.values if isinstance(y_search, (pd.DataFrame, pd.Series)): y_search = y_search.values X_search = X_search[:num_obs_search] y_search = y_search[:num_obs_search] utils.set_seed(random_state) repaired = repairer.repair( orig_compiled, X_search, y_search, bound_k=bound_k, bound_num_repairs=bound_num_repaired_pipelines, scoring=scoring, cv=dev_cv, random_state=random_state, verbosity=1, ) try: print("Evaluate original") utils.set_seed(random_state) orig_results = mp_utils.run( DEFAULT_TIMEOUT_EVAL, cross_validate, orig_compiled, X_test, y_test, cv=StratifiedKFold( cv, random_state=random_state, shuffle=True, ), scoring=scoring, return_estimator=True, return_train_score=True, ) orig_info["test_scores"] = orig_results["test_score"] orig_info["mean_test_score"] = np.mean(orig_results["test_score"]) orig_info["failed"] = False orig_info["timedout"] = False except mp_utils.TimeoutError: print("Timedout on original pipeline") orig_info["failed"] = True orig_info["timedout"] = True orig_info["test_scores"] = [] orig_info["mean_test_score"] = np.nan except Exception as err: print("Failed to run original pipeline") print(err) orig_info["failed"] = True orig_info["timedout"] = False orig_info["test_scores"] = [] orig_info["mean_test_score"] = np.nan if repaired is None: print("No repair found") orig_info["no_repaired_candidates"] = True results_summary.append(orig_info) return pd.DataFrame(results_summary) else: orig_info["no_repaired_candidates"] = False results_summary.append(orig_info) repair_info = { "type": "repair", "graph": pt.to_tree(repaired), "no_repaired_candidates": False, } try: print("Evaluate repaired") utils.set_seed(random_state) repaired_results = mp_utils.run( DEFAULT_TIMEOUT_EVAL, cross_validate, repaired, X_test, y_test, cv=StratifiedKFold( cv, random_state=random_state, shuffle=True, ), scoring=scoring, return_estimator=True, return_train_score=True, ) repair_info["test_scores"] = repaired_results["test_score"] repair_info["mean_test_score"] = np.mean( repaired_results["test_score"]) repair_info["failed"] = False repair_info["timedout"] = False except mp_utils.TimeoutError: print("Timedout on repair pipeline") orig_info["failed"] = True orig_info["timedout"] = True orig_info["test_scores"] = [] orig_info["mean_test_score"] = np.nan except Exception as err: print("Failed to run repaired pipeline") print(err) repair_info["test_scores"] = [] repair_info["mean_test_score"] = np.nan repair_info["failed"] = True repair_info["timedout"] = False repair_info["repairer_statistics"] = repairer.statistics orig_info["repairer_statistics"] = None results_summary.append(repair_info) return pd.DataFrame(results_summary)
def repair( self, pipeline, X, y, bound_k=3, bound_num_repairs=10, scoring="f1_macro", cv=3, random_state=42, verbosity=0, ): self.debug_data = [] cv_splitter = StratifiedKFold( cv, random_state=random_state, shuffle=True, ) orig_tree = pt.to_tree(pipeline) tried = set([]) tried.add(pt.to_hashable_json(orig_tree)) repair_ct = 0 best_candidate = None best_score = None pbar = tqdm.tqdm(total=bound_num_repairs) time_budget_left = self.total_time tree_gen = self.enumerator.enumerate(orig_tree, bound_k) # attempted pipelines before one executes without failure ct_tries = 0 time_last_pipeline = time.time() while repair_ct < bound_num_repairs and time_budget_left > 0: if verbosity > 0: sys.stdout.write( "Time left:{:.2f}(s)\r".format(time_budget_left)) sys.stdout.flush() iter_start_time = time.time() # generate candidate trees in separate thread # so we can time out with stopit.ThreadingTimeout(time_budget_left) as ctx: try: candidate_tree = next(tree_gen) except StopIteration: time_budget_left = 0 break if ctx.state == ctx.TIMED_OUT: time_budget_left = 0 break if candidate_tree is None: time_budget_left -= (time.time() - iter_start_time) continue candidate_hash = pt.to_hashable_json(candidate_tree) if candidate_hash in tried: time_budget_left -= (time.time() - iter_start_time) continue try: compiled_candidate = pt.to_pipeline(candidate_tree) except Exception as err: ct_tries += 1 self.statistics.record_failure(err) time_budget_left -= (time.time() - iter_start_time) continue tried.add(candidate_hash) try: repair_results = mp_utils.run( DEFAULT_TIMEOUT_PER_REPAIR, cross_validate, compiled_candidate, X, y, cv=cv_splitter, scoring=scoring, return_estimator=True, return_train_score=True, ) self.debug_data.append(compiled_candidate) now = time.time() time_taken = now - time_last_pipeline time_last_pipeline = now self.statistics.record_success(ct_tries, time_taken) time_budget_left -= (time.time() - iter_start_time) # based on # https://github.com/josepablocam/ams/blob/master/core/search.py#L274 except ( TimeoutError, ValueError, TypeError, ZeroDivisionError, IndexError, AttributeError, MemoryError, ImportError, mp_utils.TimeoutError, mp_utils.mp.pool.MaybeEncodingError, ) as err: ct_tries += 1 self.statistics.record_failure(err) time_budget_left -= (time.time() - iter_start_time) continue repair_ct += 1 candidate_score = np.mean(repair_results["test_score"]) if best_candidate is None or candidate_score > best_score: best_candidate = compiled_candidate best_score = candidate_score pbar.update(1) pbar.close() return best_candidate