def run(self, trial: Trial) -> TrialReport: # pragma: no cover judge = _NonIterativeJudgeWrapper(self._budget) base_fs = FileSystem() fs = base_fs.makedirs(self._checkpoint_path, recreate=True) self._func = self._func.copy() self._func.run(trial, judge=judge, checkpoint_basedir_fs=fs) return judge.report
def _compute( self, df: Iterable[Dict[str, Any]], entrypoint: Callable[[str, Dict[str, Any]], Any], ) -> Iterable[Dict[str, Any]]: fs = FileSystem() ck_fs = fs.makedirs(self._checkpoint_path, recreate=True) for row in df: for trial in _get_trials_from_row(row): rjudge = RemoteTrialJudge(entrypoint) self._objective.copy().run(trial, rjudge, ck_fs) if rjudge.report is not None: yield rjudge.report.fill_dict(dict(row))
def optimize_by_continuous_asha( objective: Any, dataset: TuneDataset, plan: List[Tuple[float, int]], checkpoint_path: str = "", always_checkpoint: bool = False, study_early_stop: Optional[Callable[[List[Any], List[RungHeap]], bool]] = None, trial_early_stop: Optional[ Callable[[TrialReport, List[TrialReport], List[RungHeap]], bool] ] = None, monitor: Any = None, ) -> StudyResult: _objective = TUNE_OBJECT_FACTORY.make_iterative_objective(objective) _monitor = TUNE_OBJECT_FACTORY.make_monitor(monitor) checkpoint_path = TUNE_OBJECT_FACTORY.get_path_or_temp(checkpoint_path) judge = ASHAJudge( schedule=plan, always_checkpoint=always_checkpoint, study_early_stop=study_early_stop, trial_early_stop=trial_early_stop, monitor=_monitor, ) path = os.path.join(checkpoint_path, str(uuid4())) FileSystem().makedirs(path, recreate=True) study = IterativeStudy(_objective, checkpoint_path=path) return study.optimize(dataset, judge=judge)
def _sk_cv( _sk__model: str, _sk__train_df: pd.DataFrame, _sk__scoring: Any, _sk__cv: int = 5, _sk__feature_prefix: str = "", _sk__label_col: str = "label", _sk__save_path: str = "", **kwargs: Any, ) -> Dict[str, Any]: model = _to_model(_sk__model)(**kwargs) train_df = _sk__train_df.sample(frac=1, random_state=0).reset_index(drop=True) train_x = train_df.drop([_sk__label_col], axis=1) cols = [x for x in train_x.columns if x.startswith(_sk__feature_prefix)] train_x = train_x[cols] train_y = train_df[_sk__label_col] s = cross_val_score(model, train_x, train_y, cv=_sk__cv, scoring=_sk__scoring) metadata = dict(sk_model=_sk__model, cv_scores=[float(x) for x in s]) if _sk__save_path != "": model.fit(train_x, train_y) fp = os.path.join(_sk__save_path, str(uuid4()) + ".pkl") with FileSystem().openbin(fp, mode="wb") as f: pickle.dump(model, f) metadata["model_path"] = fp return dict(error=-np.mean(s), hp=dict(_sk__model=_sk__model, **kwargs), metadata=metadata)
def run(self, trial: Trial) -> TrialReport: params = dict(trial.params) if trial.trial_id != self._last_id: self._model_type = to_sk_model(params.pop(SPACE_MODEL_NAME)) self._model_expr = to_sk_model_expr(self._model_type) self._train_x, self._train_y = self._reset_xy( trial.dfs[TUNE_DATASET_DF_DEFAULT_NAME] ) self._test_x, self._test_y = self._reset_xy( trial.dfs[TUNE_DATASET_VALIDATION_DF_DEFAULT_NAME] ) self._last_id = trial.trial_id else: params.pop(SPACE_MODEL_NAME) model = self._model_type(**params).fit(self._train_x, self._train_y) metric = get_scorer(self._scoring)(model, self._test_x, self._test_y) metadata = dict(model=self._model_expr) if self._checkpoint_path is not None: fp = os.path.join(self._checkpoint_path, str(uuid4()) + ".pkl") with FileSystem().openbin(fp, mode="wb") as f: pickle.dump(model, f) metadata["checkpoint_path"] = fp return TrialReport( trial, metric=metric, metadata=metadata, sort_metric=self.generate_sort_metric(metric), )
def run(self, trial: Trial) -> TrialReport: params = dict(trial.params) if trial.trial_id != self._last_id: self._model_type = to_sk_model(params.pop(SPACE_MODEL_NAME)) self._model_expr = to_sk_model_expr(self._model_type) self._train_x, self._train_y = self._reset_xy( trial.dfs[TUNE_DATASET_DF_DEFAULT_NAME] ) self._last_id = trial.trial_id else: params.pop(SPACE_MODEL_NAME) model = self._model_type(**params) s = cross_val_score( model, self._train_x, self._train_y, cv=self._cv, scoring=self._scoring ) metadata = dict(model=self._model_expr, cv_scores=[float(x) for x in s]) if self._checkpoint_path is not None: model.fit(self._train_x, self._train_y) fp = os.path.join(self._checkpoint_path, str(uuid4()) + ".pkl") with FileSystem().openbin(fp, mode="wb") as f: pickle.dump(model, f) metadata["checkpoint_path"] = fp metric = float(np.mean(s)) return TrialReport( trial, metric=metric, metadata=metadata, sort_metric=self.generate_sort_metric(metric), )
def test_checkpoint(tmpdir): fs = FileSystem().opendir(str(tmpdir)) cp = Checkpoint(fs) assert 0 == len(cp) with raises(AssertionError): cp.latest try: for i in range(4): with cp.create() as sfs: sfs.writetext("a.txt", str(i)) if i == 3: raise Exception except Exception: pass assert 3 == len(cp) assert "2" == cp.latest.readtext("a.txt") files = fs.listdir(".") assert 4 == len(files) cp2 = Checkpoint(fs) assert 3 == len(cp2) assert "2" == cp2.latest.readtext("a.txt")
def test_objective_func(tmpdir): fs = FileSystem().opendir(str(tmpdir)) j = J([3, 3, 2]) f = F().copy() t = Trial("abc", {"a": 1}) f.run(t, judge=j, checkpoint_basedir_fs=fs) assert -10 == f.v f.run(t, judge=j, checkpoint_basedir_fs=fs) assert -10 == f.v assert 6.0 == j.report.metric assert -6.0 == j.report.sort_metric f.run(t, judge=j, checkpoint_basedir_fs=fs) assert -10 == f.v assert 8.0 == j.report.metric assert -8.0 == j.report.sort_metric
def __init__(self, spark_session: Optional[SparkSession] = None, conf: Any = None): if spark_session is None: spark_session = SparkSession.builder.getOrCreate() self._spark_session = spark_session cf = dict(FUGUE_SPARK_DEFAULT_CONF) cf.update({x[0]: x[1] for x in spark_session.sparkContext.getConf().getAll()}) cf.update(ParamDict(conf)) super().__init__(cf) self._fs = FileSystem() self._log = logging.getLogger() self._broadcast_func = RunOnce( self._broadcast, lambda *args, **kwargs: id(args[0]) ) self._persist_func = RunOnce(self._persist, lambda *args, **kwargs: id(args[0])) self._register_func = RunOnce( self._register, lambda *args, **kwargs: id(args[0]) ) self._io = SparkIO(self.spark_session, self.fs)
def test_serialize_df(tmpdir): def assert_eq(df, df_expected=None, raw=False): if df_expected is None: df_expected = df df_actual = deserialize_df(serialize_df(df)) if raw: assert df_expected.native == df_actual.native else: df_eq(df_expected, df_actual, throw=True) fs = FileSystem() assert deserialize_df(serialize_df(None)) is None assert_eq(ArrayDataFrame([], "a:int,b:int")) assert_eq(ArrayDataFrame([[None, None]], "a:int,b:int")) assert_eq(ArrayDataFrame([[None, "abc"]], "a:int,b:str")) assert_eq(ArrayDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"), raw=True) assert_eq( IterableDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"), ArrayDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"), raw=True, ) assert_eq(PandasDataFrame([[None, None]], "a:int,b:int")) assert_eq(PandasDataFrame([[None, "abc"]], "a:int,b:str")) raises( InvalidOperationError, lambda: serialize_df(ArrayDataFrame([], "a:int,b:int"), 0), ) path = os.path.join(tmpdir, "1.pkl") df = ArrayDataFrame([[None, None]], "a:int,b:int") s = serialize_df(df, 0, path, fs) df_eq(df, deserialize_df(s, fs), throw=True) df_eq(df, deserialize_df(s), throw=True) s = serialize_df(df, 0, path) df_eq(df, deserialize_df(s), throw=True) raises(ValueError, lambda: deserialize_df('{"x":1}'))
def load_checkpoint(self, fs: FSBase, model: keras.models.Model) -> None: with tempfile.NamedTemporaryFile(suffix=".h5") as tf: local_fs = FileSystem() with fs.open("model.h5", "rb") as fin: local_fs.writefile(tf.name, fin) model.load_weights(tf.name)
def _sk_stack_cv( _sk__model: str, _sk__estimators: str, _sk__train_df: pd.DataFrame, _sk__scoring: Any, _sk__stack_cv: int = 2, _sk__method: str = "auto", _sk__passthrough: bool = False, _sk__cv: int = 5, _sk__feature_prefix: str = "", _sk__label_col: str = "label", _sk__save_path: str = "", **kwargs: Any, ) -> Dict[str, Any]: final_estimator = _to_model(_sk__model)(**kwargs) estimators: List[Tuple[str, Any]] = [] for i, d in enumerate(json.loads(_sk__estimators)): key = f"_{i}" m = _to_model(d.pop("_sk__model")) estimators.append((key, m(**d))) if is_classifier(final_estimator): model = StackingClassifier( estimators, final_estimator, cv=_sk__stack_cv, stack_method=_sk__method, passthrough=_sk__passthrough, n_jobs=kwargs.get("n_jobs", 1), ) else: model = StackingRegressor( estimators, final_estimator, cv=_sk__stack_cv, passthrough=_sk__passthrough, n_jobs=kwargs.get("n_jobs", 1), ) train_df = _sk__train_df.sample(frac=1, random_state=0).reset_index(drop=True) train_x = train_df.drop([_sk__label_col], axis=1) cols = [x for x in train_x.columns if x.startswith(_sk__feature_prefix)] train_x = train_x[cols] train_y = train_df[_sk__label_col] s = cross_val_score(model, train_x, train_y, cv=_sk__cv, scoring=_sk__scoring) metadata = dict(sk_model=get_full_type_path(model), cv_scores=[float(x) for x in s]) if _sk__save_path != "": model.fit(train_x, train_y) fp = os.path.join(_sk__save_path, str(uuid4()) + ".pkl") with FileSystem().openbin(fp, mode="wb") as f: pickle.dump(model, f) metadata["model_path"] = fp return dict( error=-np.mean(s), hp=dict( _sk__model=get_full_type_path(model), _sk__estimators=dict( **{ f"_{i}": d for i, d in enumerate(json.loads(_sk__estimators)) }, stacking=dict(_sk__model=_sk__model, **kwargs), ), _sk__stack_cv=_sk__stack_cv, _sk__method=_sk__method, _sk__passthrough=_sk__passthrough, ), metadata=metadata, )