Esempio n. 1
0
        def copy_to_search_trace(job, trace_entry):
            trace_entry = copy.deepcopy(trace_entry)
            for key in trace_keys:
                # Process deprecated options to some extent. Support key renames, but
                # not value renames.
                actual_key = {key: None}
                _process_deprecated_options(actual_key)
                if len(actual_key) > 1:
                    raise KeyError(
                        f"{key} is deprecated but cannot be handled automatically"
                    )
                actual_key = next(iter(actual_key.keys()))
                value = train_job_config.get(actual_key)
                trace_entry[key] = value

            trace_entry["folder"] = os.path.split(train_job_config.folder)[1]
            metric_value = Trace.get_metric(trace_entry, metric_name)
            trace_entry["metric_name"] = metric_name
            trace_entry["metric_value"] = metric_value
            trace_entry["parent_job_id"] = search_job.job_id
            search_job.config.trace(**trace_entry)
            valid_trace.append(trace_entry)
Esempio n. 2
0
    def run(self):
        self.init_search()

        # let's go
        trial_no = 0
        while trial_no < self.num_trials:
            gc.collect()
            self.config.log(
                "Registering trial {}/{}...".format(trial_no, self.num_trials - 1)
            )

            # determine next trial
            if trial_no >= len(self.parameters):
                # create a new trial
                parameters, trial_id = self.register_trial()
                if trial_id is None:
                    self.config.log(
                        "Cannot generate trial parameters. Will try again after a "
                        + "running trial has completed."
                    )
                else:
                    # remember the trial
                    self.trial_ids.append(trial_id)
                    self.parameters.append(parameters)
                    self.results.append(None)
                    self.config.log(
                        "Created trial {:05d} with parameters: {}".format(
                            trial_no, parameters
                        )
                    )
            else:
                # use the trial of a resumed run of this job
                parameters, trial_id = self.register_trial(self.parameters[trial_no])
                self.trial_ids.append(trial_id)
                self.config.log(
                    "Resumed trial {:05d} with parameters: {}".format(
                        trial_no, parameters
                    )
                )

            if trial_id is None:
                # couldn't generate a new trial since data is lacking; so wait for data
                self.wait_task()
            elif self.results[trial_no] is not None:
                # trial result is in checkpoint, use it (from prior run of this job)
                self.config.log(
                    "Registering trial {:05d} result: {}".format(
                        trial_no, self.results[trial_no]
                    )
                )
                self.register_trial_result(
                    self.trial_ids[trial_no],
                    self.parameters[trial_no],
                    self.results[trial_no],
                )
            else:  # trial_id is valid, but no result yet
                # create/resume job for trial
                folder = str("{:05d}".format(trial_no))
                config = self.config.clone(folder)
                config.set("job.type", "train")
                config.set_all(_process_deprecated_options(copy.deepcopy(parameters)))
                config.init_folder()

                # save checkpoint here so that trial is not lost
                # TODO make atomic (may corrupt good checkpoint when canceled!)
                self.save(self.config.checkpoint_file(1))

                # run or schedule the trial
                self.submit_task(
                    kge.job.search._run_train_job,
                    (self, trial_no, config, self.num_trials, list(parameters.keys())),
                )

            # on last iteration, wait for all running trials to complete
            if trial_id is not None and trial_no == self.num_trials - 1:
                self.wait_task(return_when=concurrent.futures.ALL_COMPLETED)

            # for each ready trial, store its results
            for ready_trial_no, ready_trial_best, _ in self.ready_task_results:
                if ready_trial_best is not None:
                    self.config.log(
                        "Registering trial {:05d} result: {}".format(
                            ready_trial_no, ready_trial_best["metric_value"]
                        )
                    )
                else:
                    # TODO: currently cannot distinguish failed trials from trials that
                    # haven't been run to completion. Both will have their entry in
                    # self.results set to None
                    self.config.log(
                        "Registering failed trial {:05d}".format(ready_trial_no)
                    )
                self.results[ready_trial_no] = ready_trial_best
                self.register_trial_result(
                    self.trial_ids[ready_trial_no],
                    self.parameters[ready_trial_no],
                    ready_trial_best,
                )

                # save checkpoint
                # TODO make atomic (may corrupt good checkpoint when canceled!)
                self.save(self.config.checkpoint_file(1))

            # clean up
            self.ready_task_results.clear()
            if trial_id is not None:
                # advance to next trial (unless we did not run this one)
                trial_no += 1

        # all done, output failed trials result
        failed_trials = [i for i in range(len(self.results)) if self.results[i] is None]
        self.config.log(
            "{} trials were successful, {} trials failed".format(
                len(self.results) - len(failed_trials), len(failed_trials)
            )
        )
        if len(failed_trials) > 0:
            self.config.log(
                "Failed trials: {}".format(
                    " ".join(["{:05d}".format(x) for x in failed_trials])
                )
            )

        # and best trial
        if len(failed_trials) != len(self.results):
            trial_metric_values = [
                float("-Inf") if result is None else result["metric_value"]
                for result in self.results
            ]
            best_trial_index = trial_metric_values.index(max(trial_metric_values))
            metric_name = self.results[best_trial_index]["metric_name"]
            self.config.log(
                "Best trial ({:05d}): {}={}".format(
                    best_trial_index, metric_name, trial_metric_values[best_trial_index]
                )
            )

            self.trace(
                even="search_completed",
                echo=True,
                echo_prefix="  ",
                log=True,
                scope="search",
                **self.results[best_trial_index]
            )