Exemple #1
0
 def load_most_recent(self, name, specification):
     if self.diff_namer is not None:
         specification_id = self.diff_namer.get_name(specification)
     else:
         specification_id = specification_hash(specification)
     location = get_partial_save_directory(name, specification, self.diff_namer)
     checkpoints = self._get_time_sorted_checkpoints(name, specification)
     if checkpoints == []:
         logging.getLogger("smallab.{id}.checkpoint".format(id=specification_id)).info("No checkpoints available")
         return
     # checkpoints = reversed(checkpoints)
     able_to_load_checkpoint = False
     checkpoints = reversed(checkpoints)
     used_checkpoint = None
     for checkpoint in checkpoints:
         try:
             with open(os.path.join(location, str(checkpoint) + ".pkl"), "rb") as f:
                 partial_experiment = dill.load(f)
                 able_to_load_checkpoint = True
                 used_checkpoint = checkpoint
                 break
         except:
             logging.getLogger("smallab.{id}.checkpoint".format(id=specification_id)).warning(
                 "Unable to load checkpoint {chp}".format(chp=checkpoint), exc_info=True)
     if not able_to_load_checkpoint:
         logging.getLogger("smallab.{id}.checkpoint".format(id=specification_id)).warning(
             "All checkpoints corrupt".format(id=specification_id))
         return
     else:
         logging.getLogger("smallab.{id}.checkpoint".format(id=specification_id)).info(
             "Successfully loaded checkpoint {chp}".format(chp=used_checkpoint))
     return partial_experiment
Exemple #2
0
    def _save_checkpoint(self, save_data, name, specification):
        #assert isinstance(experiment,HasCheckpoint)
        if isinstance(save_data, tuple):
            experiment = save_data[0]
        else:
            experiment = save_data
        assert isinstance(experiment,HasCheckpoint)

        experiment.set_steps_since_checkpoint(experiment.get_steps_since_checkpiont() + 1)
        if experiment.get_steps_since_checkpiont() >= experiment.steps_before_checkpoint():
            experiment.set_steps_since_checkpoint(0)
            if self.diff_namer is not None:
                experiment_hash = self.diff_namer.get_name(specification)
            else:
                experiment_hash = specification_hash(specification)
            checkpoint_name = str(datetime.datetime.now())
            try:
                location = get_partial_save_directory(name, specification,self.diff_namer)
                os.makedirs(location, exist_ok=True)
                # TODO make sure a checkpoint with this name doesn't already exist
                with open(os.path.join(location, checkpoint_name + ".pkl"), "wb") as f:
                    dill.dump(save_data, f)
                logging.getLogger("smallab.{id}.checkpoint".format(id=experiment_hash)).info(
                    "Succesfully checkpointed {chp}".format(chp=checkpoint_name))
                checkpoints = os.listdir(location)
                if len(checkpoints) > self.rolled_backups:
                    checkpoints = self._get_time_sorted_checkpoints(name, specification)
                    os.remove(os.path.join(location, str(checkpoints[0]) + ".pkl"))
            except:
                logging.getLogger("smallab.{id}.checkpoint".format(id=experiment_hash)).warning(
                    "Unsuccesful checkpoint {chp}".format(chp=checkpoint_name),
                    exc_info=True)
Exemple #3
0
 def publish_progress(self, specification, result):
     if isinstance(result, tuple):
         if self.diff_namer is not None:
             name = self.diff_namer.get_name(specification)
         else:
             name = specification_hash(specification)
         put_in_event_queue(self.eventQueue, ProgressEvent(name, result[0], result[1]))
def get_partial_save_directory(name, specification, diff_namer):
    if diff_namer is not None:
        specification_name = diff_namer.get_name(specification)
    else:
        specification_name = specification_hash(specification)

    return os.path.join(get_save_directory(name), "checkpoints",
                        specification_name)
Exemple #5
0
def get_save_file_directory(name: typing.AnyStr,
                            specification: Specification) -> typing.AnyStr:
    """
    Get the folder to save the .pkl file and specification.json file under
    :param name: The name of the current batch
    :param specification: The specification of the current run
    :return: The location where specification.json should be saved
    """
    return os.path.join(get_experiment_save_directory(name),
                        specification_hash(specification))
 def __gen_name(self, keys, specification):
     name = []
     for key in keys:
         name.append(str(key) + ":" + str(specification[key]))
     name = "_".join(name)
     if len(
             name
     ) >= 250:  # This is the maximum file length for windows and unix, doesn't take into account full path length
         return specification_hash(specification)
     else:
         return name
def get_specification_local_storage(name,
                                    specification,
                                    diff_namer,
                                    extended_keys=False):
    if diff_namer is not None:
        if extended_keys:
            expr_name = diff_namer.get_extended_name(specification)
        else:
            expr_name = diff_namer.get_name(specification)
    else:
        expr_name = specification_hash(specification)
    return os.path.join(get_experiment_local_storage(name), expr_name)
def get_save_file_directory(name: typing.AnyStr,
                            specification: Specification,
                            diff_namer,
                            extended_keys=False) -> typing.AnyStr:
    """
    Get the folder to save the .pkl file and specification.json file under
    :param name: The name of the current batch
    :param specification: The specification of the current run
    :return: The location where specification.json should be saved
    """

    if diff_namer is not None:
        if extended_keys:
            expr_name = diff_namer.get_extended_name(specification)
        else:
            expr_name = diff_namer.get_name(specification)
    else:
        expr_name = specification_hash(specification)
    return os.path.join(get_experiment_save_directory(name), expr_name)
Exemple #9
0
 def __save_checkpoint(self, experiment, name, specification):
     experiment_hash = specification_hash(specification)
     checkpoint_name = str(datetime.datetime.now())
     try:
         location = get_partial_save_directory(name, specification)
         os.makedirs(location, exist_ok=True)
         #TODO make sure a checkpoint with this name doesn't already exist
         with open(os.path.join(location, checkpoint_name + ".pkl"), "wb") as f:
             dill.dump(experiment, f)
         logging.getLogger("smallab.{id}.checkpoint".format(id=experiment_hash)).info(
             "Succesfully checkpointed {chp}".format(chp=checkpoint_name))
         checkpoints = os.listdir(location)
         if len(checkpoints) > self.rolled_backups:
             checkpoints = self._get_time_sorted_checkpoints(name, specification)
             os.remove(os.path.join(location, str(checkpoints[0]) + ".pkl"))
     except:
         logging.getLogger("smallab.{id}.checkpoint".format(id=experiment_hash)).warning(
             "Unsuccesful checkpoint {chp}".format(chp=checkpoint_name),
             exc_info=True)
Exemple #10
0
    def __run_and_save(self, name, experiment, specification,
                       propagate_exceptions):
        experiment = deepcopy(experiment)
        specification_id = specification_hash(specification)
        logger_name = "smallab.{specification_id}".format(
            specification_id=specification_id)
        logger = logging.getLogger(logger_name)
        logger.setLevel(logging.DEBUG)
        file_handler = logging.FileHandler(
            get_log_file(experiment, specification_id))
        # stream_handler = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s [%(levelname)-5.5s]  %(message)s")
        # stream_handler.setFormatter(formatter)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

        experiment.set_logger_name(logger_name)

        def _interior_fn():
            if isinstance(experiment, CheckpointedExperiment):
                result = CheckpointedExperimentHandler().run(
                    experiment, name, specification)
            else:
                result = experiment.main(specification)
            self._save_run(name, experiment, specification, result)
            for callback in self.callbacks:
                callback.on_specification_complete(specification, result)
            return None

        if not propagate_exceptions:
            try:
                _interior_fn()
            except Exception as e:
                logger.error("Specification Failure", exc_info=True)
                for callback in self.callbacks:
                    callback.on_specification_failure(e, specification)
                return e
        else:
            _interior_fn()
            return None
Exemple #11
0
    def run(self, name: typing.AnyStr, specifications: typing.List[Specification], experiment: ExperimentBase,
            continue_from_last_run=True, propagate_exceptions=False,
            force_pickle=False, specification_runner: SimpleAbstractRunner = MultiprocessingRunner(),
            use_dashboard=True, context_type="fork", multiprocessing_lib=None,save_every_k=None) -> typing.NoReturn:

        """
        The method called to run an experiment
        :param propagate_exceptions: If True, exceptions won't be caught and logged as failed experiments but will cause the program to crash (like normal), useful for debugging exeperiments
        :param name: The name of this experiment batch
        :param specifications: The list of specifications to run. Should be a list of dictionaries. Each dictionary is passed to the experiment run method
        :param experiment: The experiment object to run
        :param continue_from_last_run: If true, will not redo already completed experiments. Defaults to true
        :param show_progress: Whether or not to show a progress bar for experiment completion
        :param force_pickle: If true, don't attempt to json serialze results and default to pickling
        :param specification_runner: An instance of ```AbstractRunner``` that will be used to run the specification
        :param use_dashboard: If true, use the terminal monitoring dashboard. If false, just stream logs to stdout.
        :return: No return
        """
        if multiprocessing_lib is None:
            import multiprocessing as mp
        else:
            mp = multiprocessing_lib
        ctx = mp.get_context(context_type)
        specification_runner.set_multiprocessing_context(ctx)
        if specification_runner is None:
            specification_runner = JoblibRunner(None)
        dashboard_process = None
        try:
            manager = ctx.Manager()
            eventQueue = manager.Queue(maxsize=2000)
            put_in_event_queue(eventQueue, StartExperimentEvent(name))
            # Set up root smallab logger
            folder_loc = os.path.join("experiment_runs", name, "logs", str(datetime.datetime.now()))
            file_loc = os.path.join(folder_loc, "main.log")
            if not os.path.exists(folder_loc):
                os.makedirs(folder_loc)
            logger = logging.getLogger("smallab")
            logger.setLevel(logging.DEBUG)
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            # Can't do this with non-fork multiprocessing
            if context_type == "fork":
                fh = logging.FileHandler(file_loc)
                fh.setFormatter(formatter)
                logger.addHandler(fh)
            if not use_dashboard:
                sh = logging.StreamHandler()
                sh.setFormatter(formatter)
                logger.addHandler(sh)
            else:
                # fq = LogToEventQueue(eventQueue)
                # sh = logging.StreamHandler(fq)
                # sh.setFormatter(formatter)
                # Add to root so all logging appears in dashboard not just smallab.
                # logging.getLogger().addHandler(sh)
                dashboard_process = ctx.Process(target=start_dashboard, args=(eventQueue,))
                dashboard_process.start()
            experiment.set_logging_folder(folder_loc)

            self.force_pickle = force_pickle
            if not os.path.exists(get_save_directory(name)):
                os.makedirs(get_save_directory(name))

            if continue_from_last_run:
                need_to_run_specifications = self._find_uncompleted_specifications(name, specifications)
            else:
                need_to_run_specifications = specifications
            for callback in self.callbacks:
                callback.set_experiment_name(name)

            for specification in need_to_run_specifications:
                put_in_event_queue(eventQueue, RegisterEvent(specification_hash(specification), specification))
            if isinstance(specification_runner, SimpleAbstractRunner):
                specification_runner.run(need_to_run_specifications,
                                         lambda specification: run_and_save(name, experiment, specification,
                                                                            propagate_exceptions, self.callbacks,
                                                                            self.force_pickle, eventQueue))
            elif isinstance(specification_runner, ComplexAbstractRunner):
                specification_runner.run(need_to_run_specifications, name, experiment, propagate_exceptions,
                                         self.callbacks, self.force_pickle, eventQueue)

            self._write_to_completed_json(name, specification_runner.get_completed(),
                                          specification_runner.get_failed_specifications())

            # Call batch complete functions
            if specification_runner.get_exceptions() != []:
                for callback in self.callbacks:
                    callback.on_batch_failure(specification_runner.get_exceptions(),
                                              specification_runner.get_failed_specifications())

            if specification_runner.get_completed() != []:
                for callback in self.callbacks:
                    callback.on_batch_complete(specification_runner.get_completed())
        finally:
            if dashboard_process is not None:
                dashboard_process.terminate()
def run_and_save(name, experiment, specification, propagate_exceptions,
                 callbacks, force_pickle, eventQueue, diff_namer):
    experiment = deepcopy(experiment)
    if diff_namer is None:
        specification_id = specification_hash(specification)
    else:
        specification_id = diff_namer.get_name(specification)
    logger_name = "smallab.{specification_id}".format(
        specification_id=specification_id)
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.DEBUG)
    file_handler = logging.FileHandler(
        get_log_file(experiment, specification_id))
    #formatter = logging.Formatter("%(asctime)s [%(levelname)-5.5s]  %(message)s")
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    fq = LogToEventQueue(eventQueue)
    sh = logging.StreamHandler(fq)
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    #TODO need to attach eventqueue logger handler here and not at base logger

    experiment.set_logger_name(logger_name)
    experiment.set_experiment_local_storage(get_experiment_local_storage(name))
    experiment.set_specification_local_storage(
        get_specification_local_storage(name, specification, diff_namer))
    put_in_event_queue(eventQueue, BeginEvent(specification_id))

    def _interior_fn():
        result = run_with_correct_handler(experiment,
                                          name,
                                          specification,
                                          eventQueue,
                                          diff_namer=diff_namer)
        if isinstance(result, types.GeneratorType):
            for cur_result in result:
                if diff_namer is not None:
                    diff_namer.extend_name(cur_result["specification"])
                save_run(name,
                         experiment,
                         cur_result["specification"],
                         cur_result["result"],
                         force_pickle,
                         diff_namer=diff_namer,
                         extended_keys=True)
        else:
            save_run(name,
                     experiment,
                     specification,
                     result,
                     force_pickle,
                     diff_namer=diff_namer)
        for callback in callbacks:
            callback.on_specification_complete(specification, result)
        return None

    if not propagate_exceptions:
        try:
            _interior_fn()

            put_in_event_queue(eventQueue, CompleteEvent(specification_id))
        except Exception as e:
            logger.error("Specification Failure", exc_info=True)
            put_in_event_queue(eventQueue, FailedEvent(specification_id))
            for callback in callbacks:
                callback.on_specification_failure(e, specification)
            return e
    else:
        _interior_fn()
        put_in_event_queue(eventQueue, CompleteEvent(specification_id))
        return None
Exemple #13
0
def get_partial_save_directory(name, specification):
    return os.path.join(get_save_directory(name), "checkpoints",
                        specification_hash(specification))
Exemple #14
0
 def publish_progress(self, specification, result):
     if isinstance(result, tuple):
         put_in_event_queue(
             self.eventQueue,
             ProgressEvent(specification_hash(specification), result[0],
                           result[1]))