Example #1
0
def _pull_shared_params(proposal: Proposal, param_cache: ParamCache):
    if proposal.params_type == ParamsType.NONE:
        return None

    print('Retrieving shared params from cache...')
    shared_params = param_cache.retrieve_params(proposal.params_type)
    return shared_params
Example #2
0
    def start(self):
        self._monitor.pull_job_info()
        self._train_cache = TrainCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._param_cache = ParamCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)

        logger.info(
            f'Starting worker for sub train job "{self._monitor.sub_train_job_id}"...'
        )
        self._notify_start()

        while True:
            proposal = self._fetch_proposal()
            if proposal is not None:
                result = self._perform_trial(proposal)
                self._submit_result(result)
            time.sleep(LOOP_SLEEP_SECS)
Example #3
0
    def start(self):
        self._monitor.pull_job_info()
        self._train_cache = TrainCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._param_cache = ParamCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._advisor = self._make_advisor()

        logger.info(
            f'Starting advisor for sub train job "{self._monitor.sub_train_job_id}"...'
        )
        self._notify_start()

        while True:
            self._fetch_results()
            if not self._make_proposals():
                self._notify_budget_reached()
                break
            time.sleep(LOOP_SLEEP_SECS)
Example #4
0
def _save_model(model_inst: BaseModel, proposal: Proposal, result: TrialResult,
                param_cache: ParamCache, param_store: ParamStore):
    if not proposal.to_cache_params and not proposal.to_save_params:
        return None

    print('Dumping model parameters...')
    params = model_inst.dump_parameters()
    if proposal.to_cache_params:
        print('Storing shared params in cache...')
        param_cache.store_params(params,
                                 score=result.score,
                                 time=datetime.now())

    store_params_id = None
    if proposal.to_save_params:
        print('Saving shared params...')
        store_params_id = param_store.save(params)

    return store_params_id
Example #5
0
    def start(self):
        self._monitor.pull_job_info()
        self._train_cache = TrainCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._param_cache = ParamCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)

        logger.info(
            f'Starting worker for sub train job "{self._monitor.sub_train_job_id}"...'
        )
        self._notify_start()

        # if it's distributed training, skip the adviser
        if "DIST_TRAIN_MODEL" in os.environ and os.environ["DIST_TRAIN_MODEL"] == "DIST":
            print("Start dist training")
            logger.info("Start dist training")

            pro = Proposal(trial_no=1, knobs={})
            model_inst = self._load_model(pro)
            self._train_model(model_inst, pro, None)
            result = self._evaluate_model(model_inst, pro)

            # in master process's container
            # it's master_addr is "localhost"
            # only master run the save model
            if os.environ["MASTER_ADDR"] == "localhost":
                self._save_model(model_inst, pro, result)
        else:
            # training as usual
            while True:
                logger.info('Fetching proposal....')
                proposal = self._fetch_proposal()
                if proposal is not None:
                    result = self._perform_trial(proposal)
                    self._submit_result(result)
                time.sleep(LOOP_SLEEP_SECS)
Example #6
0
class TrainWorker:

    def __init__(self, service_id, worker_id):
        self._worker_id = worker_id
        self._monitor: _SubTrainJobMonitor = _SubTrainJobMonitor(service_id)
        self._redis_host = os.environ['REDIS_HOST']
        self._redis_port = os.environ['REDIS_PORT']
        self._param_store: ParamStore = FileParamStore()
        self._trial_id = None  # ID of currently running trial
        self._train_cache: TrainCache = None
        self._param_cache: ParamCache = None
        self._trial_errors = 0  # Consecutive traial errors

    def start(self):
        self._monitor.pull_job_info()
        self._train_cache = TrainCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._param_cache = ParamCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)

        logger.info(
            f'Starting worker for sub train job "{self._monitor.sub_train_job_id}"...'
        )
        self._notify_start()

        while True:
            proposal = self._fetch_proposal()
            if proposal is not None:
                result = self._perform_trial(proposal)
                self._submit_result(result)
            time.sleep(LOOP_SLEEP_SECS)

    def stop(self):
        self._notify_stop()

        # If worker is currently running a trial, mark it has errored
        try:
            if self._trial_id is not None:
                self._monitor.mark_trial_as_errored(self._trial_id)
        except:
            logger.error('Error marking trial as errored:')
            logger.error(traceback.format_exc())

        # Run model class teardown
        try:
            self._monitor.model_class.teardown()
        except:
            logger.error('Error tearing down model class:')
            logger.error(traceback.format_exc())

    def _notify_start(self):
        superadmin_client().send_event(
            'train_job_worker_started',
            sub_train_job_id=self._monitor.sub_train_job_id)
        self._train_cache.add_worker(self._worker_id)

    def _fetch_proposal(self):
        proposal = self._train_cache.get_proposal(self._worker_id)
        return proposal

    def _perform_trial(self, proposal: Proposal) -> TrialResult:
        self._trial_id = proposal.trial_id

        logger.info(
            f'Starting trial {self._trial_id} with proposal {proposal}...')
        try:
            # Setup logging
            logger_info = self._start_logging_to_trial(
                lambda log_line, log_lvl: self._monitor.log_to_trial(
                    self._trial_id, log_line, log_lvl))

            self._monitor.mark_trial_as_running(self._trial_id, proposal)

            shared_params = self._pull_shared_params(proposal)
            model_inst = self._load_model(proposal)
            self._train_model(model_inst, proposal, shared_params)
            result = self._evaluate_model(model_inst, proposal)
            store_params_id = self._save_model(model_inst, proposal, result)
            model_inst.destroy()

            self._monitor.mark_trial_as_completed(self._trial_id, result.score,
                                                  store_params_id)
            self._trial_errors = 0
            return result
        except Exception as e:
            logger.error('Error while running trial:')
            logger.error(traceback.format_exc())
            self._monitor.mark_trial_as_errored(self._trial_id)

            # Ensure that trial doesn't error too many times consecutively
            self._trial_errors += 1
            if self._trial_errors > MAX_CONSEC_TRIAL_ERRORS:
                logger.error(
                    f'Reached {MAX_CONSEC_TRIAL_ERRORS} consecutive errors - raising exception'
                )
                raise e

            return TrialResult(proposal)
        finally:
            self._stop_logging_to_trial(logger_info)

            # Untie from done trial
            self._trial_id = None

    def _notify_stop(self):
        self._train_cache.delete_worker(self._worker_id)
        superadmin_client().send_event(
            'train_job_worker_stopped',
            sub_train_job_id=self._monitor.sub_train_job_id)

    def _start_logging_to_trial(self, handle_log):
        # Add log handlers for trial, including adding handler to root logger
        # to capture any logs emitted with level above INFO during model training & evaluation
        log_handler = LoggerUtilsHandler(handle_log)
        py_model_logger = logging.getLogger('{}.trial'.format(__name__))
        py_model_logger.setLevel(logging.INFO)
        py_model_logger.propagate = False  # Avoid duplicate logs in root logger
        py_model_logger.addHandler(log_handler)
        model_logger.set_logger(py_model_logger)

        root_logger = logging.getLogger()
        root_logger.addHandler(log_handler)

        return (root_logger, py_model_logger, log_handler)

    def _load_model(self, proposal: Proposal):
        logger.info('Creating model instance...')
        py_model_class = self._monitor.model_class
        model_inst = py_model_class(**proposal.knobs)
        return model_inst

    def _pull_shared_params(self, proposal: Proposal):
        if proposal.params_type == ParamsType.NONE:
            return None

        logger.info('Retrieving shared params from cache...')
        shared_params = self._param_cache.retrieve_params(proposal.params_type)
        return shared_params

    def _train_model(self, model_inst: BaseModel, proposal: Proposal,
                     shared_params: Union[dict, None]):
        train_dataset_path = self._monitor.train_dataset_path
        train_args = self._monitor.train_args

        logger.info('Training model...')
        model_inst.train(train_dataset_path,
                         shared_params=shared_params,
                         **(train_args or {}))

    def _evaluate_model(self, model_inst: BaseModel,
                        proposal: Proposal) -> TrialResult:
        val_dataset_path = self._monitor.val_dataset_path
        if not proposal.to_eval:
            return TrialResult(proposal)

        logger.info('Evaluating model...')
        score = model_inst.evaluate(val_dataset_path)
        logger.info(f'Score on validation dataset: {score}')
        return TrialResult(proposal, score=score)

    def _save_model(self, model_inst: BaseModel, proposal: Proposal,
                    result: TrialResult):
        if not proposal.to_cache_params and not proposal.to_save_params:
            return None

        logger.info('Dumping model parameters...')
        params = model_inst.dump_parameters()
        if proposal.to_cache_params:
            logger.info('Storing shared params in cache...')
            self._param_cache.store_params(params,
                                           score=result.score,
                                           time=datetime.now())

        store_params_id = None
        if proposal.to_save_params:
            logger.info('Saving shared params...')
            store_params_id = self._param_store.save(params)

        return store_params_id

    def _submit_result(self, result: TrialResult):
        self._train_cache.create_result(self._worker_id, result)
        self._train_cache.delete_proposal(self._worker_id)

    def _stop_logging_to_trial(self, logger_info):
        (root_logger, py_model_logger, log_handler) = logger_info

        # Remove log handlers from loggers for this trial
        root_logger.removeHandler(log_handler)
        py_model_logger.removeHandler(log_handler)
Example #7
0
class AdvisorWorker:
    def __init__(self, service_id):
        self._monitor: _SubTrainJobMonitor = _SubTrainJobMonitor(service_id)
        self._redis_host = os.environ['REDIS_HOST']
        self._redis_port = os.environ['REDIS_PORT']
        self._train_cache: TrainCache = None
        self._param_cache: ParamCache = None
        self._advisor: BaseAdvisor = None
        self._worker_infos: Dict[str, _WorkerInfo] = {
        }  # { <worker_id> : <info about worker> }

    def start(self):
        self._monitor.pull_job_info()
        self._train_cache = TrainCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._param_cache = ParamCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._advisor = self._make_advisor()

        logger.info(
            f'Starting advisor for sub train job "{self._monitor.sub_train_job_id}"...'
        )
        self._notify_start()

        while True:
            self._fetch_results()
            if not self._make_proposals():
                self._notify_budget_reached()
                break
            time.sleep(LOOP_SLEEP_SECS)

    def stop(self):
        self._notify_stop()

        # Clear caches for sub train job
        try:
            self._train_cache.clear_all()
        except:
            logger.error('Error clearing train cache:')
            logger.error(traceback.format_exc())
        try:
            self._param_cache.clear_all_params()
        except:
            logger.error('Error clearing params cache:')
            logger.error(traceback.format_exc())

    def _notify_start(self):
        superadmin_client().send_event(
            'sub_train_job_advisor_started',
            sub_train_job_id=self._monitor.sub_train_job_id)

    def _make_advisor(self):
        clazz = self._monitor.model_class
        budget = self._monitor.budget

        # Retrieve knob config
        knob_config = clazz.get_knob_config()
        advisor = make_advisor(knob_config, budget)
        logger.info(f'Using advisor "{type(advisor).__name__}"...')

        return advisor

    # Fetch results of workers
    def _fetch_results(self):
        for (worker_id, info) in self._worker_infos.items():
            # If no pending trial, skip
            if info.trial_id is None:
                continue

            # Fetch result for worker
            # If no result yet, skip
            result = self._train_cache.take_result(worker_id)
            if result is None:
                continue

            # Pass result to advisor
            self._advisor.feedback(worker_id, result)

            # Mark worker as not pending
            info.trial_id = None

    # Make proposals for workers
    # Returns False if tuning is to be stopped
    def _make_proposals(self):
        # For each free worker
        worker_ids = self._train_cache.get_workers()
        for worker_id in worker_ids:
            # If new worker, add info
            if worker_id not in self._worker_infos:
                self._worker_infos[worker_id] = _WorkerInfo()

            # Get info for worker
            worker_info = self._worker_infos[worker_id]

            # Check that worker doesn't already have a proposal
            proposal = self._train_cache.get_proposal(worker_id)
            if proposal is not None:
                continue

            # Create trial
            (trial_no, trial_id) = self._monitor.create_next_trial(worker_id)

            # Make proposal to free worker
            proposal = self._advisor.propose(worker_id, trial_no)

            # If advisor has no more proposals, to stop tuning
            if proposal is None:
                return False

            # Attach trial ID to proposal
            proposal.trial_id = trial_id

            # Push proposal to worker
            self._train_cache.create_proposal(worker_id, proposal)

            # Associate trial ID to worker
            worker_info.trial_id = trial_id

        return True

    def _notify_budget_reached(self):
        superadmin_client().send_event(
            'sub_train_job_budget_reached',
            sub_train_job_id=self._monitor.sub_train_job_id)

    def _notify_stop(self):
        superadmin_client().send_event(
            'sub_train_job_advisor_stopped',
            sub_train_job_id=self._monitor.sub_train_job_id)
Example #8
0
def tune_model(
        py_model_class: Type[BaseModel],
        train_dataset_path: str,
        val_dataset_path: str,
        annotation_dataset_path: str = None,
        task: str = None,
        test_dataset_path: str = None,
        budget: Budget = None,
        train_args: Dict[str, any] = None) -> (Dict[str, Any], float, Params):

    worker_id = 'local'

    # Note start time
    start_time = time.time()

    # Retrieve config of model
    _print_header('Checking model configuration...')
    knob_config = py_model_class.get_knob_config()
    _check_knob_config(knob_config)

    # Read knob values from CLI args
    _print_header('Starting trials...')
    knobs_from_args = _maybe_read_knobs_from_args(knob_config)

    # Read budget options from CLI args
    budget_from_args = _maybe_read_budget_from_args()
    budget = {**(budget or {}), **budget_from_args}
    inform_user(f'Using budget {budget}...')

    # Make advisor
    advisor = make_advisor(knob_config, budget)
    inform_user(f'Using advisor "{type(advisor).__name__}"...')

    # Create caches & stores
    param_store: ParamStore = FileParamStore()
    param_cache: ParamCache = ParamCache()
    train_cache: TrainCache = TrainCache()

    # Variables to track over trials
    best_model_score = -1
    best_trial_no = 0
    best_model_test_score = None
    best_proposal = None
    best_store_params_id = None

    # Train worker tells advisor that it is free
    train_cache.add_worker(worker_id)

    # Until there's no more proposals, keep conducting trials
    trial_no = 0
    while True:
        trial_no += 1

        # Advisor checks free workers
        worker_ids = train_cache.get_workers()
        assert worker_id in worker_ids

        # Advisor checks worker doesn't already have a proposal
        proposal = train_cache.get_proposal(worker_id)
        assert proposal is None

        # Advisor sends a proposal to worker
        # Overriding knobs from args
        proposal = advisor.propose(worker_id, trial_no)
        if proposal is None:
            print('No more proposals from advisor - to stop training')
            break
        proposal.knobs = {**proposal.knobs, **knobs_from_args}
        train_cache.create_proposal(worker_id, proposal)

        # Worker receives proposal
        proposal = train_cache.get_proposal(worker_id)
        assert proposal is not None

        # Worker starts trial
        _print_header(f'Trial #{trial_no}')
        print('Proposal from advisor:', proposal)

        # Worker loads model
        model_inst = py_model_class(**proposal.knobs)

        # Worker pulls shared params
        shared_params = _pull_shared_params(proposal, param_cache)

        # Worker trains model
        print('Training model...')

        if annotation_dataset_path:
            model_inst.train(train_dataset_path,
                             annotation_dataset_path=annotation_dataset_path,
                             shared_params=shared_params,
                             **(train_args or {}))
        else:
            model_inst.train(train_dataset_path,
                             shared_params=shared_params,
                             **(train_args or {}))

        # Worker evaluates model
        if annotation_dataset_path:
            result = _evaluate_model(model_inst, proposal, val_dataset_path,
                                     annotation_dataset_path)
        else:
            result = _evaluate_model(model_inst, proposal, val_dataset_path)

        # Worker caches/saves model parameters
        store_params_id = _save_model(model_inst, proposal, result,
                                      param_cache, param_store)

        # Update best saved model
        if result.score is not None and store_params_id is not None and result.score > best_model_score:
            inform_user(
                'Best saved model so far! Beats previous best of score {}!'.
                format(best_model_score))
            best_store_params_id = store_params_id
            best_proposal = proposal
            best_model_score = result.score
            best_trial_no = trial_no

            # Test best model, if test dataset provided
            if test_dataset_path is not None:
                print('Evaluating new best model on test dataset...')
                if annotation_dataset_path:
                    best_model_test_score = model_inst.evaluate(
                        test_dataset_path,
                        annotation_dataset_path=annotation_dataset_path)
                else:
                    best_model_test_score = model_inst.evaluate(
                        test_dataset_path)
                inform_user(
                    'Score on test dataset: {}'.format(best_model_test_score))

        # Worker sends result to advisor
        print('Giving feedback to advisor...')
        train_cache.create_result(worker_id, result)
        train_cache.delete_proposal(worker_id)

        # Advisor receives result
        # Advisor ingests feedback
        result = train_cache.take_result(worker_id)
        assert result is not None
        advisor.feedback(worker_id, result)

        # Destroy model
        model_inst.destroy()

        if task == 'question_answering_covid19':
            break

    # Train worker tells advisor that it is no longer free
    train_cache.delete_worker(worker_id)

    # Declare best model
    if best_proposal is not None:
        inform_user('Best trial #{} has knobs {} with score of {}'.format(
            best_trial_no, best_proposal.knobs, best_model_score))
        if best_model_test_score is not None:
            inform_user(
                '...with test score of {}'.format(best_model_test_score))

    # Load params for best model
    best_params = None
    if best_store_params_id is not None:
        best_params = param_store.load(best_store_params_id)

    # Teardown model class
    print('Running model class teardown...')
    py_model_class.teardown()

    # Print duration
    duration = time.time() - start_time
    print('Tuning took a total of {}s'.format(duration))

    return (best_proposal, best_model_test_score, best_params)