Ejemplo n.º 1
0
    def start(self):
        self._monitor.pull_job_info()
        self._train_cache = TrainCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._param_cache = ParamCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)

        logger.info(
            f'Starting worker for sub train job "{self._monitor.sub_train_job_id}"...'
        )
        self._notify_start()

        while True:
            proposal = self._fetch_proposal()
            if proposal is not None:
                result = self._perform_trial(proposal)
                self._submit_result(result)
            time.sleep(LOOP_SLEEP_SECS)
Ejemplo n.º 2
0
    def start(self):
        self._monitor.pull_job_info()
        self._train_cache = TrainCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._param_cache = ParamCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._advisor = self._make_advisor()

        logger.info(
            f'Starting advisor for sub train job "{self._monitor.sub_train_job_id}"...'
        )
        self._notify_start()

        while True:
            self._fetch_results()
            if not self._make_proposals():
                self._notify_budget_reached()
                break
            time.sleep(LOOP_SLEEP_SECS)
Ejemplo n.º 3
0
    def start(self):
        self._monitor.pull_job_info()
        self._train_cache = TrainCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)
        self._param_cache = ParamCache(self._monitor.sub_train_job_id,
                                       self._redis_host, self._redis_port)

        logger.info(
            f'Starting worker for sub train job "{self._monitor.sub_train_job_id}"...'
        )
        self._notify_start()

        # if it's distributed training, skip the adviser
        if "DIST_TRAIN_MODEL" in os.environ and os.environ["DIST_TRAIN_MODEL"] == "DIST":
            print("Start dist training")
            logger.info("Start dist training")

            pro = Proposal(trial_no=1, knobs={})
            model_inst = self._load_model(pro)
            self._train_model(model_inst, pro, None)
            result = self._evaluate_model(model_inst, pro)

            # in master process's container
            # it's master_addr is "localhost"
            # only master run the save model
            if os.environ["MASTER_ADDR"] == "localhost":
                self._save_model(model_inst, pro, result)
        else:
            # training as usual
            while True:
                logger.info('Fetching proposal....')
                proposal = self._fetch_proposal()
                if proposal is not None:
                    result = self._perform_trial(proposal)
                    self._submit_result(result)
                time.sleep(LOOP_SLEEP_SECS)
Ejemplo n.º 4
0
def tune_model(
        py_model_class: Type[BaseModel],
        train_dataset_path: str,
        val_dataset_path: str,
        annotation_dataset_path: str = None,
        task: str = None,
        test_dataset_path: str = None,
        budget: Budget = None,
        train_args: Dict[str, any] = None) -> (Dict[str, Any], float, Params):

    worker_id = 'local'

    # Note start time
    start_time = time.time()

    # Retrieve config of model
    _print_header('Checking model configuration...')
    knob_config = py_model_class.get_knob_config()
    _check_knob_config(knob_config)

    # Read knob values from CLI args
    _print_header('Starting trials...')
    knobs_from_args = _maybe_read_knobs_from_args(knob_config)

    # Read budget options from CLI args
    budget_from_args = _maybe_read_budget_from_args()
    budget = {**(budget or {}), **budget_from_args}
    inform_user(f'Using budget {budget}...')

    # Make advisor
    advisor = make_advisor(knob_config, budget)
    inform_user(f'Using advisor "{type(advisor).__name__}"...')

    # Create caches & stores
    param_store: ParamStore = FileParamStore()
    param_cache: ParamCache = ParamCache()
    train_cache: TrainCache = TrainCache()

    # Variables to track over trials
    best_model_score = -1
    best_trial_no = 0
    best_model_test_score = None
    best_proposal = None
    best_store_params_id = None

    # Train worker tells advisor that it is free
    train_cache.add_worker(worker_id)

    # Until there's no more proposals, keep conducting trials
    trial_no = 0
    while True:
        trial_no += 1

        # Advisor checks free workers
        worker_ids = train_cache.get_workers()
        assert worker_id in worker_ids

        # Advisor checks worker doesn't already have a proposal
        proposal = train_cache.get_proposal(worker_id)
        assert proposal is None

        # Advisor sends a proposal to worker
        # Overriding knobs from args
        proposal = advisor.propose(worker_id, trial_no)
        if proposal is None:
            print('No more proposals from advisor - to stop training')
            break
        proposal.knobs = {**proposal.knobs, **knobs_from_args}
        train_cache.create_proposal(worker_id, proposal)

        # Worker receives proposal
        proposal = train_cache.get_proposal(worker_id)
        assert proposal is not None

        # Worker starts trial
        _print_header(f'Trial #{trial_no}')
        print('Proposal from advisor:', proposal)

        # Worker loads model
        model_inst = py_model_class(**proposal.knobs)

        # Worker pulls shared params
        shared_params = _pull_shared_params(proposal, param_cache)

        # Worker trains model
        print('Training model...')

        if annotation_dataset_path:
            model_inst.train(train_dataset_path,
                             annotation_dataset_path=annotation_dataset_path,
                             shared_params=shared_params,
                             **(train_args or {}))
        else:
            model_inst.train(train_dataset_path,
                             shared_params=shared_params,
                             **(train_args or {}))

        # Worker evaluates model
        if annotation_dataset_path:
            result = _evaluate_model(model_inst, proposal, val_dataset_path,
                                     annotation_dataset_path)
        else:
            result = _evaluate_model(model_inst, proposal, val_dataset_path)

        # Worker caches/saves model parameters
        store_params_id = _save_model(model_inst, proposal, result,
                                      param_cache, param_store)

        # Update best saved model
        if result.score is not None and store_params_id is not None and result.score > best_model_score:
            inform_user(
                'Best saved model so far! Beats previous best of score {}!'.
                format(best_model_score))
            best_store_params_id = store_params_id
            best_proposal = proposal
            best_model_score = result.score
            best_trial_no = trial_no

            # Test best model, if test dataset provided
            if test_dataset_path is not None:
                print('Evaluating new best model on test dataset...')
                if annotation_dataset_path:
                    best_model_test_score = model_inst.evaluate(
                        test_dataset_path,
                        annotation_dataset_path=annotation_dataset_path)
                else:
                    best_model_test_score = model_inst.evaluate(
                        test_dataset_path)
                inform_user(
                    'Score on test dataset: {}'.format(best_model_test_score))

        # Worker sends result to advisor
        print('Giving feedback to advisor...')
        train_cache.create_result(worker_id, result)
        train_cache.delete_proposal(worker_id)

        # Advisor receives result
        # Advisor ingests feedback
        result = train_cache.take_result(worker_id)
        assert result is not None
        advisor.feedback(worker_id, result)

        # Destroy model
        model_inst.destroy()

        if task == 'question_answering_covid19':
            break

    # Train worker tells advisor that it is no longer free
    train_cache.delete_worker(worker_id)

    # Declare best model
    if best_proposal is not None:
        inform_user('Best trial #{} has knobs {} with score of {}'.format(
            best_trial_no, best_proposal.knobs, best_model_score))
        if best_model_test_score is not None:
            inform_user(
                '...with test score of {}'.format(best_model_test_score))

    # Load params for best model
    best_params = None
    if best_store_params_id is not None:
        best_params = param_store.load(best_store_params_id)

    # Teardown model class
    print('Running model class teardown...')
    py_model_class.teardown()

    # Print duration
    duration = time.time() - start_time
    print('Tuning took a total of {}s'.format(duration))

    return (best_proposal, best_model_test_score, best_params)