コード例 #1
0
    def test_model_parallel_exempt(self):
        # Test that we ignore module lists explicitly marked as exempt.
        def _get_model():
            model = torch.nn.Module()
            model.layers = torch.nn.ModuleList(
                [IdentityLayer() for _ in range(8)])
            return model

        def _exempt_mp(submodule):
            submodule.model_parallel_exempt = True

        pipeline = PipelineHelper()
        pipeline.num_devices = 8
        pipeline.devices = [f'cuda:{i}' for i in range(8)]
        pipeline._PipelineHelper__device_allocations = {
            d: 0
            for d in pipeline.devices
        }

        model1 = _get_model()
        model1 = pipeline.make_parallel(model1)
        assert getattr(model1.layers, 'is_model_parallel', False)

        model2 = _get_model()
        model2.apply(_exempt_mp)
        model2 = pipeline.make_parallel(model2)
        assert not getattr(model2.layers, 'is_model_parallel', False)
コード例 #2
0
    def build_regret_model(self) -> RagModel:
        """
        Build and return regret RagModel.

        Assume dictionary is the same.
        """
        model_file = self.opt['regret_model_file']
        if model_file:
            assert os.path.exists(
                model_file), 'specify correct path for --regret-model-file'
            regret_opt = Opt.load(f'{model_file}.opt')
            regret_opt['n_docs'] = self.opt[
                'n_docs']  # Urgent that this is the same
            # add keys that were not in this model when originally trained
            regret_opt.update(
                {k: v
                 for k, v in self.opt.items() if k not in regret_opt})
            retriever_shared = None
            if all([
                    regret_opt[k] == self.opt[k] for k in [
                        'rag_retriever_type',
                        'path_to_index',
                        'path_to_dpr_passages',
                    ]
            ]):
                logging.warning(
                    'Sharing retrievers between model and regret model!')
                retriever_shared = self.model.encoder.retriever.share()

            model = RagModel(regret_opt,
                             self.dict,
                             retriever_shared=retriever_shared)
            with PathManager.open(self.opt['regret_model_file'], 'rb') as f:
                states = torch.load(
                    f,
                    map_location=lambda cpu, _: cpu,
                    pickle_module=parlai.utils.pickle,
                )
            assert 'model' in states
            model.load_state_dict(states['model'])
            if self.model_parallel:
                ph = PipelineHelper()
                ph.check_compatibility(self.opt)
                self.regret_model = ph.make_parallel(self.regret_model)
            else:
                self.regret_model.cuda()
            if self.fp16:
                self.regret_model = self.regret_model.half()

            sync_parameters(self.regret_model)
            train_params = trainable_parameters(self.regret_model)
            total_params = total_parameters(self.regret_model)
            logging.info(
                f"Total regret parameters: {total_params:,d} ({train_params:,d} trainable)"
            )
        else:
            model = self.model

        return model
コード例 #3
0
    def __init__(self, opt: Opt, shared=None):
        # Must call _get_init_model() first so that paths are updated if necessary
        # (e.g., a .dict file)
        init_model, is_finetune = self._get_init_model(opt, shared)
        opt['rank_candidates'] = True
        self._set_candidate_variables(opt)
        super().__init__(opt, shared)

        states: Dict[str, Any]
        if shared:
            states = {}
        else:
            # Note: we cannot change the type of metrics ahead of time, so you
            # should correctly initialize to floats or ints here
            self.criterion = self.build_criterion()
            self.model = self.build_model()

            if self.model is None or self.criterion is None:
                raise AttributeError(
                    'build_model() and build_criterion() need to return the model '
                    'or criterion')
            train_params = trainable_parameters(self.model)
            total_params = total_parameters(self.model)
            logging.info(
                f"Total parameters: {total_params:,d} ({train_params:,d} trainable)"
            )

            if self.fp16:
                self.model = self.model.half()
            if init_model:
                logging.info(
                    f'Loading existing model parameters from {init_model}')
                states = self.load(init_model)
            else:
                states = {}

            if self.use_cuda:
                if self.model_parallel:
                    ph = PipelineHelper()
                    ph.check_compatibility(self.opt)
                    self.model = ph.make_parallel(self.model)
                else:
                    self.model.cuda()
                if self.data_parallel:
                    self.model = torch.nn.DataParallel(self.model)
                self.criterion.cuda()

        self.rank_top_k = opt.get('rank_top_k', -1)

        # Set fixed and vocab candidates if applicable
        self.set_fixed_candidates(shared)
        self.set_vocab_candidates(shared)

        if shared:
            # We don't use get here because hasattr is used on optimizer later.
            if 'optimizer' in shared:
                self.optimizer = shared['optimizer']
        elif self._should_initialize_optimizer():
            # only build an optimizer if we're training
            optim_params = [
                p for p in self.model.parameters() if p.requires_grad
            ]
            self.init_optim(optim_params, states.get('optimizer'),
                            states.get('optimizer_type'))
            self.build_lr_scheduler(states, hard_reset=is_finetune)

        if shared is None and is_distributed():
            device_ids = None if self.model_parallel else [self.opt['gpu']]
            self.model = torch.nn.parallel.DistributedDataParallel(
                self.model, device_ids=device_ids, broadcast_buffers=False)
コード例 #4
0
    def __init__(self, opt: Opt, shared=None):
        init_model, self.is_finetune = self._get_init_model(opt, shared)
        super().__init__(opt, shared)

        # set up classes
        if opt.get('classes') is None and opt.get('classes_from_file') is None:
            raise RuntimeError(
                'Must specify --classes or --classes-from-file argument.')
        if not shared:
            if opt['classes_from_file'] is not None:
                with PathManager.open(opt['classes_from_file']) as f:
                    self.class_list = f.read().splitlines()
            else:
                self.class_list = opt['classes']
            self.class_dict = {val: i for i, val in enumerate(self.class_list)}
            if opt.get('class_weights', None) is not None:
                self.class_weights = opt['class_weights']
            else:
                self.class_weights = [1.0 for c in self.class_list]
            self.reset_metrics()
        else:
            self.class_list = shared['class_list']
            self.class_dict = shared['class_dict']
            self.class_weights = shared['class_weights']

        # in binary classfication, opt['threshold'] applies to ref class
        if opt['ref_class'] is None or opt['ref_class'] not in self.class_dict:
            self.ref_class = self.class_list[0]
        else:
            self.ref_class = opt['ref_class']
            ref_class_id = self.class_list.index(self.ref_class)
            if ref_class_id != 0:
                # move to the front of the class list
                self.class_list.insert(0, self.class_list.pop(ref_class_id))

        # set up threshold, only used in binary classification
        if len(self.class_list) == 2 and opt.get('threshold', 0.5) != 0.5:
            self.threshold = opt['threshold']
        else:
            self.threshold = None

        # set up model and optimizers
        states = {}
        if shared:
            self.model = shared['model']
        else:
            self.model = self.build_model()
            # freeze the encoder and update the classifier only
            if opt.get("update_classifier_head_only", False):
                for _param_name, _param_value in self.model.named_parameters():
                    if not _param_name.startswith('additional_linear_layer'):
                        _param_value.requires_grad = False

            self.criterion = self.build_criterion()
            if self.model is None or self.criterion is None:
                raise AttributeError(
                    'build_model() and build_criterion() need to return the model or criterion'
                )
            if init_model:
                logging.info(
                    f'Loading existing model parameters from {init_model}')
                states = self.load(init_model)
            if self.use_cuda:
                if self.model_parallel:
                    ph = PipelineHelper()
                    ph.check_compatibility(self.opt)
                    self.model = ph.make_parallel(self.model)
                else:
                    self.model.cuda()
                if self.data_parallel:
                    self.model = torch.nn.DataParallel(self.model)
                self.criterion.cuda()

            train_params = trainable_parameters(self.model)
            total_params = total_parameters(self.model)
            logging.info(
                f"Total parameters: {total_params:,d} ({train_params:,d} trainable)"
            )

        if shared:
            # We don't use get here because hasattr is used on optimizer later.
            if 'optimizer' in shared:
                self.optimizer = shared['optimizer']
        elif self._should_initialize_optimizer():
            optim_params = [
                p for p in self.model.parameters() if p.requires_grad
            ]
            self.init_optim(optim_params)
            self.build_lr_scheduler(states, hard_reset=self.is_finetune)