def test_model_parallel_exempt(self): # Test that we ignore module lists explicitly marked as exempt. def _get_model(): model = torch.nn.Module() model.layers = torch.nn.ModuleList( [IdentityLayer() for _ in range(8)]) return model def _exempt_mp(submodule): submodule.model_parallel_exempt = True pipeline = PipelineHelper() pipeline.num_devices = 8 pipeline.devices = [f'cuda:{i}' for i in range(8)] pipeline._PipelineHelper__device_allocations = { d: 0 for d in pipeline.devices } model1 = _get_model() model1 = pipeline.make_parallel(model1) assert getattr(model1.layers, 'is_model_parallel', False) model2 = _get_model() model2.apply(_exempt_mp) model2 = pipeline.make_parallel(model2) assert not getattr(model2.layers, 'is_model_parallel', False)
def build_regret_model(self) -> RagModel: """ Build and return regret RagModel. Assume dictionary is the same. """ model_file = self.opt['regret_model_file'] if model_file: assert os.path.exists( model_file), 'specify correct path for --regret-model-file' regret_opt = Opt.load(f'{model_file}.opt') regret_opt['n_docs'] = self.opt[ 'n_docs'] # Urgent that this is the same # add keys that were not in this model when originally trained regret_opt.update( {k: v for k, v in self.opt.items() if k not in regret_opt}) retriever_shared = None if all([ regret_opt[k] == self.opt[k] for k in [ 'rag_retriever_type', 'path_to_index', 'path_to_dpr_passages', ] ]): logging.warning( 'Sharing retrievers between model and regret model!') retriever_shared = self.model.encoder.retriever.share() model = RagModel(regret_opt, self.dict, retriever_shared=retriever_shared) with PathManager.open(self.opt['regret_model_file'], 'rb') as f: states = torch.load( f, map_location=lambda cpu, _: cpu, pickle_module=parlai.utils.pickle, ) assert 'model' in states model.load_state_dict(states['model']) if self.model_parallel: ph = PipelineHelper() ph.check_compatibility(self.opt) self.regret_model = ph.make_parallel(self.regret_model) else: self.regret_model.cuda() if self.fp16: self.regret_model = self.regret_model.half() sync_parameters(self.regret_model) train_params = trainable_parameters(self.regret_model) total_params = total_parameters(self.regret_model) logging.info( f"Total regret parameters: {total_params:,d} ({train_params:,d} trainable)" ) else: model = self.model return model
def __init__(self, opt: Opt, shared=None): # Must call _get_init_model() first so that paths are updated if necessary # (e.g., a .dict file) init_model, is_finetune = self._get_init_model(opt, shared) opt['rank_candidates'] = True self._set_candidate_variables(opt) super().__init__(opt, shared) states: Dict[str, Any] if shared: states = {} else: # Note: we cannot change the type of metrics ahead of time, so you # should correctly initialize to floats or ints here self.criterion = self.build_criterion() self.model = self.build_model() if self.model is None or self.criterion is None: raise AttributeError( 'build_model() and build_criterion() need to return the model ' 'or criterion') train_params = trainable_parameters(self.model) total_params = total_parameters(self.model) logging.info( f"Total parameters: {total_params:,d} ({train_params:,d} trainable)" ) if self.fp16: self.model = self.model.half() if init_model: logging.info( f'Loading existing model parameters from {init_model}') states = self.load(init_model) else: states = {} if self.use_cuda: if self.model_parallel: ph = PipelineHelper() ph.check_compatibility(self.opt) self.model = ph.make_parallel(self.model) else: self.model.cuda() if self.data_parallel: self.model = torch.nn.DataParallel(self.model) self.criterion.cuda() self.rank_top_k = opt.get('rank_top_k', -1) # Set fixed and vocab candidates if applicable self.set_fixed_candidates(shared) self.set_vocab_candidates(shared) if shared: # We don't use get here because hasattr is used on optimizer later. if 'optimizer' in shared: self.optimizer = shared['optimizer'] elif self._should_initialize_optimizer(): # only build an optimizer if we're training optim_params = [ p for p in self.model.parameters() if p.requires_grad ] self.init_optim(optim_params, states.get('optimizer'), states.get('optimizer_type')) self.build_lr_scheduler(states, hard_reset=is_finetune) if shared is None and is_distributed(): device_ids = None if self.model_parallel else [self.opt['gpu']] self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=device_ids, broadcast_buffers=False)
def __init__(self, opt: Opt, shared=None): init_model, self.is_finetune = self._get_init_model(opt, shared) super().__init__(opt, shared) # set up classes if opt.get('classes') is None and opt.get('classes_from_file') is None: raise RuntimeError( 'Must specify --classes or --classes-from-file argument.') if not shared: if opt['classes_from_file'] is not None: with PathManager.open(opt['classes_from_file']) as f: self.class_list = f.read().splitlines() else: self.class_list = opt['classes'] self.class_dict = {val: i for i, val in enumerate(self.class_list)} if opt.get('class_weights', None) is not None: self.class_weights = opt['class_weights'] else: self.class_weights = [1.0 for c in self.class_list] self.reset_metrics() else: self.class_list = shared['class_list'] self.class_dict = shared['class_dict'] self.class_weights = shared['class_weights'] # in binary classfication, opt['threshold'] applies to ref class if opt['ref_class'] is None or opt['ref_class'] not in self.class_dict: self.ref_class = self.class_list[0] else: self.ref_class = opt['ref_class'] ref_class_id = self.class_list.index(self.ref_class) if ref_class_id != 0: # move to the front of the class list self.class_list.insert(0, self.class_list.pop(ref_class_id)) # set up threshold, only used in binary classification if len(self.class_list) == 2 and opt.get('threshold', 0.5) != 0.5: self.threshold = opt['threshold'] else: self.threshold = None # set up model and optimizers states = {} if shared: self.model = shared['model'] else: self.model = self.build_model() # freeze the encoder and update the classifier only if opt.get("update_classifier_head_only", False): for _param_name, _param_value in self.model.named_parameters(): if not _param_name.startswith('additional_linear_layer'): _param_value.requires_grad = False self.criterion = self.build_criterion() if self.model is None or self.criterion is None: raise AttributeError( 'build_model() and build_criterion() need to return the model or criterion' ) if init_model: logging.info( f'Loading existing model parameters from {init_model}') states = self.load(init_model) if self.use_cuda: if self.model_parallel: ph = PipelineHelper() ph.check_compatibility(self.opt) self.model = ph.make_parallel(self.model) else: self.model.cuda() if self.data_parallel: self.model = torch.nn.DataParallel(self.model) self.criterion.cuda() train_params = trainable_parameters(self.model) total_params = total_parameters(self.model) logging.info( f"Total parameters: {total_params:,d} ({train_params:,d} trainable)" ) if shared: # We don't use get here because hasattr is used on optimizer later. if 'optimizer' in shared: self.optimizer = shared['optimizer'] elif self._should_initialize_optimizer(): optim_params = [ p for p in self.model.parameters() if p.requires_grad ] self.init_optim(optim_params) self.build_lr_scheduler(states, hard_reset=self.is_finetune)