def __init__(self, cfg: DictConfig, trainer: Trainer): # All of the lines below need to be set when the parent class calls self._build_tokenizer() self.encoder_tokenizer_library = cfg.encoder_tokenizer.get( 'library', 'yttm') self.decoder_tokenizer_library = cfg.decoder_tokenizer.get( 'library', 'yttm') self.special_tokens = {} self.src_language = cfg.get("src_language", None) self.tgt_language = cfg.get("tgt_language", None) self.multilingual = cfg.get("multilingual", False) self.multilingual_ids = [] self.validate_input_ids = cfg.get("validate_input_ids", True) if self.multilingual: if isinstance(self.src_language, ListConfig) and isinstance( self.tgt_language, ListConfig): raise ValueError( "cfg.src_language and cfg.tgt_language cannot both be lists. We only support many-to-one or one-to-many multilingual models." ) elif isinstance(self.src_language, ListConfig): pass elif isinstance(self.tgt_language, ListConfig): for lng in self.tgt_language: self.special_tokens["<" + lng + ">"] = "<" + lng + ">" else: raise ValueError( "Expect either cfg.src_language or cfg.tgt_language to be a list when multilingual=True." ) super().__init__(cfg, trainer=trainer)
def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer) self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) # TODO: Fix this once apex patches FusedScaledMaskedSoftmax. # This is a workaround for the fact that `masked_softmax_fusion` has issues with certain input sizes that may be present while finetuning. t5_cfg = MegatronT5Model.restore_from( self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)), trainer=trainer, return_config=True, ) OmegaConf.set_struct(t5_cfg, True) with open_dict(t5_cfg): t5_cfg.masked_softmax_fusion = False t5_cfg.megatron_amp_O2 = self.megatron_amp_o2 self.model = MegatronT5Model.restore_from( self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)), trainer=trainer, override_config_path=t5_cfg, ) # self.model = MegatronT5Model.restore_from( # self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)), # trainer=trainer) self.tokenizer = self.model.tokenizer self.float_type = self.model.enc_dec_model.enc_dec_model.encoder.model.layers[0].dtype if not cfg.use_lm_finetune: self.model.freeze() hidden_size = self.model.cfg.hidden_size # register the file containing the labels into the artifacts to get stored in the '.nemo' file later self.word_embeddings = self.model.enc_dec_model.encoder_embedding.word_embeddings self.position_embeddings = self.model.enc_dec_model.encoder_embedding.position_embeddings # self.vocab = self.tokenizer.tokenizer.get_vocab() self.template = cfg.prompt_encoder.template self.prompt_encoder = PromptEncoder( template=cfg.prompt_encoder.template, hidden_size=hidden_size, lstm_dropout=cfg.prompt_encoder.dropout, num_layers=cfg.prompt_encoder.num_layers, ) # load prompt encoder self.hidden_size = hidden_size self.tokenizer.add_special_tokens([cfg.pseudo_token]) self.pseudo_token_id = self.tokenizer.special_token_to_id[cfg.pseudo_token] self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id self.spell_length = sum(self.template) self._reduced_loss_buffer = [] self.decoder_seq_length = cfg.get('decoder_seq_length', 10)
def test_Instantiate(self): conf = DictConfig({"transform": "GridSampling", "params": {"size": 0.1}}) t = instantiate_transform(conf) self.assertIsInstance(t, GridSampling) conf = DictConfig({"transform": "None", "params": {"size": 0.1}}) with self.assertRaises(ValueError): t = instantiate_transform(conf)
def __init__(self, name: str = "Unnamed", *args, **kwargs) -> None: self.__ports__ = {} self.aliases = {} self.uid = str(uuid.uuid4())[:8] if "with_uuid" in kwargs or name == "Unnamed": name += "_" + self.uid super(Component, self).__init__(name=name, exclude_from_current=True) self.info = DictConfig(self.info) self.name = name # overwrie PHIDL's incremental naming convention self.name_long = None
def test_dataloaders(self): dataset = ForwardShapenetDataset(self.config) dataset.create_dataloaders(MockModel(DictConfig({"conv_type": "DENSE"})), 2, False, 1, False) forward_set = dataset.test_dataloaders()[0] for b in forward_set: self.assertEqual(b.origin_id.shape, (2, 2)) sparseconfig = DictConfig({"dataroot": self.datadir, "category": "Airplane", "forward_category": "Airplane"}) dataset = ForwardShapenetDataset(sparseconfig) dataset.create_dataloaders(MockModel(DictConfig({"conv_type": "PARTIAL_DENSE"})), 2, False, 1, False) forward_set = dataset.test_dataloaders()[0] for b in forward_set: torch.testing.assert_allclose(b.origin_id, torch.tensor([0, 1, 2, 0, 1, 2, 3])) torch.testing.assert_allclose(b.sampleid, torch.tensor([0, 1]))
def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): # FIXME: switch to self._cfg if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) # this prevents base constructor from initializing tokenizer self.tokenizer = None super().__init__(cfg, trainer=trainer, no_lm_init=no_lm_init) # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self._cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), pipeline_model_parallel_size=cfg.get( 'pipeline_model_parallel_size', 1), pipeline_model_parallel_split_rank=cfg.get( 'pipeline_model_parallel_split_rank', 0), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get( 'apex_transformer_log_level', 30), ) self.grad_clip_pl_default = False # use pytorch default for gradient clipping. Default False if hasattr( self._cfg, "tokenizer") or (hasattr(self._cfg, "encoder_tokenizer") and hasattr(self._cfg, "decoder_tokenizer")): # build tokenizer (defaults to nemo supported tokenizers) self._build_tokenizer() # manipulate vocabulary (e.g., pad vocabulary for better efficiency) self._build_vocab()
def parse_dataset_definition(config: DictConfig): """Parse and instantiate a dataset class using the configuration provided. Parameters __________ config: An omegaconf.dictconfig.DictConfig object This is the dataset config dictionary. It must contain the `type` key with fully qualified class name. Raise _____ DataSetError: If the function fails to parse the configuration provided. Return ______ 2-tuple: (Dataset class object, configuration dictionary) """ if "type" not in config: raise DataSetError( "`type` is missing from DataSet catalog configuration") class_obj = config.pop("type") if isinstance(class_obj, str): if len(class_obj.strip(".")) != len( class_obj): #check if starts or ends with a dot raise DataSetError("`type` class path does not support relative " "paths or paths ending with a dot.") class_obj = load_obj(class_obj) if not issubclass(class_obj, AbstractDataSet): raise DataSetError( f"DataSet type `{class_obj.__module__}.{class_obj.__qualname__}` " f"is invalid: all data set types must extend `AbstractDataSet`.") return class_obj, config
def __init__(self, cfg: DictConfig, trainer: Trainer): app_state = AppState() if not app_state._is_megatron_initialized: logging.info( f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders." ) app_state.global_rank = trainer.global_rank app_state.world_size = trainer.world_size app_state.model_parallel_size = 1 app_state.model_parallel_rank = trainer.global_rank initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get( 'tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), ) try: from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper compile_helper() logging.info('Megatron dataset helper compiled successfully.') from nemo.collections.nlp.data.language_modeling.megatron import helpers except ImportError: raise ImportError( f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' )
def __init__(self, cfg: DictConfig, trainer: Trainer): # FIXME: switch to self._cfg if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) super().__init__(cfg, trainer=trainer) # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self._cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=self._cfg.get('seed', 1234), )
def _instanciate_transform_with_random_params(self): dico = dict() for p, rang in self.transform_params.items(): if "max" in rang and "min" in rang: assert rang["max"] - rang["min"] > 0 v = np.random.random() * (rang["max"] - rang["min"]) + rang["min"] if rang["type"] == "float": v = float(v) elif rang["type"] == "int": v = int(v) else: raise NotImplementedError dico[p] = v elif "value" in rang: v = rang["value"] dico[p] = v else: raise NotImplementedError trans_opt = DictConfig(dict(params=dico, transform=self.transform_name)) random_transform = instantiate_transform(trans_opt, attr="transform") return random_transform
def merge_json( doe_directory: Path = CONFIG["doe_directory"], gds_directory: Path = CONFIG["gds_directory"], extra_directories: Optional[Iterable[Path]] = None, jsonpath: Path = CONFIG["mask_directory"] / "metadata.json", json_version: int = 6, config: DictConfig = TECH, ) -> Dict[str, Any]: """Combine several JSON files from config.yml in the root of the mask directory, gets mask_name from there Args: doe_directory: defaults to current working directory extra_directories: list of extra_directories jsonpath json_version: config """ logger.debug("Merging JSON files:") cells = {} extra_directories = extra_directories or [] config = dataclasses.asdict(config) config.pop("library", "") for directory in extra_directories + [doe_directory]: for filename in directory.glob("*/*.json"): logger.debug(filename) with open(filename, "r") as f: data = json.load(f) cells.update(data.get("cells")) does = { d.stem: json.loads(open(d).read()) for d in doe_directory.glob("*.json") } metadata = dict( json_version=json_version, cells=cells, does=does, config=config, ) write_config(metadata, jsonpath) logger.info(f"Wrote metadata in {jsonpath}") return metadata
def recurse_structures(structure: Component) -> DictConfig: """Recurse over structures""" if (hasattr(structure, "function_name") and structure.function_name in IGNORE_FUNCTION_NAMES): return DictConfig({}) if hasattr(structure, "name") and any( [structure.name.startswith(i) for i in IGNORE_STRUCTURE_NAME_PREFIXES]): return DictConfig({}) output = {structure.name: structure.info} for element in structure.references: if (isinstance(element, ComponentReference) and element.ref_cell.name not in output): output.update(recurse_structures(element.ref_cell)) return output
def setUp(self): self.datadir = os.path.join(DIR, "test_dataset") self.config = DictConfig( { "dataroot": self.datadir, "test_transforms": [{"transform": "FixedPoints", "lparams": [2]}], "category": ["Airplane", "Cap"], "forward_category": "Airplane", } )
def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) if cfg.get('pipeline_model_parallel_size', 1) > 1: if cfg.get('pipeline_model_parallel_split_rank', 0) <= 0: raise ValueError( f"pipeline_model_parallel_split_rank must be > 0 when using pipeline_model_parallel_size > 1" ) # Make sure trainer.accumulate_grad_batches is 1. self._validate_trainer() # TODO: Not sure how to use lists of modules with PTL. # This means we can only use pipeline parallelism without the interleaved schedule. self.enc_dec_model = build_model( model_provider_func=self.model_provider_func, wrap_with_ddp=False, model_type=ModelType.encoder_and_decoder, )[0] # We don't need to call it explicitly? Since it is a pytorch lightning hook function # self.setup_optimizer_param_groups() self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) if self.megatron_amp_o2: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type self.enc_dec_model.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision self.enc_dec_model = Float16Module(module=self.enc_dec_model, precision=cfg.precision) if self.cfg.precision == 32: self.autocast_dtype = torch.float elif self.cfg.precision == 16: self.autocast_dtype = torch.half elif self.cfg.precision == 'bf16': self.autocast_dtype = torch.bfloat16 else: raise ValueError('precision must be in [32, 16, "bf16"]') self.enc_dec_model.model_type = ModelType.encoder_and_decoder
def _setup_eval_dataloader_from_config(self, cfg: DictConfig, dataset): rank = parallel_state.get_data_parallel_rank() world_size = parallel_state.get_data_parallel_world_size() dataloaders = [] for _dataset in dataset: sampler = torch.utils.data.distributed.DistributedSampler( _dataset, num_replicas=world_size, rank=rank, shuffle=False) dataloaders.append( torch.utils.data.DataLoader( dataset=_dataset, batch_size=1, sampler=sampler, num_workers=cfg.get("num_workers", 0), pin_memory=cfg.get("pin_memory", False), drop_last=cfg.get("drop_last", False), shuffle=False, )) return dataloaders
def _flatten_compact_options(self, opt): """Converts from a dict of lists, to a list of dicts""" flattenedOpts = [] for index in range(int(1e6)): try: flattenedOpts.append(DictConfig(self._fetch_arguments_from_list(opt, index))) except IndexError: break return flattenedOpts
def to_dict_polygons(self) -> DictConfig: """Returns a dict representation of the flattened compoment.""" d = DictConfig({}) polygons = {} layer_to_polygons = self.get_polygons(by_spec=True) for layer, polygons_layer in layer_to_polygons.items(): for polygon in polygons_layer: layer_name = f"{layer[0]}_{layer[1]}" polygons[layer_name] = [ tuple(snap_to_grid(v)) for v in polygon ] ports = {port.name: port.settings for port in self.get_ports_list()} clean_dict(ports) clean_dict(polygons) d.info = self.info d.polygons = polygons d.ports = ports return OmegaConf.create(d)
def test_predictupsamplepartialdense(self): dataset = ForwardShapenetDataset(self.config) dataset.create_dataloaders(MockModel(DictConfig({"conv_type": "PARTIAL_DENSE"})), 2, False, 1, False) forward_set = dataset.test_dataloaders()[0] for b in forward_set: output = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]]) predicted = dataset.predict_original_samples(b, "PARTIAL_DENSE", output) self.assertEqual(len(predicted), 2) self.assertEqual(predicted["example1.txt"].shape, (3, 4)) self.assertEqual(predicted["example2.txt"].shape, (4, 4)) npt.assert_allclose(predicted["example1.txt"][:, -1], np.asarray([0, 0, 0])) npt.assert_allclose(predicted["example2.txt"][:, -1], np.asarray([1, 1, 1, 1]))
def objective(trial): umap_params = { "n_components": trial.suggest_categorical('n_components', [2]), "random_state": trial.suggest_categorical('random_state', [42]), "n_neighbors": trial.suggest_int('n_neighbors', 2, 6), "min_dist": trial.suggest_uniform("min_dist", 0.1, 1), # スケールしてるから1程度 } scale = trial.suggest_uniform("scale", 0.1, 0.4) config_a = DictConfig({ "umap": umap_params, "scale": scale, "u_duration": hyparam.u_duration_a, "w_duration": hyparam.w_duration_a }) config_b = DictConfig({ "umap": umap_params, "scale": scale, "u_duration": hyparam.u_duration_b, "w_duration": hyparam.w_duration_b }) a = Agent(config_a) b = Agent(config_b) n_target = 0 n_correct = 0 perceptions = [] for n in range(hyparam.n_iter): src = hyparam.source phoneme, obs, states = a.production(src) obs = np.array(obs).astype('double') phoneme_hat, obs, states_hat = b.perception(obs) perceptions.append(phoneme_hat) n_target += phoneme_hat == hyparam.target print(Counter(perceptions)) return n_target * (1 - scale)
def to_dict_config(self) -> DictConfig: """Returns a DictConfig representation of the compoment.""" d = DictConfig({}) ports = {port.name: port.settings for port in self.get_ports_list()} clean_dict(ports) d.ports = ports d.info = self.info d.version = 1 d.cells = recurse_structures(self) return OmegaConf.create(d)
def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) # Make sure trainer.accumulate_grad_batches is 1. self._validate_trainer() # build tokenizer (defaults to nemo supported tokenizers) self._build_tokenizer() # manipulate vocabulary (e.g., pad vocabulary for better efficiency) self._build_vocab() # TODO: Not sure how to use lists of modules with PTL. # This means we can only use pipeline parallelism without the interleaved schedule. self.enc_dec_model = build_model( model_provider_func=self.model_provider_func, wrap_with_ddp=False, model_type=ModelType.encoder_and_decoder, )[0] self.setup_optimizer_param_groups() self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) if self.megatron_amp_o2: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type self.enc_dec_model.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision self.enc_dec_model = Float16Module(module=self.enc_dec_model, precision=cfg.precision) if self.cfg.precision == 32: self.autocast_dtype = torch.float elif self.cfg.precision == 16: self.autocast_dtype = torch.half elif self.cfg.precision == 'bf16': self.autocast_dtype = torch.bfloat16 else: raise ValueError('precision must be in [32, 16, "bf16"]') self.enc_dec_model.model_type = ModelType.encoder_and_decoder
def __init__(self, cfg: DictConfig, trainer: Trainer): app_state = AppState() if not app_state._is_megatron_initialized: logging.info( f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders." ) app_state.global_rank = trainer.global_rank app_state.world_size = trainer.world_size app_state.model_parallel_size = 1 app_state.model_parallel_rank = trainer.global_rank initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get( 'tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), )
def __init__(self, cfg: DictConfig, trainer: Trainer): if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) # this prevents base constructor from initializing tokenizer self.tokenizer = None super().__init__(cfg, trainer=trainer, no_lm_init=True) self._validate_trainer() # TODO: Not sure how to use lists of modules with PTL. # This means we can only use pipeline parallelism without the interleaved schedule. self.model = build_model(model_provider_func=self.model_provider_func, wrap_with_ddp=False)[0] # We don't need to call it explicitly? Since it is a pytorch lightning hook function # self.setup_optimizer_param_groups() self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) if self.megatron_amp_o2: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type self.model.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision self.model = Float16Module(module=self.model, precision=cfg.precision) if self.trainer.precision == 32: self.autocast_dtype = torch.float elif self.trainer.precision == 16: self.autocast_dtype = torch.half elif self.trainer.precision == 'bf16': self.autocast_dtype = torch.bfloat16 else: raise ValueError('precision must be in [32, 16, "bf16"]') # configuration used for inference self._inference_config = None
def flatten_cfg(cfg: Union[DictConfig, ListConfig]) -> dict: """ Recursively flattens a config into a flat dictionary compatible with tensorboard's `add_hparams` function. """ out_dict = {} if type(cfg) == ListConfig: cfg = DictConfig({f"[{i}]": v for i, v in enumerate(cfg)}) for key in cfg: if type(getattr(cfg, key)) in (int, str, bool, float): out_dict[key] = getattr(cfg, key) elif type(getattr(cfg, key)) in [DictConfig, ListConfig]: out_dict = out_dict | { f"{key}{'.' if type(getattr(cfg, key)) == DictConfig else ''}{k}": v for k, v in flatten_cfg(getattr(cfg, key)).items() } else: raise AssertionError return out_dict
def __init__(self, cfg: DictConfig, trainer: Trainer): # FIXME: switch to self._cfg if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) # this prevents base constructor from initializing tokenizer self.tokenizer = None super().__init__(cfg, trainer=trainer, no_lm_init=True) # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self._cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] if cfg.get('pipeline_model_parallel_size', 1) > 1: if cfg.get('pipeline_model_parallel_split_rank', 0) <= 0: raise ValueError( f"pipeline_model_parallel_split_rank must be > 0 when using pipeline_model_parallel_size > 1" ) initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), pipeline_model_parallel_size=cfg.get( 'pipeline_model_parallel_size', 1), pipeline_model_parallel_split_rank=cfg.get( 'pipeline_model_parallel_split_rank', 0), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), seed=self.cfg.get('seed', 1234), apex_transformer_log_level=self.cfg.get( 'apex_transformer_log_level', 30), )
def parse_catalog_configuration(catalog: DictConfig, data_dir: str): """Parse the catalog configuration Parameters __________ catalog : An omegaconf.dictconfig.DictConfig object This is the catalog configuration data_dir : str The directory where the data is stored. Return ______ omegaconf.dictconfig.DictConfig: {Dataset name : configuration dictionary} """ output = {} local_dir = os.path.join(data_dir, catalog.site_name) datasets = catalog.datasets levels = datasets.keys() for level in levels: subset = datasets[level] for name, contents in subset.items(): output[name] = { 'local_dir': local_dir, 'file_name': contents.file_name, 'type': contents.type, 'data_stage': DataStage[level.upper()].value, 'ml_stages': contents.stages } if 'load_args' in contents: output[name].update(dict(load_args=contents.load_args)) if 'save_args' in contents: output[name].update(dict(save_args=contents.save_args)) return DictConfig(output)
def __init__(self, cfg: DictConfig, trainer: Trainer): if not HAVE_APEX: raise ImportError( "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." ) super().__init__(cfg, trainer) self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) # TODO: Fix this once apex patches FusedScaledMaskedSoftmax. # This is a workaround for the fact that `masked_softmax_fusion` has issues with certain input sizes that may be present while finetuning. t5_cfg = MegatronT5Model.restore_from(self.register_artifact( 't5_base_model', cfg.restore_from_path), trainer=trainer, return_config=True) OmegaConf.set_struct(t5_cfg, True) with open_dict(t5_cfg): t5_cfg.masked_softmax_fusion = False t5_cfg.megatron_amp_O2 = self.megatron_amp_o2 self.model = MegatronT5Model.restore_from( self.register_artifact('t5_base_model', cfg.restore_from_path), trainer=trainer, override_config_path=t5_cfg, ) self.setup_optimizer_param_groups()
def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) self.cfg = cfg # used in NVIDIA NGC PyTorch containers self._enable_nvidia_optimizations() if self.cfg.get('use_cpu_initialization', False) is False: torch.cuda.set_device(trainer.local_rank) # buffer used during train_step for logging average loss over gradient accumulation steps self._reduced_loss_buffer = [] self._reduced_lm_loss_buffer = [] self._reduced_sop_loss_buffer = [] initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), ) self.tokenizer = get_nmt_tokenizer( library=self.cfg.tokenizer.library, model_name=self.cfg.tokenizer.type, tokenizer_model=self.register_artifact("tokenizer_model", self.cfg.tokenizer.model), vocab_file=self.register_artifact("vocab_file", self.cfg.tokenizer.vocab_file), merges_file=self.register_artifact("merges_file", self.cfg.tokenizer.merge_file), ) vocab_size = self.tokenizer.vocab_size padded_vocab_size = self._vocab_size_with_padding( orig_vocab_size=vocab_size, make_vocab_size_divisible_by=cfg.get( 'make_vocab_size_divisible_by', 128), tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), ) num_tokentypes = 2 if cfg.bert_binary_head else 0 self.model = BertModel( vocab_size=padded_vocab_size, hidden_size=cfg.hidden_size, max_position_embeddings=cfg.max_position_embeddings, num_layers=cfg.num_layers, num_attention_heads=cfg.num_attention_heads, apply_query_key_layer_scaling=cfg.get( 'apply_query_key_layer_scaling', True), kv_channels=cfg.get('kv_channels', None), ffn_hidden_size=cfg.ffn_hidden_size, num_tokentypes=num_tokentypes, parallel_output=True, pre_process=cfg.get('pre_process', True), post_process=cfg.get('post_process', True), init_method_std=cfg.get('init_method_std', 0.02), fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False), use_cpu_initialization=cfg.get('use_cpu_initialization', False), hidden_dropout=cfg.get('hidden_dropout', 0.1), precision=cfg.get('precision', 16), fp32_residual_connection=cfg.get('fp32_residual_connection', False), activations_checkpoint_method=cfg.get( 'activations_checkpoint_method', None), activations_checkpoint_num_layers=cfg.get( 'activations_checkpoint_num_layers', 1), layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5), onnx_safe=cfg.get('onnx_safe', False), add_binary_head=cfg.bert_binary_head, )
def build_basic_params(): return DictConfig({"base_lr": 0.001})
def train(net, losses_fn, metrics_fn, train_loader, eval_loader, optimizer, scheduler, cfg): logfile = open(os.path.join(cfg.checkpoint_dir, 'log_run.txt'), 'w+') logwrite(logfile, str(cfg), to_print=False) logwrite( logfile, "Total number of parameters : " + str( sum([p.numel() for p in net.parameters() if p.requires_grad]) / 1e6) + "M") cfg.run = DictConfig({ 'no_improvements': 0, 'current_epoch': 0, 'early_stop': 0, 'best_early_stop_metric': 0.0 if cfg.early_stop.higher_is_better else float('inf'), }) scaler = torch.cuda.amp.GradScaler() summary = '' batch_size = cfg.hyperparameter.batch_size metric_comparison_func = operator.gt if cfg.early_stop.higher_is_better else operator.lt for epoch in range(0, 9999): cfg.run.current_epoch = epoch net.train() time_start = time.time() for step, sample in enumerate(train_loader): optimizer.zero_grad() with torch.cuda.amp.autocast(): pred = net(sample) losses = 0.0 for loss_fn in losses_fn: loss = loss_fn.weight * loss_fn(pred, sample) losses += loss scaler.scale(losses).backward() # Gradient norm clipping if cfg.hyperparameter.grad_norm_clip > 0: nn.utils.clip_grad_norm_(net.parameters(), cfg.hyperparameter.grad_norm_clip) scaler.step(optimizer) scaler.update() summary = "\r[Epoch {}][Step {}/{}] Loss: {}, Lr: {}, ES: {}/{} ({}: {:.2f}) - {:.2f} m remaining".format( cfg.run.current_epoch + 1, step, int(len(train_loader.dataset) / batch_size), [ "{}: {:.2f}".format( type(loss_fn).__name__, loss_fn.mean_running_loss) for loss_fn in losses_fn ], *[group['lr'] for group in optimizer.param_groups], cfg.run.no_improvements, cfg.early_stop.no_improvements, cfg.early_stop.early_stop_metric, cfg.run.best_early_stop_metric, ((time.time() - time_start) / (step + 1)) * ((len(train_loader.dataset) / batch_size) - step) / 60, ) print(summary, end=' ') time_end = time.time() elapse_time = time_end - time_start print('Finished in {}s'.format(int(elapse_time))) logwrite(logfile, summary) if epoch + 1 >= cfg.hyperparameter.eval_start: metrics = evaluate(net, losses_fn, metrics_fn, eval_loader, cfg) logwrite(logfile, metrics) metric_value = metrics[cfg.early_stop.early_stop_metric] cfg.run.no_improvements += 1 # Best model beaten if metric_comparison_func(metric_value, cfg.run.best_early_stop_metric): torch.save( { 'state_dict': net.state_dict(), # 'optimizer': optimizer.state_dict(), # 'scheduler': scheduler.state_dict(), 'cfg': cfg, 'metrics': metrics, }, os.path.join(cfg.checkpoint_dir, 'best.pkl')) cfg.run.no_improvements = 0 cfg.run.best_early_stop_metric = float(metric_value) # Scheduler if cfg.scheduler.use_scheduler: scheduler.step(metrics[cfg.early_stop.early_stop_metric]) # Early stop ? if cfg.run.no_improvements == cfg.early_stop.no_improvements: import sys os.rename( os.path.join(cfg.checkpoint_dir, 'best.pkl'), os.path.join( cfg.checkpoint_dir, 'best' + str(cfg.run.best_early_stop_metric) + '.pkl')) print('Early stop reached') sys.exit()