Example #1
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        # All of the lines below need to be set when the parent class calls self._build_tokenizer()
        self.encoder_tokenizer_library = cfg.encoder_tokenizer.get(
            'library', 'yttm')
        self.decoder_tokenizer_library = cfg.decoder_tokenizer.get(
            'library', 'yttm')
        self.special_tokens = {}
        self.src_language = cfg.get("src_language", None)
        self.tgt_language = cfg.get("tgt_language", None)

        self.multilingual = cfg.get("multilingual", False)
        self.multilingual_ids = []

        self.validate_input_ids = cfg.get("validate_input_ids", True)
        if self.multilingual:
            if isinstance(self.src_language, ListConfig) and isinstance(
                    self.tgt_language, ListConfig):
                raise ValueError(
                    "cfg.src_language and cfg.tgt_language cannot both be lists. We only support many-to-one or one-to-many multilingual models."
                )
            elif isinstance(self.src_language, ListConfig):
                pass
            elif isinstance(self.tgt_language, ListConfig):
                for lng in self.tgt_language:
                    self.special_tokens["<" + lng + ">"] = "<" + lng + ">"
            else:
                raise ValueError(
                    "Expect either cfg.src_language or cfg.tgt_language to be a list when multilingual=True."
                )

        super().__init__(cfg, trainer=trainer)
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer)

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)
        # TODO: Fix this once apex patches FusedScaledMaskedSoftmax.
        # This is a workaround for the fact that `masked_softmax_fusion` has issues with certain input sizes that may be present while finetuning.
        t5_cfg = MegatronT5Model.restore_from(
            self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)),
            trainer=trainer,
            return_config=True,
        )
        OmegaConf.set_struct(t5_cfg, True)
        with open_dict(t5_cfg):
            t5_cfg.masked_softmax_fusion = False
            t5_cfg.megatron_amp_O2 = self.megatron_amp_o2

        self.model = MegatronT5Model.restore_from(
            self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)),
            trainer=trainer,
            override_config_path=t5_cfg,
        )

        # self.model = MegatronT5Model.restore_from(
        #     self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)),
        #     trainer=trainer)

        self.tokenizer = self.model.tokenizer

        self.float_type = self.model.enc_dec_model.enc_dec_model.encoder.model.layers[0].dtype

        if not cfg.use_lm_finetune:
            self.model.freeze()

        hidden_size = self.model.cfg.hidden_size

        # register the file containing the labels into the artifacts to get stored in the '.nemo' file later
        self.word_embeddings = self.model.enc_dec_model.encoder_embedding.word_embeddings
        self.position_embeddings = self.model.enc_dec_model.encoder_embedding.position_embeddings

        # self.vocab = self.tokenizer.tokenizer.get_vocab()

        self.template = cfg.prompt_encoder.template

        self.prompt_encoder = PromptEncoder(
            template=cfg.prompt_encoder.template,
            hidden_size=hidden_size,
            lstm_dropout=cfg.prompt_encoder.dropout,
            num_layers=cfg.prompt_encoder.num_layers,
        )

        # load prompt encoder
        self.hidden_size = hidden_size
        self.tokenizer.add_special_tokens([cfg.pseudo_token])

        self.pseudo_token_id = self.tokenizer.special_token_to_id[cfg.pseudo_token]
        self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id
        self.spell_length = sum(self.template)
        self._reduced_loss_buffer = []
        self.decoder_seq_length = cfg.get('decoder_seq_length', 10)
Example #3
0
    def test_Instantiate(self):
        conf = DictConfig({"transform": "GridSampling", "params": {"size": 0.1}})
        t = instantiate_transform(conf)
        self.assertIsInstance(t, GridSampling)

        conf = DictConfig({"transform": "None", "params": {"size": 0.1}})
        with self.assertRaises(ValueError):
            t = instantiate_transform(conf)
Example #4
0
    def __init__(self, name: str = "Unnamed", *args, **kwargs) -> None:

        self.__ports__ = {}
        self.aliases = {}
        self.uid = str(uuid.uuid4())[:8]
        if "with_uuid" in kwargs or name == "Unnamed":
            name += "_" + self.uid

        super(Component, self).__init__(name=name, exclude_from_current=True)
        self.info = DictConfig(self.info)
        self.name = name  # overwrie PHIDL's incremental naming convention
        self.name_long = None
    def test_dataloaders(self):
        dataset = ForwardShapenetDataset(self.config)
        dataset.create_dataloaders(MockModel(DictConfig({"conv_type": "DENSE"})), 2, False, 1, False)
        forward_set = dataset.test_dataloaders()[0]
        for b in forward_set:
            self.assertEqual(b.origin_id.shape, (2, 2))

        sparseconfig = DictConfig({"dataroot": self.datadir, "category": "Airplane", "forward_category": "Airplane"})
        dataset = ForwardShapenetDataset(sparseconfig)
        dataset.create_dataloaders(MockModel(DictConfig({"conv_type": "PARTIAL_DENSE"})), 2, False, 1, False)
        forward_set = dataset.test_dataloaders()[0]
        for b in forward_set:
            torch.testing.assert_allclose(b.origin_id, torch.tensor([0, 1, 2, 0, 1, 2, 3]))
            torch.testing.assert_allclose(b.sampleid, torch.tensor([0, 1]))
Example #6
0
    def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
        # FIXME: switch to self._cfg
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        # this prevents base constructor from initializing tokenizer
        self.tokenizer = None

        super().__init__(cfg, trainer=trainer, no_lm_init=no_lm_init)

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self._cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            pipeline_model_parallel_size=cfg.get(
                'pipeline_model_parallel_size', 1),
            pipeline_model_parallel_split_rank=cfg.get(
                'pipeline_model_parallel_split_rank', 0),
            micro_batch_size=cfg.get('micro_batch_size'),
            global_batch_size=cfg.get('global_batch_size'),
            seed=self.cfg.get('seed', 1234),
            apex_transformer_log_level=self.cfg.get(
                'apex_transformer_log_level', 30),
        )

        self.grad_clip_pl_default = False  # use pytorch default for gradient clipping. Default False

        if hasattr(
                self._cfg,
                "tokenizer") or (hasattr(self._cfg, "encoder_tokenizer")
                                 and hasattr(self._cfg, "decoder_tokenizer")):
            # build tokenizer (defaults to nemo supported tokenizers)
            self._build_tokenizer()

            # manipulate vocabulary (e.g., pad vocabulary for better efficiency)
            self._build_vocab()
Example #7
0
def parse_dataset_definition(config: DictConfig):
    """Parse and instantiate a dataset class using the configuration provided.
    
    Parameters
    __________
    config: An omegaconf.dictconfig.DictConfig object
        This is the dataset config dictionary. It must contain the `type` key
        with fully qualified class name.
        
    Raise
    _____
    DataSetError: If the function fails to parse the configuration provided.
    
    Return
    ______
    2-tuple: (Dataset class object, configuration dictionary)
    """
    if "type" not in config:
        raise DataSetError(
            "`type` is missing from DataSet catalog configuration")

    class_obj = config.pop("type")
    if isinstance(class_obj, str):
        if len(class_obj.strip(".")) != len(
                class_obj):  #check if starts or ends with a dot
            raise DataSetError("`type` class path does not support relative "
                               "paths or paths ending with a dot.")
        class_obj = load_obj(class_obj)

    if not issubclass(class_obj, AbstractDataSet):
        raise DataSetError(
            f"DataSet type `{class_obj.__module__}.{class_obj.__qualname__}` "
            f"is invalid: all data set types must extend `AbstractDataSet`.")

    return class_obj, config
Example #8
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        app_state = AppState()

        if not app_state._is_megatron_initialized:
            logging.info(
                f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders."
            )
            app_state.global_rank = trainer.global_rank
            app_state.world_size = trainer.world_size
            app_state.model_parallel_size = 1
            app_state.model_parallel_rank = trainer.global_rank

            initialize_model_parallel_for_nemo(
                world_size=trainer.world_size,
                global_rank=trainer.global_rank,
                local_rank=trainer.local_rank,
                tensor_model_parallel_size=cfg.get(
                    'tensor_model_parallel_size', 1),
                seed=self.cfg.get('seed', 1234),
            )

        try:
            from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper

            compile_helper()
            logging.info('Megatron dataset helper compiled successfully.')
            from nemo.collections.nlp.data.language_modeling.megatron import helpers
        except ImportError:
            raise ImportError(
                f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.'
            )
Example #9
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        # FIXME: switch to self._cfg
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        super().__init__(cfg, trainer=trainer)

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self._cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            seed=self._cfg.get('seed', 1234),
        )
    def _instanciate_transform_with_random_params(self):
        dico = dict()
        for p, rang in self.transform_params.items():
            if "max" in rang and "min" in rang:
                assert rang["max"] - rang["min"] > 0
                v = np.random.random() * (rang["max"] -
                                          rang["min"]) + rang["min"]

                if rang["type"] == "float":
                    v = float(v)
                elif rang["type"] == "int":
                    v = int(v)
                else:
                    raise NotImplementedError
                dico[p] = v
            elif "value" in rang:
                v = rang["value"]
                dico[p] = v
            else:
                raise NotImplementedError

        trans_opt = DictConfig(dict(params=dico,
                                    transform=self.transform_name))
        random_transform = instantiate_transform(trans_opt, attr="transform")
        return random_transform
Example #11
0
def merge_json(
    doe_directory: Path = CONFIG["doe_directory"],
    gds_directory: Path = CONFIG["gds_directory"],
    extra_directories: Optional[Iterable[Path]] = None,
    jsonpath: Path = CONFIG["mask_directory"] / "metadata.json",
    json_version: int = 6,
    config: DictConfig = TECH,
) -> Dict[str, Any]:
    """Combine several JSON files from config.yml
    in the root of the mask directory, gets mask_name from there

    Args:
        doe_directory: defaults to current working directory
        extra_directories: list of extra_directories
        jsonpath
        json_version:
        config

    """
    logger.debug("Merging JSON files:")
    cells = {}
    extra_directories = extra_directories or []
    config = dataclasses.asdict(config)
    config.pop("library", "")

    for directory in extra_directories + [doe_directory]:
        for filename in directory.glob("*/*.json"):
            logger.debug(filename)
            with open(filename, "r") as f:
                data = json.load(f)
                cells.update(data.get("cells"))

    does = {
        d.stem: json.loads(open(d).read())
        for d in doe_directory.glob("*.json")
    }
    metadata = dict(
        json_version=json_version,
        cells=cells,
        does=does,
        config=config,
    )

    write_config(metadata, jsonpath)
    logger.info(f"Wrote  metadata in {jsonpath}")
    return metadata
Example #12
0
def recurse_structures(structure: Component) -> DictConfig:
    """Recurse over structures"""
    if (hasattr(structure, "function_name")
            and structure.function_name in IGNORE_FUNCTION_NAMES):
        return DictConfig({})

    if hasattr(structure, "name") and any(
        [structure.name.startswith(i)
         for i in IGNORE_STRUCTURE_NAME_PREFIXES]):
        return DictConfig({})

    output = {structure.name: structure.info}
    for element in structure.references:
        if (isinstance(element, ComponentReference)
                and element.ref_cell.name not in output):
            output.update(recurse_structures(element.ref_cell))

    return output
 def setUp(self):
     self.datadir = os.path.join(DIR, "test_dataset")
     self.config = DictConfig(
         {
             "dataroot": self.datadir,
             "test_transforms": [{"transform": "FixedPoints", "lparams": [2]}],
             "category": ["Airplane", "Cap"],
             "forward_category": "Airplane",
         }
     )
Example #14
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer=trainer)
        if cfg.get('pipeline_model_parallel_size', 1) > 1:
            if cfg.get('pipeline_model_parallel_split_rank', 0) <= 0:
                raise ValueError(
                    f"pipeline_model_parallel_split_rank must be > 0 when using pipeline_model_parallel_size > 1"
                )

        # Make sure trainer.accumulate_grad_batches is 1.
        self._validate_trainer()

        # TODO: Not sure how to use lists of modules with PTL.
        # This means we can only use pipeline parallelism without the interleaved schedule.
        self.enc_dec_model = build_model(
            model_provider_func=self.model_provider_func,
            wrap_with_ddp=False,
            model_type=ModelType.encoder_and_decoder,
        )[0]

        # We don't need to call it explicitly? Since it is a pytorch lightning hook function
        # self.setup_optimizer_param_groups()

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)

        if self.megatron_amp_o2:

            # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
            self.enc_dec_model.cuda(torch.cuda.current_device())

            # Model wrapper to convert both model and inputs to half precision
            self.enc_dec_model = Float16Module(module=self.enc_dec_model, precision=cfg.precision)

        if self.cfg.precision == 32:
            self.autocast_dtype = torch.float
        elif self.cfg.precision == 16:
            self.autocast_dtype = torch.half
        elif self.cfg.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')

        self.enc_dec_model.model_type = ModelType.encoder_and_decoder
Example #15
0
    def _setup_eval_dataloader_from_config(self, cfg: DictConfig, dataset):

        rank = parallel_state.get_data_parallel_rank()
        world_size = parallel_state.get_data_parallel_world_size()
        dataloaders = []
        for _dataset in dataset:
            sampler = torch.utils.data.distributed.DistributedSampler(
                _dataset, num_replicas=world_size, rank=rank, shuffle=False)
            dataloaders.append(
                torch.utils.data.DataLoader(
                    dataset=_dataset,
                    batch_size=1,
                    sampler=sampler,
                    num_workers=cfg.get("num_workers", 0),
                    pin_memory=cfg.get("pin_memory", False),
                    drop_last=cfg.get("drop_last", False),
                    shuffle=False,
                ))

        return dataloaders
Example #16
0
    def _flatten_compact_options(self, opt):
        """Converts from a dict of lists, to a list of dicts"""
        flattenedOpts = []

        for index in range(int(1e6)):
            try:
                flattenedOpts.append(DictConfig(self._fetch_arguments_from_list(opt, index)))
            except IndexError:
                break

        return flattenedOpts
Example #17
0
    def to_dict_polygons(self) -> DictConfig:
        """Returns a dict representation of the flattened compoment."""
        d = DictConfig({})
        polygons = {}
        layer_to_polygons = self.get_polygons(by_spec=True)

        for layer, polygons_layer in layer_to_polygons.items():
            for polygon in polygons_layer:
                layer_name = f"{layer[0]}_{layer[1]}"
                polygons[layer_name] = [
                    tuple(snap_to_grid(v)) for v in polygon
                ]

        ports = {port.name: port.settings for port in self.get_ports_list()}
        clean_dict(ports)
        clean_dict(polygons)
        d.info = self.info
        d.polygons = polygons
        d.ports = ports
        return OmegaConf.create(d)
 def test_predictupsamplepartialdense(self):
     dataset = ForwardShapenetDataset(self.config)
     dataset.create_dataloaders(MockModel(DictConfig({"conv_type": "PARTIAL_DENSE"})), 2, False, 1, False)
     forward_set = dataset.test_dataloaders()[0]
     for b in forward_set:
         output = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]])
         predicted = dataset.predict_original_samples(b, "PARTIAL_DENSE", output)
         self.assertEqual(len(predicted), 2)
         self.assertEqual(predicted["example1.txt"].shape, (3, 4))
         self.assertEqual(predicted["example2.txt"].shape, (4, 4))
         npt.assert_allclose(predicted["example1.txt"][:, -1], np.asarray([0, 0, 0]))
         npt.assert_allclose(predicted["example2.txt"][:, -1], np.asarray([1, 1, 1, 1]))
def objective(trial):
    umap_params = {
        "n_components": trial.suggest_categorical('n_components', [2]),
        "random_state": trial.suggest_categorical('random_state', [42]),
        "n_neighbors": trial.suggest_int('n_neighbors', 2, 6),
        "min_dist": trial.suggest_uniform("min_dist", 0.1, 1),  # スケールしてるから1程度
    }
    scale = trial.suggest_uniform("scale", 0.1, 0.4)
    config_a = DictConfig({
        "umap": umap_params,
        "scale": scale,
        "u_duration": hyparam.u_duration_a,
        "w_duration": hyparam.w_duration_a
    })
    config_b = DictConfig({
        "umap": umap_params,
        "scale": scale,
        "u_duration": hyparam.u_duration_b,
        "w_duration": hyparam.w_duration_b
    })

    a = Agent(config_a)
    b = Agent(config_b)

    n_target = 0
    n_correct = 0
    perceptions = []
    for n in range(hyparam.n_iter):
        src = hyparam.source
        phoneme, obs, states = a.production(src)
        obs = np.array(obs).astype('double')
        phoneme_hat, obs, states_hat = b.perception(obs)
        perceptions.append(phoneme_hat)
        n_target += phoneme_hat == hyparam.target

    print(Counter(perceptions))
    return n_target * (1 - scale)
Example #20
0
    def to_dict_config(self) -> DictConfig:
        """Returns a DictConfig representation of the compoment."""
        d = DictConfig({})
        ports = {port.name: port.settings for port in self.get_ports_list()}
        clean_dict(ports)

        d.ports = ports
        d.info = self.info
        d.version = 1
        d.cells = recurse_structures(self)
        return OmegaConf.create(d)
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer=trainer)

        # Make sure trainer.accumulate_grad_batches is 1.
        self._validate_trainer()

        # build tokenizer (defaults to nemo supported tokenizers)
        self._build_tokenizer()

        # manipulate vocabulary (e.g., pad vocabulary for better efficiency)
        self._build_vocab()

        # TODO: Not sure how to use lists of modules with PTL.
        # This means we can only use pipeline parallelism without the interleaved schedule.
        self.enc_dec_model = build_model(
            model_provider_func=self.model_provider_func,
            wrap_with_ddp=False,
            model_type=ModelType.encoder_and_decoder,
        )[0]

        self.setup_optimizer_param_groups()

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)

        if self.megatron_amp_o2:

            # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
            self.enc_dec_model.cuda(torch.cuda.current_device())

            # Model wrapper to convert both model and inputs to half precision
            self.enc_dec_model = Float16Module(module=self.enc_dec_model, precision=cfg.precision)

        if self.cfg.precision == 32:
            self.autocast_dtype = torch.float
        elif self.cfg.precision == 16:
            self.autocast_dtype = torch.half
        elif self.cfg.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')

        self.enc_dec_model.model_type = ModelType.encoder_and_decoder
Example #22
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        app_state = AppState()

        if not app_state._is_megatron_initialized:
            logging.info(
                f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders."
            )
            app_state.global_rank = trainer.global_rank
            app_state.world_size = trainer.world_size
            app_state.model_parallel_size = 1
            app_state.model_parallel_rank = trainer.global_rank

            initialize_model_parallel_for_nemo(
                world_size=trainer.world_size,
                global_rank=trainer.global_rank,
                local_rank=trainer.local_rank,
                tensor_model_parallel_size=cfg.get(
                    'tensor_model_parallel_size', 1),
                seed=self.cfg.get('seed', 1234),
            )
Example #23
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        # this prevents base constructor from initializing tokenizer
        self.tokenizer = None
        super().__init__(cfg, trainer=trainer, no_lm_init=True)

        self._validate_trainer()

        # TODO: Not sure how to use lists of modules with PTL.
        # This means we can only use pipeline parallelism without the interleaved schedule.
        self.model = build_model(model_provider_func=self.model_provider_func,
                                 wrap_with_ddp=False)[0]

        # We don't need to call it explicitly? Since it is a pytorch lightning hook function
        # self.setup_optimizer_param_groups()

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)

        if self.megatron_amp_o2:

            # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
            self.model.cuda(torch.cuda.current_device())

            # Model wrapper to convert both model and inputs to half precision
            self.model = Float16Module(module=self.model,
                                       precision=cfg.precision)

        if self.trainer.precision == 32:
            self.autocast_dtype = torch.float
        elif self.trainer.precision == 16:
            self.autocast_dtype = torch.half
        elif self.trainer.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')

        # configuration used for inference
        self._inference_config = None
Example #24
0
def flatten_cfg(cfg: Union[DictConfig, ListConfig]) -> dict:
    """ 
    Recursively flattens a config into a flat dictionary compatible with 
    tensorboard's `add_hparams` function.
    """
    out_dict = {}
    if type(cfg) == ListConfig:
        cfg = DictConfig({f"[{i}]": v for i, v in enumerate(cfg)})

    for key in cfg:
        if type(getattr(cfg, key)) in (int, str, bool, float):
            out_dict[key] = getattr(cfg, key)
        elif type(getattr(cfg, key)) in [DictConfig, ListConfig]:
            out_dict = out_dict | {
                f"{key}{'.' if type(getattr(cfg, key)) == DictConfig else ''}{k}":
                v
                for k, v in flatten_cfg(getattr(cfg, key)).items()
            }
        else:
            raise AssertionError
    return out_dict
Example #25
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        # FIXME: switch to self._cfg
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        # this prevents base constructor from initializing tokenizer
        self.tokenizer = None

        super().__init__(cfg, trainer=trainer, no_lm_init=True)

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self._cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []

        if cfg.get('pipeline_model_parallel_size', 1) > 1:
            if cfg.get('pipeline_model_parallel_split_rank', 0) <= 0:
                raise ValueError(
                    f"pipeline_model_parallel_split_rank must be > 0 when using pipeline_model_parallel_size > 1"
                )

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            pipeline_model_parallel_size=cfg.get(
                'pipeline_model_parallel_size', 1),
            pipeline_model_parallel_split_rank=cfg.get(
                'pipeline_model_parallel_split_rank', 0),
            micro_batch_size=cfg.get('micro_batch_size'),
            global_batch_size=cfg.get('global_batch_size'),
            seed=self.cfg.get('seed', 1234),
            apex_transformer_log_level=self.cfg.get(
                'apex_transformer_log_level', 30),
        )
Example #26
0
def parse_catalog_configuration(catalog: DictConfig, data_dir: str):
    """Parse the catalog configuration
    
    Parameters
    __________
    catalog : An omegaconf.dictconfig.DictConfig object
        This is the catalog configuration
    data_dir : str
        The directory where the data is stored.
    
    Return
    ______
    omegaconf.dictconfig.DictConfig: {Dataset name : configuration dictionary}
    """
    output = {}
    local_dir = os.path.join(data_dir, catalog.site_name)
    datasets = catalog.datasets
    levels = datasets.keys()

    for level in levels:
        subset = datasets[level]
        for name, contents in subset.items():
            output[name] = {
                'local_dir': local_dir,
                'file_name': contents.file_name,
                'type': contents.type,
                'data_stage': DataStage[level.upper()].value,
                'ml_stages': contents.stages
            }

            if 'load_args' in contents:
                output[name].update(dict(load_args=contents.load_args))
            if 'save_args' in contents:
                output[name].update(dict(save_args=contents.save_args))

    return DictConfig(output)
Example #27
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        super().__init__(cfg, trainer)
        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)
        # TODO: Fix this once apex patches FusedScaledMaskedSoftmax.
        # This is a workaround for the fact that `masked_softmax_fusion` has issues with certain input sizes that may be present while finetuning.
        t5_cfg = MegatronT5Model.restore_from(self.register_artifact(
            't5_base_model', cfg.restore_from_path),
                                              trainer=trainer,
                                              return_config=True)
        OmegaConf.set_struct(t5_cfg, True)
        with open_dict(t5_cfg):
            t5_cfg.masked_softmax_fusion = False
            t5_cfg.megatron_amp_O2 = self.megatron_amp_o2

        self.model = MegatronT5Model.restore_from(
            self.register_artifact('t5_base_model', cfg.restore_from_path),
            trainer=trainer,
            override_config_path=t5_cfg,
        )
        self.setup_optimizer_param_groups()
Example #28
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer=trainer)
        self.cfg = cfg

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self.cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []
        self._reduced_lm_loss_buffer = []
        self._reduced_sop_loss_buffer = []

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            seed=self.cfg.get('seed', 1234),
        )

        self.tokenizer = get_nmt_tokenizer(
            library=self.cfg.tokenizer.library,
            model_name=self.cfg.tokenizer.type,
            tokenizer_model=self.register_artifact("tokenizer_model",
                                                   self.cfg.tokenizer.model),
            vocab_file=self.register_artifact("vocab_file",
                                              self.cfg.tokenizer.vocab_file),
            merges_file=self.register_artifact("merges_file",
                                               self.cfg.tokenizer.merge_file),
        )

        vocab_size = self.tokenizer.vocab_size

        padded_vocab_size = self._vocab_size_with_padding(
            orig_vocab_size=vocab_size,
            make_vocab_size_divisible_by=cfg.get(
                'make_vocab_size_divisible_by', 128),
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
        )

        num_tokentypes = 2 if cfg.bert_binary_head else 0

        self.model = BertModel(
            vocab_size=padded_vocab_size,
            hidden_size=cfg.hidden_size,
            max_position_embeddings=cfg.max_position_embeddings,
            num_layers=cfg.num_layers,
            num_attention_heads=cfg.num_attention_heads,
            apply_query_key_layer_scaling=cfg.get(
                'apply_query_key_layer_scaling', True),
            kv_channels=cfg.get('kv_channels', None),
            ffn_hidden_size=cfg.ffn_hidden_size,
            num_tokentypes=num_tokentypes,
            parallel_output=True,
            pre_process=cfg.get('pre_process', True),
            post_process=cfg.get('post_process', True),
            init_method_std=cfg.get('init_method_std', 0.02),
            fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
            use_cpu_initialization=cfg.get('use_cpu_initialization', False),
            hidden_dropout=cfg.get('hidden_dropout', 0.1),
            precision=cfg.get('precision', 16),
            fp32_residual_connection=cfg.get('fp32_residual_connection',
                                             False),
            activations_checkpoint_method=cfg.get(
                'activations_checkpoint_method', None),
            activations_checkpoint_num_layers=cfg.get(
                'activations_checkpoint_num_layers', 1),
            layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
            onnx_safe=cfg.get('onnx_safe', False),
            add_binary_head=cfg.bert_binary_head,
        )
def build_basic_params():
    return DictConfig({"base_lr": 0.001})
Example #30
0
def train(net, losses_fn, metrics_fn, train_loader, eval_loader, optimizer,
          scheduler, cfg):
    logfile = open(os.path.join(cfg.checkpoint_dir, 'log_run.txt'), 'w+')

    logwrite(logfile, str(cfg), to_print=False)
    logwrite(
        logfile, "Total number of parameters : " + str(
            sum([p.numel()
                 for p in net.parameters() if p.requires_grad]) / 1e6) + "M")

    cfg.run = DictConfig({
        'no_improvements':
        0,
        'current_epoch':
        0,
        'early_stop':
        0,
        'best_early_stop_metric':
        0.0 if cfg.early_stop.higher_is_better else float('inf'),
    })

    scaler = torch.cuda.amp.GradScaler()

    summary = ''
    batch_size = cfg.hyperparameter.batch_size
    metric_comparison_func = operator.gt if cfg.early_stop.higher_is_better else operator.lt

    for epoch in range(0, 9999):
        cfg.run.current_epoch = epoch
        net.train()
        time_start = time.time()
        for step, sample in enumerate(train_loader):
            optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                pred = net(sample)

            losses = 0.0
            for loss_fn in losses_fn:
                loss = loss_fn.weight * loss_fn(pred, sample)
                losses += loss
            scaler.scale(losses).backward()

            # Gradient norm clipping
            if cfg.hyperparameter.grad_norm_clip > 0:
                nn.utils.clip_grad_norm_(net.parameters(),
                                         cfg.hyperparameter.grad_norm_clip)

            scaler.step(optimizer)
            scaler.update()

            summary = "\r[Epoch {}][Step {}/{}] Loss: {}, Lr: {}, ES: {}/{} ({}: {:.2f}) - {:.2f} m remaining".format(
                cfg.run.current_epoch + 1,
                step,
                int(len(train_loader.dataset) / batch_size),
                [
                    "{}: {:.2f}".format(
                        type(loss_fn).__name__, loss_fn.mean_running_loss)
                    for loss_fn in losses_fn
                ],
                *[group['lr'] for group in optimizer.param_groups],
                cfg.run.no_improvements,
                cfg.early_stop.no_improvements,
                cfg.early_stop.early_stop_metric,
                cfg.run.best_early_stop_metric,
                ((time.time() - time_start) / (step + 1)) *
                ((len(train_loader.dataset) / batch_size) - step) / 60,
            )
            print(summary, end='          ')

        time_end = time.time()
        elapse_time = time_end - time_start
        print('Finished in {}s'.format(int(elapse_time)))
        logwrite(logfile, summary)

        if epoch + 1 >= cfg.hyperparameter.eval_start:
            metrics = evaluate(net, losses_fn, metrics_fn, eval_loader, cfg)
            logwrite(logfile, metrics)
            metric_value = metrics[cfg.early_stop.early_stop_metric]
            cfg.run.no_improvements += 1

            # Best model beaten
            if metric_comparison_func(metric_value,
                                      cfg.run.best_early_stop_metric):
                torch.save(
                    {
                        'state_dict': net.state_dict(),
                        # 'optimizer': optimizer.state_dict(),
                        # 'scheduler': scheduler.state_dict(),
                        'cfg': cfg,
                        'metrics': metrics,
                    },
                    os.path.join(cfg.checkpoint_dir, 'best.pkl'))
                cfg.run.no_improvements = 0
                cfg.run.best_early_stop_metric = float(metric_value)

            # Scheduler
            if cfg.scheduler.use_scheduler:
                scheduler.step(metrics[cfg.early_stop.early_stop_metric])

        # Early stop ?
        if cfg.run.no_improvements == cfg.early_stop.no_improvements:
            import sys
            os.rename(
                os.path.join(cfg.checkpoint_dir, 'best.pkl'),
                os.path.join(
                    cfg.checkpoint_dir,
                    'best' + str(cfg.run.best_early_stop_metric) + '.pkl'))
            print('Early stop reached')
            sys.exit()