Exemple #1
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        # All of the lines below need to be set when the parent class calls self._build_tokenizer()
        self.encoder_tokenizer_library = cfg.encoder_tokenizer.get(
            'library', 'yttm')
        self.decoder_tokenizer_library = cfg.decoder_tokenizer.get(
            'library', 'yttm')
        self.special_tokens = {}
        self.src_language = cfg.get("src_language", None)
        self.tgt_language = cfg.get("tgt_language", None)

        self.multilingual = cfg.get("multilingual", False)
        self.multilingual_ids = []

        self.validate_input_ids = cfg.get("validate_input_ids", True)
        if self.multilingual:
            if isinstance(self.src_language, ListConfig) and isinstance(
                    self.tgt_language, ListConfig):
                raise ValueError(
                    "cfg.src_language and cfg.tgt_language cannot both be lists. We only support many-to-one or one-to-many multilingual models."
                )
            elif isinstance(self.src_language, ListConfig):
                pass
            elif isinstance(self.tgt_language, ListConfig):
                for lng in self.tgt_language:
                    self.special_tokens["<" + lng + ">"] = "<" + lng + ">"
            else:
                raise ValueError(
                    "Expect either cfg.src_language or cfg.tgt_language to be a list when multilingual=True."
                )

        super().__init__(cfg, trainer=trainer)
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer)

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)
        # TODO: Fix this once apex patches FusedScaledMaskedSoftmax.
        # This is a workaround for the fact that `masked_softmax_fusion` has issues with certain input sizes that may be present while finetuning.
        t5_cfg = MegatronT5Model.restore_from(
            self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)),
            trainer=trainer,
            return_config=True,
        )
        OmegaConf.set_struct(t5_cfg, True)
        with open_dict(t5_cfg):
            t5_cfg.masked_softmax_fusion = False
            t5_cfg.megatron_amp_O2 = self.megatron_amp_o2

        self.model = MegatronT5Model.restore_from(
            self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)),
            trainer=trainer,
            override_config_path=t5_cfg,
        )

        # self.model = MegatronT5Model.restore_from(
        #     self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)),
        #     trainer=trainer)

        self.tokenizer = self.model.tokenizer

        self.float_type = self.model.enc_dec_model.enc_dec_model.encoder.model.layers[0].dtype

        if not cfg.use_lm_finetune:
            self.model.freeze()

        hidden_size = self.model.cfg.hidden_size

        # register the file containing the labels into the artifacts to get stored in the '.nemo' file later
        self.word_embeddings = self.model.enc_dec_model.encoder_embedding.word_embeddings
        self.position_embeddings = self.model.enc_dec_model.encoder_embedding.position_embeddings

        # self.vocab = self.tokenizer.tokenizer.get_vocab()

        self.template = cfg.prompt_encoder.template

        self.prompt_encoder = PromptEncoder(
            template=cfg.prompt_encoder.template,
            hidden_size=hidden_size,
            lstm_dropout=cfg.prompt_encoder.dropout,
            num_layers=cfg.prompt_encoder.num_layers,
        )

        # load prompt encoder
        self.hidden_size = hidden_size
        self.tokenizer.add_special_tokens([cfg.pseudo_token])

        self.pseudo_token_id = self.tokenizer.special_token_to_id[cfg.pseudo_token]
        self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id
        self.spell_length = sum(self.template)
        self._reduced_loss_buffer = []
        self.decoder_seq_length = cfg.get('decoder_seq_length', 10)
Exemple #3
0
    def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
        # FIXME: switch to self._cfg
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        # this prevents base constructor from initializing tokenizer
        self.tokenizer = None

        super().__init__(cfg, trainer=trainer, no_lm_init=no_lm_init)

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self._cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            pipeline_model_parallel_size=cfg.get(
                'pipeline_model_parallel_size', 1),
            pipeline_model_parallel_split_rank=cfg.get(
                'pipeline_model_parallel_split_rank', 0),
            micro_batch_size=cfg.get('micro_batch_size'),
            global_batch_size=cfg.get('global_batch_size'),
            seed=self.cfg.get('seed', 1234),
            apex_transformer_log_level=self.cfg.get(
                'apex_transformer_log_level', 30),
        )

        self.grad_clip_pl_default = False  # use pytorch default for gradient clipping. Default False

        if hasattr(
                self._cfg,
                "tokenizer") or (hasattr(self._cfg, "encoder_tokenizer")
                                 and hasattr(self._cfg, "decoder_tokenizer")):
            # build tokenizer (defaults to nemo supported tokenizers)
            self._build_tokenizer()

            # manipulate vocabulary (e.g., pad vocabulary for better efficiency)
            self._build_vocab()
Exemple #4
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        # FIXME: switch to self._cfg
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        super().__init__(cfg, trainer=trainer)

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self._cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            seed=self._cfg.get('seed', 1234),
        )
Exemple #5
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        app_state = AppState()

        if not app_state._is_megatron_initialized:
            logging.info(
                f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders."
            )
            app_state.global_rank = trainer.global_rank
            app_state.world_size = trainer.world_size
            app_state.model_parallel_size = 1
            app_state.model_parallel_rank = trainer.global_rank

            initialize_model_parallel_for_nemo(
                world_size=trainer.world_size,
                global_rank=trainer.global_rank,
                local_rank=trainer.local_rank,
                tensor_model_parallel_size=cfg.get(
                    'tensor_model_parallel_size', 1),
                seed=self.cfg.get('seed', 1234),
            )

        try:
            from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper

            compile_helper()
            logging.info('Megatron dataset helper compiled successfully.')
            from nemo.collections.nlp.data.language_modeling.megatron import helpers
        except ImportError:
            raise ImportError(
                f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.'
            )
Exemple #6
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer=trainer)
        if cfg.get('pipeline_model_parallel_size', 1) > 1:
            if cfg.get('pipeline_model_parallel_split_rank', 0) <= 0:
                raise ValueError(
                    f"pipeline_model_parallel_split_rank must be > 0 when using pipeline_model_parallel_size > 1"
                )

        # Make sure trainer.accumulate_grad_batches is 1.
        self._validate_trainer()

        # TODO: Not sure how to use lists of modules with PTL.
        # This means we can only use pipeline parallelism without the interleaved schedule.
        self.enc_dec_model = build_model(
            model_provider_func=self.model_provider_func,
            wrap_with_ddp=False,
            model_type=ModelType.encoder_and_decoder,
        )[0]

        # We don't need to call it explicitly? Since it is a pytorch lightning hook function
        # self.setup_optimizer_param_groups()

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)

        if self.megatron_amp_o2:

            # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
            self.enc_dec_model.cuda(torch.cuda.current_device())

            # Model wrapper to convert both model and inputs to half precision
            self.enc_dec_model = Float16Module(module=self.enc_dec_model, precision=cfg.precision)

        if self.cfg.precision == 32:
            self.autocast_dtype = torch.float
        elif self.cfg.precision == 16:
            self.autocast_dtype = torch.half
        elif self.cfg.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')

        self.enc_dec_model.model_type = ModelType.encoder_and_decoder
Exemple #7
0
    def _setup_eval_dataloader_from_config(self, cfg: DictConfig, dataset):

        rank = parallel_state.get_data_parallel_rank()
        world_size = parallel_state.get_data_parallel_world_size()
        dataloaders = []
        for _dataset in dataset:
            sampler = torch.utils.data.distributed.DistributedSampler(
                _dataset, num_replicas=world_size, rank=rank, shuffle=False)
            dataloaders.append(
                torch.utils.data.DataLoader(
                    dataset=_dataset,
                    batch_size=1,
                    sampler=sampler,
                    num_workers=cfg.get("num_workers", 0),
                    pin_memory=cfg.get("pin_memory", False),
                    drop_last=cfg.get("drop_last", False),
                    shuffle=False,
                ))

        return dataloaders
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer=trainer)

        # Make sure trainer.accumulate_grad_batches is 1.
        self._validate_trainer()

        # build tokenizer (defaults to nemo supported tokenizers)
        self._build_tokenizer()

        # manipulate vocabulary (e.g., pad vocabulary for better efficiency)
        self._build_vocab()

        # TODO: Not sure how to use lists of modules with PTL.
        # This means we can only use pipeline parallelism without the interleaved schedule.
        self.enc_dec_model = build_model(
            model_provider_func=self.model_provider_func,
            wrap_with_ddp=False,
            model_type=ModelType.encoder_and_decoder,
        )[0]

        self.setup_optimizer_param_groups()

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)

        if self.megatron_amp_o2:

            # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
            self.enc_dec_model.cuda(torch.cuda.current_device())

            # Model wrapper to convert both model and inputs to half precision
            self.enc_dec_model = Float16Module(module=self.enc_dec_model, precision=cfg.precision)

        if self.cfg.precision == 32:
            self.autocast_dtype = torch.float
        elif self.cfg.precision == 16:
            self.autocast_dtype = torch.half
        elif self.cfg.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')

        self.enc_dec_model.model_type = ModelType.encoder_and_decoder
Exemple #9
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        app_state = AppState()

        if not app_state._is_megatron_initialized:
            logging.info(
                f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders."
            )
            app_state.global_rank = trainer.global_rank
            app_state.world_size = trainer.world_size
            app_state.model_parallel_size = 1
            app_state.model_parallel_rank = trainer.global_rank

            initialize_model_parallel_for_nemo(
                world_size=trainer.world_size,
                global_rank=trainer.global_rank,
                local_rank=trainer.local_rank,
                tensor_model_parallel_size=cfg.get(
                    'tensor_model_parallel_size', 1),
                seed=self.cfg.get('seed', 1234),
            )
Exemple #10
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        # this prevents base constructor from initializing tokenizer
        self.tokenizer = None
        super().__init__(cfg, trainer=trainer, no_lm_init=True)

        self._validate_trainer()

        # TODO: Not sure how to use lists of modules with PTL.
        # This means we can only use pipeline parallelism without the interleaved schedule.
        self.model = build_model(model_provider_func=self.model_provider_func,
                                 wrap_with_ddp=False)[0]

        # We don't need to call it explicitly? Since it is a pytorch lightning hook function
        # self.setup_optimizer_param_groups()

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)

        if self.megatron_amp_o2:

            # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
            self.model.cuda(torch.cuda.current_device())

            # Model wrapper to convert both model and inputs to half precision
            self.model = Float16Module(module=self.model,
                                       precision=cfg.precision)

        if self.trainer.precision == 32:
            self.autocast_dtype = torch.float
        elif self.trainer.precision == 16:
            self.autocast_dtype = torch.half
        elif self.trainer.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')

        # configuration used for inference
        self._inference_config = None
Exemple #11
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        # FIXME: switch to self._cfg
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        # this prevents base constructor from initializing tokenizer
        self.tokenizer = None

        super().__init__(cfg, trainer=trainer, no_lm_init=True)

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self._cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []

        if cfg.get('pipeline_model_parallel_size', 1) > 1:
            if cfg.get('pipeline_model_parallel_split_rank', 0) <= 0:
                raise ValueError(
                    f"pipeline_model_parallel_split_rank must be > 0 when using pipeline_model_parallel_size > 1"
                )

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            pipeline_model_parallel_size=cfg.get(
                'pipeline_model_parallel_size', 1),
            pipeline_model_parallel_split_rank=cfg.get(
                'pipeline_model_parallel_split_rank', 0),
            micro_batch_size=cfg.get('micro_batch_size'),
            global_batch_size=cfg.get('global_batch_size'),
            seed=self.cfg.get('seed', 1234),
            apex_transformer_log_level=self.cfg.get(
                'apex_transformer_log_level', 30),
        )
Exemple #12
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        super().__init__(cfg, trainer)
        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)
        # TODO: Fix this once apex patches FusedScaledMaskedSoftmax.
        # This is a workaround for the fact that `masked_softmax_fusion` has issues with certain input sizes that may be present while finetuning.
        t5_cfg = MegatronT5Model.restore_from(self.register_artifact(
            't5_base_model', cfg.restore_from_path),
                                              trainer=trainer,
                                              return_config=True)
        OmegaConf.set_struct(t5_cfg, True)
        with open_dict(t5_cfg):
            t5_cfg.masked_softmax_fusion = False
            t5_cfg.megatron_amp_O2 = self.megatron_amp_o2

        self.model = MegatronT5Model.restore_from(
            self.register_artifact('t5_base_model', cfg.restore_from_path),
            trainer=trainer,
            override_config_path=t5_cfg,
        )
        self.setup_optimizer_param_groups()
Exemple #13
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer=trainer)
        self.cfg = cfg

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self.cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []
        self._reduced_lm_loss_buffer = []
        self._reduced_sop_loss_buffer = []

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            seed=self.cfg.get('seed', 1234),
        )

        self.tokenizer = get_nmt_tokenizer(
            library=self.cfg.tokenizer.library,
            model_name=self.cfg.tokenizer.type,
            tokenizer_model=self.register_artifact("tokenizer_model",
                                                   self.cfg.tokenizer.model),
            vocab_file=self.register_artifact("vocab_file",
                                              self.cfg.tokenizer.vocab_file),
            merges_file=self.register_artifact("merges_file",
                                               self.cfg.tokenizer.merge_file),
        )

        vocab_size = self.tokenizer.vocab_size

        padded_vocab_size = self._vocab_size_with_padding(
            orig_vocab_size=vocab_size,
            make_vocab_size_divisible_by=cfg.get(
                'make_vocab_size_divisible_by', 128),
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
        )

        num_tokentypes = 2 if cfg.bert_binary_head else 0

        self.model = BertModel(
            vocab_size=padded_vocab_size,
            hidden_size=cfg.hidden_size,
            max_position_embeddings=cfg.max_position_embeddings,
            num_layers=cfg.num_layers,
            num_attention_heads=cfg.num_attention_heads,
            apply_query_key_layer_scaling=cfg.get(
                'apply_query_key_layer_scaling', True),
            kv_channels=cfg.get('kv_channels', None),
            ffn_hidden_size=cfg.ffn_hidden_size,
            num_tokentypes=num_tokentypes,
            parallel_output=True,
            pre_process=cfg.get('pre_process', True),
            post_process=cfg.get('post_process', True),
            init_method_std=cfg.get('init_method_std', 0.02),
            fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
            use_cpu_initialization=cfg.get('use_cpu_initialization', False),
            hidden_dropout=cfg.get('hidden_dropout', 0.1),
            precision=cfg.get('precision', 16),
            fp32_residual_connection=cfg.get('fp32_residual_connection',
                                             False),
            activations_checkpoint_method=cfg.get(
                'activations_checkpoint_method', None),
            activations_checkpoint_num_layers=cfg.get(
                'activations_checkpoint_num_layers', 1),
            layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
            onnx_safe=cfg.get('onnx_safe', False),
            add_binary_head=cfg.bert_binary_head,
        )
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer)

        self.cfg = cfg

        # Load pretrained GPT model and tokenizer
        self.model = MegatronGPTModel.restore_from(
            self.register_artifact('language_model_path',
                                   cfg.get('language_model_path', None)),
            trainer=trainer,
            save_restore_connector=NLPSaveRestoreConnector(),
        )

        # Freeze all GPT model weights for prompt-tuning/p-tuning
        if not cfg.lm_finetune:
            self.model.freeze()

        self.tokenizer = self.model.tokenizer
        self.float_type = self.model.model.language_model.encoder.layers[
            0].dtype
        self.hidden_size = self.model.cfg.hidden_size
        self.word_embeddings = self.model.model.language_model.embedding.word_embeddings
        self.existing_tasks = list(self.cfg.get('existing_tasks', []))
        self.new_tasks = list(self.cfg.get('new_tasks', []))

        # Load templates for assigning virtual prompt token positions
        self.load_task_templates(self.cfg.task_templates)

        # Prompt table stores all task embeddings, p-tuning virtual prompts get added to the table after training
        self.prompt_table = PromptTable(
            existing_tasks=self.existing_tasks,
            task_templates=self.task_templates,
            task_id_num_to_name=self.task_id_num_to_name,
            hidden_size=self.hidden_size,
        )

        # Prepare pseudo token ids for virtual/virtual prompt tokens
        self.pseudo_token_base = cfg.pseudo_token_base
        self.pseudo_tokens = [
            self.pseudo_token_base + str(i)
            for i in range(self.max_virtual_tokens)
        ]
        self.tokenizer.add_special_tokens(
            {'additional_special_tokens': self.pseudo_tokens})
        self.pseudo_token_ids = self.tokenizer.tokens_to_ids(
            self.pseudo_tokens)
        self.pseudo_token_ids_start = self.pseudo_token_ids[0]
        self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id
        self.virtual_prompt_style = cfg.virtual_prompt_style.lower()

        # Prompt tuning stores virtual prompts in the prompt table and tunes their weight directly
        if self.virtual_prompt_style in ['prompt-tuning', 'inference']:
            self.virtual_prompt_source = 'prompt-table'

        # P-Tuning uses an LSTM Encoder to produce virtual token embeddings
        elif self.virtual_prompt_style == 'p-tuning':
            self.virtual_prompt_source = 'prompt-encoder'
        else:
            raise ValueError(
                f"\nvirtual prompt style '{cfg.virtual_prompt_type}' not recognized, please use one of 'prompt-tuning' or 'p-tuning'"
            )

        self._reduced_loss_buffer = []
        self._inference_config = None

        if self.trainer.precision == 32:
            self.autocast_dtype = torch.float
        elif self.trainer.precision == 16:
            self.autocast_dtype = torch.half
        elif self.trainer.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')
Exemple #15
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        # this prevents base constructor from initializing tokenizer
        self.tokenizer = None
        super().__init__(cfg, trainer=trainer, no_lm_init=True)

        self._validate_trainer()

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self.cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            pipeline_model_parallel_size=cfg.get(
                'pipeline_model_parallel_size', 1),
            micro_batch_size=cfg.get('micro_batch_size'),
            global_batch_size=cfg.get('global_batch_size'),
            seed=self.cfg.get('seed', 1234),
            apex_transformer_log_level=self.cfg.get(
                'apex_transformer_log_level', 30),
        )

        self.tokenizer = get_nmt_tokenizer(
            library=self.cfg.tokenizer.library,
            model_name=self.cfg.tokenizer.type,
            tokenizer_model=self.register_artifact("tokenizer.model",
                                                   self.cfg.tokenizer.model),
            vocab_file=self.register_artifact("tokenizer.vocab_file",
                                              self.cfg.tokenizer.vocab_file),
            merges_file=self.register_artifact("tokenizer.merge_file",
                                               self.cfg.tokenizer.merge_file),
            delimiter=self.cfg.tokenizer.get('delimiter', None),
        )

        vocab_size = self.tokenizer.vocab_size

        self.padded_vocab_size = self._vocab_size_with_padding(
            orig_vocab_size=vocab_size,
            make_vocab_size_divisible_by=cfg.get(
                'make_vocab_size_divisible_by', 128),
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
        )

        # TODO: Not sure how to use lists of modules with PTL.
        # This means we can only use pipeline parallelism without the interleaved schedule.
        self.model = build_model(model_provider_func=self.model_provider_func,
                                 wrap_with_ddp=False)[0]

        self.setup_optimizer_param_groups()

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)

        if self.megatron_amp_o2:

            # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
            self.model.cuda(torch.cuda.current_device())

            # Model wrapper to convert both model and inputs to half precision
            self.model = Float16Module(module=self.model,
                                       precision=cfg.precision)

        if self.trainer.precision == 32:
            self.autocast_dtype = torch.float
        elif self.trainer.precision == 16:
            self.autocast_dtype = torch.half
        elif self.trainer.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')

        # configuration used for inference
        self._inference_config = None
Exemple #16
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        super().__init__(cfg, trainer=trainer, no_lm_init=False)
        self.cfg = cfg

        # used in NVIDIA NGC PyTorch containers
        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_lm_loss_buffer = []
        self._reduced_sop_loss_buffer = []

        num_tokentypes = 2 if cfg.bert_binary_head else 0

        self.model = BertModel(
            vocab_size=self.padded_vocab_size,
            hidden_size=cfg.hidden_size,
            max_position_embeddings=cfg.max_position_embeddings,
            num_layers=cfg.num_layers,
            num_attention_heads=cfg.num_attention_heads,
            apply_query_key_layer_scaling=cfg.get(
                'apply_query_key_layer_scaling', True),
            kv_channels=cfg.get('kv_channels', None),
            ffn_hidden_size=cfg.ffn_hidden_size,
            num_tokentypes=num_tokentypes,
            parallel_output=True,
            pre_process=cfg.get('pre_process', True),
            post_process=cfg.get('post_process', True),
            init_method_std=cfg.get('init_method_std', 0.02),
            fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
            use_cpu_initialization=cfg.get('use_cpu_initialization', False),
            hidden_dropout=cfg.get('hidden_dropout', 0.1),
            precision=cfg.get('precision', 16),
            fp32_residual_connection=cfg.get('fp32_residual_connection',
                                             False),
            activations_checkpoint_method=cfg.get(
                'activations_checkpoint_method', None),
            activations_checkpoint_num_layers=cfg.get(
                'activations_checkpoint_num_layers', 1),
            layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
            masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
            bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
            onnx_safe=cfg.get('onnx_safe', False),
            add_binary_head=cfg.bert_binary_head,
            megatron_legacy=cfg.get('megatron_legacy', False),
        )
        # not using amp o2
        self.megatron_amp_o2 = False
Exemple #17
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        super().__init__(cfg, trainer=trainer)
        self.cfg = cfg

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self.cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []
        self._reduced_lm_loss_buffer = []
        self._reduced_sop_loss_buffer = []

        # not saved as part of nemo model graph but required during export to ONNX
        input_names = ['input_ids', 'attention_mask', 'token_type_ids']

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            seed=self.cfg.get('seed', 1234),
        )

        self.tokenizer = get_nmt_tokenizer(
            library=self.cfg.tokenizer.library,
            model_name=self.cfg.tokenizer.type,
            tokenizer_model=self.register_artifact("tokenizer.model",
                                                   self.cfg.tokenizer.model),
            vocab_file=self.register_artifact("tokenizer.vocab_file",
                                              self.cfg.tokenizer.vocab_file),
            merges_file=self.register_artifact("tokenizer.merge_file",
                                               self.cfg.tokenizer.merge_file),
        )

        vocab_size = self.tokenizer.vocab_size

        padded_vocab_size = self._vocab_size_with_padding(
            orig_vocab_size=vocab_size,
            make_vocab_size_divisible_by=cfg.get(
                'make_vocab_size_divisible_by', 128),
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
        )

        num_tokentypes = 2 if cfg.bert_binary_head else 0

        self.model = BertModel(
            vocab_size=padded_vocab_size,
            hidden_size=cfg.hidden_size,
            max_position_embeddings=cfg.max_position_embeddings,
            num_layers=cfg.num_layers,
            num_attention_heads=cfg.num_attention_heads,
            apply_query_key_layer_scaling=cfg.get(
                'apply_query_key_layer_scaling', True),
            kv_channels=cfg.get('kv_channels', None),
            ffn_hidden_size=cfg.ffn_hidden_size,
            num_tokentypes=num_tokentypes,
            parallel_output=True,
            pre_process=cfg.get('pre_process', True),
            post_process=cfg.get('post_process', True),
            init_method_std=cfg.get('init_method_std', 0.02),
            fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
            use_cpu_initialization=cfg.get('use_cpu_initialization', False),
            hidden_dropout=cfg.get('hidden_dropout', 0.1),
            precision=cfg.get('precision', 16),
            fp32_residual_connection=cfg.get('fp32_residual_connection',
                                             False),
            activations_checkpoint_method=cfg.get(
                'activations_checkpoint_method', None),
            activations_checkpoint_num_layers=cfg.get(
                'activations_checkpoint_num_layers', 1),
            layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
            masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
            bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
            onnx_safe=cfg.get('onnx_safe', False),
            add_binary_head=cfg.bert_binary_head,
            megatron_legacy=cfg.get('megatron_legacy', False),
        )
Exemple #18
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer)

        self.cfg = cfg

        # Load pretrained GPT model and tokenizer
        if cfg.get('language_model_path', None):
            self.frozen_model = MegatronGPTModel.restore_from(
                cfg.get('language_model_path'),
                trainer=trainer,
                save_restore_connector=NLPSaveRestoreConnector(),
            )

        # Freeze all GPT model weights for prompt-tuning/p-tuning
        self.frozen_model.freeze()
        self.tokenizer = self.frozen_model.tokenizer
        self.float_type = self.frozen_model.model.language_model.encoder.layers[
            0].dtype
        self.hidden_size = self.frozen_model.cfg.hidden_size
        self.word_embeddings = self.frozen_model.model.language_model.embedding.word_embeddings
        self.existing_tasks = list(self.cfg.get('existing_tasks', []))
        self.new_tasks = list(self.cfg.get('new_tasks', []))

        # Load templates for assigning virtual prompt token positions
        self.load_task_templates(self.cfg.task_templates)

        # Prompt table stores all task embeddings, p-tuning virtual prompts get added to the table after training
        self.prompt_table = PromptTable(
            existing_tasks=self.existing_tasks,
            task_templates=self.task_templates,
            task_id_num_to_name=self.task_id_num_to_name,
            hidden_size=self.hidden_size,
        )
        self._prompt_table_key = VirtualPromptSource.PROMPT_TABLE.value
        self._prompt_encoder_key = VirtualPromptSource.PROMPT_ENCODER.value

        # Prepare pseudo token ids for virtual/virtual prompt tokens
        self.pseudo_tokens = get_pseudo_tokens(self.max_virtual_tokens)
        self.tokenizer.add_special_tokens(
            {'additional_special_tokens': self.pseudo_tokens})
        self.pseudo_token_ids = self.tokenizer.tokens_to_ids(
            self.pseudo_tokens)
        self.pseudo_token_ids_start = self.pseudo_token_ids[0]
        self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id
        self.virtual_prompt_style = VirtualPromptStyle(
            cfg.virtual_prompt_style)

        # Prompt tuning stores virtual prompts in the prompt table and tunes their weight directly
        if self.virtual_prompt_style in [
                VirtualPromptStyle.PROMPT_TUNING, VirtualPromptStyle.INFERENCE
        ]:
            self.virtual_prompt_source = VirtualPromptSource.PROMPT_TABLE

        # P-Tuning uses an LSTM Encoder to produce virtual token embeddings
        elif self.virtual_prompt_style == VirtualPromptStyle.P_TUNING:
            self.virtual_prompt_source = VirtualPromptSource.PROMPT_ENCODER
        else:
            raise ValueError(
                f"\nvirtual prompt style '{cfg.virtual_prompt_style}' not recognized, please use one of 'prompt-tuning' or 'p-tuning'"
            )

        self._reduced_loss_buffer = []
        self._inference_config = None

        if self.trainer.precision == 32:
            self.autocast_dtype = torch.float
        elif self.trainer.precision == 16:
            self.autocast_dtype = torch.half
        elif self.trainer.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')
        # make sure the default pytorch lightning gradient clipping in the basemodel
        self.grad_clip_pl_default = True
        # no support of amp o2
        self.megatron_amp_o2 = False
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer=trainer)

        # build tokenizer (defaults to nemo supported tokenizers)
        self._build_tokenizer()

        # manipulate vocabulary (e.g., pad vocabulary for better efficiency)
        self._build_vocab()

        # TODO: create get_encoder_decoder_model()here for different losses (e..g, nll, vae, mim)
        self.enc_dec_model = MegatronTokenLevelEncoderDecoderModule(
            encoder_arch=cfg.encoder_arch,
            decoder_arch=cfg.decoder_arch,
            vocab_size=self.padded_vocab_size,
            hidden_size=cfg.hidden_size,
            max_position_embeddings=cfg.max_position_embeddings,
            num_layers=cfg.num_layers,
            num_attention_heads=cfg.num_attention_heads,
            apply_query_key_layer_scaling=cfg.get('apply_query_key_layer_scaling', True),
            kv_channels=cfg.get('kv_channels', None),
            ffn_hidden_size=cfg.ffn_hidden_size,
            num_tokentypes=0,
            parallel_output=True,
            pre_process=cfg.get('pre_process', True),
            post_process=cfg.get('post_process', True),
            init_method_std=cfg.get('init_method_std', 0.02),
            fp16_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
            use_cpu_initialization=cfg.get('use_cpu_initialization', False),
            hidden_dropout=cfg.get('hidden_dropout', 0.1),
            attention_dropout=cfg.get('attention_dropout', 0.1),
            precision=cfg.get('precision', 16),
            fp32_residual_connection=cfg.get('fp32_residual_connection', False),
            activations_checkpoint_method=cfg.get('activations_checkpoint_method', None),
            activations_checkpoint_num_layers=cfg.get('activations_checkpoint_num_layers', 1),
            layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
            persist_layer_norm=cfg.get('persist_layer_norm', False),
            bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
            masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
            onnx_safe=cfg.get('onnx_safe', False),
            activation=cfg.get('activation', 'gelu'),
        )

        self.setup_optimizer_param_groups()

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)

        if self.megatron_amp_o2:

            # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
            self.enc_dec_model.cuda(torch.cuda.current_device())

            # Model wrapper to convert both model and inputs to half precision
            self.enc_dec_model = Float16Module(module=self.enc_dec_model, precision=cfg.precision)
Exemple #20
0
class Component(Device):
    """customize phidl.Device

    Allow name to be set like Component('arc') or Component(name = 'arc')

    - get/write JSON metadata
    - get ports by type (optical, electrical ...)
    - set data_analysis and test_protocols

    Args:
        name: component_name


    Properties:
        info: includes
            full: full list of settings that create the function
            changed: changed settings
            default: includes the default signature of the component
            - derived properties
            - external metadata (test_protocol, docs, ...)
            - simulation_settings
            - function_name
            - name: for the component
            - name_long: for the component


    """
    def __init__(self, name: str = "Unnamed", *args, **kwargs) -> None:

        self.__ports__ = {}
        self.aliases = {}
        self.uid = str(uuid.uuid4())[:8]
        if "with_uuid" in kwargs or name == "Unnamed":
            name += "_" + self.uid

        super(Component, self).__init__(name=name, exclude_from_current=True)
        self.info = DictConfig(self.info)
        self.name = name  # overwrie PHIDL's incremental naming convention
        self.name_long = None

    @classmethod
    def __get_validators__(cls):
        yield cls.validate

    @classmethod
    def validate(cls, v):
        """pydantic assumes component is valid if:
        - name characters < MAX_NAME_LENGTH
        - is not empty (has references or polygons)
        """
        assert isinstance(v, Component)
        assert (len(v.name) <= MAX_NAME_LENGTH
                ), f"name `{v.name}` {len(v.name)} > {MAX_NAME_LENGTH} "
        # assert v.references or v.polygons, f"No references or  polygons in {v.name}"
        return v

    @property
    def ports_layer(self) -> Dict[str, str]:
        """Returns a mapping from layer0_layer1_E0: portName"""
        return map_ports_layer_to_orientation(self.ports)

    def port_by_orientation_cw(self, key: str, **kwargs):
        """Returns port by indexing them clockwise"""
        m = map_ports_to_orientation_cw(self.ports, **kwargs)
        if key not in m:
            raise KeyError(f"{key} not in {list(m.keys())}")
        key2 = m[key]
        return self.ports[key2]

    def port_by_orientation_ccw(self, key: str, **kwargs):
        """Returns port by indexing them clockwise"""
        m = map_ports_to_orientation_ccw(self.ports, **kwargs)
        if key not in m:
            raise KeyError(f"{key} not in {list(m.keys())}")
        key2 = m[key]
        return self.ports[key2]

    def get_ports_xsize(self, **kwargs) -> float:
        """Returns a the xdistance from east to west ports

        Args:
            kwargs: orientation, port_type, layer
        """
        ports_cw = self.get_ports_list(clockwise=True, **kwargs)
        ports_ccw = self.get_ports_list(clockwise=False, **kwargs)
        return snap_to_grid(ports_ccw[0].x - ports_cw[0].x)

    def get_ports_ysize(self, **kwargs) -> float:
        """Returns a the ydistance from east to west ports"""
        ports_cw = self.get_ports_list(clockwise=True, **kwargs)
        ports_ccw = self.get_ports_list(clockwise=False, **kwargs)
        return snap_to_grid(ports_ccw[0].y - ports_cw[0].y)

    def plot_netlist(self,
                     with_labels: bool = True,
                     font_weight: str = "normal") -> nx.Graph:
        """plots a netlist graph with networkx
        https://networkx.github.io/documentation/stable/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html

        Args:
            with_labels: label nodes
            font_weight: normal, bold
        """
        netlist = self.get_netlist()
        connections = netlist["connections"]
        placements = netlist["placements"]

        G = nx.Graph()

        G.add_edges_from([(",".join(k.split(",")[:-1]),
                           ",".join(v.split(",")[:-1]))
                          for k, v in connections.items()])

        pos = {k: (v["x"], v["y"]) for k, v in placements.items()}
        labels = {k: ",".join(k.split(",")[:1]) for k in placements.keys()}
        nx.draw(
            G,
            with_labels=with_labels,
            font_weight=font_weight,
            labels=labels,
            pos=pos,
        )
        return G

    def get_netlist_yaml(self) -> str:
        """Return YAML netlist."""
        return OmegaConf.to_yaml(self.get_netlist())

    def write_netlist(self,
                      filepath: str,
                      full_settings: bool = False) -> None:
        """Write netlist in YAML"""
        netlist = self.get_netlist(full_settings=full_settings)
        OmegaConf.save(netlist, filepath)

    def write_netlist_dot(self, filepath: Optional[str] = None) -> None:
        """Write netlist graph in DOT format."""
        from networkx.drawing.nx_agraph import write_dot

        filepath = filepath or f"{self.name}.dot"

        G = self.plot_netlist()
        write_dot(G, filepath)

    def get_netlist(self, full_settings: bool = False) -> Any:
        """Returns netlist dict(instances, placements, connections, ports)

        instances = {instances}
        placements = {instance_name,uid,x,y: dict(x=0, y=0, rotation=90), ...}
        connections = {instance_name_src_x_y,portName: instance_name_dst_x_y,portName}
        ports: {portName: instace_name,portName}

        Args:
            full_settings: exports all info, when false only settings_changed
        """
        from gdsfactory.get_netlist import get_netlist

        return get_netlist(component=self, full_settings=full_settings)

    def get_name_long(self) -> str:
        """returns the long name if it's been truncated to MAX_NAME_LENGTH"""
        if self.name_long:
            return self.name_long
        else:
            return self.name

    def get_parent_name(self) -> str:
        """Returns parent name if it has parent, else returns its own name.
        Returns the original parent name for hierarchical components
        and for non-hierarchical it just returns the component name
        """
        return self.info.get("parent_name", self.name)

    def assert_ports_on_grid(self, nm: int = 1) -> None:
        """Asserts that all ports are on grid."""
        for port in self.ports.values():
            port.assert_on_grid(nm=nm)

    def get_ports_dict(self, **kwargs) -> Dict[str, Port]:
        """Returns a dict of ports.

        Args:
            layer: port GDS layer
            prefix: for example "E" for east, "W" for west ...
        """
        return select_ports(self.ports, **kwargs)

    def get_ports_list(self, **kwargs) -> List[Port]:
        """Returns a list of ports.

        Args:
            layer: port GDS layer
            prefix: for example "E" for east, "W" for west ...
            orientation: angle in degrees for the port
        """
        return list(select_ports(self.ports, **kwargs).values())

    def get_ports_array(self) -> Dict[str, ndarray]:
        """returns ports as a dict of np arrays"""
        ports_array = {
            port_name: np.array([
                port.x,
                port.y,
                int(port.orientation),
                port.width,
                port.layer[0],
                port.layer[1],
            ])
            for port_name, port in self.ports.items()
        }
        return ports_array

    def ref(
        self,
        position: Coordinate = (0, 0),
        port_id: Optional[str] = None,
        rotation: int = 0,
        h_mirror: bool = False,
        v_mirror: bool = False,
    ) -> ComponentReference:
        """Returns Component reference."""
        _ref = ComponentReference(self)

        if port_id and port_id not in self.ports:
            raise ValueError(f"port {port_id} not in {self.ports.keys()}")

        if port_id:
            origin = self.ports[port_id].position
        else:
            origin = (0, 0)

        if h_mirror:
            _ref.reflect_h(port_id)

        if v_mirror:
            _ref.reflect_v(port_id)

        if rotation != 0:
            _ref.rotate(rotation, origin)
        _ref.move(origin, position)

        return _ref

    def ref_center(self, position=(0, 0)):
        """returns a reference of the component centered at (x=0, y=0)"""
        si = self.size_info
        yc = si.south + si.height / 2
        xc = si.west + si.width / 2
        center = (xc, yc)
        _ref = ComponentReference(self)
        _ref.move(center, position)
        return _ref

    def __repr__(self) -> str:
        return f"{self.name}: uid {self.uid}, ports {list(self.ports.keys())}, aliases {list(self.aliases.keys())}, {len(self.polygons)} polygons, {len(self.references)} references"

    @property
    def pprint(self) -> None:
        """Prints component info."""
        print(OmegaConf.to_yaml(self.info))

    @property
    def pprint_ports(self) -> None:
        """Prints component netlists."""
        ports_list = self.get_ports_list()
        for port in ports_list:
            print(port)

    @property
    def info_child(self) -> DictConfig:
        """Returns info from child if any, otherwise returns its info"""
        info = self.info

        while info.get("child"):
            info = info.get("child")

        return info

    def add_port(
        self,
        name: Optional[Union[str, int, object]] = None,
        midpoint: Tuple[float, float] = (
            0.0,
            0.0,
        ),
        width: float = 1.0,
        orientation: int = 45,
        port: Optional[Port] = None,
        layer: Tuple[int, int] = (1, 0),
        port_type: str = "optical",
        cross_section: Optional[CrossSection] = None,
    ) -> Port:
        """Can be called to copy an existing port like add_port(port = existing_port) or
        to create a new port add_port(myname, mymidpoint, mywidth, myorientation).
        Can also be called to copy an existing port
        with a new name add_port(port = existing_port, name = new_name)"""

        if port:
            if not isinstance(port, Port):
                raise ValueError(f"add_port() needs a Port, got {type(port)}")
            p = port.copy(new_uid=True)
            if name is not None:
                p.name = name
            p.parent = self

        elif isinstance(name, Port):
            p = name.copy(new_uid=True)
            p.parent = self
            name = p.name
        else:
            half_width = width / 2
            half_width_correct = snap_to_grid(half_width, nm=1)
            if not np.isclose(half_width, half_width_correct):
                warnings.warn(
                    f"port width = {width} will create off-grid points.\n"
                    f"You can fix it by changing width to {2*half_width_correct}\n"
                    f"port {name}, {midpoint}  {orientation} deg",
                    stacklevel=3,
                )
            p = Port(
                name=name,
                midpoint=(snap_to_grid(midpoint[0]),
                          snap_to_grid(midpoint[1])),
                width=snap_to_grid(width),
                orientation=orientation,
                parent=self,
                layer=layer,
                port_type=port_type,
                cross_section=cross_section,
            )
        if name is not None:
            p.name = name
        if p.name in self.ports:
            raise ValueError(
                f"add_port() Port name {p.name} exists in {self.name}")

        self.ports[p.name] = p
        return p

    def add_ports(self,
                  ports: Union[List[Port], Dict[str, Port]],
                  prefix: str = ""):
        ports = ports if isinstance(ports, list) else ports.values()
        for port in ports:
            name = f"{prefix}{port.name}" if prefix else port.name
            self.add_port(name=name, port=port)

    def snap_ports_to_grid(self, nm: int = 1) -> None:
        for port in self.ports.values():
            port.snap_to_grid(nm=nm)

    def remove_layers(
        self,
        layers: Union[List[Tuple[int, int]], Tuple[int, int]] = (),
        include_labels: bool = True,
        invert_selection: bool = False,
        recursive: bool = True,
    ) -> Device:
        """Remove a list of layers."""
        layers = [_parse_layer(layer) for layer in layers]
        all_D = list(self.get_dependencies(recursive))
        all_D += [self]
        for D in all_D:
            for polygonset in D.polygons:
                polygon_layers = zip(polygonset.layers, polygonset.datatypes)
                polygons_to_keep = [(pl in layers) for pl in polygon_layers]
                if not invert_selection:
                    polygons_to_keep = [(not p) for p in polygons_to_keep]
                polygonset.polygons = [
                    p for p, keep in zip(polygonset.polygons, polygons_to_keep)
                    if keep
                ]
                polygonset.layers = [
                    p for p, keep in zip(polygonset.layers, polygons_to_keep)
                    if keep
                ]
                polygonset.datatypes = [
                    p
                    for p, keep in zip(polygonset.datatypes, polygons_to_keep)
                    if keep
                ]

            if include_labels:
                new_labels = []
                for label in D.labels:
                    original_layer = (label.layer, label.texttype)
                    original_layer = _parse_layer(original_layer)
                    if invert_selection:
                        keep_layer = original_layer in layers
                    else:
                        keep_layer = original_layer not in layers
                    if keep_layer:
                        new_labels += [label]
                D.labels = new_labels
        return self

    def extract(
        self,
        layers: Union[List[Tuple[int, int]], Tuple[int, int]] = (),
    ) -> Device:
        """Extract polygons from a Component.
        adapted from phidl.geometry.
        """
        from gdsfactory.name import clean_value

        component = Component(f"{self.name}_{clean_value(layers)}")
        if type(layers) not in (list, tuple):
            raise ValueError("layers needs to be a list or tuple")
        poly_dict = self.get_polygons(by_spec=True)
        parsed_layer_list = [_parse_layer(layer) for layer in layers]
        for layer, polys in poly_dict.items():
            if _parse_layer(layer) in parsed_layer_list:
                component.add_polygon(polys, layer=layer)
        return component

    def copy(self) -> Device:
        return copy(self)

    def copy_child_info(self, component) -> None:
        """Copy info from another component.
        great for hiearchical components that propagate child cells info.
        """
        self.info.child = component.info

    @property
    def size_info(self) -> SizeInfo:
        """size info of the component"""
        # if self.__size_info__ == None:
        # self.__size_info__  = SizeInfo(self.bbox)
        return SizeInfo(self.bbox)  # self.__size_info__

    def add_ref(self,
                D: Device,
                alias: Optional[str] = None) -> "ComponentReference":
        """Takes a Component and adds it as a ComponentReference to the current
        Device."""
        if not isinstance(D, Component) and not isinstance(D, Device):
            raise TypeError(
                f"Component.add_ref() type = {type(D)} needs to be a Component."
            )
        ref = ComponentReference(
            D)  # Create a ComponentReference (CellReference)
        self.add(
            ref)  # Add ComponentReference (CellReference) to Device (Cell)

        if alias is not None:
            self.aliases[alias] = ref
        return ref

    def get_layers(
            self) -> Union[Set[Tuple[int, int]], Set[Tuple[int64, int64]]]:
        """returns a set of (layer, datatype)

        .. code ::

            import gdsfactory as gf
            gf.components.straight().get_layers() == {(1, 0), (111, 0)}

        """
        layers = set()
        for element in itertools.chain(self.polygons, self.paths):
            for layer, datatype in zip(element.layers, element.datatypes):
                layers.add((layer, datatype))
        for reference in self.references:
            for layer, datatype in reference.ref_cell.get_layers():
                layers.add((layer, datatype))
        for label in self.labels:
            layers.add((label.layer, 0))
        return layers

    def _repr_html_(self):
        """Print component, show geometry in matplotlib and in klayout
        when using jupyter notebooks
        """
        self.show(show_ports=False)
        self.plot()
        return self.__str__()

    def plot(
        self,
        clear_cache: bool = False,
    ) -> None:
        """Plot component in matplotlib"""
        from phidl import quickplot as plot

        from gdsfactory.cell import clear_cache as clear_cache_function

        plot(self)
        if clear_cache:
            clear_cache_function()

    def show(
        self,
        show_ports: bool = True,
        show_subports: bool = False,
        clear_cache: bool = False,
    ) -> None:
        """Show component in klayout

        if show_subports = True
        We add pins in a new component
        that contains a reference to the old component
        so we don't modify the original component

        Args:
            show_ports: shows component with port markers and labels
            show_subports: add ports markers and labels to component references
            clear_cache: after showing component clears cache (useful for jupyter)
        """
        from gdsfactory.add_pins import add_pins_container, add_pins_to_references
        from gdsfactory.show import show

        if show_subports:
            component = add_pins_to_references(component=self)
            component.name = self.name + "_show_subports"

        elif show_ports:
            component = add_pins_container(component=self)
            component.name = self.name + "_show_ports"
        else:
            component = self

        show(component, clear_cache=clear_cache)

    def plotqt(self):
        from phidl.quickplotter import quickplot2

        quickplot2(self)

    def write_gds(
        self,
        gdspath: Optional[PathType] = None,
        gdsdir: PathType = tmp,
        unit: float = 1e-6,
        precision: float = 1e-9,
        auto_rename: bool = False,
        timestamp: Optional[datetime.datetime] = _timestamp2019,
    ) -> Path:
        """Write component to GDS and returs gdspath

        Args:
            component: gf.Component.
            gdspath: GDS file path to write to.
            unit unit size for objects in library.
            precision: for the dimensions of the objects in the library (m).
            remove_previous_markers: clear previous ones to avoid duplicates.
            auto_rename: If True, fixes any duplicate cell names.
            timestamp: datetime object or boolean
                Sets the GDSII timestamp. Default = 2019-10-25 07:36:32.827300
                If None, defaults to Now.

        Returns:
            gdspath
        """
        gdsdir = pathlib.Path(gdsdir)
        gdspath = gdspath or gdsdir / (self.name + ".gds")
        gdspath = pathlib.Path(gdspath)
        gdsdir = gdspath.parent
        gdsdir.mkdir(exist_ok=True, parents=True)

        cells = self.get_dependencies()
        cell_names = [cell.name for cell in list(cells)]
        cell_names_unique = set(cell_names)

        if len(cell_names) != len(set(cell_names)):
            for cell_name in cell_names_unique:
                cell_names.remove(cell_name)

            cell_names_duplicated = "\n".join(set(cell_names))
            raise ValueError(
                f"Duplicated cell names in {self.name}:\n{cell_names_duplicated}"
            )

        referenced_cells = list(self.get_dependencies(recursive=True))
        all_cells = [self] + referenced_cells

        lib = gdspy.GdsLibrary(unit=unit, precision=precision)
        lib.write_gds(gdspath, cells=all_cells, timestamp=timestamp)
        self.path = gdspath
        return gdspath

    def write_gds_with_metadata(self, *args, **kwargs) -> Path:
        """Write component in GDS and metadata (component settings) in YAML"""
        gdspath = self.write_gds(*args, **kwargs)
        metadata = gdspath.with_suffix(".yml")
        metadata.write_text(self.to_yaml)
        return gdspath

    @property
    def to_dict_config(self) -> DictConfig:
        """Returns a DictConfig representation of the compoment."""
        d = DictConfig({})
        ports = {port.name: port.settings for port in self.get_ports_list()}
        clean_dict(ports)

        d.ports = ports
        d.info = self.info
        d.version = 1
        d.cells = recurse_structures(self)
        return OmegaConf.create(d)

    @property
    def to_dict(self) -> str:
        return OmegaConf.to_container(self.to_dict_config)

    @property
    def to_yaml(self) -> str:
        return OmegaConf.to_yaml(self.to_dict)

    @property
    def to_dict_polygons(self) -> DictConfig:
        """Returns a dict representation of the flattened compoment."""
        d = DictConfig({})
        polygons = {}
        layer_to_polygons = self.get_polygons(by_spec=True)

        for layer, polygons_layer in layer_to_polygons.items():
            for polygon in polygons_layer:
                layer_name = f"{layer[0]}_{layer[1]}"
                polygons[layer_name] = [
                    tuple(snap_to_grid(v)) for v in polygon
                ]

        ports = {port.name: port.settings for port in self.get_ports_list()}
        clean_dict(ports)
        clean_dict(polygons)
        d.info = self.info
        d.polygons = polygons
        d.ports = ports
        return OmegaConf.create(d)

    def auto_rename_ports(self, **kwargs) -> None:
        auto_rename_ports(self, **kwargs)

    def auto_rename_ports_counter_clockwise(self, **kwargs) -> None:
        auto_rename_ports_counter_clockwise(self, **kwargs)

    def auto_rename_ports_layer_orientation(self, **kwargs) -> None:
        auto_rename_ports_layer_orientation(self, **kwargs)

    def auto_rename_ports_orientation(self, **kwargs) -> None:
        auto_rename_ports_orientation(self, **kwargs)

    def move(self, *args, **kwargs):
        raise MutabilityError(
            "Don't move Components. Create a reference and move the reference instead."
        )

    def rotate(self, angle: int = 90):
        """Returns a new component with a rotated reference to the original component

        Args:
            angle: in degrees
        """
        from gdsfactory.rotate import rotate

        return rotate(component=self, angle=angle)