Esempio n. 1
0
    def get_enc_dec_tokenizers(
        encoder_tokenizer_name=None,
        encoder_tokenizer_model=None,
        encoder_bpe_dropout=0.0,
        encoder_model_name=None,
        decoder_tokenizer_name=None,
        decoder_tokenizer_model=None,
        decoder_bpe_dropout=0.0,
        decoder_model_name=None,
    ):

        # if encoder_tokenizer_name != 'yttm' or decoder_tokenizer_name != 'yttm':
        #     raise NotImplementedError(f"Currently we only support yttm tokenizer.")

        encoder_tokenizer = get_nmt_tokenizer(
            library=encoder_tokenizer_name,
            model_name=encoder_model_name,
            tokenizer_model=encoder_tokenizer_model,
            bpe_dropout=encoder_bpe_dropout,
        )
        decoder_tokenizer = get_nmt_tokenizer(
            library=decoder_tokenizer_name,
            model_name=decoder_model_name,
            tokenizer_model=decoder_tokenizer_model,
            bpe_dropout=decoder_bpe_dropout,
        )

        return encoder_tokenizer, decoder_tokenizer
Esempio n. 2
0
    def test_init_prompt_learning_dataset(self):
        tokenizer = get_nmt_tokenizer(library='megatron',
                                      model_name='GPT2BPETokenizer')
        task_templates = get_task_templates()
        dataset_path = create_temp_dataset()

        # Setup virtual token place holders
        total_virtual_tokens = 10
        pseudo_tokens = get_pseudo_tokens(total_virtual_tokens)
        tokenizer.add_special_tokens(
            {'additional_special_tokens': pseudo_tokens})

        dataset = get_prompt_tuning_dataset(
            dataset_path,
            tokenizer,
            VirtualPromptSource.PROMPT_TABLE,
            task_templates,
            pseudo_tokens,
        )

        dataset = get_prompt_tuning_dataset(
            dataset_path,
            tokenizer,
            VirtualPromptSource.PROMPT_ENCODER,
            task_templates,
            pseudo_tokens,
        )

        print(type(dataset))

        assert isinstance(dataset, Dataset)

        os.remove(dataset_path)
    def initializer(self):
        # Use Encoder class as a container for global data
        Encoder.tokenizer = get_nmt_tokenizer(
            library=self.args.tokenizer_library,
            model_name=self.args.tokenizer_type,
            tokenizer_model=self.args.tokenizer_model,
            vocab_file=self.args.vocab_file,
            merges_file=self.args.merge_file,
            delimiter=self.args.delimiter,
        )
        if self.args.split_sentences:
            if not nltk_available:
                print("NLTK is not available to split sentences.")
                exit()
            splitter = nltk.load("tokenizers/punkt/english.pickle")
            if self.args.keep_newlines:
                # this prevents punkt from eating newlines after sentences
                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
                    train_text=splitter._params,
                    lang_vars=CustomLanguageVars())
            else:
                Encoder.splitter = splitter

        else:
            Encoder.splitter = IdentitySplitter()
Esempio n. 4
0
def main():
    args = get_args()
    startup_start = time.time()

    print("Opening", args.input)
    fin = open(args.input, 'r', encoding='utf-8')

    if nltk_available and args.split_sentences:
        nltk.download("punkt", quiet=True)

    encoder = Encoder(args)

    tokenizer = get_nmt_tokenizer(
        library=args.tokenizer_library,
        model_name=args.tokenizer_type,
        tokenizer_model=args.tokenizer_model,
        vocab_file=args.vocab_file,
        merges_file=args.merge_file,
    )
    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
    encoded_docs = pool.imap(encoder.encode, fin, 25)
    # encoded_docs = map(encoder.encode, fin)

    level = "document"
    if args.split_sentences:
        level = "sentence"

    print(f"Vocab size: {tokenizer.vocab_size}")
    print(f"Output prefix: {args.output_prefix}")
    output_bin_files = {}
    output_idx_files = {}
    builders = {}
    for key in args.json_keys:
        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, key, level)
        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, key, level)
        builders[key] = indexed_dataset.make_builder(
            output_bin_files[key], impl=args.dataset_impl, vocab_size=tokenizer.vocab_size
        )

    startup_end = time.time()
    proc_start = time.time()
    total_bytes_processed = 0
    print("Time to startup:", startup_end - startup_start)

    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
        total_bytes_processed += bytes_processed
        for key, sentences in doc.items():
            if len(sentences) == 0:
                continue
            for sentence in sentences:
                builders[key].add_item(torch.IntTensor(sentence))
            builders[key].end_document()
        if i % args.log_interval == 0:
            current = time.time()
            elapsed = current - proc_start
            mbs = total_bytes_processed / elapsed / 1024 / 1024
            print(f"Processed {i} documents", f"({i/elapsed} docs/s, {mbs} MB/s).", file=sys.stderr)

    for key in args.json_keys:
        builders[key].finalize(output_idx_files[key])
Esempio n. 5
0
    def test_prompt_learning_dataset_collate_fn_prompt_encoder(self):
        tokenizer = get_nmt_tokenizer(library='megatron',
                                      model_name='GPT2BPETokenizer')
        task_templates = get_task_templates()
        dataset_path = create_temp_dataset()

        # Setup virtual token place holders
        total_virtual_tokens = 10
        pseudo_tokens = get_pseudo_tokens(total_virtual_tokens)
        tokenizer.add_special_tokens(
            {'additional_special_tokens': pseudo_tokens})

        dataset = get_prompt_tuning_dataset(
            dataset_path,
            tokenizer,
            VirtualPromptSource.PROMPT_ENCODER,
            task_templates,
            pseudo_tokens,
        )

        batch = [dataset[i] for i in range(8)]
        batch = dataset.collate_fn(batch)

        assert len(batch) == 6

        _, _, _, _, _, taskname_ids = batch

        assert list(
            taskname_ids[0].numpy()) == tokenizer.text_to_ids("task name A")

        os.remove(dataset_path)
Esempio n. 6
0
    def setup_enc_dec_tokenizers(
        self,
        encoder_tokenizer_library=None,
        encoder_tokenizer_model=None,
        encoder_bpe_dropout=0.0,
        encoder_model_name=None,
        encoder_r2l=False,
        encoder_tokenizer_vocab_file=None,
        decoder_tokenizer_library=None,
        decoder_tokenizer_model=None,
        decoder_bpe_dropout=0.0,
        decoder_model_name=None,
        decoder_r2l=False,
    ):

        supported_tokenizers = [
            'yttm', 'huggingface', 'sentencepiece', 'megatron', 'byte-level'
        ]
        if (encoder_tokenizer_library not in supported_tokenizers
                or decoder_tokenizer_library not in supported_tokenizers):
            raise NotImplementedError(
                f"Currently we only support tokenizers in {supported_tokenizers}."
            )

        self.encoder_tokenizer = get_nmt_tokenizer(
            library=encoder_tokenizer_library,
            tokenizer_model=self.register_artifact(
                "encoder_tokenizer.tokenizer_model", encoder_tokenizer_model),
            bpe_dropout=encoder_bpe_dropout,
            model_name=encoder_model_name,
            vocab_file=self.register_artifact("encoder_tokenizer.vocab_file",
                                              encoder_tokenizer_vocab_file),
            special_tokens=None,
            use_fast=False,
            r2l=encoder_r2l,
        )
        self.decoder_tokenizer = get_nmt_tokenizer(
            library=decoder_tokenizer_library,
            tokenizer_model=self.register_artifact(
                "decoder_tokenizer.tokenizer_model", decoder_tokenizer_model),
            bpe_dropout=decoder_bpe_dropout,
            model_name=decoder_model_name,
            vocab_file=None,
            special_tokens=None,
            use_fast=False,
            r2l=decoder_r2l,
        )
Esempio n. 7
0
    def test_init_prompt_tuning_dataset(self):
        tokenizer = get_nmt_tokenizer(library='huggingface', model_name='gpt2')
        dataset_path = create_temp_dataset()
        num_prompt_tokens = 10

        dataset = get_prompt_tuning_dataset(tokenizer, dataset_path, num_prompt_tokens)

        print(type(dataset))

        assert isinstance(dataset, Dataset)

        os.remove(dataset_path)
Esempio n. 8
0
def get_tokenizer(args):
    tokenizer = get_nmt_tokenizer(
        library=args.tokenizer_library,
        model_name=args.tokenizer_type,
        tokenizer_model=args.tokenizer_model,
        vocab_file=args.vocab_file,
        merges_file=args.merge_file,
        delimiter=args.delimiter,
    )
    if not hasattr(tokenizer, "pad_id"):
        tokenizer.add_special_tokens({'pad_token': '<pad>'})
    elif hasattr(tokenizer, "pad_id") and (tokenizer.pad_id is None or tokenizer.pad_id < 0):
        tokenizer.add_special_tokens({'pad_token': '<pad>'})
    return tokenizer
 def _build_tokenizer(self):
     """
     Default tokenizer is based on available nemo tokenizers.
     Override this method to use an external tokenizer.
     All tokenizers are expected to provide compatible interface.
     Override default Encoder-decoder tokenizer to use legacy=True for sentencepiece.
     """
     self.tokenizer = get_nmt_tokenizer(
         library=self._cfg.tokenizer.library,
         model_name=self._cfg.tokenizer.type,
         tokenizer_model=self.register_artifact("tokenizer.model", self._cfg.tokenizer.model),
         vocab_file=self.register_artifact("tokenizer.vocab_file", self._cfg.tokenizer.vocab_file),
         merges_file=self.register_artifact("tokenizer.merge_file", self._cfg.tokenizer.merge_file),
         legacy=True if self._cfg.tokenizer.library == 'sentencepiece' else False,
     )
Esempio n. 10
0
    def test_prompt_tuning_dataset_collate_fn(self):
        tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer')
        dataset_path = create_temp_dataset()
        num_prompt_tokens = 10

        dataset = get_prompt_tuning_dataset(tokenizer, dataset_path, num_prompt_tokens)
        batch = [dataset[i] for i in range(8)]
        batch = dataset.collate_fn(batch)

        assert len(batch) == 6

        tokens, labels, prompt_tags, attention_mask, loss_mask, text_position_ids = batch

        assert len(tokens) == len(loss_mask) == len(attention_mask) == len(text_position_ids)
        assert len(tokens) == len(prompt_tags)
        assert len(tokens[0]) + num_prompt_tokens == len(loss_mask[0])
        assert len(tokens[0]) + num_prompt_tokens == attention_mask[0].size()[-1]

        os.remove(dataset_path)
Esempio n. 11
0
    def _build_tokenizer(self):
        self.tokenizer = get_nmt_tokenizer(
            library=self._cfg.tokenizer.library,
            model_name=self._cfg.tokenizer.type,
            tokenizer_model=self.register_artifact("tokenizer.model",
                                                   self._cfg.tokenizer.model),
            vocab_file=self.register_artifact("tokenizer.vocab_file",
                                              self._cfg.tokenizer.vocab_file),
            merges_file=self.register_artifact("tokenizer.merge_file",
                                               self._cfg.tokenizer.merge_file),
            delimiter=self.cfg.tokenizer.get('delimiter', None),
            legacy=False,
        )

        # add pad special token
        if not hasattr(self.tokenizer, "pad_id"):
            self.tokenizer.add_special_tokens({'pad_token': '<pad>'})
        elif hasattr(self.tokenizer,
                     "pad_id") and (self.tokenizer.pad_id is None
                                    or self.tokenizer.pad_id < 0):
            self.tokenizer.add_special_tokens({'pad_token': '<pad>'})
Esempio n. 12
0
    def test_prompt_learning_dataset_collate_fn_prompt_table(self):
        tokenizer = get_nmt_tokenizer(library='megatron',
                                      model_name='GPT2BPETokenizer')
        task_templates = get_task_templates()
        dataset_path = create_temp_dataset()

        # Setup virtual token place holders
        pseudo_token_base = 'PROMPT_'
        total_virtual_tokens = 10
        pseudo_tokens = [
            pseudo_token_base + str(i) for i in range(total_virtual_tokens)
        ]
        tokenizer.add_special_tokens(
            {'additional_special_tokens': pseudo_tokens})

        dataset = get_prompt_tuning_dataset(
            dataset_path,
            tokenizer,
            'prompt-table',
            task_templates,
            pseudo_tokens,
        )

        batch = [dataset[i] for i in range(8)]
        batch = dataset.collate_fn(batch)

        assert len(batch) == 6

        input_ids, labels, loss_mask, position_ids, attention_mask, taskname_ids = batch

        assert len(input_ids) == len(loss_mask) == len(attention_mask) == len(
            position_ids)
        assert len(input_ids) == len(taskname_ids)
        assert len(labels) == len(input_ids)
        assert len(labels[0]) == len(loss_mask[0])
        assert len(input_ids[0]) == attention_mask[0].size()[-1]
        assert len(taskname_ids.shape) == 1
        assert taskname_ids[0] == 0

        os.remove(dataset_path)
Esempio n. 13
0
    def test_init_prompt_learning_dataset(self):
        tokenizer = get_nmt_tokenizer(library='megatron',
                                      model_name='GPT2BPETokenizer')
        task_templates = get_task_templates()
        dataset_path = create_temp_dataset()

        # Setup virtual token place holders
        pseudo_token_base = 'PROMPT_'
        max_virtual_tokens = 10
        pseudo_tokens = [
            pseudo_token_base + str(i) for i in range(max_virtual_tokens)
        ]
        tokenizer.add_special_tokens(
            {'additional_special_tokens': pseudo_tokens})

        dataset = get_prompt_tuning_dataset(
            dataset_path,
            tokenizer,
            'prompt-table',
            task_templates,
            pseudo_tokens,
        )

        dataset = get_prompt_tuning_dataset(
            dataset_path,
            tokenizer,
            'prompt-encoder',
            task_templates,
            pseudo_tokens,
        )

        print(type(dataset))

        assert isinstance(dataset, Dataset)

        os.remove(dataset_path)
Esempio n. 14
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        super().__init__(cfg, trainer=trainer)
        self.cfg = cfg

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self.cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []
        self._reduced_lm_loss_buffer = []
        self._reduced_sop_loss_buffer = []

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            seed=self.cfg.get('seed', 1234),
        )

        self.tokenizer = get_nmt_tokenizer(
            library=self.cfg.tokenizer.library,
            model_name=self.cfg.tokenizer.type,
            tokenizer_model=self.register_artifact("tokenizer_model",
                                                   self.cfg.tokenizer.model),
            vocab_file=self.register_artifact("vocab_file",
                                              self.cfg.tokenizer.vocab_file),
            merges_file=self.register_artifact("merges_file",
                                               self.cfg.tokenizer.merge_file),
        )

        vocab_size = self.tokenizer.vocab_size

        padded_vocab_size = self._vocab_size_with_padding(
            orig_vocab_size=vocab_size,
            make_vocab_size_divisible_by=cfg.get(
                'make_vocab_size_divisible_by', 128),
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
        )

        num_tokentypes = 2 if cfg.bert_binary_head else 0

        self.model = BertModel(
            vocab_size=padded_vocab_size,
            hidden_size=cfg.hidden_size,
            max_position_embeddings=cfg.max_position_embeddings,
            num_layers=cfg.num_layers,
            num_attention_heads=cfg.num_attention_heads,
            apply_query_key_layer_scaling=cfg.get(
                'apply_query_key_layer_scaling', True),
            kv_channels=cfg.get('kv_channels', None),
            ffn_hidden_size=cfg.ffn_hidden_size,
            num_tokentypes=num_tokentypes,
            parallel_output=True,
            pre_process=cfg.get('pre_process', True),
            post_process=cfg.get('post_process', True),
            init_method_std=cfg.get('init_method_std', 0.02),
            fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
            use_cpu_initialization=cfg.get('use_cpu_initialization', False),
            hidden_dropout=cfg.get('hidden_dropout', 0.1),
            precision=cfg.get('precision', 16),
            fp32_residual_connection=cfg.get('fp32_residual_connection',
                                             False),
            activations_checkpoint_method=cfg.get(
                'activations_checkpoint_method', None),
            activations_checkpoint_num_layers=cfg.get(
                'activations_checkpoint_num_layers', 1),
            layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
            onnx_safe=cfg.get('onnx_safe', False),
            add_binary_head=cfg.bert_binary_head,
        )
Esempio n. 15
0
def init_tokenizer(library, tokenizer_model):
    tokenizer = get_nmt_tokenizer(library=library,
                                  tokenizer_model=tokenizer_model)
    worker_data["tokenizer"] = tokenizer
Esempio n. 16
0
                       help='Path to the vocab file')
    group.add_argument('--merge-file',
                       type=str,
                       default=None,
                       help='Path to the BPE merge file (if necessary).')
    group.add_argument('--delimiter',
                       type=str,
                       default=None,
                       help='delimiter used for tabular tokenizer')

    args = parser.parse_args()

    tokenizer = get_nmt_tokenizer(
        library=args.tokenizer_library,
        model_name=args.tokenizer_type,
        tokenizer_model=args.tokenizer_model,
        vocab_file=args.vocab_file,
        merges_file=args.merge_file,
        delimiter=args.delimiter,
    )

    data_ds = MMapRetrievalIndexedDataset(args.input_data_prefix)
    retrieval_ds = MMapRetrievalIndexedDataset(args.input_retrieval_prefix)
    knn_index = KNNIndex(args.knn_index)
    assert knn_index.len == data_ds.chunks
    logging.info(f'Data index has {data_ds.chunks} chunks')
    logging.info(f'Retrieval Data index has {retrieval_ds.chunks} chunks')
    logging.info(f'KNN index has {knn_index.K} neighbors')
    assert knn_index.knn_map.max() < retrieval_ds.chunks
    assert data_ds._index.chunk_size == retrieval_ds._index.chunk_size

    for chunk_id in args.chunk_ids:
Esempio n. 17
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        super().__init__(cfg, trainer=trainer)
        self.cfg = cfg

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self.cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        # buffer used during train_step for logging average loss over gradient accumulation steps
        self._reduced_loss_buffer = []
        self._reduced_lm_loss_buffer = []
        self._reduced_sop_loss_buffer = []

        # not saved as part of nemo model graph but required during export to ONNX
        input_names = ['input_ids', 'attention_mask', 'token_type_ids']

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            seed=self.cfg.get('seed', 1234),
        )

        self.tokenizer = get_nmt_tokenizer(
            library=self.cfg.tokenizer.library,
            model_name=self.cfg.tokenizer.type,
            tokenizer_model=self.register_artifact("tokenizer.model",
                                                   self.cfg.tokenizer.model),
            vocab_file=self.register_artifact("tokenizer.vocab_file",
                                              self.cfg.tokenizer.vocab_file),
            merges_file=self.register_artifact("tokenizer.merge_file",
                                               self.cfg.tokenizer.merge_file),
        )

        vocab_size = self.tokenizer.vocab_size

        padded_vocab_size = self._vocab_size_with_padding(
            orig_vocab_size=vocab_size,
            make_vocab_size_divisible_by=cfg.get(
                'make_vocab_size_divisible_by', 128),
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
        )

        num_tokentypes = 2 if cfg.bert_binary_head else 0

        self.model = BertModel(
            vocab_size=padded_vocab_size,
            hidden_size=cfg.hidden_size,
            max_position_embeddings=cfg.max_position_embeddings,
            num_layers=cfg.num_layers,
            num_attention_heads=cfg.num_attention_heads,
            apply_query_key_layer_scaling=cfg.get(
                'apply_query_key_layer_scaling', True),
            kv_channels=cfg.get('kv_channels', None),
            ffn_hidden_size=cfg.ffn_hidden_size,
            num_tokentypes=num_tokentypes,
            parallel_output=True,
            pre_process=cfg.get('pre_process', True),
            post_process=cfg.get('post_process', True),
            init_method_std=cfg.get('init_method_std', 0.02),
            fp16_lm_cross_entropy=cfg.get('fp16_lm_cross_entropy', False),
            use_cpu_initialization=cfg.get('use_cpu_initialization', False),
            hidden_dropout=cfg.get('hidden_dropout', 0.1),
            precision=cfg.get('precision', 16),
            fp32_residual_connection=cfg.get('fp32_residual_connection',
                                             False),
            activations_checkpoint_method=cfg.get(
                'activations_checkpoint_method', None),
            activations_checkpoint_num_layers=cfg.get(
                'activations_checkpoint_num_layers', 1),
            layernorm_epsilon=cfg.get('layernorm_epsilon', 1e-5),
            masked_softmax_fusion=cfg.get('masked_softmax_fusion', True),
            bias_gelu_fusion=cfg.get('bias_gelu_fusion', True),
            onnx_safe=cfg.get('onnx_safe', False),
            add_binary_head=cfg.bert_binary_head,
            megatron_legacy=cfg.get('megatron_legacy', False),
        )
Esempio n. 18
0
    def __init__(self, cfg: DictConfig, trainer: Trainer):
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        # this prevents base constructor from initializing tokenizer
        self.tokenizer = None
        super().__init__(cfg, trainer=trainer, no_lm_init=True)

        self._validate_trainer()

        # used in NVIDIA NGC PyTorch containers
        self._enable_nvidia_optimizations()

        if self.cfg.get('use_cpu_initialization', False) is False:
            torch.cuda.set_device(trainer.local_rank)

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
            pipeline_model_parallel_size=cfg.get(
                'pipeline_model_parallel_size', 1),
            micro_batch_size=cfg.get('micro_batch_size'),
            global_batch_size=cfg.get('global_batch_size'),
            seed=self.cfg.get('seed', 1234),
            apex_transformer_log_level=self.cfg.get(
                'apex_transformer_log_level', 30),
        )

        self.tokenizer = get_nmt_tokenizer(
            library=self.cfg.tokenizer.library,
            model_name=self.cfg.tokenizer.type,
            tokenizer_model=self.register_artifact("tokenizer.model",
                                                   self.cfg.tokenizer.model),
            vocab_file=self.register_artifact("tokenizer.vocab_file",
                                              self.cfg.tokenizer.vocab_file),
            merges_file=self.register_artifact("tokenizer.merge_file",
                                               self.cfg.tokenizer.merge_file),
            delimiter=self.cfg.tokenizer.get('delimiter', None),
        )

        vocab_size = self.tokenizer.vocab_size

        self.padded_vocab_size = self._vocab_size_with_padding(
            orig_vocab_size=vocab_size,
            make_vocab_size_divisible_by=cfg.get(
                'make_vocab_size_divisible_by', 128),
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size',
                                               1),
        )

        # TODO: Not sure how to use lists of modules with PTL.
        # This means we can only use pipeline parallelism without the interleaved schedule.
        self.model = build_model(model_provider_func=self.model_provider_func,
                                 wrap_with_ddp=False)[0]

        self.setup_optimizer_param_groups()

        self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False)

        if self.megatron_amp_o2:

            # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type
            self.model.cuda(torch.cuda.current_device())

            # Model wrapper to convert both model and inputs to half precision
            self.model = Float16Module(module=self.model,
                                       precision=cfg.precision)

        if self.trainer.precision == 32:
            self.autocast_dtype = torch.float
        elif self.trainer.precision == 16:
            self.autocast_dtype = torch.half
        elif self.trainer.precision == 'bf16':
            self.autocast_dtype = torch.bfloat16
        else:
            raise ValueError('precision must be in [32, 16, "bf16"]')

        # configuration used for inference
        self._inference_config = None
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
        """Initializes the PTune TextClassifier model."""
        super().__init__(cfg=cfg, trainer=trainer)

        initialize_model_parallel_for_nemo(
            world_size=trainer.world_size,
            global_rank=trainer.global_rank,
            local_rank=trainer.local_rank,
            tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1),
            seed=cfg.get('seed', 1234),
        )

        # shared params for dataset and data loaders
        self.dataset_cfg = cfg.dataset
        # tokenizer needs to get initialized before the super.__init__()
        # as dataloaders and datasets need it to process the data
        self.tokenizer = get_nmt_tokenizer(
            library=cfg.tokenizer.library,
            model_name=cfg.tokenizer.type,
            tokenizer_model=self.register_artifact("tokenizer.model", cfg.tokenizer.model),
            vocab_file=self.register_artifact("tokenizer.vocab_file", cfg.tokenizer.vocab_file),
            merges_file=self.register_artifact("tokenizer.merges_file", cfg.tokenizer.merge_file),
        )

        self.class_weights = None

        self.model = MegatronGPTModel.restore_from(
            self.register_artifact('language_model.nemo_file', cfg.language_model.get('nemo_file', None)),
            trainer=trainer,
        )

        if not cfg.use_lm_finetune:
            self.model.freeze()

        hidden_size = self.model.cfg.hidden_size

        # register the file containing the labels into the artifacts to get stored in the '.nemo' file later
        self.classes = cfg.dataset.classes

        self.embeddings = self.model.model.language_model.embedding.word_embeddings

        # set allowed vocab set
        self.vocab = self.tokenizer.tokenizer.get_vocab()

        # make sure classes are part of the vocab
        for k in cfg.dataset.classes:
            if token_wrapper(k) not in self.vocab:
                logging.error(f'class {k} is not part of the vocabulary. Please add it to your vocab')
        self.allowed_vocab_ids = set(self.vocab[token_wrapper(k)] for k in cfg.dataset.classes)

        # map from id to label
        self.allowed_vocab = {}
        self.label_ids = {}
        self.id_to_label = {}
        for i, k in enumerate(cfg.dataset.classes):
            self.allowed_vocab[self.vocab[token_wrapper(k)]] = i
            self.label_ids[k] = i
            self.id_to_label[i] = k

        self.template = cfg.prompt_encoder.template

        self.prompt_encoder = PromptEncoder(
            template=cfg.prompt_encoder.template,
            hidden_size=hidden_size,
            lstm_dropout=cfg.prompt_encoder.dropout,
            num_layers=cfg.prompt_encoder.num_layers,
        )

        # load prompt encoder
        self.hidden_size = hidden_size
        self.tokenizer.add_special_tokens({'additional_special_tokens': [cfg.pseudo_token]})

        self.pseudo_token_id = self.tokenizer.tokenizer.get_vocab()[cfg.pseudo_token]
        self.pad_token_id = (
            self.tokenizer.tokenizer.pad_token_id
            if self.tokenizer.tokenizer.pad_token_id is not None
            else self.tokenizer.tokenizer.unk_token_id
        )
        self.spell_length = sum(self.template)
Esempio n. 20
0
def main():
    args = get_args()
    startup_start = time.time()
    if args.preproc_folder:
        print('Searching folder for .json or .json.gz files...')
        assert os.path.exists(
            args.input), f'Folder does not exist: {args.input}'
        files_in_folder = os.listdir(args.input)
        json_files = [
            os.path.join(args.input, f) for f in files_in_folder
            if f.endswith('.json') or f.endswith('.json.gz')
        ]
        if len(json_files) == 0:
            raise FileNotFoundError(
                'No .json or .json.gz files found in folder.')
        else:
            print(f'Found {len(json_files)} .json or .json.gz files.')
    else:
        assert os.path.exists(args.input), f'File does not exist: {args.input}'
        json_files = [args.input]

    if nltk_available and args.split_sentences:
        nltk.download("punkt", quiet=True)

    encoder = Encoder(args)

    tokenizer = get_nmt_tokenizer(
        library=args.tokenizer_library,
        model_name=args.tokenizer_type,
        tokenizer_model=args.tokenizer_model,
        vocab_file=args.vocab_file,
        merges_file=args.merge_file,
    )

    level = "document"
    if args.split_sentences:
        level = "sentence"

    print(f"Vocab size: {tokenizer.vocab_size}")
    print(f"Output prefix: {args.output_prefix}")
    output_bin_files = {}
    output_idx_files = {}
    builders = {}
    for key in args.json_keys:
        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix, key,
                                                      level)
        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix, key,
                                                      level)
        builders[key] = indexed_dataset.make_builder(
            output_bin_files[key],
            impl=args.dataset_impl,
            vocab_size=tokenizer.vocab_size)

    startup_end = time.time()
    proc_start = time.time()
    total_bytes_processed = 0
    print("Time to startup:", startup_end - startup_start)

    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)

    for idx, json_file in enumerate(json_files):
        print(f'Processing file {json_file} {idx + 1}/{len(json_files)}')
        if json_file.endswith('.gz'):
            fin = gzip.open(json_file, 'r')
        else:
            fin = open(args.input, 'r', encoding='utf-8')

        encoded_docs = pool.imap(encoder.encode, fin, 25)

        for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
            total_bytes_processed += bytes_processed
            for key, sentences in doc.items():
                if len(sentences) == 0:
                    continue
                for sentence in sentences:
                    builders[key].add_item(torch.IntTensor(sentence))
                builders[key].end_document()
            if i % args.log_interval == 0:
                current = time.time()
                elapsed = current - proc_start
                mbs = total_bytes_processed / elapsed / 1024 / 1024
                print(f"Processed {i} documents",
                      f"({i/elapsed} docs/s, {mbs} MB/s).",
                      file=sys.stderr)

    for key in args.json_keys:
        builders[key].finalize(output_idx_files[key])
Esempio n. 21
0
                        type=int,
                        default=-1,
                        help='Max number of lines to parse')
    parser.add_argument('--batch_size',
                        type=int,
                        default=10000000,
                        help='Batch size to parse in parallel')
    parser.add_argument('--out_dir',
                        type=str,
                        default="",
                        help='Path to store data and plots')

    args = parser.parse_args()

    tokenizer = get_nmt_tokenizer(
        library=args.tokenizer_library,
        tokenizer_model=args.tokenizer_model,
    )

    all_len = []

    for fn in args.input_files:
        print(f"Parsing fn = {fn}")
        # read file
        fh = open(fn)

        # read all batches
        while True:
            lines = read_batch(fh, args.batch_size)

            # move to next file when no lines are read
            if not lines: