def test_add_special_tokens(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = ["[CLS]", "[MASK]", "[SEP]"] tokenizer.add_special_tokens(special_tokens) self.assertTrue(tokenizer.vocab_size == tokenizer.original_vocab_size + len(special_tokens))
def test_tokens_to_text(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) result = tokenizer.tokens_to_text(tokens) self.assertTrue(text == result)
def test_text_to_ids(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = ["[CLS]", "[MASK]", "[SEP]"] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" ids = tokenizer.text_to_ids(text) self.assertTrue(len(ids) == len(text.split())) self.assertTrue(ids.count(tokenizer.special_tokens["[CLS]"]) == 1) self.assertTrue(ids.count(tokenizer.special_tokens["[MASK]"]) == 1) self.assertTrue(ids.count(tokenizer.special_tokens["[SEP]"]) == 2)
def test_ids_to_text(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = ["[CLS]", "[MASK]", "[SEP]"] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" ids = tokenizer.text_to_ids(text) result = tokenizer.ids_to_text(ids) self.assertTrue(text == result)
def test_ids_to_tokens(self): tokenizer = SentencePieceTokenizer("./tests/data/m_common.model") special_tokens = ["[CLS]", "[MASK]", "[SEP]"] tokenizer.add_special_tokens(special_tokens) text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]" tokens = tokenizer.text_to_tokens(text) ids = tokenizer.tokens_to_ids(tokens) result = tokenizer.ids_to_tokens(ids) self.assertTrue(len(result) == len(tokens)) for i in range(len(result)): self.assertTrue(result[i] == tokens[i])
add_time_to_log_dir=True) output_file = f'{nf.work_dir}/output.txt' if args.bert_checkpoint is None: """ Use this if you're using a standard BERT model. To see the list of pretrained models, call: nemo_nlp.huggingface.BERT.list_pretrained_models() """ tokenizer = NemoBertTokenizer(args.pretrained_bert_model) pretrained_bert_model = nemo_nlp.huggingface.BERT( pretrained_model_name=args.pretrained_bert_model, factory=nf) else: """ Use this if you're using a BERT model that you pre-trained yourself. Replace BERT-STEP-150000.pt with the path to your checkpoint. """ tokenizer = SentencePieceTokenizer(model_path=tokenizer_model) tokenizer.add_special_tokens(["[MASK]", "[CLS]", "[SEP]"]) bert_model = nemo_nlp.huggingface.BERT(config_filename=args.bert_config, factory=nf) pretrained_bert_model.restore_from(args.bert_checkpoint) hidden_size = pretrained_bert_model.local_parameters["hidden_size"] ner_classifier = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=args.num_classes, dropout=args.fc_dropout) ner_loss = nemo_nlp.TokenClassificationLoss(num_classes=args.num_classes) def create_pipeline(input_file, max_seq_length=args.max_seq_length,
# Instantiate Neural Factory with supported backend neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=optimization_level, placement=device) if args.bert_checkpoint is None: tokenizer = NemoBertTokenizer(args.pretrained_bert_model) bert_model = nemo_nlp.huggingface.BERT( pretrained_model_name=args.pretrained_bert_model, factory=neural_factory) else: tokenizer = SentencePieceTokenizer(model_path="tokenizer.model") tokenizer.add_special_tokens(["[MASK]", "[CLS]", "[SEP]"]) bert_model = nemo_nlp.huggingface.BERT(config_filename=args.bert_config, factory=neural_factory) bert_model.restore_from(args.bert_checkpoint) vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8) # Training pipeline print("Loading training data...") train_data_layer = nemo_nlp.BertNERDataLayer( tokenizer=tokenizer, path_to_data=os.path.join(args.data_dir, "train.txt"), max_seq_length=args.max_seq_length, is_training=True,