Exemple #1
0
    def test_add_special_tokens(self):
        tokenizer = SentencePieceTokenizer("./tests/data/m_common.model")

        special_tokens = ["[CLS]", "[MASK]", "[SEP]"]
        tokenizer.add_special_tokens(special_tokens)

        self.assertTrue(tokenizer.vocab_size == tokenizer.original_vocab_size +
                        len(special_tokens))
Exemple #2
0
    def test_tokens_to_text(self):
        tokenizer = SentencePieceTokenizer("./tests/data/m_common.model")

        text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]"
        tokens = tokenizer.text_to_tokens(text)
        result = tokenizer.tokens_to_text(tokens)

        self.assertTrue(text == result)
Exemple #3
0
    def test_text_to_ids(self):
        tokenizer = SentencePieceTokenizer("./tests/data/m_common.model")

        special_tokens = ["[CLS]", "[MASK]", "[SEP]"]
        tokenizer.add_special_tokens(special_tokens)

        text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]"
        ids = tokenizer.text_to_ids(text)

        self.assertTrue(len(ids) == len(text.split()))
        self.assertTrue(ids.count(tokenizer.special_tokens["[CLS]"]) == 1)
        self.assertTrue(ids.count(tokenizer.special_tokens["[MASK]"]) == 1)
        self.assertTrue(ids.count(tokenizer.special_tokens["[SEP]"]) == 2)
Exemple #4
0
    def test_ids_to_text(self):
        tokenizer = SentencePieceTokenizer("./tests/data/m_common.model")

        special_tokens = ["[CLS]", "[MASK]", "[SEP]"]
        tokenizer.add_special_tokens(special_tokens)

        text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]"
        ids = tokenizer.text_to_ids(text)
        result = tokenizer.ids_to_text(ids)

        self.assertTrue(text == result)
Exemple #5
0
    def test_ids_to_tokens(self):
        tokenizer = SentencePieceTokenizer("./tests/data/m_common.model")

        special_tokens = ["[CLS]", "[MASK]", "[SEP]"]
        tokenizer.add_special_tokens(special_tokens)

        text = "[CLS] a b c [MASK] e f [SEP] g h i [SEP]"
        tokens = tokenizer.text_to_tokens(text)
        ids = tokenizer.tokens_to_ids(tokens)
        result = tokenizer.ids_to_tokens(ids)

        self.assertTrue(len(result) == len(tokens))

        for i in range(len(result)):
            self.assertTrue(result[i] == tokens[i])
Exemple #6
0
                                   add_time_to_log_dir=True)
output_file = f'{nf.work_dir}/output.txt'

if args.bert_checkpoint is None:
    """ Use this if you're using a standard BERT model.
    To see the list of pretrained models, call:
    nemo_nlp.huggingface.BERT.list_pretrained_models()
    """
    tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
    pretrained_bert_model = nemo_nlp.huggingface.BERT(
        pretrained_model_name=args.pretrained_bert_model, factory=nf)
else:
    """ Use this if you're using a BERT model that you pre-trained yourself.
    Replace BERT-STEP-150000.pt with the path to your checkpoint.
    """
    tokenizer = SentencePieceTokenizer(model_path=tokenizer_model)
    tokenizer.add_special_tokens(["[MASK]", "[CLS]", "[SEP]"])

    bert_model = nemo_nlp.huggingface.BERT(config_filename=args.bert_config,
                                           factory=nf)
    pretrained_bert_model.restore_from(args.bert_checkpoint)

hidden_size = pretrained_bert_model.local_parameters["hidden_size"]
ner_classifier = nemo_nlp.TokenClassifier(hidden_size=hidden_size,
                                          num_classes=args.num_classes,
                                          dropout=args.fc_dropout)
ner_loss = nemo_nlp.TokenClassificationLoss(num_classes=args.num_classes)


def create_pipeline(input_file,
                    max_seq_length=args.max_seq_length,
Exemple #7
0
# Instantiate Neural Factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
    backend=nemo.core.Backend.PyTorch,
    local_rank=args.local_rank,
    optimization_level=optimization_level,
    placement=device)

if args.bert_checkpoint is None:
    tokenizer = NemoBertTokenizer(args.pretrained_bert_model)

    bert_model = nemo_nlp.huggingface.BERT(
        pretrained_model_name=args.pretrained_bert_model,
        factory=neural_factory)
else:
    tokenizer = SentencePieceTokenizer(model_path="tokenizer.model")
    tokenizer.add_special_tokens(["[MASK]", "[CLS]", "[SEP]"])

    bert_model = nemo_nlp.huggingface.BERT(config_filename=args.bert_config,
                                           factory=neural_factory)
    bert_model.restore_from(args.bert_checkpoint)

vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8)

# Training pipeline
print("Loading training data...")
train_data_layer = nemo_nlp.BertNERDataLayer(
    tokenizer=tokenizer,
    path_to_data=os.path.join(args.data_dir, "train.txt"),
    max_seq_length=args.max_seq_length,
    is_training=True,