def chat(folder_bert, voc, testing=False):
    tf.random.set_seed(1)
    tokenizer = BertTokenizer(vocab_file=folder_bert + voc)
    if testing:
        tokens = tokenizer.tokenize("jeg tror det skal regne")
        print(tokens)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        print(ids)
        print("Vocab size:", len(tokenizer.vocab))

    config = BertConfig.from_json_file(folder_bert + "/config.json")
    model = BertLMHeadModel.from_pretrained(folder_bert, config=config)
    while (1):
        text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0])))
        print("Bot: {}".format(
            tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0],
                             skip_special_tokens=True)))
Beispiel #2
0
    def __init__(
        self,
        SentenceEncoder,
        device,
        ContextEncoder,
        no_contextencoder_before_languagemodel=False,
    ):
        super().__init__()

        self.sentence_encoder = SentenceEncoder

        # Context Encoder
        if ContextEncoder == "GRUContextEncoder":
            self.context_encoder = GRUContextEncoder(input_size=768,
                                                     hidden_size=768)
        elif ContextEncoder == "PoolContextEncoder":
            self.context_encoder = PoolContextEncoder(input_size=768,
                                                      hidden_size=768)

        self.decoder = BertLMHeadModel.from_pretrained(
            "bert-base-uncased",
            is_decoder=True,
            add_cross_attention=True,
            output_hidden_states=True,
        )

        self.mpp_classifier = nn.Linear(768, 5)

        self.device = device
        self.no_contextencoder_before_languagemodel = (
            no_contextencoder_before_languagemodel)
Beispiel #3
0
    def __init__(self, latent_dim: int = 512):
        super(TextDecoder, self).__init__()

        # Tokenizer
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        # Decoder model
        config = BertConfig.from_pretrained("bert-base-uncased")
        config.is_decoder = True
        config.add_cross_attention = True

        self.decoder_model = BertLMHeadModel.from_pretrained(
            "bert-base-uncased", config=config)

        decoder_input_size = 768
        self.linear = nn.Linear(latent_dim, decoder_input_size)

        # Identifier to signal to the trainer to put the label in the decode call
        self.needs_labels = True
Beispiel #4
0
 def from_pretrained(self, model_dir):
     self.encoder_config = BertConfig.from_pretrained(model_dir)
     self.tokenizer = BertTokenizer.from_pretrained(
         path.join(model_dir, 'tokenizer'),
         do_lower_case=args.do_lower_case)
     self.utt_encoder = BertForPreTraining.from_pretrained(
         path.join(model_dir, 'utt_encoder'))
     self.context_encoder = BertForSequenceClassification.from_pretrained(
         path.join(model_dir, 'context_encoder'))
     self.context_mlm_trans = BertPredictionHeadTransform(
         self.encoder_config)
     self.context_mlm_trans.load_state_dict(
         torch.load(path.join(model_dir, 'context_mlm_trans.pkl')))
     self.context_order_trans = SelfSorting(self.encoder_config.hidden_size)
     self.context_order_trans.load_state_dict(
         torch.load(path.join(model_dir, 'context_order_trans.pkl')))
     self.decoder_config = BertConfig.from_pretrained(model_dir)
     self.decoder = BertLMHeadModel.from_pretrained(
         path.join(model_dir, 'decoder'))
Beispiel #5
0
    def __init__(self):
        # tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(BERT_TYPE)

        # special tokens
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.tokenizer.add_special_tokens(
            {'additional_special_tokens': dataset_tokens})

        # chess tokens
        self.tokenizer.add_tokens(get_chess_tokens())

        # model
        self.configuration = BertConfig.from_pretrained(BERT_TYPE)
        self.configuration.is_decoder = True

        self.model = BertLMHeadModel.from_pretrained(
            BERT_TYPE, config=self.configuration).cuda()

        self.model.resize_token_embeddings(len(self.tokenizer))
 def get_encoder_decoder_models(self):
     encoder_model = BertModel.from_pretrained("bert-base-uncased")
     decoder_model = BertLMHeadModel.from_pretrained("bert-base-uncased", config=self.get_decoder_config())
     return {"encoder": encoder_model, "decoder": decoder_model}
Beispiel #7
0
top_10 = torch.topk(mask_word, 10, dim=1)[1][0]
for token in top_10:
    word = tokenizer.decode([token])
    new_sentence = text.replace(tokenizer.mask_token, word)
    print(new_sentence)

# get the top candidate word only
top_word = torch.argmax(mask_word, dim=1)
print(tokenizer.decode(top_word))

### Example 2: Language Modeling
print('### Example 2: Language Modeling')
# the task of predicting the best word to follow or continue a sentence given all the words already in the sentence.
model = BertLMHeadModel.from_pretrained(
    'bert-base-uncased',
    return_dict=True,
    #  is_decoder = True if we want to use this model as a standalone model for predicting the next best word in the sequence.
    is_decoder=True,
    cache_dir=os.getenv("cache_dir", "../../models"))

text = "A knife is very "
input = tokenizer.encode_plus(text, return_tensors="pt")
output = model(**input).logits[:, -1, :]
softmax = F.softmax(output, -1)
index = torch.argmax(softmax, dim=-1)
x = tokenizer.decode(index)
print(text + " " + x)

### Example 3: Next Sentence Prediction
print('### Example 3: Next Sentence Prediction')
# Next Sentence Prediction is the task of predicting whether one sentence follows another sentence.
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased',