コード例 #1
0
 def test_tokenizer_from_pretrained(self):
     cache_dir = "/tmp/pytorch_pretrained_bert_test/"
     for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
         tokenizer = GPT2Tokenizer.from_pretrained(model_name,
                                                   cache_dir=cache_dir)
         shutil.rmtree(cache_dir)
         self.assertIsNotNone(tokenizer)
コード例 #2
0
def gpt2Tokenizer(*args, **kwargs):
    """
    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
    Peculiarities:
        - Byte-level BPE

    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * gpt2
    Keyword args:
    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
                    Default: None
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
             Default: None

    Example:
        >>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')

        >>> text = "Who was Jim Henson ?"
        >>> indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
コード例 #3
0
    def test_full_tokenizer(self):
        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
        vocab = [
            "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "low",
            "er", "low", "lowest", "newer", "wider"
        ]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
            fp.write(json.dumps(vocab_tokens))
            vocab_file = fp.name
        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
            fp.write("\n".join(merges))
            merges_file = fp.name

        tokenizer = GPT2Tokenizer(vocab_file,
                                  merges_file,
                                  special_tokens=["<unk>", "<pad>"])
        print("encoder", tokenizer.byte_encoder)
        os.remove(vocab_file)
        os.remove(merges_file)

        text = "lower"
        bpe_tokens = ["low", "er"]
        tokens = tokenizer.tokenize(text)
        self.assertListEqual(tokens, bpe_tokens)

        input_tokens = tokens + ["<unk>"]
        input_bpe_tokens = [13, 12, 16]
        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens),
                             input_bpe_tokens)

        vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary(
            vocab_path="/tmp/")
        tokenizer_2 = GPT2Tokenizer.from_pretrained("/tmp/")
        os.remove(vocab_file)
        os.remove(merges_file)
        os.remove(special_tokens_file)

        self.assertListEqual([
            tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks,
            tokenizer.special_tokens, tokenizer.special_tokens_decoder
        ], [
            tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks,
            tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder
        ])
コード例 #4
0
    def __init__(self, cache_size: int = 100) -> None:
        """
        Each cache element is about 8MB, so size accordingly.
        """
        # Cache stores tuples, so default value is a tuple
        self._cache = LRUCache(cache_size, default_value=(None, None))
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.model = GPT2LMHeadModel.from_pretrained('gpt2')

        # The end of text marker.
        self.END_OF_TEXT = self.tokenizer.encoder["<|endoftext|>"]
コード例 #5
0
    def __init__(self,
                 model_name: str = MEDIUM_MODEL,
                 cache_size: int = 0) -> None:
        """
        Each cache element is about 8MB, so size accordingly.
        """
        # Cache stores tuples, so default value is a tuple
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.model = GPT2LMHeadModel.from_pretrained(model_name)

        # The end of text marker.
        self.END_OF_TEXT = self.tokenizer.encoder["<|endoftext|>"]
コード例 #6
0
    def __init__(self,
                 model_name: str = MEDIUM_MODEL,
                 cache_size: int = 0) -> None:
        """
        Each cache element is about 8MB, so size accordingly.
        """
        # Cache stores tuples, so default value is a tuple
        # self.tokenizer = T5Tokenizer.from_pretrained(MEDIUM_MODEL)
        # self.model = AutoModelForCausalLM.from_pretrained(MEDIUM_MODEL)

        self.tokenizer = GPT2Tokenizer.from_pretrained(MEDIUM_MODEL)
        self.model = GPT2LMHeadModel.from_pretrained(MEDIUM_MODEL)
コード例 #7
0
    def __init__(self, model_name: str = '117M', cache_size: int = 0) -> None:
        """
        Each cache element is about 8MB, so size accordingly.
        """
        # Cache stores tuples, so default value is a tuple
        self._cache = LRUCache(cache_size, default_value=(None, None))
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        if model_name == '117M':
            self.model = GPT2LMHeadModel.from_pretrained('gpt2')
        elif model_name == '345M':
            self.model = GPT2LMHeadModel.from_pretrained(MEDIUM_MODEL)
        else:
            exit("model name not found")

        # The end of text marker.
        self.END_OF_TEXT = self.tokenizer.encoder["<|endoftext|>"]
コード例 #8
0
def predict():
    text_return = ''
    if request.method == 'POST':
        post_text = request.form.to_dict()
        for k in post_text:

            print(k)
            # Load pre-trained model tokenizer (vocabulary)
            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

            # Encode text input
            text = k

            indexed_tokens = tokenizer.encode(text)

            # Convert indexed tokens in a PyTorch tensor
            tokens_tensor = torch.tensor([indexed_tokens])

            # Load pre-trained model (weights)
            model = GPT2LMHeadModel.from_pretrained(
                'https://storage.googleapis.com/allennlp/models/gpt2-345M-dump'
            )
            #print(1)
            # Set the model in evaluation mode to deactivate the DropOut modules
            model.eval()

            # If you have a GPU, put everything on cuda
            tokens_tensor = tokens_tensor.to('cpu')
            model.to('cpu')

            # Predict all tokens
            with torch.no_grad():
                outputs = model(tokens_tensor)
                predictions = outputs[0]

            # Get the predicted next sub-word
            predicted_index = torch.argmax(predictions[0, -1, :]).item()
            predicted_text = tokenizer.decode(indexed_tokens +
                                              [predicted_index])

            # Print the predicted word
            #print(predicted_text) #debugging
            original_fragment = text
            completed_phrase = predicted_text
            output_list = [
                li for li in difflib.ndiff(original_fragment, completed_phrase)
                if li[0] != ' '
            ]
            #print(output_list) #debugging
            output_list = [s.replace('+', '') for s in output_list[1:]]
            #print(output_list) #debugging
            output_word = ""
            for x in output_list:
                output_word += x
            #print(output_word)
            #output_word = output_word.strip(" ")
            #print(output_word)
            output_word = output_word.replace(" ", "")
            print(output_word)
            text_return = output_word

    return text_return