Exemple #1
0
def load_tokenizer(tknzr_file,
                   flag_tknzr_fast,
                   pad_token=None,
                   mask_token=None):
    """
    Interestingly, HuggingFace does not allow the base tokenizer to be called.
    This is a bizarre choice, but accordingly we have to look for something else
    , which is why I use the PreTrainedTokenizerFast to wrap the base tokenizer.
    Written in Rust, it's faster than the base tokenizer class, but also lets
    you call the tokenizer as tknzr('text to be tokenized').

    Input
        tknzr_file (str) : .json file of the tokenizer trained previously
        *_tokens (str)  : tokens that are to be used in the corresponding context
                            Some of them are not implemented yet...
    Output
        tknzr     : tokenizer as PreTrainedTokenizerFast class to be passed on
    """
    if flag_tknzr_fast:
        tknzr = PreTrainedTokenizerFast(tokenizer_file=tknzr_file)
    else:
        tknzr = PreTrainedTokenizer(tokenizer_file=tknzr_file)
    tknzr.pad_token = pad_token
    tknzr.mask_token = mask_token

    return tknzr
Exemple #2
0
    def __init__(self,
                 pretrained_path,
                 n_labels,
                 hidden_size=768,
                 dropout_p=0.2,
                 label_ignore_idx=0,
                 head_init_range=0.04,
                 device='cuda'):
        super().__init__()

        self.n_labels = n_labels

        self.linear_1 = nn.Linear(hidden_size, hidden_size)
        self.classification_head = nn.Linear(hidden_size, n_labels)

        self.label_ignore_idx = label_ignore_idx
        self.tokenizer = PreTrainedTokenizerFast(
            tokenizer_file=os.path.join(pretrained_path, "tokenizer.json"))
        self.model = AutoModel.from_pretrained(pretrained_path)

        self.dropout = nn.Dropout(dropout_p)

        self.device = device

        # initializing classification head
        self.classification_head.weight.data.normal_(mean=0.0,
                                                     std=head_init_range)
class ChatDataset(Dataset):
    def __init__(self, filepath, tok_vocab, max_seq_len=128) -> None:
        self.filepath = filepath
        self.data = pd.read_csv(self.filepath)
        self.bos_token = '<s>'
        self.eos_token = '</s>'
        self.max_seq_len = max_seq_len
        self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tok_vocab,
                                                 bos_token=self.bos_token,
                                                 eos_token=self.eos_token,
                                                 unk_token='<unk>',
                                                 pad_token='<pad>',
                                                 mask_token='<mask>')

    def __len__(self):
        return len(self.data)

    def make_input_id_mask(self, tokens, index):
        input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        if len(input_id) < self.max_seq_len:
            while len(input_id) < self.max_seq_len:
                input_id += [self.tokenizer.pad_token_id]
                attention_mask += [0]
        else:
            # logging.warning(f'exceed max_seq_len for given article : {index}')
            input_id = input_id[:self.max_seq_len -
                                1] + [self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return input_id, attention_mask

    def __getitem__(self, index):
        record = self.data.iloc[index]
        q, a = record['Q'], record['A']
        q_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(q) + [self.eos_token]
        a_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(a) + [self.eos_token]
        encoder_input_id, encoder_attention_mask = self.make_input_id_mask(
            q_tokens, index)
        decoder_input_id, decoder_attention_mask = self.make_input_id_mask(
            a_tokens, index)
        labels = self.tokenizer.convert_tokens_to_ids(
            a_tokens[1:(self.max_seq_len + 1)])
        if len(labels) < self.max_seq_len:
            while len(labels) < self.max_seq_len:
                # for cross entropy loss masking
                labels += [-100]
        return {
            'input_ids':
            np.array(encoder_input_id, dtype=np.int_),
            'attention_mask':
            np.array(encoder_attention_mask, dtype=np.float_),
            'decoder_input_ids':
            np.array(decoder_input_id, dtype=np.int_),
            'decoder_attention_mask':
            np.array(decoder_attention_mask, dtype=np.float_),
            'labels':
            np.array(labels, dtype=np.int_)
        }
 def __init__(self, hparams, **kwargs):
     super(KoBARTConditionalGeneration, self).__init__(hparams, **kwargs)
     self.model = BartForConditionalGeneration.from_pretrained(self.hparams.model_path)
     self.model.train()
     self.bos_token = '<s>'
     self.eos_token = '</s>'
     self.tokenizer = PreTrainedTokenizerFast(
         tokenizer_file=os.path.join(self.hparams.tokenizer_path, 'model.json'),
         bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
 def __init__(self, filepath, tok_vocab, max_seq_len=128) -> None:
     self.filepath = filepath
     self.data = pd.read_csv(self.filepath) #encoding='cp949'
     self.bos_token = '<s>'
     self.eos_token = '</s>'
     self.max_seq_len = max_seq_len
     self.tokenizer = PreTrainedTokenizerFast(
         tokenizer_file=tok_vocab,
         bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
Exemple #6
0
class KoBARTConditionalGeneration(Base):
    def __init__(self, hparams, **kwargs):
        super(KoBARTConditionalGeneration, self).__init__(hparams, **kwargs)
        self.model = BartForConditionalGeneration.from_pretrained(
            self.hparams.model_path)
        self.model.train()
        self.bos_token = '<s>'
        self.eos_token = '</s>'
        self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=os.path.join(
            self.hparams.tokenizer_path, 'model.json'),
                                                 bos_token=self.bos_token,
                                                 eos_token=self.eos_token,
                                                 unk_token='<unk>',
                                                 pad_token='<pad>',
                                                 mask_token='<mask>')

    def forward(self, inputs):
        return self.model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            decoder_input_ids=inputs['decoder_input_ids'],
            decoder_attention_mask=inputs['decoder_attention_mask'],
            labels=inputs['labels'],
            return_dict=True)

    def training_step(self, batch, batch_idx):
        outs = self(batch)
        loss = outs.loss
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outs = self(batch)
        loss = outs['loss']
        return (loss)

    def validation_epoch_end(self, outputs):
        losses = []
        for loss in outputs:
            losses.append(loss)
        self.log('val_loss', torch.stack(losses).mean(), prog_bar=True)

    def chat(self, text):
        input_ids = [
            self.tokenizer.bos_token_id
        ] + self.tokenizer.encode(text) + [self.tokenizer.eos_token_id]

        res_ids = self.model.generate(
            torch.tensor([input_ids]),
            max_length=self.hparams.max_seq_len,
            num_beams=5,
            eos_token_id=self.tokenizer.eos_token_id,
            bad_words_ids=[[self.tokenizer.unk_token_id]])
        a = self.tokenizer.batch_decode(res_ids.tolist())[0]
        return a.replace('<s>', '').replace('</s>', '').replace('<usr>', '')
Exemple #7
0
def load_tokenizer(folder="."):
    folder = Path(folder)
    return PreTrainedTokenizerFast(
        WhitespaceTokenizer(str(folder / vocab_file)),
        pad_token="<pad>",
        mask_token="<mask>",
    )
Exemple #8
0
def _convert_examples_to_generation_features(
    examples: List[GenerationExample],
    tokenizer: PreTrainedTokenizerFast,
    args: GenerationTrainArguments,
):

    logger.info("tokenize sentences, it could take a lot of time...")
    start = time.time()
    batch_encoding = tokenizer(
        [example.text for example in examples],
        max_length=args.max_seq_length,
        padding="max_length",
        truncation=True,
    )
    logger.info("tokenize sentences [took %.3f s]", time.time() - start)

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        feature = GenerationFeatures(**inputs,
                                     labels=batch_encoding["input_ids"][i])
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("sentence: %s" % (example.text))
        logger.info(
            "tokens: %s" %
            (" ".join(tokenizer.convert_ids_to_tokens(features[i].input_ids))))
        logger.info("features: %s" % features[i])

    return features
def ingest():
    """
    Every model from HugginFace is applicable
    TODO: put url here
    Corpus example: squad | MedQA or FindZebra
    """
    typer.secho("Welcome to the ingest command", fg=typer.colors.WHITE, bold=True)

    model = BertModel.from_pretrained(Config['model'].get())
    fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(Config['tokenizer'].get())
    # fast_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    corpus = load_dataset(Config['corpus'].get(),
                          split='train[:100]')  # cache_dir=Config['cache_dir'].get() -- Cache directory override

    torch.set_grad_enabled(False)

    typer.secho("Embedding corpus as dense context vector representation using FAISS.")
    corpus_embeddings = corpus.map(
        lambda example: {
            'embeddings': model(**fast_tokenizer(example['line'], return_tensors='pt'))['pooler_output'][0].numpy()})
    # corpus_embeddings.save_to_disk(os.path.join(Config['cache_dir'].get(), "corpus/"))

    typer.secho("Adding FAISS index for efficient similarity search and clustering of dense vectors.")
    corpus_embeddings.add_faiss_index(column='embeddings')

    typer.secho("Saving the index")
    corpus_embeddings.save_faiss_index("embeddings", "corpus.faiss")  # os.path.join(Config['cache_dir'].get())
    return 0
Exemple #10
0
def get_kobart_tokenizer(cachedir='~/kobart/'):
    """Get KoGPT2 Tokenizer file path after downloading
    """
    global tokenizer
    model_info = tokenizer
    file_path, is_cached = download(model_info['url'],
                                    model_info['fname'],
                                    model_info['chksum'],
                                    cachedir=cachedir)
    cachedir_full = os.path.expanduser(cachedir)
    if not os.path.exists(os.path.join(cachedir_full,
                                       'emji_tokenizer')) or not is_cached:
        if not is_cached:
            shutil.rmtree(os.path.join(cachedir_full, 'emji_tokenizer'),
                          ignore_errors=True)
        zipf = ZipFile(os.path.expanduser(file_path))
        zipf.extractall(path=cachedir_full)
    tok_path = os.path.join(cachedir_full, 'emji_tokenizer/model.json')
    tokenizer_obj = PreTrainedTokenizerFast(tokenizer_file=tok_path,
                                            bos_token='<s>',
                                            eos_token='</s>',
                                            unk_token='<unk>',
                                            pad_token='<pad>',
                                            mask_token='<mask>')
    return tokenizer_obj
Exemple #11
0
    def __init__(
        self,
        examples: List[SequenceClassificationExample],
        tokenizer: PreTrainedTokenizerFast,
        label_to_id: Dict[str, int],
        tokens_per_batch: int = 32,
    ):
        self.features: List[InputFeatures] = []
        self.examples: List[SequenceClassificationExample] = examples
        texts: StrList = [ex.text for ex in self.examples]
        labels: StrList = [ex.label for ex in self.examples]

        # tokenize text into subwords with padding and truncation
        self.encodings: List[BatchEncoding] = [
            tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=tokens_per_batch,
                return_token_type_ids=False,
                padding="max_length",
                return_attention_mask=True,
                return_tensors="np",
                truncation=True,
            ) for text in texts
        ]

        # register features
        self.features = [
            InputFeatures(
                input_ids=encoding.input_ids.flatten().tolist(),
                attention_mask=encoding.attention_mask.flatten().tolist(),
                label_ids=[label_to_id.get(label, 0)],
            ) for encoding, label in zip(self.encodings, labels)
        ]
        self._n_features = len(self.features)
Exemple #12
0
def get_adjusted_lengths(
    sentences: Sentences,
    tokenizer: PreTrainedTokenizerFast,
    max_sequence_length,
) -> Tuple[int, ...]:
    """Return adjusted lengths based on a tokenizer and model max length."""
    encodings = [tokenizer.encode_plus(" ".join(sentence), return_offsets_mapping=True) for sentence in sentences]
    # Create end-token masks: [CLS] Hauk ur er [SEP] -> [dropped, 0, 1, 1, dropped]
    # By getting  initial token masks and shifting them:
    # [CLS] Hauk ur er [SEP] -> [0, 1, 0, 1, 0] ->
    # -> drop [mid shifted to left] + [1] drop
    # -> [_, 0, 1, 1, _]
    end_token_masks = [get_initial_token_mask(encoded["offset_mapping"])[2:-1] + [1] for encoded in encodings]
    # We need to account for two special tokens (SEP and CLS) or (<s> and </s>) when finding the cuts
    max_sequence_length -= 2
    # And some extra, because of errors
    max_sequence_length -= 6
    lengths = []
    for end_token_mask in end_token_masks:
        while len(end_token_mask) != 0:
            prefix, end_token_mask = (
                end_token_mask[:max_sequence_length],
                end_token_mask[max_sequence_length:],
            )
            length = sum(prefix)
            lengths.append(length)

    return tuple(int(length) for length in lengths)
Exemple #13
0
def main(args):
    data = np.load(args.data, allow_pickle=True)
    tokenizer_path = args.tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path,
                                        max_len=512,
                                        mask_token="<mask>",
                                        pad_token="<pad>")
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.convert_tokens_to_ids("</s>")),
        ("<s>", tokenizer.convert_tokens_to_ids("<s>")),
    )

    config = RobertaConfig(
        vocab_size=tokenizer.vocab_size,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)
    dataset = PhoneDatasetMLM(data, tokenizer)

    model = RobertaForMaskedLM(config=config)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=64,
        logging_steps=2,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model(args.output_dir)
Exemple #14
0
    def __init__(self,
                 equations=None,
                 images=None,
                 tokenizer=None,
                 shuffle=True,
                 batchsize=16,
                 max_dimensions=(1024, 512),
                 pad=False,
                 keep_smaller_batches=False,
                 test=False):
        """Generates a torch dataset from pairs of `equations` and `images`.

        Args:
            equations (str, optional): Path to equations. Defaults to None.
            images (str, optional): Directory where images are saved. Defaults to None.
            tokenizer (str, optional): Path to saved tokenizer. Defaults to None.
            shuffle (bool, opitonal): Defaults to True. 
            batchsize (int, optional): Defaults to 16.
            max_dimensions (tuple(int, int), optional): Maximal dimensions the model can handle
            pad (bool): Pad the images to `max_dimensions`. Defaults to False.
            keep_smaller_batches (bool): Whether to also return batches with smaller size than `batchsize`. Defaults to False.
            test (bool): Whether to use the test transformation or not. Defaults to False.
        """

        if images is not None and equations is not None:
            assert tokenizer is not None
            self.images = [
                path.replace('\\', '/')
                for path in glob.glob(join(images, '*.png'))
            ]
            self.sample_size = len(self.images)
            eqs = open(equations, 'r').read().split('\n')
            self.indices = [
                int(os.path.basename(img).split('.')[0]) for img in self.images
            ]
            self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer)
            self.shuffle = shuffle
            self.batchsize = batchsize
            self.max_dimensions = max_dimensions
            self.pad = pad
            self.keep_smaller_batches = keep_smaller_batches
            self.test = test
            self.data = defaultdict(lambda: [])
            # check the image dimension for every image and group them together
            try:
                for i, im in tqdm(enumerate(self.images),
                                  total=len(self.images)):
                    width, height = imagesize.get(im)
                    if width <= max_dimensions[0] and height <= max_dimensions[
                            1]:
                        self.data[(width, height)].append(
                            (eqs[self.indices[i]], im))
            except KeyboardInterrupt:
                pass
            self.data = dict(self.data)
            self._get_size()

            iter(self)
    def setUp(self):
        self.test_rust_tokenizer = False  # because we don't have pretrained_vocab_files_map
        super().setUp()
        self.test_rust_tokenizer = True

        self.tokenizers_list = [(PreTrainedTokenizerFast, "robot-test/dummy-tokenizer-fast", {})]

        tokenizer = PreTrainedTokenizerFast.from_pretrained("robot-test/dummy-tokenizer-fast")
        tokenizer.save_pretrained(self.tmpdirname)
Exemple #16
0
 def __init__(self, bot):
     self.bot = bot
     self.model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
     self.tokenizer = tokenizer = PreTrainedTokenizerFast.from_pretrained(
         "skt/kogpt2-base-v2",
         bos_token='</s>',
         eos_token='</s>',
         unk_token='<unk>',
         pad_token='<pad>',
         mask_token='<mask>')
def get_kobart_tokenizer():
    tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart")

    tokenizer.pad_token = "<pad>"
    tokenizer.bos_token = "<s>"
    tokenizer.eos_token = "</s>"
    tokenizer.unk_token = "<unk>"
    tokenizer.mask_token = "<mask>"

    return tokenizer
Exemple #18
0
 def __init__(self, model: str, device: str):
     config = BartConfig.from_pretrained("hyunwoongko/kobart")
     self.model = BartForConditionalGeneration(config).half().eval().to(
         device)
     self.model.model.load_state_dict(torch.load(
         model,
         map_location=device,
     ))
     self.tokenizer = PreTrainedTokenizerFast.from_pretrained(
         "hyunwoongko/kobart")
     self.device = device
    def setUp(self):
        self.test_rust_tokenizer = False  # because we don't have pretrained_vocab_files_map
        super().setUp()
        self.test_rust_tokenizer = True

        model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"]

        # Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment)
        self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]

        tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
        tokenizer.save_pretrained(self.tmpdirname)
Exemple #20
0
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart")

    model = BartForConditionalGeneration.from_pretrained(
        args.finetuned_model_path)
    model.eval()
    model.to(device)

    examples = [
        "배고프다", "너무너무 사랑해요", "나는 너를 좋아해", "저의 취미는 축구입니다", "어제 무슨 영화 봤어?",
        "짜장면 짬뽕 탕수육 먹었어"
    ]

    for example in examples:
        chosung_example = convert_text_to_chosung(example)

        input_ids = (torch.tensor(
            tokenizer.convert_tokens_to_ids(
                tokenizer.tokenize(chosung_example))).unsqueeze(0).to(device))

        if args.decoding_method == "top_p":
            outputs = model.generate(
                input_ids=input_ids,
                max_length=48,
                temperature=1.0,
                do_sample=True,
                top_p=0.8,
                pad_token_id=tokenizer.pad_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                decoder_start_token_id=tokenizer.bos_token_id,
                num_return_sequences=5,
            )
        elif args.decoding_method == "beam_search":
            outputs = model.generate(
                input_ids=input_ids,
                max_length=48,
                num_beams=10,
                pad_token_id=tokenizer.pad_token_id,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                decoder_start_token_id=tokenizer.bos_token_id,
                num_return_sequences=5,
            )
        else:
            raise ValueError(
                "Enter the right decoding method (top_p or beam_search)")

        for output in outputs.tolist():
            answer = tokenizer.decode(output)
            print(f"초성: {chosung_example} \t 예측 문장: {answer}")
Exemple #21
0
 def __init__(self, datapath, max_seq_len=128):
     self.datapath = datapath
     self.data = pd.read_csv(self.datapath, sep='\t')
     self.bos_token = '</s>'
     self.eos_token = '</s>'
     self.max_seq_len = max_seq_len
     self.tokenizer = PreTrainedTokenizerFast.from_pretrained(
         "skt/kogpt2-base-v2",
         bos_token=self.bos_token,
         eos_token=self.eos_token,
         unk_token='<unk>',
         pad_token='<pad>',
         mask_token='<mask>')
Exemple #22
0
    def test_async_share_tokenizer(self):
        # See https://github.com/huggingface/transformers/pull/12550
        # and https://github.com/huggingface/tokenizers/issues/537
        tokenizer = PreTrainedTokenizerFast.from_pretrained(
            "robot-test/dummy-tokenizer-wordlevel")
        text = "The Matrix is a 1999 science fiction action film."

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(self.fetch, tokenizer, text) for i in range(10)
            ]
            return_value = [future.result() for future in futures]
            self.assertEqual(return_value, [[1, 10, 0, 8, 0, 18, 0, 0, 0, 2]
                                            for i in range(10)])
 def load_custom_tokenizer(self, path):
     tokenizer = ByteLevelBPETokenizer(path + "-vocab.json",
                                       path + "-merges.txt")
     # Add preprocessing tokens like Roberta
     tokenizer._tokenizer.post_processor = BertProcessing(
         ("</s>", tokenizer.token_to_id("</s>")),
         ("<s>", tokenizer.token_to_id("<s>")),
     )
     return PreTrainedTokenizerFast(tokenizer,
                                    pad_token="<pad>",
                                    mask_token="<mask>",
                                    unk_token="<unk>",
                                    bos_token="<s>",
                                    eos_token="</s>")
    def __init__(self, type="normal", device="cpu"):
        """
        Constructor of Summarizers

        Args:
            type (str): type of article. (e.g. normal, paper, patent)
            device (str): device for inference (e.g. cpu, cuda)
        """

        type = type.lower()
        model_name_prefix = "hyunwoongko/ctrlsum"

        assert type in ['normal', 'paper', 'patent'], \
            "param `article_type` must be one of ['normal', 'paper', 'patent']"

        if type == "normal":
            model_name = f"{model_name_prefix}-cnndm"
        elif type == "paper":
            model_name = f"{model_name_prefix}-paper"
        elif type == "patent":
            model_name = f"{model_name_prefix}-patent"
        else:
            raise Exception(f"Unknown type: {type}")

        self.device = device
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(
            device)
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
        self._5w1h = [
            "what ",
            "what's "
            "when ",
            "why ",
            "who ",
            "who's ",
            "where ",
            "how ",
            "What ",
            "What's ",
            "When ",
            "Why ",
            "Who ",
            "Who's ",
            "Where ",
            "How ",
        ]
Exemple #25
0
def convert_instances_to_feature_tensors(
        instances: List[Instance], tokenizer: PreTrainedTokenizerFast,
        label2idx: Dict[str, int]) -> List[Feature]:
    features = []
    ## tokenize the word into word_piece / BPE
    ## NOTE: adding a leading space is important for BART/GPT/Roberta tokenization.
    ## Related GitHub issues:
    ##      https://github.com/huggingface/transformers/issues/1196
    ##      https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py#L38-L56
    ##      https://github.com/ThilinaRajapakse/simpletransformers/issues/458
    assert tokenizer.add_prefix_space  ## has to be true, in order to tokenize pre-tokenized input
    print(
        "[Data Info] We are not limiting the max length in tokenizer. You should be aware of that"
    )
    for idx, inst in enumerate(instances):
        words = inst.ori_words
        orig_to_tok_index = []
        res = tokenizer.encode_plus(words, is_split_into_words=True)
        subword_idx2word_idx = res.word_ids(batch_index=0)
        prev_word_idx = -1
        for i, mapped_word_idx in enumerate(subword_idx2word_idx):
            """
            Note: by default, we use the first wordpiece/subword token to represent the word
            If you want to do something else (e.g., use last wordpiece to represent), modify them here.
            """
            if mapped_word_idx is None:  ## cls and sep token
                continue
            if mapped_word_idx != prev_word_idx:
                ## because we take the first subword to represent the whold word
                orig_to_tok_index.append(i)
                prev_word_idx = mapped_word_idx
        assert len(orig_to_tok_index) == len(words)
        labels = inst.labels
        label_ids = [label2idx[label]
                     for label in labels] if labels else [-100] * len(words)
        segment_ids = [0] * len(res["input_ids"])

        features.append(
            Feature(input_ids=res["input_ids"],
                    attention_mask=res["attention_mask"],
                    orig_to_tok_index=orig_to_tok_index,
                    token_type_ids=segment_ids,
                    word_seq_len=len(orig_to_tok_index),
                    label_ids=label_ids))
    return features
def preprocess(texts, tokenizer_path, max_len=32):

    input_ids, input_masks = [], []

    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
    tokenizer.mask_token = '[MASK]'
    tokenizer.pad_token = "[PAD]"
    tokenizer.sep_token = "[SEP]"
    tokenizer.cls_token = "[CLS]"
    tokenizer.unk_token = "[UNK]"

    for text in tqdm(texts):
        encoded = tokenizer.encode_plus(text,
                                        max_length=max_len,
                                        pad_to_max_length=True,
                                        truncation=True)
        input_ids.append(encoded['input_ids'])
        input_masks.append(encoded['attention_mask'])

    return [np.array(input_ids), np.array(input_masks)]
Exemple #27
0
    def __init__(self, path, max_ids):
        self.model = load_model(path)
        self.max_ids = max_ids
        U_TKN = '<usr>'
        S_TKN = '<sys>'
        BOS = '</s>'
        EOS = '</s>'
        MASK = '<unused0>'
        SENT = '<unused1>'
        PAD = '<pad>'

        TOKENIZER = PreTrainedTokenizerFast.from_pretrained(
            "skt/kogpt2-base-v2",
            bos_token=BOS,
            eos_token=EOS,
            unk_token='<unk>',
            pad_token=PAD,
            mask_token=MASK)

        self.tok = TOKENIZER
Exemple #28
0
def fine_tuning(MODEL_TYPE, DATA_PATH, BATCH_SIZE, LEARNING_RATE, WARMUP_STEPS,
                OUTPUT_MODEL_PATH, EPOCHS):
    print("=" * 15, "LOAD MODEL", "=" * 15)
    model = GPT2LMHeadModel.from_pretrained(MODEL_TYPE)
    tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_TYPE)

    print("=" * 15, "GET DATASET", "=" * 15)
    data_loader = get_data_loader(DATA_PATH, tokenizer, BATCH_SIZE, True)

    optimizier = AdamW(model.parameters(), lr=LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizier, WARMUP_STEPS,
        len(data_loader) - WARMUP_STEPS, -1)

    if not os.path.exists(OUTPUT_MODEL_PATH):
        os.mkdir(OUTPUT_MODEL_PATH)

    fine_tuning_runner(model, optimizier, data_loader, scheduler, EPOCHS,
                       OUTPUT_MODEL_PATH)
    model.save_pretrained(OUTPUT_MODEL_PATH)
Exemple #29
0
def summarizer(input: TextSummerizeInput) -> TextSummerizeOutput:
    """ Summarize texts """
    tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart")
    inputs = tokenizer([
        tokenizer.bos_token + input.text_input + tokenizer.eos_token
    ])['input_ids'][0]

    model_url = 'https://train-mxysk1opgrzauh8ifw55-gpt2-train-teachable-ainize.endpoint.dev.ainize.ai/predictions/bart-ko-small-finetune'

    headers = {'Content-Type': 'application/json; charset=utf-8'}
    response = requests.post(url=model_url,
                             headers=headers,
                             json={"text": inputs})

    if response.status_code == 200:
        result = tokenizer.decode(response.json()[0], skip_special_tokens=True)
        return TextSummerizeOutput(output=result)
    else:
        print(f'Failed {response.text}')
        return TextSummerizeOutput(output='Failed summerize')
Exemple #30
0
def initialize(arguments=None):
    if arguments is None:
        arguments = Munch({
            'config': 'settings/config.yaml',
            'checkpoint': 'checkpoints/weights.pth',
            'no_cuda': True,
            'no_resize': False
        })
    logging.getLogger().setLevel(logging.FATAL)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    with open(arguments.config, 'r') as f:
        params = yaml.load(f, Loader=yaml.FullLoader)
    args = parse_args(Munch(params))
    args.update(**vars(arguments))
    args.wandb = False
    args.device = 'cuda' if torch.cuda.is_available(
    ) and not args.no_cuda else 'cpu'

    model = get_model(args)
    model.load_state_dict(torch.load(args.checkpoint,
                                     map_location=args.device))

    if 'image_resizer.pth' in os.listdir(os.path.dirname(
            args.checkpoint)) and not arguments.no_resize:
        image_resizer = ResNetV2(layers=[2, 3, 3],
                                 num_classes=max(args.max_dimensions) // 32,
                                 global_pool='avg',
                                 in_chans=1,
                                 drop_rate=.05,
                                 preact=True,
                                 stem_type='same',
                                 conv_layer=StdConv2dSame).to(args.device)
        image_resizer.load_state_dict(
            torch.load(os.path.join(os.path.dirname(args.checkpoint),
                                    'image_resizer.pth'),
                       map_location=args.device))
        image_resizer.eval()
    else:
        image_resizer = None
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=args.tokenizer)
    return args, model, image_resizer, tokenizer