def load_tokenizer(tknzr_file, flag_tknzr_fast, pad_token=None, mask_token=None): """ Interestingly, HuggingFace does not allow the base tokenizer to be called. This is a bizarre choice, but accordingly we have to look for something else , which is why I use the PreTrainedTokenizerFast to wrap the base tokenizer. Written in Rust, it's faster than the base tokenizer class, but also lets you call the tokenizer as tknzr('text to be tokenized'). Input tknzr_file (str) : .json file of the tokenizer trained previously *_tokens (str) : tokens that are to be used in the corresponding context Some of them are not implemented yet... Output tknzr : tokenizer as PreTrainedTokenizerFast class to be passed on """ if flag_tknzr_fast: tknzr = PreTrainedTokenizerFast(tokenizer_file=tknzr_file) else: tknzr = PreTrainedTokenizer(tokenizer_file=tknzr_file) tknzr.pad_token = pad_token tknzr.mask_token = mask_token return tknzr
def __init__(self, pretrained_path, n_labels, hidden_size=768, dropout_p=0.2, label_ignore_idx=0, head_init_range=0.04, device='cuda'): super().__init__() self.n_labels = n_labels self.linear_1 = nn.Linear(hidden_size, hidden_size) self.classification_head = nn.Linear(hidden_size, n_labels) self.label_ignore_idx = label_ignore_idx self.tokenizer = PreTrainedTokenizerFast( tokenizer_file=os.path.join(pretrained_path, "tokenizer.json")) self.model = AutoModel.from_pretrained(pretrained_path) self.dropout = nn.Dropout(dropout_p) self.device = device # initializing classification head self.classification_head.weight.data.normal_(mean=0.0, std=head_init_range)
class ChatDataset(Dataset): def __init__(self, filepath, tok_vocab, max_seq_len=128) -> None: self.filepath = filepath self.data = pd.read_csv(self.filepath) self.bos_token = '<s>' self.eos_token = '</s>' self.max_seq_len = max_seq_len self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tok_vocab, bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>') def __len__(self): return len(self.data) def make_input_id_mask(self, tokens, index): input_id = self.tokenizer.convert_tokens_to_ids(tokens) attention_mask = [1] * len(input_id) if len(input_id) < self.max_seq_len: while len(input_id) < self.max_seq_len: input_id += [self.tokenizer.pad_token_id] attention_mask += [0] else: # logging.warning(f'exceed max_seq_len for given article : {index}') input_id = input_id[:self.max_seq_len - 1] + [self.tokenizer.eos_token_id] attention_mask = attention_mask[:self.max_seq_len] return input_id, attention_mask def __getitem__(self, index): record = self.data.iloc[index] q, a = record['Q'], record['A'] q_tokens = [self.bos_token] + \ self.tokenizer.tokenize(q) + [self.eos_token] a_tokens = [self.bos_token] + \ self.tokenizer.tokenize(a) + [self.eos_token] encoder_input_id, encoder_attention_mask = self.make_input_id_mask( q_tokens, index) decoder_input_id, decoder_attention_mask = self.make_input_id_mask( a_tokens, index) labels = self.tokenizer.convert_tokens_to_ids( a_tokens[1:(self.max_seq_len + 1)]) if len(labels) < self.max_seq_len: while len(labels) < self.max_seq_len: # for cross entropy loss masking labels += [-100] return { 'input_ids': np.array(encoder_input_id, dtype=np.int_), 'attention_mask': np.array(encoder_attention_mask, dtype=np.float_), 'decoder_input_ids': np.array(decoder_input_id, dtype=np.int_), 'decoder_attention_mask': np.array(decoder_attention_mask, dtype=np.float_), 'labels': np.array(labels, dtype=np.int_) }
def __init__(self, hparams, **kwargs): super(KoBARTConditionalGeneration, self).__init__(hparams, **kwargs) self.model = BartForConditionalGeneration.from_pretrained(self.hparams.model_path) self.model.train() self.bos_token = '<s>' self.eos_token = '</s>' self.tokenizer = PreTrainedTokenizerFast( tokenizer_file=os.path.join(self.hparams.tokenizer_path, 'model.json'), bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
def __init__(self, filepath, tok_vocab, max_seq_len=128) -> None: self.filepath = filepath self.data = pd.read_csv(self.filepath) #encoding='cp949' self.bos_token = '<s>' self.eos_token = '</s>' self.max_seq_len = max_seq_len self.tokenizer = PreTrainedTokenizerFast( tokenizer_file=tok_vocab, bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
class KoBARTConditionalGeneration(Base): def __init__(self, hparams, **kwargs): super(KoBARTConditionalGeneration, self).__init__(hparams, **kwargs) self.model = BartForConditionalGeneration.from_pretrained( self.hparams.model_path) self.model.train() self.bos_token = '<s>' self.eos_token = '</s>' self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=os.path.join( self.hparams.tokenizer_path, 'model.json'), bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>') def forward(self, inputs): return self.model( input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], decoder_input_ids=inputs['decoder_input_ids'], decoder_attention_mask=inputs['decoder_attention_mask'], labels=inputs['labels'], return_dict=True) def training_step(self, batch, batch_idx): outs = self(batch) loss = outs.loss self.log('train_loss', loss, prog_bar=True) return loss def validation_step(self, batch, batch_idx): outs = self(batch) loss = outs['loss'] return (loss) def validation_epoch_end(self, outputs): losses = [] for loss in outputs: losses.append(loss) self.log('val_loss', torch.stack(losses).mean(), prog_bar=True) def chat(self, text): input_ids = [ self.tokenizer.bos_token_id ] + self.tokenizer.encode(text) + [self.tokenizer.eos_token_id] res_ids = self.model.generate( torch.tensor([input_ids]), max_length=self.hparams.max_seq_len, num_beams=5, eos_token_id=self.tokenizer.eos_token_id, bad_words_ids=[[self.tokenizer.unk_token_id]]) a = self.tokenizer.batch_decode(res_ids.tolist())[0] return a.replace('<s>', '').replace('</s>', '').replace('<usr>', '')
def load_tokenizer(folder="."): folder = Path(folder) return PreTrainedTokenizerFast( WhitespaceTokenizer(str(folder / vocab_file)), pad_token="<pad>", mask_token="<mask>", )
def _convert_examples_to_generation_features( examples: List[GenerationExample], tokenizer: PreTrainedTokenizerFast, args: GenerationTrainArguments, ): logger.info("tokenize sentences, it could take a lot of time...") start = time.time() batch_encoding = tokenizer( [example.text for example in examples], max_length=args.max_seq_length, padding="max_length", truncation=True, ) logger.info("tokenize sentences [took %.3f s]", time.time() - start) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = GenerationFeatures(**inputs, labels=batch_encoding["input_ids"][i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("sentence: %s" % (example.text)) logger.info( "tokens: %s" % (" ".join(tokenizer.convert_ids_to_tokens(features[i].input_ids)))) logger.info("features: %s" % features[i]) return features
def ingest(): """ Every model from HugginFace is applicable TODO: put url here Corpus example: squad | MedQA or FindZebra """ typer.secho("Welcome to the ingest command", fg=typer.colors.WHITE, bold=True) model = BertModel.from_pretrained(Config['model'].get()) fast_tokenizer = PreTrainedTokenizerFast.from_pretrained(Config['tokenizer'].get()) # fast_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) corpus = load_dataset(Config['corpus'].get(), split='train[:100]') # cache_dir=Config['cache_dir'].get() -- Cache directory override torch.set_grad_enabled(False) typer.secho("Embedding corpus as dense context vector representation using FAISS.") corpus_embeddings = corpus.map( lambda example: { 'embeddings': model(**fast_tokenizer(example['line'], return_tensors='pt'))['pooler_output'][0].numpy()}) # corpus_embeddings.save_to_disk(os.path.join(Config['cache_dir'].get(), "corpus/")) typer.secho("Adding FAISS index for efficient similarity search and clustering of dense vectors.") corpus_embeddings.add_faiss_index(column='embeddings') typer.secho("Saving the index") corpus_embeddings.save_faiss_index("embeddings", "corpus.faiss") # os.path.join(Config['cache_dir'].get()) return 0
def get_kobart_tokenizer(cachedir='~/kobart/'): """Get KoGPT2 Tokenizer file path after downloading """ global tokenizer model_info = tokenizer file_path, is_cached = download(model_info['url'], model_info['fname'], model_info['chksum'], cachedir=cachedir) cachedir_full = os.path.expanduser(cachedir) if not os.path.exists(os.path.join(cachedir_full, 'emji_tokenizer')) or not is_cached: if not is_cached: shutil.rmtree(os.path.join(cachedir_full, 'emji_tokenizer'), ignore_errors=True) zipf = ZipFile(os.path.expanduser(file_path)) zipf.extractall(path=cachedir_full) tok_path = os.path.join(cachedir_full, 'emji_tokenizer/model.json') tokenizer_obj = PreTrainedTokenizerFast(tokenizer_file=tok_path, bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>') return tokenizer_obj
def __init__( self, examples: List[SequenceClassificationExample], tokenizer: PreTrainedTokenizerFast, label_to_id: Dict[str, int], tokens_per_batch: int = 32, ): self.features: List[InputFeatures] = [] self.examples: List[SequenceClassificationExample] = examples texts: StrList = [ex.text for ex in self.examples] labels: StrList = [ex.label for ex in self.examples] # tokenize text into subwords with padding and truncation self.encodings: List[BatchEncoding] = [ tokenizer.encode_plus( text, add_special_tokens=True, max_length=tokens_per_batch, return_token_type_ids=False, padding="max_length", return_attention_mask=True, return_tensors="np", truncation=True, ) for text in texts ] # register features self.features = [ InputFeatures( input_ids=encoding.input_ids.flatten().tolist(), attention_mask=encoding.attention_mask.flatten().tolist(), label_ids=[label_to_id.get(label, 0)], ) for encoding, label in zip(self.encodings, labels) ] self._n_features = len(self.features)
def get_adjusted_lengths( sentences: Sentences, tokenizer: PreTrainedTokenizerFast, max_sequence_length, ) -> Tuple[int, ...]: """Return adjusted lengths based on a tokenizer and model max length.""" encodings = [tokenizer.encode_plus(" ".join(sentence), return_offsets_mapping=True) for sentence in sentences] # Create end-token masks: [CLS] Hauk ur er [SEP] -> [dropped, 0, 1, 1, dropped] # By getting initial token masks and shifting them: # [CLS] Hauk ur er [SEP] -> [0, 1, 0, 1, 0] -> # -> drop [mid shifted to left] + [1] drop # -> [_, 0, 1, 1, _] end_token_masks = [get_initial_token_mask(encoded["offset_mapping"])[2:-1] + [1] for encoded in encodings] # We need to account for two special tokens (SEP and CLS) or (<s> and </s>) when finding the cuts max_sequence_length -= 2 # And some extra, because of errors max_sequence_length -= 6 lengths = [] for end_token_mask in end_token_masks: while len(end_token_mask) != 0: prefix, end_token_mask = ( end_token_mask[:max_sequence_length], end_token_mask[max_sequence_length:], ) length = sum(prefix) lengths.append(length) return tuple(int(length) for length in lengths)
def main(args): data = np.load(args.data, allow_pickle=True) tokenizer_path = args.tokenizer tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>") tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.convert_tokens_to_ids("</s>")), ("<s>", tokenizer.convert_tokens_to_ids("<s>")), ) config = RobertaConfig( vocab_size=tokenizer.vocab_size, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) dataset = PhoneDatasetMLM(data, tokenizer) model = RobertaForMaskedLM(config=config) training_args = TrainingArguments( output_dir=args.output_dir, overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=64, logging_steps=2, save_steps=10_000, save_total_limit=2, prediction_loss_only=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() trainer.save_model(args.output_dir)
def __init__(self, equations=None, images=None, tokenizer=None, shuffle=True, batchsize=16, max_dimensions=(1024, 512), pad=False, keep_smaller_batches=False, test=False): """Generates a torch dataset from pairs of `equations` and `images`. Args: equations (str, optional): Path to equations. Defaults to None. images (str, optional): Directory where images are saved. Defaults to None. tokenizer (str, optional): Path to saved tokenizer. Defaults to None. shuffle (bool, opitonal): Defaults to True. batchsize (int, optional): Defaults to 16. max_dimensions (tuple(int, int), optional): Maximal dimensions the model can handle pad (bool): Pad the images to `max_dimensions`. Defaults to False. keep_smaller_batches (bool): Whether to also return batches with smaller size than `batchsize`. Defaults to False. test (bool): Whether to use the test transformation or not. Defaults to False. """ if images is not None and equations is not None: assert tokenizer is not None self.images = [ path.replace('\\', '/') for path in glob.glob(join(images, '*.png')) ] self.sample_size = len(self.images) eqs = open(equations, 'r').read().split('\n') self.indices = [ int(os.path.basename(img).split('.')[0]) for img in self.images ] self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer) self.shuffle = shuffle self.batchsize = batchsize self.max_dimensions = max_dimensions self.pad = pad self.keep_smaller_batches = keep_smaller_batches self.test = test self.data = defaultdict(lambda: []) # check the image dimension for every image and group them together try: for i, im in tqdm(enumerate(self.images), total=len(self.images)): width, height = imagesize.get(im) if width <= max_dimensions[0] and height <= max_dimensions[ 1]: self.data[(width, height)].append( (eqs[self.indices[i]], im)) except KeyboardInterrupt: pass self.data = dict(self.data) self._get_size() iter(self)
def setUp(self): self.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map super().setUp() self.test_rust_tokenizer = True self.tokenizers_list = [(PreTrainedTokenizerFast, "robot-test/dummy-tokenizer-fast", {})] tokenizer = PreTrainedTokenizerFast.from_pretrained("robot-test/dummy-tokenizer-fast") tokenizer.save_pretrained(self.tmpdirname)
def __init__(self, bot): self.bot = bot self.model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2') self.tokenizer = tokenizer = PreTrainedTokenizerFast.from_pretrained( "skt/kogpt2-base-v2", bos_token='</s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
def get_kobart_tokenizer(): tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart") tokenizer.pad_token = "<pad>" tokenizer.bos_token = "<s>" tokenizer.eos_token = "</s>" tokenizer.unk_token = "<unk>" tokenizer.mask_token = "<mask>" return tokenizer
def __init__(self, model: str, device: str): config = BartConfig.from_pretrained("hyunwoongko/kobart") self.model = BartForConditionalGeneration(config).half().eval().to( device) self.model.model.load_state_dict(torch.load( model, map_location=device, )) self.tokenizer = PreTrainedTokenizerFast.from_pretrained( "hyunwoongko/kobart") self.device = device
def setUp(self): self.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map super().setUp() self.test_rust_tokenizer = True model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"] # Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment) self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths] tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0]) tokenizer.save_pretrained(self.tmpdirname)
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart") model = BartForConditionalGeneration.from_pretrained( args.finetuned_model_path) model.eval() model.to(device) examples = [ "배고프다", "너무너무 사랑해요", "나는 너를 좋아해", "저의 취미는 축구입니다", "어제 무슨 영화 봤어?", "짜장면 짬뽕 탕수육 먹었어" ] for example in examples: chosung_example = convert_text_to_chosung(example) input_ids = (torch.tensor( tokenizer.convert_tokens_to_ids( tokenizer.tokenize(chosung_example))).unsqueeze(0).to(device)) if args.decoding_method == "top_p": outputs = model.generate( input_ids=input_ids, max_length=48, temperature=1.0, do_sample=True, top_p=0.8, pad_token_id=tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, decoder_start_token_id=tokenizer.bos_token_id, num_return_sequences=5, ) elif args.decoding_method == "beam_search": outputs = model.generate( input_ids=input_ids, max_length=48, num_beams=10, pad_token_id=tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, decoder_start_token_id=tokenizer.bos_token_id, num_return_sequences=5, ) else: raise ValueError( "Enter the right decoding method (top_p or beam_search)") for output in outputs.tolist(): answer = tokenizer.decode(output) print(f"초성: {chosung_example} \t 예측 문장: {answer}")
def __init__(self, datapath, max_seq_len=128): self.datapath = datapath self.data = pd.read_csv(self.datapath, sep='\t') self.bos_token = '</s>' self.eos_token = '</s>' self.max_seq_len = max_seq_len self.tokenizer = PreTrainedTokenizerFast.from_pretrained( "skt/kogpt2-base-v2", bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
def test_async_share_tokenizer(self): # See https://github.com/huggingface/transformers/pull/12550 # and https://github.com/huggingface/tokenizers/issues/537 tokenizer = PreTrainedTokenizerFast.from_pretrained( "robot-test/dummy-tokenizer-wordlevel") text = "The Matrix is a 1999 science fiction action film." with concurrent.futures.ThreadPoolExecutor() as executor: futures = [ executor.submit(self.fetch, tokenizer, text) for i in range(10) ] return_value = [future.result() for future in futures] self.assertEqual(return_value, [[1, 10, 0, 8, 0, 18, 0, 0, 0, 2] for i in range(10)])
def load_custom_tokenizer(self, path): tokenizer = ByteLevelBPETokenizer(path + "-vocab.json", path + "-merges.txt") # Add preprocessing tokens like Roberta tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) return PreTrainedTokenizerFast(tokenizer, pad_token="<pad>", mask_token="<mask>", unk_token="<unk>", bos_token="<s>", eos_token="</s>")
def __init__(self, type="normal", device="cpu"): """ Constructor of Summarizers Args: type (str): type of article. (e.g. normal, paper, patent) device (str): device for inference (e.g. cpu, cuda) """ type = type.lower() model_name_prefix = "hyunwoongko/ctrlsum" assert type in ['normal', 'paper', 'patent'], \ "param `article_type` must be one of ['normal', 'paper', 'patent']" if type == "normal": model_name = f"{model_name_prefix}-cnndm" elif type == "paper": model_name = f"{model_name_prefix}-paper" elif type == "patent": model_name = f"{model_name_prefix}-patent" else: raise Exception(f"Unknown type: {type}") self.device = device self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to( device) self.tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name) self._5w1h = [ "what ", "what's " "when ", "why ", "who ", "who's ", "where ", "how ", "What ", "What's ", "When ", "Why ", "Who ", "Who's ", "Where ", "How ", ]
def convert_instances_to_feature_tensors( instances: List[Instance], tokenizer: PreTrainedTokenizerFast, label2idx: Dict[str, int]) -> List[Feature]: features = [] ## tokenize the word into word_piece / BPE ## NOTE: adding a leading space is important for BART/GPT/Roberta tokenization. ## Related GitHub issues: ## https://github.com/huggingface/transformers/issues/1196 ## https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py#L38-L56 ## https://github.com/ThilinaRajapakse/simpletransformers/issues/458 assert tokenizer.add_prefix_space ## has to be true, in order to tokenize pre-tokenized input print( "[Data Info] We are not limiting the max length in tokenizer. You should be aware of that" ) for idx, inst in enumerate(instances): words = inst.ori_words orig_to_tok_index = [] res = tokenizer.encode_plus(words, is_split_into_words=True) subword_idx2word_idx = res.word_ids(batch_index=0) prev_word_idx = -1 for i, mapped_word_idx in enumerate(subword_idx2word_idx): """ Note: by default, we use the first wordpiece/subword token to represent the word If you want to do something else (e.g., use last wordpiece to represent), modify them here. """ if mapped_word_idx is None: ## cls and sep token continue if mapped_word_idx != prev_word_idx: ## because we take the first subword to represent the whold word orig_to_tok_index.append(i) prev_word_idx = mapped_word_idx assert len(orig_to_tok_index) == len(words) labels = inst.labels label_ids = [label2idx[label] for label in labels] if labels else [-100] * len(words) segment_ids = [0] * len(res["input_ids"]) features.append( Feature(input_ids=res["input_ids"], attention_mask=res["attention_mask"], orig_to_tok_index=orig_to_tok_index, token_type_ids=segment_ids, word_seq_len=len(orig_to_tok_index), label_ids=label_ids)) return features
def preprocess(texts, tokenizer_path, max_len=32): input_ids, input_masks = [], [] tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path) tokenizer.mask_token = '[MASK]' tokenizer.pad_token = "[PAD]" tokenizer.sep_token = "[SEP]" tokenizer.cls_token = "[CLS]" tokenizer.unk_token = "[UNK]" for text in tqdm(texts): encoded = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=True, truncation=True) input_ids.append(encoded['input_ids']) input_masks.append(encoded['attention_mask']) return [np.array(input_ids), np.array(input_masks)]
def __init__(self, path, max_ids): self.model = load_model(path) self.max_ids = max_ids U_TKN = '<usr>' S_TKN = '<sys>' BOS = '</s>' EOS = '</s>' MASK = '<unused0>' SENT = '<unused1>' PAD = '<pad>' TOKENIZER = PreTrainedTokenizerFast.from_pretrained( "skt/kogpt2-base-v2", bos_token=BOS, eos_token=EOS, unk_token='<unk>', pad_token=PAD, mask_token=MASK) self.tok = TOKENIZER
def fine_tuning(MODEL_TYPE, DATA_PATH, BATCH_SIZE, LEARNING_RATE, WARMUP_STEPS, OUTPUT_MODEL_PATH, EPOCHS): print("=" * 15, "LOAD MODEL", "=" * 15) model = GPT2LMHeadModel.from_pretrained(MODEL_TYPE) tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_TYPE) print("=" * 15, "GET DATASET", "=" * 15) data_loader = get_data_loader(DATA_PATH, tokenizer, BATCH_SIZE, True) optimizier = AdamW(model.parameters(), lr=LEARNING_RATE) scheduler = get_linear_schedule_with_warmup( optimizier, WARMUP_STEPS, len(data_loader) - WARMUP_STEPS, -1) if not os.path.exists(OUTPUT_MODEL_PATH): os.mkdir(OUTPUT_MODEL_PATH) fine_tuning_runner(model, optimizier, data_loader, scheduler, EPOCHS, OUTPUT_MODEL_PATH) model.save_pretrained(OUTPUT_MODEL_PATH)
def summarizer(input: TextSummerizeInput) -> TextSummerizeOutput: """ Summarize texts """ tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart") inputs = tokenizer([ tokenizer.bos_token + input.text_input + tokenizer.eos_token ])['input_ids'][0] model_url = 'https://train-mxysk1opgrzauh8ifw55-gpt2-train-teachable-ainize.endpoint.dev.ainize.ai/predictions/bart-ko-small-finetune' headers = {'Content-Type': 'application/json; charset=utf-8'} response = requests.post(url=model_url, headers=headers, json={"text": inputs}) if response.status_code == 200: result = tokenizer.decode(response.json()[0], skip_special_tokens=True) return TextSummerizeOutput(output=result) else: print(f'Failed {response.text}') return TextSummerizeOutput(output='Failed summerize')
def initialize(arguments=None): if arguments is None: arguments = Munch({ 'config': 'settings/config.yaml', 'checkpoint': 'checkpoints/weights.pth', 'no_cuda': True, 'no_resize': False }) logging.getLogger().setLevel(logging.FATAL) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' with open(arguments.config, 'r') as f: params = yaml.load(f, Loader=yaml.FullLoader) args = parse_args(Munch(params)) args.update(**vars(arguments)) args.wandb = False args.device = 'cuda' if torch.cuda.is_available( ) and not args.no_cuda else 'cpu' model = get_model(args) model.load_state_dict(torch.load(args.checkpoint, map_location=args.device)) if 'image_resizer.pth' in os.listdir(os.path.dirname( args.checkpoint)) and not arguments.no_resize: image_resizer = ResNetV2(layers=[2, 3, 3], num_classes=max(args.max_dimensions) // 32, global_pool='avg', in_chans=1, drop_rate=.05, preact=True, stem_type='same', conv_layer=StdConv2dSame).to(args.device) image_resizer.load_state_dict( torch.load(os.path.join(os.path.dirname(args.checkpoint), 'image_resizer.pth'), map_location=args.device)) image_resizer.eval() else: image_resizer = None tokenizer = PreTrainedTokenizerFast(tokenizer_file=args.tokenizer) return args, model, image_resizer, tokenizer