def __init__(self, config, *args, **kwargs): tokenizer_config = config.tokenizer_config self._tokenizer = AutoTokenizer.from_pretrained( tokenizer_config.type, **tokenizer_config.params ) self._max_seq_length = config.max_seq_length self._probability = getattr(config, "mask_probability", 0.15)
def __init__(self): model = AutoModelForQuestionAnswering.from_pretrained("/model/model") tokenizer = AutoTokenizer.from_pretrained("/model/tokenizer") self.default_response = "Perhaps the answer is 42." self.predictor = pipeline("question-answering", model=model, tokenizer=tokenizer) with open("/mounts/bert_context/paragraph.txt") as f: self.context = f.read()
def __init__(self, pretrained_model: str): self._pretrained_model = pretrained_model self._init_kwargs = {} self._kwargs = {} if pretrained_model.startswith('roberta-'): self._kwargs['add_prefix_space'] = True self._tokenizer = AutoTokenizer.from_pretrained( pretrained_model, **self._init_kwargs)
def load(cls, model_name: str, cache_tokenizer: bool = True) -> AutoTokenizer: if model_name in cls._cache: return PretrainedAutoTokenizer._cache[model_name] tokenizer = AutoTokenizer.from_pretrained(model_name) if cache_tokenizer: cls._cache[model_name] = tokenizer return tokenizer
def test_as_array_produces_token_sequence_roberta_sentence_pair(self): tokenizer = AutoTokenizer.from_pretrained("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base") indexer = PretrainedTransformerIndexer(model_name="roberta-base") default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair( "AllenNLP is great!", "Really it is!") vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids
def __init__(self, model_name: str, namespace: str = "tags", token_min_padding_length: int = 0) -> None: super().__init__(token_min_padding_length) self._namespace = namespace self._tokenizer = AutoTokenizer.from_pretrained(model_name) self._padding_value = self._tokenizer.convert_tokens_to_ids( [self._tokenizer.pad_token])[0] logger.info( f"Using token indexer padding value of {self._padding_value}") self._added_to_vocabulary = False
def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased") indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]" tokens = tokenizer.tokenize(default_format) expected_ids = tokenizer.convert_tokens_to_ids(tokens) allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair( "AllenNLP is great!", "Really it is!") vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
def __init__(self, data_path, img_path, tsv_path, tr_name, max_seq_len, max_seq_len_title=None, train=True): super().__init__() # self.name = data_path # self.splits = splits.split(",") self.path = data_path self.img_path = img_path self.train = train self.tokenizer = AutoTokenizer.from_pretrained(tr_name) self.max_seq_len = max_seq_len self.max_seq_len_title = max_seq_len_title # Loading datasets to data self.raw_data = [] # for split in self.splits: # path = os.path.join("data/", f"{split}.jsonl") # self.raw_data.extend( # [json.loads(jline) for jline in open(path, "r").read().split('\n')] # ) # print("Load %d data from split(s) %s." % (len(self.raw_data), self.name)) # self.raw_data = [json.loads(jline) for jline in open(self.path,"r").read().split('\n')] self.raw_data = [json.loads(jline) for jline in open(self.path, "r")] # List to dict (for evaluation and others) self.id2datum = {datum["id"]: datum for datum in self.raw_data} # Loading detection features to img_data img_data = [] # path = "data/HM_img.tsv" img_data.extend(load_obj_tsv(tsv_path, self.id2datum.keys())) # Convert img list to dict self.imgid2img = {} for img_datum in img_data: # Adding int here to convert 0625 to 625 self.imgid2img[int(img_datum['img_id'])] = img_datum # Only keep the data with loaded image features self.data = [] for datum in self.raw_data: # In HM the Img Id field is simply "id" if datum['id'] in self.imgid2img: self.data.append(datum) print("Use %d data in torch dataset" % (len(self.data))) print()
def test_as_array_produces_token_sequence_roberta(self): tokenizer = AutoTokenizer.from_pretrained("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base") indexer = PretrainedTransformerIndexer(model_name="roberta-base") string_specials = "<s> AllenNLP is great </s>" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids
def __init__( self, model_name: str, add_special_tokens: bool = True, max_length: int = None, stride: int = 0, truncation_strategy: str = "longest_first", ) -> None: self._tokenizer = AutoTokenizer.from_pretrained(model_name) self._add_special_tokens = add_special_tokens self._max_length = max_length self._stride = stride self._truncation_strategy = truncation_strategy
def test_as_array_produces_token_sequence_bert_cased(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased") indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") string_specials = "[CLS] AllenNLP is great [SEP]" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
def check_vocab_size(model_name: str): namespace = "tags" tokenizer = AutoTokenizer.from_pretrained(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_vocab_size( namespace=namespace) == tokenizer.vocab_size
def test_transformers_vocabs_added_correctly(self): namespace, model_name = "tags", "roberta-base" tokenizer = AutoTokenizer.from_pretrained(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_token_to_index_vocabulary( namespace=namespace) == tokenizer.encoder
def __init__(self, model_name: str, max_length: int = None) -> None: super().__init__() self.transformer_model = AutoModel.from_pretrained(model_name) self._max_length = max_length # I'm not sure if this works for all models; open an issue on github if you find a case # where it doesn't work. self.output_dim = self.transformer_model.config.hidden_size tokenizer = AutoTokenizer.from_pretrained(model_name) ( self._num_added_start_tokens, self._num_added_end_tokens, ) = PretrainedTransformerIndexer.determine_num_special_tokens_added(tokenizer) self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens
def __init__(self, config, *args, **kwargs): # https://huggingface.co/transformers/model_doc/xlmroberta.html # roberta is with different tokenization of above default (bert) tokenizer_config = config.tokenizer_config self._tokenizer = AutoTokenizer.from_pretrained( tokenizer_config.type, **tokenizer_config.params) self._CLS_TOKEN = self._tokenizer.bos_token # <s> self._SEP_TOKEN = self._tokenizer.sep_token # </s> self._MASK_TOKEN = self._tokenizer.mask_token # <mask> self._PAD_TOKEN_ID = self._tokenizer.pad_token_id # 1 self._max_seq_length = config.max_seq_length self._probability = getattr(config, "mask_probability", 0.15)
def load_datasets(self): model_name_or_path = "monologg/koelectra-base-discriminator" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) utters, label_map, answers = self.utters, self.label_map, self.answers def gen_datasets(split=0.1): train_dataset, test_dataset = [], [] train_label, test_label = [], [] for l, u in utters.items(): assert len(u) > 1, "# of utterences per label should be > 1" labels = [l] * len(u) ix = max(1, int(len(u) * split)) test_dataset.extend(u[:ix]) test_label.extend(labels[:ix]) train_dataset.extend(u[ix:]) train_label.extend(labels[ix:]) return train_dataset, train_label, test_dataset, test_label train_dataset, train_label, test_dataset, test_label = gen_datasets() train_dataset = tokenizer(train_dataset, return_tensors='pt', padding='max_length', truncation=True) test_dataset = tokenizer(test_dataset, return_tensors='pt', padding='max_length', truncation=True) train_dataset['labels'] = train_label test_dataset['labels'] = test_label keys = list(train_dataset.keys()) train_dataset = [ dict(zip(keys, v)) for v in zip(*train_dataset.values()) ] test_dataset = [ dict(zip(keys, v)) for v in zip(*test_dataset.values()) ] train_dataset = WellnessDataset(train_dataset) test_dataset = WellnessDataset(test_dataset) return train_dataset, test_dataset
def compute_metrics(pred): labels_ids = pred.label_ids pred_ids = pred.predictions tokenizer = AutoTokenizer.from_pretrained(args.tr) print(pred_ids) print(labels_ids) label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True) pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True) labels_ids[labels_ids == -100] = tokenizer.pad_token_id rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid return { "rouge2_precision": round(rouge_output.precision, 4), "rouge2_recall": round(rouge_output.recall, 4), "rouge2_fmeasure": round(rouge_output.fmeasure, 4), }
def __init__( self, model_name: str, add_special_tokens: bool = True, max_length: Optional[int] = None, stride: int = 0, truncation_strategy: str = "longest_first", calculate_character_offsets: bool = False, tokenizer_kwargs: Optional[Dict[str, Any]] = None, ) -> None: if tokenizer_kwargs is None: tokenizer_kwargs = {} else: tokenizer_kwargs = tokenizer_kwargs.copy() if "use_fast" in tokenizer_kwargs: if tokenizer_kwargs["use_fast"]: logger.warning( "Fast huggingface tokenizers are known to break in certain scenarios." ) else: tokenizer_kwargs["use_fast"] = False # As of transformers==2.8.0, fast tokenizers are broken. self.tokenizer = AutoTokenizer.from_pretrained(model_name, **tokenizer_kwargs) # Huggingface tokenizers have different ways of remembering whether they lowercase or not. Detecting it # this way seems like the least brittle way to do it. tokenized = self.tokenizer.tokenize( "A") # Use a single character that won't be cut into word pieces. detokenized = " ".join(tokenized) self._tokenizer_lowercases = "a" in detokenized self._add_special_tokens = add_special_tokens self._max_length = max_length self._stride = stride self._truncation_strategy = truncation_strategy self._calculate_character_offsets = calculate_character_offsets ( self.num_added_start_tokens, self.num_added_middle_tokens, self.num_added_end_tokens, ) = self._determine_num_special_tokens_added()
def run_inference(ckpt_path=None): ckpt_path = "lightning_logs/version_0/checkpoints/epoch=24.ckpt" import pprint import random parser = ArgumentParser() parser = Trainer.add_argparse_args(parser) parser = WellnessClassifier.add_model_specific_args(parser) args = parser.parse_args() model = WellnessClassifier(args, 359) ckpt = torch.load(ckpt_path, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt['state_dict']) model_name_or_path = "monologg/koelectra-base-discriminator" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) utters, label_map, answers = prep.load() label_map = {v: k for k, v in label_map.items()} samples = [ "벽에 머리를 부딪히는 느낌이야", "허리가 아파서 움직임이 어렵네ㅠㅠ", "집중력도 떨어지고 기분이 좋지 않아", "나는 화가 통제가 안돼!", "히잉?", "나 자해 할거야", "팔다리가 너무 저려", "방에만 있고 싶어", "스트레스 너무 많이 받아서 잠이 안와", "난바부야 기억을 하나두 못하겠어", "다이어트 하고싶은데 맘처럼 안되네", "요즘은 이상한 생각이 많이 들어", "부정적인 생각이 많이 드네", "사고 휴유증이 있는걸까", "체력이 떨어져서 문제야", "으악! 꽥!", "요즘 비둘기 무서워", "감정이 왔다갔다해요.", "화가 많이 날때는 감정 조절이 안되어여", "요즘 잠이 안와요", "입맛도 통 없구", "기분이 우울해서 큰일이야", "나는 아무것도 잘한게 없는걸?", "모든걸 내 마음대로 하고 싶을 때 있잖아", "무엇이 불안한지 잘 모르겠어" ] model.eval() inputs_ = tokenizer(samples, return_tensors='pt', padding='max_length', truncation=True) ixs = torch.argmax(model(**inputs_)[0], dim=-1).tolist() res = [random.choice(answers.get(ix, ['None'])) for ix in ixs] labels = [label_map[ix] for ix in ixs] for s, l, r in zip(samples, labels, res): print(f"{l}|{s}=>{r}")
def test_long_sequence_splitting(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased") indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", max_length=4) string_specials = "[CLS] AllenNLP is great [SEP]" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) assert len( expected_ids) == 7 # just to make sure it's what we're expecting cls_id, sep_id = expected_ids[0], expected_ids[-1] expected_ids = (expected_ids[:3] + [sep_id, cls_id] + expected_ids[3:5] + [sep_id, cls_id] + expected_ids[5:]) allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids assert indexed["segment_concat_mask"] == [1] * len(expected_ids) assert indexed["mask"] == [1] * 7 # original length
def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None, **kwargs) -> None: super().__init__(**kwargs) self._namespace = namespace self._tokenizer = AutoTokenizer.from_pretrained(model_name) self._added_to_vocabulary = False ( self._num_added_start_tokens, self._num_added_end_tokens, ) = self.__class__.determine_num_special_tokens_added(self._tokenizer) self._max_length = max_length if self._max_length is not None: self._effective_max_length = ( # we need to take into account special tokens self._max_length - self._tokenizer.num_added_tokens()) if self._effective_max_length <= 0: raise ValueError( "max_length needs to be greater than the number of special tokens inserted." )
def __init__( self, model_name: str, add_special_tokens: bool = True, max_length: int = None, stride: int = 0, truncation_strategy: str = "longest_first", calculate_character_offsets: bool = False, ) -> None: self._tokenizer = AutoTokenizer.from_pretrained(model_name) # Huggingface tokenizers have different ways of remembering whether they lowercase or not. Detecting it # this way seems like the least brittle way to do it. tokenized = self._tokenizer.tokenize( "FOO" ) # Use a short word that's unlikely to be cut into word pieces. detokenized = " ".join(tokenized) self._tokenizer_lowercases = "foo" in detokenized self._add_special_tokens = add_special_tokens self._max_length = max_length self._stride = stride self._truncation_strategy = truncation_strategy self._calculate_character_offsets = calculate_character_offsets
def __init__(self, model_name): self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = BertModel.from_pretrained(model_name).eval() self.model.cuda()
def __init__(self, model_name: str, do_lowercase: bool, namespace: str = "tags", token_min_padding_length: int = 0, max_pieces: int = 512, use_starting_offsets: bool = True, truncate_long_sequences: bool = True, start_sub_words: List[str] = None, end_sub_words: List[str] = None, separator_sub_word: str = "[SEP]", never_lowercase: List[str] = None) -> None: super().__init__(token_min_padding_length) if model_name.endswith("-cased") and do_lowercase: logger.warning("Your pretrained model appears to be cased, " "but your indexers is lowercasing tokens.") elif model_name.endswith("-uncased") and not do_lowercase: logger.warning("Your pretrained model appears to be uncased, " "but your indexers is not lowercasing tokens.") self._model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained( model_name, do_lower_case=do_lowercase) self._namespace = namespace self._added_to_vocabulary = False self._padding_value = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.pad_token])[0] logger.info( f"Using token indexers padding value of {self._padding_value}") self._never_lowercase = never_lowercase or [] self._use_starting_offsets = use_starting_offsets self._max_pieces = max_pieces self._truncate_long_sequences = truncate_long_sequences self._do_lowercase = do_lowercase if start_sub_words: self._start_sub_words = start_sub_words else: if 'roberta' in model_name: self._start_sub_words = ['<s>'] elif 'bert' in model_name: self._start_sub_words = ['[CLS]'] elif 'xlm' in model_name: self._start_sub_words = ['</s>'] elif 'gpt' in model_name or 'transfo' in model_name: self._start_sub_words = [] else: raise ValueError("strange input") if end_sub_words: self._end_sub_words = end_sub_words else: if 'roberta' in model_name or 'xlm' in model_name: self._end_sub_words = ['</s>'] elif 'bert' in model_name: self._end_sub_words = ['[SEP]'] elif 'gpt' in model_name or 'transfo' in model_name: self._end_sub_words = [] else: raise ValueError("strange input") # sub_word padding的时候用上面的padding_value,其它项进行padding的时候,用这个 self._other_padding_value = 0 self._start_sub_word_ids = [ self.tokenizer.convert_tokens_to_ids(sub_word) for token in (self._start_sub_words or []) for sub_word in self.tokenizer.tokenize(token) ] self._end_sub_word_ids = [ self.tokenizer.convert_tokens_to_ids(sub_word) for token in (self._end_sub_words or []) for sub_word in self.tokenizer.tokenize(token) ] self._separator_ids = [ self.tokenizer.convert_tokens_to_ids(sub_word) for sub_word in self.tokenizer.tokenize(separator_sub_word) ]
def single_format_to_bert(params): # length_limit = 510 read_json_file, wt_pt_file, oracle_mode, oracle_sent_num, bert_model_name, min_src_ntokens, max_src_ntokens, min_nsents, max_nsents, length_limit = params # print(read_json_file) # TODO keep file exist check from transformers.tokenization_auto import AutoTokenizer # tokenizer = plm_tokenizers[bert_model_name] tokenizer = AutoTokenizer.from_pretrained(bert_model_name) if os.path.exists(wt_pt_file): logger.info('Ignore %s' % wt_pt_file) return print("working on {}".format(wt_pt_file)) bert_data = MSBertData(min_src_ntokens, max_src_ntokens, min_nsents, max_nsents, tokenizer, bert_model_name) logger.info('Processing %s' % read_json_file) jobs = json.load(open(read_json_file)) datasets = [] for d in jobs: disco_dep = d['disco_dep'] # disco_graph_links = d['disco_graph_links'] disco_links = d['disco_link'] ##### tgt_list_str = d['tgt_list_str'] tgt_tok_list_list_str = d['tgt_tok_list_list_str'] span = d['disco_span'] sent, doc_id, coref = d['sent'], d['doc_id'], d['coref'] # First of all, assemble data and LENGTH truncation budget = 0 disco_bag = [] sent_bag = [] original_disco_txt_list_of_str = [] for idx in range(len(sent)): this_sent = sent[idx] this_disco = span[idx] this_tokens = this_sent['tokens'] this_tokens = [clean(x.lower()) for x in this_tokens] this_coref = this_sent['corefs'] original_word_len = len(this_tokens) tmp_disco_bag = [] for disc in this_disco: # tree_node = next(gen) start, end = disc disc_piece = DiscourseUnit(len(disco_bag) + len(tmp_disco_bag), idx, rel_start=start, rel_end=end) # disc_piece.add_dep_info(tree_node) disc_piece.add_dep(disco_dep) for jdx in range(start, end + 1): _toks = this_tokens[jdx] disc_piece.add_word(_toks,tokenizer) # look at word jdx, see if any coref mentions applied. _cor = this_coref[jdx] if _cor != []: disc_piece.add_mention(jdx) # add the orinigla index of the word in the sentence for _c in _cor: disc_piece.add_coref(_c) # finish loading coref tmp_disco_bag.append(disc_piece) budget += disc_piece.get_bert_wp_length() budget += 2 if budget > length_limit: break else: disco_bag += tmp_disco_bag original_disco_txt_list_of_str += [x.get_readable_words_as_list() for x in tmp_disco_bag] s = SentUnit(idx, this_tokens, [x.bert_word_pieces for x in tmp_disco_bag], tmp_disco_bag) sent_bag.append(s) effective_disco_number = len(disco_bag) # clean disco_graph_links disco_graph_links = [(tup[0] - 1, tup[1] - 1, tup[2]) for tup in disco_links if (tup[0] <= effective_disco_number and tup[1] <= effective_disco_number)] disc_oracle_ids, disc_spans, disc_coref = bert_data.preprocess_disc(disco_bag, tgt_tok_list_list_str) src_tok_index, sent_oracle_labels, segments_ids, \ cls_ids, original_sent_txt_list_of_str, tgt_txt = bert_data.preprocess_sent(sent_bag,bert_model_name, summary=tgt_tok_list_list_str) # TO have: src_subtoken_idxs [for bert encoder], labels[sent level and discourse level], # segments_ids[for bert encoder], # cls_ids[for sent level], # span indexs [ for discourse level] # entity coref linking edge [ sent level and discourse level] # discourse linking edge [discourse level only] # src_txt, tgt_txt # provide two versions, one based on discourse, one without. # w. multiple oracle # prepare discourse data # oracle is computed based on discourse # prepare sent data b_data_dict = {"src": src_tok_index, "labels": sent_oracle_labels, "segs": segments_ids, 'clss': cls_ids, 'sent_txt': original_sent_txt_list_of_str, 'disco_txt': original_disco_txt_list_of_str, # "tgt_txt": tgt_txt, "tgt_list_str": tgt_list_str, # unchanged reference summary for computing final score "tgt_tok_list_list_str": tgt_tok_list_list_str, # for oracle, tokenized 'd_labels': disc_oracle_ids, 'd_span': disc_spans, 'd_coref': disc_coref, 'd_graph': disco_graph_links, 'disco_dep': disco_dep, 'doc_id': doc_id } if len(src_tok_index) < 15: continue datasets.append(b_data_dict) logger.info('Saving to %s' % wt_pt_file) torch.save(datasets, wt_pt_file) datasets = [] gc.collect()
def _reverse_engineer_special_tokens( self, token_a: str, token_b: str, model_name: str, tokenizer_kwargs: Optional[Dict[str, Any]], ): # storing the special tokens self.sequence_pair_start_tokens = [] self.sequence_pair_mid_tokens = [] self.sequence_pair_end_tokens = [] # storing token type ids for the sequences self.sequence_pair_first_token_type_id = None self.sequence_pair_second_token_type_id = None # storing the special tokens self.single_sequence_start_tokens = [] self.single_sequence_end_tokens = [] # storing token type id for the sequence self.single_sequence_token_type_id = None # Reverse-engineer the tokenizer for two sequences tokenizer_with_special_tokens = AutoTokenizer.from_pretrained( model_name, add_special_tokens=True, **tokenizer_kwargs) dummy_output = tokenizer_with_special_tokens.encode_plus( token_a, token_b, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=False, ) dummy_a = self.tokenizer.encode(token_a, add_special_tokens=False, add_prefix_space=True)[0] assert dummy_a in dummy_output["input_ids"] dummy_b = self.tokenizer.encode(token_b, add_special_tokens=False, add_prefix_space=True)[0] assert dummy_b in dummy_output["input_ids"] assert dummy_a != dummy_b seen_dummy_a = False seen_dummy_b = False for token_id, token_type_id in zip(dummy_output["input_ids"], dummy_output["token_type_ids"]): if token_id == dummy_a: if seen_dummy_a or seen_dummy_b: # seeing a twice or b before a raise ValueError( "Cannot auto-determine the number of special tokens added." ) seen_dummy_a = True assert ( self.sequence_pair_first_token_type_id is None or self.sequence_pair_first_token_type_id == token_type_id ), "multiple different token type ids found for the first sequence" self.sequence_pair_first_token_type_id = token_type_id continue if token_id == dummy_b: if seen_dummy_b: # seeing b twice raise ValueError( "Cannot auto-determine the number of special tokens added." ) seen_dummy_b = True assert ( self.sequence_pair_second_token_type_id is None or self.sequence_pair_second_token_type_id == token_type_id ), "multiple different token type ids found for the second sequence" self.sequence_pair_second_token_type_id = token_type_id continue token = Token( tokenizer_with_special_tokens.convert_ids_to_tokens(token_id), text_id=token_id, type_id=token_type_id, ) if not seen_dummy_a: self.sequence_pair_start_tokens.append(token) elif not seen_dummy_b: self.sequence_pair_mid_tokens.append(token) else: self.sequence_pair_end_tokens.append(token) assert (len(self.sequence_pair_start_tokens) + len(self.sequence_pair_mid_tokens) + len(self.sequence_pair_end_tokens) ) == self.tokenizer.num_special_tokens_to_add(pair=True) # Reverse-engineer the tokenizer for one sequence dummy_output = tokenizer_with_special_tokens.encode_plus( token_a, add_special_tokens=True, return_token_type_ids=True, return_attention_mask=False, add_prefix_space=True, ) seen_dummy_a = False for token_id, token_type_id in zip(dummy_output["input_ids"], dummy_output["token_type_ids"]): if token_id == dummy_a: if seen_dummy_a: raise ValueError( "Cannot auto-determine the number of special tokens added." ) seen_dummy_a = True assert ( self.single_sequence_token_type_id is None or self.single_sequence_token_type_id == token_type_id ), "multiple different token type ids found for the sequence" self.single_sequence_token_type_id = token_type_id continue token = Token( tokenizer_with_special_tokens.convert_ids_to_tokens(token_id), text_id=token_id, type_id=token_type_id, ) if not seen_dummy_a: self.single_sequence_start_tokens.append(token) else: self.single_sequence_end_tokens.append(token) assert (len(self.single_sequence_start_tokens) + len(self.single_sequence_end_tokens) ) == self.tokenizer.num_special_tokens_to_add(pair=False)
def main(): # model files check and download check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH) inputs = { "question": args.question, "context": args.context, } logger.info("Question : " + str(args.question)) logger.info("Context : " + str(args.context)) # Set defaults values handle_impossible_answer = False topk = 1 max_answer_len = 15 tokenizer = AutoTokenizer.from_pretrained('deepset/roberta-base-squad2') # Convert inputs to features examples = [] if True: # for i, item in enumerate(inputs): item = inputs logger.debug(item) if isinstance(item, dict): if any(k not in item for k in ["question", "context"]): raise KeyError("You need to provide a dictionary with keys " "{question:..., context:...}") example = create_sample(**item) examples.append(example) features_list = [ squad_convert_examples_to_features( examples=[example], tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, padding_strategy=PaddingStrategy.DO_NOT_PAD.value, is_training=False, tqdm_enabled=False, ) for example in examples ] all_answers = [] for features, example in zip(features_list, examples): model_input_names = tokenizer.model_input_names + ["input_ids"] fw_args = { k: [feature.__dict__[k] for feature in features] for k in model_input_names } fw_args = {k: np.array(v) for (k, v) in fw_args.items()} logger.debug("Input" + str(fw_args)) logger.debug("Shape" + str(fw_args["input_ids"].shape)) net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id) net.set_input_shape(fw_args["input_ids"].shape) if args.benchmark: logger.info('BENCHMARK mode') for i in range(5): start = int(round(time.time() * 1000)) outputs = net.predict(fw_args) end = int(round(time.time() * 1000)) logger.info("\tailia processing time {} ms".format(end - start)) else: outputs = net.predict(fw_args) logger.debug("Output" + str(outputs)) start, end = outputs[0:2] min_null_score = 1000000 # large and positive answers = [] for (feature, start_, end_) in zip(features, start, end): # Ensure padded tokens & question tokens cannot belong # to the set of candidate answers. undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & \ feature.attention_mask # Generate mask undesired_tokens_mask = undesired_tokens == 0.0 # Make sure non-context indexes in the tensor cannot contribute # to the softmax start_ = np.where(undesired_tokens_mask, -10000.0, start_) end_ = np.where(undesired_tokens_mask, -10000.0, end_) # Normalize logits and spans to retrieve the answer start_ = np.exp( start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True))) end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True))) if handle_impossible_answer: min_null_score = min(min_null_score, (start_[0] * end_[0]).item()) # Mask CLS start_[0] = end_[0] = 0.0 starts, ends, scores = decode(start_, end_, topk, max_answer_len) char_to_word = np.array(example.char_to_word_offset) # Convert the answer (tokens) back to the original text t2org = feature.token_to_orig_map answers += [{ "score": score.item(), "start": np.where(char_to_word == t2org[s])[0][0].item(), "end": np.where(char_to_word == t2org[e])[0][-1].item(), "answer": " ".join(example.doc_tokens[t2org[s]:t2org[e] + 1]), } for s, e, score in zip(starts, ends, scores)] if handle_impossible_answer: answers.append({ "score": min_null_score, "start": 0, "end": 0, "answer": "" }) answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:topk] all_answers += answers logger.info("Answer : " + str(all_answers)) logger.info('Script finished successfully.')
def pipeline(task: str, model: Optional = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, modelcard: Optional[Union[str, ModelCard]] = None, device=torch.device("cpu"), **kwargs) -> Pipeline: """ Utility factory method to build a pipeline. Pipeline are made of: A Tokenizer instance in charge of mapping raw textual input to token A Model instance Some (optional) post processing for enhancing model's output Examples: pipeline('sentiment-analysis') """ # Register all the supported task here SUPPORTED_TASKS = { "sentiment-analysis": { "impl": TextClassificationPipelineMod, "pt": AutoModelForSequenceClassification, # if is_torch_available() else None, "default": { "model": { "pt": "distilbert-base-uncased-finetuned-sst-2-english", }, "config": "distilbert-base-uncased-finetuned-sst-2-english", "tokenizer": "distilbert-base-uncased", }, }, } # Retrieve the task if task not in SUPPORTED_TASKS: raise KeyError("Unknown task {}, available tasks are {}".format( task, list(SUPPORTED_TASKS.keys()))) framework = "pt" #get_framework(model) targeted_task = SUPPORTED_TASKS[task] task, model_class = targeted_task["impl"], targeted_task[framework] # Use default model/config/tokenizer for the task if no model is provided if model is None: models, config, tokenizer = tuple(targeted_task["default"].values()) model = models[framework] # Try to infer tokenizer from model or config name (if provided as str) if tokenizer is None: if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: tokenizer = model elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: tokenizer = config else: # Impossible to guest what is the right tokenizer here raise Exception( "Impossible to guess which tokenizer to use. " "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer." ) # Try to infer modelcard from model or config name (if provided as str) if modelcard is None: # Try to fallback on one of the provided string for model or config (will replace the suffix) if isinstance(model, str): modelcard = model elif isinstance(config, str): modelcard = config # Instantiate tokenizer if needed if isinstance(tokenizer, str): tokenizer = AutoTokenizer.from_pretrained(tokenizer) # Instantiate config if needed if isinstance(config, str): config = AutoConfig.from_pretrained(config) # Instantiate modelcard if needed if isinstance(modelcard, str): modelcard = ModelCard.from_pretrained(modelcard) # Instantiate model if needed if isinstance(model, str): # Handle transparent TF/PT model conversion model_kwargs = {} if framework == "pt" and model.endswith(".h5"): model_kwargs["from_tf"] = True logger.warning( "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. " "Trying to load the model with PyTorch.") model = model_class.from_pretrained(model, config=config, **model_kwargs) model = model.to(device) model.device = device return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs)
def __init__(self, model_name: str, namespace: str = "tags", **kwargs) -> None: super().__init__(**kwargs) self._namespace = namespace self._tokenizer = AutoTokenizer.from_pretrained(model_name) self._added_to_vocabulary = False
def test_determine_num_special_tokens_added(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") assert PretrainedTransformerIndexer.determine_num_special_tokens_added( tokenizer) == (1, 1)