def _ner_bert_tokenize(tokens: List[str], tags: List[str], tokenizer: AutoTokenizer, max_subword_len: int = None, mode: str = None, subword_mask_mode: str = "first", token_masking_prob: float = None) -> Tuple[List[str], List[int], List[str]]: do_masking = (mode == 'train') and (token_masking_prob is not None) do_cutting = (max_subword_len is not None) tokens_subword = ['[CLS]'] startofword_markers = [0] tags_subword = ['X'] for token, tag in zip(tokens, tags): token_marker = int(tag != 'X') subwords = tokenizer.tokenize(token) if not subwords or (do_cutting and (len(subwords) > max_subword_len)): tokens_subword.append('[UNK]') startofword_markers.append(token_marker) tags_subword.append(tag) else: if do_masking and (random.random() < token_masking_prob): tokens_subword.extend(['[MASK]'] * len(subwords)) else: tokens_subword.extend(subwords) if subword_mask_mode == "last": startofword_markers.extend([0] * (len(subwords) - 1) + [token_marker]) else: startofword_markers.extend([token_marker] + [0] * (len(subwords) - 1)) tags_subword.extend([tag] + ['X'] * (len(subwords) - 1)) tokens_subword.append('[SEP]') startofword_markers.append(0) tags_subword.append('X') return tokens_subword, startofword_markers, tags_subword
def preprocess_text(x: str, tokenizer: AutoTokenizer, max_sequence_len: int): cur_x = x if isinstance(tokenizer, BertTokenizer): cur_x = "[CLS] " + cur_x cur_x = cur_x.replace("\n", "") cur_x = cur_x.replace(" cannot ", " can not ") cur_x = tokenizer.tokenize(cur_x) cur_x = tokenizer.convert_tokens_to_ids(cur_x) cur_x = cur_x[:max_sequence_len] cur_x = cur_x + [0] * (max_sequence_len - len(cur_x)) return cur_x
def convert_example_to_feature( example, tokenizer: AutoTokenizer, chineseandpunctuationextractor: ChineseAndPunctuationExtractor, label_map, max_length: Optional[int] = 512, pad_to_max_length: Optional[bool] = None): spo_list = example['spo_list'] if "spo_list" in example.keys() else None text_raw = example['text'] sub_text = [] # 放置中文字符 buff = "" # 存放非中文字符 for char in text_raw: if chineseandpunctuationextractor.is_chinese_or_punct(char): if buff != "": sub_text.append(buff) buff = "" sub_text.append(char) else: buff += char if buff != "": sub_text.append(buff) tok_to_orig_start_index = [] tok_to_orig_end_index = [] orig_to_tok_index = [] tokens = [] text_tmp = '' for (i, token) in enumerate(sub_text): orig_to_tok_index.append(len(tokens)) sub_tokens = tokenizer.tokenize(token) text_tmp += token for sub_token in sub_tokens: tok_to_orig_start_index.append(len(text_tmp) - len(token)) tok_to_orig_end_index.append(len(text_tmp) - 1) tokens.append(sub_token) if len(tokens) >= max_length - 2: break else: continue break # print("tok_to_orig_start_index: ", tok_to_orig_start_index) # print("tok_to_orig_end_index: ", tok_to_orig_end_index) # print("orig_to_tok_index: ", orig_to_tok_index) # print("tokens: ", tokens) seq_len = len(tokens) # 2 tags for each predicate + I tag + O tag num_labels = 2 * (len(label_map.keys()) - 2) + 2 # initialize tag labels = [[0] * num_labels for i in range(seq_len)] # 每个字都要生成标签表示,用于预测 if spo_list is not None: labels = parse_label(spo_list, label_map, tokens, tokenizer) # add [CLS] and [SEP] token, they are tagged into "O" for outside if seq_len > max_length - 2: tokens = tokens[0:(max_length - 2)] labels = labels[0:(max_length - 2)] tok_to_orig_start_index = tok_to_orig_start_index[0:(max_length - 2)] tok_to_orig_end_index = tok_to_orig_end_index[0:(max_length - 2)] tokens = ["[CLS]"] + tokens + ["[SEP]"] # "O" tag for [PAD], [CLS], [SEP] token outside_label = [[1] + [0] * (num_labels - 1)] labels = outside_label + labels + outside_label tok_to_orig_start_index = [-1] + tok_to_orig_start_index + [-1] tok_to_orig_end_index = [-1] + tok_to_orig_end_index + [-1] if seq_len < max_length: tokens = tokens + ["[PAD]"] * (max_length - seq_len - 2) labels = labels + outside_label * (max_length - len(labels)) tok_to_orig_start_index = tok_to_orig_start_index + [-1] * ( max_length - len(tok_to_orig_start_index)) tok_to_orig_end_index = tok_to_orig_end_index + [-1] * ( max_length - len(tok_to_orig_end_index)) token_ids = tokenizer.convert_tokens_to_ids(tokens) return InputFeature( input_ids=np.array(token_ids), seq_len=np.array(seq_len), tok_to_orig_start_index=np.array(tok_to_orig_start_index), tok_to_orig_end_index=np.array(tok_to_orig_end_index), labels=np.array(labels), )
class TorchTransformersSquadInfer(Component): """This model wraps BertSQuADModel to make predictions on longer than 512 tokens sequences. It splits context on chunks with `max_seq_length - 3 - len(question)` length, preserving sentences boundaries. It reassembles batches with chunks instead of full contexts to optimize performance, e.g.,: batch_size = 5 number_of_contexts == 2 number of first context chunks == 8 number of second context chunks == 2 we will create two batches with 5 chunks For each context the best answer is selected via logits or scores from BertSQuADModel. Args: squad_model_config: path to DeepPavlov BertSQuADModel config file vocab_file: path to Bert vocab file do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens batch_size: size of batch to use during inference lang: either `en` or `ru`, it is used to select sentence tokenizer """ def __init__(self, squad_model_config: str, vocab_file: str, do_lower_case: bool, max_seq_length: int = 512, batch_size: int = 10, lang: str = 'en', **kwargs) -> None: config = json.load(open(squad_model_config)) config['chainer']['pipe'][0]['max_seq_length'] = max_seq_length self.model = build_model(config) self.max_seq_length = max_seq_length if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = AutoTokenizer.from_pretrained( vocab_file, do_lower_case=do_lower_case) self.batch_size = batch_size if lang == 'en': from nltk import sent_tokenize self.sent_tokenizer = sent_tokenize elif lang == 'ru': from ru_sent_tokenize import ru_sent_tokenize self.sent_tokenizer = ru_sent_tokenize else: raise RuntimeError('en and ru languages are supported only') def __call__(self, contexts: List[str], questions: List[str], **kwargs) -> Tuple[List[str], List[int], List[float]]: """get predictions for given contexts and questions Args: contexts: batch of contexts questions: batch of questions Returns: predictions: answer, answer start position, logits or scores """ batch_indices = [] contexts_to_predict = [] questions_to_predict = [] predictions = {} for i, (context, question) in enumerate(zip(contexts, questions)): context_subtokens = self.tokenizer.tokenize(context) question_subtokens = self.tokenizer.tokenize(question) max_chunk_len = self.max_seq_length - len(question_subtokens) - 3 if 0 < max_chunk_len < len(context_subtokens): number_of_chunks = math.ceil( len(context_subtokens) / max_chunk_len) sentences = self.sent_tokenizer(context) for chunk in np.array_split(sentences, number_of_chunks): contexts_to_predict += [' '.join(chunk)] questions_to_predict += [question] batch_indices += [i] else: contexts_to_predict += [context] questions_to_predict += [question] batch_indices += [i] for j in range(0, len(contexts_to_predict), self.batch_size): c_batch = contexts_to_predict[j:j + self.batch_size] q_batch = questions_to_predict[j:j + self.batch_size] ind_batch = batch_indices[j:j + self.batch_size] a_batch, a_st_batch, logits_batch = self.model(c_batch, q_batch) for a, a_st, logits, ind in zip(a_batch, a_st_batch, logits_batch, ind_batch): if ind in predictions: predictions[ind] += [(a, a_st, logits)] else: predictions[ind] = [(a, a_st, logits)] answers, answer_starts, logits = [], [], [] for ind in sorted(predictions.keys()): prediction = predictions[ind] best_answer_ind = np.argmax([p[2] for p in prediction]) answers += [prediction[best_answer_ind][0]] answer_starts += [prediction[best_answer_ind][1]] logits += [prediction[best_answer_ind][2]] return answers, answer_starts, logits
def convert_examples_to_features( examples: List[InputExample], label_list: List[str], max_seq_length: int, tokenizer: AutoTokenizer, cls_token="[CLS]", cls_token_segment_id=0, sep_token="[SEP]", pad_token=0, pad_token_segment_id=0, pad_token_label_id=-100, sequence_a_segment_id=0, sequence_b_segment_id=1, mask_padding_with_zero=True, verbose=False ) -> List[InputFeatures]: """ Loads a data file into a list of `InputFeatures` """ label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10_000 == 0: logger.info("Writing example %d of %d", ex_index, len(examples)) tokens = [] label_ids = [] for word, label in zip(example.words, example.labels): word_tokens = tokenizer.tokenize(word) # word_tokens = word_tokens[:5] if len(word_tokens) > 0: tokens.extend(word_tokens) label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) if len(tokens) > max_seq_length - 2: logger.warning("Sequence length exceed {} (cut).".format(max_seq_length)) tokens = tokens[: (max_seq_length - 2)] label_ids = label_ids[: (max_seq_length - 2)] tokens += [sep_token] label_ids += [pad_token_label_id] segment_ids = [sequence_a_segment_id] * len(tokens) tokens = [cls_token] + tokens label_ids = [pad_token_label_id] + label_ids segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. seq_length = len(input_ids) padding_length = max_seq_length - len(input_ids) input_ids += [pad_token] * padding_length input_mask += [0 if mask_padding_with_zero else 1] * padding_length segment_ids += [pad_token_segment_id] * padding_length label_ids += [pad_token_label_id] * padding_length decoder_mask = [(x != pad_token_label_id) for x in label_ids] # assert len(input_ids) == max_seq_length # assert len(input_mask) == max_seq_length # assert len(segment_ids) == max_seq_length # assert len(label_ids) == max_seq_length if verbose and ex_index < 1: logger.info("*** Example ***") logger.info("guid: {} (length: {})".format(example.guid, seq_length)) logger.info("tokens: %s", " ".join([str(x) for x in tokens[:seq_length]])) logger.info("input_ids: %s", " ".join([str(x) for x in input_ids[:seq_length]])) # logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) # logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) logger.info("label_ids: %s", " ".join([str(x) for x in label_ids[:seq_length]])) logger.info("decode_mask: %s", " ".join([str(x) for x in decoder_mask[:seq_length]])) features.append( InputFeatures( input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids, decoder_mask=decoder_mask ) )
def convert_examples_to_features(examples: List[InputExample], label_list: List[str], max_seq_length: int, tokenizer: AutoTokenizer, cls_token="[CLS]", cls_token_segment_id=0, sep_token="[SEP]", pad_token=0, pad_token_segment_id=0, pad_token_label_id=-100, sequence_a_segment_id=0, sequence_b_segment_id=1, mask_padding_with_zero=True, verbose=False) -> List[InputFeatures]: """ Loads a data file into a list of `InputFeatures` """ label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10_000 == 0: logger.info("Writing example %d of %d", ex_index, len(examples)) tokens = [] label_ids = [] prod_start_index = prod_end_index = -1 for wid, (word, label) in enumerate(zip(example.words, example.labels)): if label == "B-arm_description": prod_start_index = len(tokens) tokens.append(PROD_START_MARKER) label_ids.append(pad_token_label_id) elif prod_start_index >= 0 and prod_end_index < 0 and label != "I-arm_description": prod_end_index = len(tokens) tokens.append(PROD_END_MARKER) label_ids.append(pad_token_label_id) word_tokens = tokenizer.tokenize(word) word_tokens = word_tokens[:5] # avoid long chemical names if len(word_tokens) > 0: tokens.extend(word_tokens) # Use the real label id for the first token of the word, # and padding ids for the remaining tokens # skip unknown labels (used by semi-supervised training with partial annotations label_ids.extend([label_map.get(label, pad_token_label_id)] + [pad_token_label_id] * (len(word_tokens) - 1)) # Product at the end of sequence if prod_start_index >= 0 and prod_end_index < 0: prod_end_index = len(tokens) tokens.append(PROD_END_MARKER) label_ids.append(pad_token_label_id) assert prod_start_index >= 0 assert prod_end_index >= 0 # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. if len(tokens) > max_seq_length - 2: # [CLS], [SEP] logger.info( "Sentence length exceeds max_seq_length: {} ({})".format( " ".join(tokens), len(tokens))) # This will fail if PROD is cut tokens = tokens[:(max_seq_length - 2)] label_ids = label_ids[:(max_seq_length - 2)] tokens += [sep_token] label_ids += [pad_token_label_id] segment_ids = [sequence_a_segment_id] * len(tokens) tokens = [cls_token] + tokens label_ids = [pad_token_label_id] + label_ids segment_ids = [cls_token_segment_id] + segment_ids prod_start_index += 1 # cls_token added to th beginning prod_end_index += 1 input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) prod_start_mask = [0 for i in range(len(input_ids))] prod_start_mask[prod_start_index] = 1 prod_end_mask = [0 for i in range(len(input_ids))] prod_end_mask[prod_end_index] = 1 prod_mask = [0 for i in range(len(input_ids))] prod_mask[prod_start_index:prod_end_index + 1] = [1] * (prod_end_index + 1 - prod_start_index) # set segment ids for product # segment_ids[prod_start_index:prod_end_index+1] = [1] * (prod_end_index+1-prod_start_index) # Zero-pad up to the sequence length. seq_length = len(input_ids) padding_length = max_seq_length - seq_length input_ids += [pad_token] * padding_length input_mask += [0 if mask_padding_with_zero else 1] * padding_length prod_start_mask += ([0 if mask_padding_with_zero else 1] * padding_length) prod_end_mask += ([0 if mask_padding_with_zero else 1] * padding_length) prod_mask += ([0 if mask_padding_with_zero else 1] * padding_length) segment_ids += [pad_token_segment_id] * padding_length label_ids += [pad_token_label_id] * padding_length decoder_mask = [(x != pad_token_label_id) for x in label_ids] assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(prod_start_mask) == max_seq_length assert len(prod_end_mask) == max_seq_length assert len(prod_mask) == max_seq_length assert len(prod_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if verbose and ex_index < 1: logger.info("*** Example ***") logger.info("guid: {} (length: {})".format(example.guid, seq_length)) logger.info("tokens: " + " ".join([str(x) for x in tokens[:seq_length]])) logger.info("input_ids: " + " ".join([str(x) for x in input_ids[:seq_length]])) logger.info("label_ids: " + " ".join([str(x) for x in label_ids[:seq_length]])) logger.info("decoder_mask: " + " ".join([str(x) for x in decoder_mask[:seq_length]])) features.append( InputFeatures(input_ids=input_ids, attention_mask=input_mask, prod_start_mask=prod_start_mask, prod_end_mask=prod_end_mask, prod_mask=prod_mask, token_type_ids=segment_ids, label_ids=label_ids, decoder_mask=decoder_mask))
def convert_examples_to_features( examples: List[InputExample], max_seq_len: int, tokenizer: AutoTokenizer, pad_token_label_id: int = -100, cls_token_segment_id: int = 0, pad_token_segment_id: int = 0, sequence_a_segment_id: int = 0, mask_padding_with_zero: bool = True, ) -> List[InputFeatures]: # Setting based on the current model type cls_token = tokenizer.cls_token sep_token = tokenizer.sep_token unk_token = tokenizer.unk_token pad_token_id = tokenizer.pad_token_id features = [] for (ex_index, example) in enumerate(examples): if ex_index % 5000 == 0: logging.debug("Processing example %d of %d", ex_index, len(examples)) # Tokenize word by word (for NER) tokens: List[str] = [] slot_labels_ids = [] pos_labels_ids = [] np_labels_ids, vp_labels_ids, entity_labels_ids, acronym_labels_ids = ( [], [], [], [], ) for ( word, slot_label, pos_label, np_label, vp_label, entity_label, acronym_label, ) in zip( example.words, example.slot_labels, example.pos_labels, example.np_labels, example.vp_labels, example.entity_labels, example.acronym_labels, ): word_tokens = tokenizer.tokenize(word) if not word_tokens: # For handling the bad-encoded word word_tokens = [unk_token] tokens.extend(word_tokens) # Use the real label ID for the first token of the word, and padding IDs for the # remaining tokens. slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) pos_labels_ids.extend([int(pos_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) np_labels_ids.extend([int(np_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) vp_labels_ids.extend([int(vp_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) entity_labels_ids.extend([int(entity_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) acronym_labels_ids.extend([int(acronym_label)] + [pad_token_label_id] * (len(word_tokens) - 1)) # Account for [CLS] and [SEP]. special_tokens_count = 2 if len(tokens) > max_seq_len - special_tokens_count: tokens = tokens[:(max_seq_len - special_tokens_count)] slot_labels_ids = slot_labels_ids[:(max_seq_len - special_tokens_count)] pos_labels_ids = pos_labels_ids[:(max_seq_len - special_tokens_count)] np_labels_ids = np_labels_ids[:(max_seq_len - special_tokens_count)] vp_labels_ids = vp_labels_ids[:(max_seq_len - special_tokens_count)] entity_labels_ids = entity_labels_ids[:(max_seq_len - special_tokens_count)] acronym_labels_ids = acronym_labels_ids[:(max_seq_len - special_tokens_count)] # Add [SEP] token. tokens += [sep_token] slot_labels_ids += [pad_token_label_id] pos_labels_ids += [pad_token_label_id] np_labels_ids += [pad_token_label_id] vp_labels_ids += [pad_token_label_id] entity_labels_ids += [pad_token_label_id] acronym_labels_ids += [pad_token_label_id] token_type_ids = [sequence_a_segment_id] * len(tokens) # Add [CLS] token. tokens = [cls_token] + tokens slot_labels_ids = [pad_token_label_id] + slot_labels_ids pos_labels_ids = [pad_token_label_id] + pos_labels_ids np_labels_ids = [pad_token_label_id] + np_labels_ids vp_labels_ids = [pad_token_label_id] + vp_labels_ids entity_labels_ids = [pad_token_label_id] + entity_labels_ids acronym_labels_ids = [pad_token_label_id] + acronym_labels_ids token_type_ids = [cls_token_segment_id] + token_type_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_len - len(input_ids) input_ids = input_ids + ([pad_token_id] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length) pos_labels_ids = pos_labels_ids + ([pad_token_label_id] * padding_length) np_labels_ids = np_labels_ids + ([pad_token_label_id] * padding_length) vp_labels_ids = vp_labels_ids + ([pad_token_label_id] * padding_length) entity_labels_ids = entity_labels_ids + ([pad_token_label_id] * padding_length) acronym_labels_ids = acronym_labels_ids + ([pad_token_label_id] * padding_length) assert len(input_ids ) == max_seq_len, "Error with input length {} vs {}".format( len(input_ids), max_seq_len) assert (len(attention_mask) == max_seq_len ), "Error with attention mask length {} vs {}".format( len(attention_mask), max_seq_len) assert (len(token_type_ids) == max_seq_len ), "Error with token type length {} vs {}".format( len(token_type_ids), max_seq_len) assert (len(slot_labels_ids) == max_seq_len ), "Error with slot labels length {} vs {}".format( len(slot_labels_ids), max_seq_len) assert (len(pos_labels_ids) == max_seq_len ), "Error with pos labels length {} vs {}".format( len(pos_labels_ids), max_seq_len) assert (len(np_labels_ids) == max_seq_len ), "Error with np labels length {} vs {}".format( len(np_labels_ids), max_seq_len) assert (len(vp_labels_ids) == max_seq_len ), "Error with vp labels length {} vs {}".format( len(vp_labels_ids), max_seq_len) assert (len(entity_labels_ids) == max_seq_len ), "Error with entity labels length {} vs {}".format( len(entity_labels_ids), max_seq_len) assert (len(acronym_labels_ids) == max_seq_len ), "Error with acronym labels length {} vs {}".format( len(acronym_labels_ids), max_seq_len) intent_label_id = int(example.intent_label) if ex_index < 3: logging.debug( # pylint: disable=logging-not-lazy "Example created. guid: %s, tokens: %s, input_ids: %s, " + "attention_mask: %s, token_type_ids: %s, intent_label: %s (id = %d), " + "slot_labels: %s, POS_labels: %s, NP_labels: %s" + "VP_labels: %s, entity_labels, %s acronym_labels: %s", example.guid, " ".join([str(x) for x in tokens]), " ".join([str(x) for x in input_ids]), " ".join([str(x) for x in attention_mask]), " ".join([str(x) for x in token_type_ids]), example.intent_label, intent_label_id, " ".join([str(x) for x in slot_labels_ids]), " ".join([str(x) for x in pos_labels_ids]), " ".join([str(x) for x in np_labels_ids]), " ".join([str(x) for x in vp_labels_ids]), " ".join([str(x) for x in entity_labels_ids]), " ".join([str(x) for x in acronym_labels_ids]), ) features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, intent_label_id=intent_label_id, slot_labels_ids=slot_labels_ids, pos_labels_ids=pos_labels_ids, np_labels_ids=np_labels_ids, vp_labels_ids=vp_labels_ids, entity_labels_ids=entity_labels_ids, acronym_labels_ids=acronym_labels_ids, )) return features