def _ner_bert_tokenize( tokens: List[str], mask: List[int], tags: List[str], tokenizer: FullTokenizer, max_subword_len: int = None) -> Tuple[List[str], List[str]]: tokens_subword = ['[CLS]'] mask_subword = [0] tags_subword = ['X'] for token, flag, tag in zip(tokens, mask, tags): subwords = tokenizer.tokenize(token) if not subwords or\ ((max_subword_len is not None) and (len(subwords) > max_subword_len)): tokens_subword.append('[UNK]') mask_subword.append(0) tags_subword.append('X') else: tokens_subword.extend(subwords) mask_subword.extend([flag] + [0] * (len(subwords) - 1)) tags_subword.extend([tag] + ['X'] * (len(subwords) - 1)) tokens_subword.append('[SEP]') mask_subword.append(0) tags_subword.append('X') return tokens_subword, mask_subword, tags_subword
def _ner_bert_tokenize( tokens: List[str], mask: List[int], tags: List[str], tokenizer: FullTokenizer, max_subword_len: int = None, mode: str = None, token_maksing_prob: float = 0.0 ) -> Tuple[List[str], List[int], List[str]]: tokens_subword = ['[CLS]'] mask_subword = [0] tags_subword = ['X'] for token, flag, tag in zip(tokens, mask, tags): subwords = tokenizer.tokenize(token) if not subwords or \ ((max_subword_len is not None) and (len(subwords) > max_subword_len)): tokens_subword.append('[UNK]') mask_subword.append(flag) tags_subword.append(tag) else: if mode == 'train' and token_maksing_prob > 0.0 and np.random.rand( ) < token_maksing_prob: tokens_subword.extend(['[MASK]'] * len(subwords)) else: tokens_subword.extend(subwords) mask_subword.extend([flag] + [0] * (len(subwords) - 1)) tags_subword.extend([tag] + ['X'] * (len(subwords) - 1)) tokens_subword.append('[SEP]') mask_subword.append(0) tags_subword.append('X') return tokens_subword, mask_subword, tags_subword
def get_context_indices( samples: List[List[str]], sample_id: int, subtokenizer: FullTokenizer, max_subtokens_length: int, left_context_rate: float = 0.5, random: Random = Random(31)) -> List[int]: rich_sample_indices = [sample_id] toks = samples[sample_id] l_ctx = samples[:sample_id] r_ctx = samples[sample_id + 1:] subtoks_len = len( [st for t in toks for st in subtokenizer.tokenize(t)]) l_i, r_i = 0, 0 while (l_i < len(l_ctx)) or (r_i < len(r_ctx)): l_rate = left_context_rate if r_i < len(r_ctx) else 1.0 if (l_i < len(l_ctx)) and (random.random() < l_rate): # add one sentence from left_context subtoks = [ st for t in l_ctx[-l_i - 1] for st in subtokenizer.tokenize(t) ] if subtoks_len + len(subtoks) > max_subtokens_length: break subtoks_len += len(subtoks) rich_sample_indices = [sample_id - l_i - 1 ] + rich_sample_indices l_i += 1 else: # add one sentence from right_context subtoks = [ st for t in r_ctx[r_i] for st in subtokenizer.tokenize(t) ] if subtoks_len + len(subtoks) > max_subtokens_length: break subtoks_len += len(subtoks) rich_sample_indices.append(sample_id + r_i + 1) r_i += 1 return rich_sample_indices
def _ner_bert_tokenize( tokens: List[str], tags: List[str], tokenizer: FullTokenizer, max_subword_len: int = None, mode: str = None, subword_mask_mode: str = "first", token_masking_prob: float = None ) -> Tuple[List[str], List[int], List[str]]: do_masking = (mode == 'train') and (token_masking_prob is not None) do_cutting = (max_subword_len is not None) tokens_subword = ['[CLS]'] startofword_markers = [0] tags_subword = ['X'] for token, tag in zip(tokens, tags): token_marker = int(tag != 'X') subwords = tokenizer.tokenize(token) if not subwords or (do_cutting and (len(subwords) > max_subword_len)): tokens_subword.append('[UNK]') startofword_markers.append(token_marker) tags_subword.append(tag) else: if do_masking and (random.random() < token_masking_prob): tokens_subword.extend(['[MASK]'] * len(subwords)) else: tokens_subword.extend(subwords) if subword_mask_mode == "last": startofword_markers.extend([0] * (len(subwords) - 1) + [token_marker]) else: startofword_markers.extend([token_marker] + [0] * (len(subwords) - 1)) tags_subword.extend([tag] + ['X'] * (len(subwords) - 1)) tokens_subword.append('[SEP]') startofword_markers.append(0) tags_subword.append('X') return tokens_subword, startofword_markers, tags_subword
class BertSQuADInferModel(Component): """This model wraps BertSQuADModel to make predictions on longer than 512 tokens sequences. It splits context on chunks with `max_seq_length - 3 - len(question)` length, preserving sentences boundaries. It reassembles batches with chunks instead of full contexts to optimize performance, e.g.,: batch_size = 5 number_of_contexts == 2 number of first context chunks == 8 number of second context chunks == 2 we will create two batches with 5 chunks For each context the best answer is selected via logits or scores from BertSQuADModel. Args: squad_model_config: path to DeepPavlov BertSQuADModel config file vocab_file: path to Bert vocab file do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens batch_size: size of batch to use during inference lang: either `en` or `ru`, it is used to select sentence tokenizer """ def __init__(self, squad_model_config: str, vocab_file: str, do_lower_case: bool, max_seq_length: int = 512, batch_size: int = 10, lang='en', **kwargs) -> None: config = json.load(open(squad_model_config)) config['chainer']['pipe'][0]['max_seq_length'] = max_seq_length self.model = build_model(config) self.max_seq_length = max_seq_length vocab_file = str(expand_path(vocab_file)) self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) self.batch_size = batch_size if lang == 'en': from nltk import sent_tokenize self.sent_tokenizer = sent_tokenize elif lang == 'ru': from ru_sent_tokenize import ru_sent_tokenize self.sent_tokenizer = ru_sent_tokenize else: raise RuntimeError('en and ru languages are supported only') def __call__(self, contexts: List[str], questions: List[str], **kwargs) -> Tuple[List[str], List[int], List[float]]: """get predictions for given contexts and questions Args: contexts: batch of contexts questions: batch of questions Returns: predictions: answer, answer start position, logits or scores """ batch_indices = [] contexts_to_predict = [] questions_to_predict = [] predictions = {} for i, (context, question) in enumerate(zip(contexts, questions)): context_subtokens = self.tokenizer.tokenize(context) question_subtokens = self.tokenizer.tokenize(question) max_chunk_len = self.max_seq_length - len(question_subtokens) - 3 if 0 < max_chunk_len < len(context_subtokens): number_of_chunks = math.ceil(len(context_subtokens) / max_chunk_len) sentences = self.sent_tokenizer(context) for chunk in np.array_split(sentences, number_of_chunks): contexts_to_predict += [' '.join(chunk)] questions_to_predict += [question] batch_indices += [i] else: contexts_to_predict += [context] questions_to_predict += [question] batch_indices += [i] for j in range(0, len(contexts_to_predict), self.batch_size): c_batch = contexts_to_predict[j: j + self.batch_size] q_batch = questions_to_predict[j: j + self.batch_size] ind_batch = batch_indices[j: j + self.batch_size] a_batch, a_st_batch, logits_batch = self.model(c_batch, q_batch) for a, a_st, logits, ind in zip(a_batch, a_st_batch, logits_batch, ind_batch): if ind in predictions: predictions[ind] += [(a, a_st, logits)] else: predictions[ind] = [(a, a_st, logits)] answers, answer_starts, logits = [], [], [] for ind in sorted(predictions.keys()): prediction = predictions[ind] best_answer_ind = np.argmax([p[2] for p in prediction]) answers += [prediction[best_answer_ind][0]] answer_starts += [prediction[best_answer_ind][1]] logits += [prediction[best_answer_ind][2]] return answers, answer_starts, logits