def create_annotation(utterance: str, intent_label: str, slots_label: str) -> str: annotation_str = OPEN + escape_brackets(intent_label) + " " slots = parse_slot_string(slots_label) cur_index = 0 for slot in sorted(slots, key=lambda slot: slot.start): annotation_str += escape_brackets( get_substring_from_offsets(utterance, cur_index, slot.start) ) annotation_str += ( OPEN + escape_brackets(slot.label) + " " + escape_brackets( get_substring_from_offsets(utterance, slot.start, slot.end) ) + " " + CLOSE ) cur_index = slot.end annotation_str += ( escape_brackets(get_substring_from_offsets(utterance, cur_index, None)) + " " + CLOSE ) return annotation_str
def create_frame(intent_label, slot_names_str, utterance): frame = Node( label=intent_label, span=Span(0, len(utterance)), children={ Node(label=slot.label, span=Span(slot.start, slot.end)) for slot in parse_slot_string(slot_names_str) }, ) return frame
def create_frame(text, intent_label, slot_names_str, byte_len): frame = Node( label=intent_label, span=Span(0, byte_len), children={ Node(label=slot.label, span=Span(slot.start, slot.end)) for slot in parse_slot_string(slot_names_str) }, text=text, ) return frame
def _unnumberize(self, preds, tokens, doc_str): """ We re-tokenize and re-numberize the raw context (doc_str) here to get doc_tokens to get access to start_idx and end_idx mappings. At this point, ans_token_start is the start index of the answer within tokens and ans_token_end is the end index. We calculate the offset of doc_tokens within tokens. Then we find the start_idx and end_idx as well as the corresponding span in the raw text using the answer token indices. """ # start_idx and end_idx are lists of char start and end positions in doc_str. doc_tokens, start_idxs, end_idxs = self.tensorizer._lookup_tokens( doc_str) # find the offsets of doc_tokens in tokens try: offset_end = tokens.index( self.tensorizer.vocab.get_pad_index()) - 1 except ValueError: offset_end = len(tokens) - 1 offset_start = list( map( lambda x: tokens[x:offset_end] == doc_tokens[:offset_end - x], range(offset_end), )).index(True) # find each answer's char idxs and strings as well pred_labels = self._process_pred(preds[offset_start:offset_end]) token_range = list(zip(start_idxs, end_idxs)) pred_slots = parse_slot_string( merge_token_labels_to_slot( token_range, pred_labels, self.tensorizer.use_bio_labels, )) ans_strs = [] ans_start_char_idxs = [] ans_end_char_idxs = [] for slot in pred_slots: # if its not an answer span, skip if slot.label in map( str, [ self.tensorizer.labels_vocab.pad_token, Slot.NO_LABEL_SLOT, ], ): continue ans_strs.append(doc_str[slot.start:slot.end]) ans_start_char_idxs.append(slot.start) ans_end_char_idxs.append(slot.end) return ans_strs, ans_start_char_idxs, ans_end_char_idxs
def create_annotation(utterance: str, intent_label: str, slots_label: str) -> str: annotation_str = OPEN + escape_brackets(intent_label) + " " slots = parse_slot_string(slots_label) cur_index = 0 for slot in sorted(slots, key=lambda slot: slot.start): annotation_str += escape_brackets(utterance[cur_index:slot.start]) annotation_str += ( OPEN + escape_brackets(slot.label) + " " + escape_brackets(utterance[slot.start:slot.end]) + " " + CLOSE) cur_index = slot.end annotation_str += escape_brackets(utterance[cur_index:]) + " " + CLOSE return annotation_str
def load_slots(s): return parse_slot_string(s)
def get_slots(word_names): slots = { Node(label=slot.label, span=Span(slot.start, slot.end)) for slot in parse_slot_string(word_names) } return Counter(slots)