def apply(self, doc): for sentence in doc.sentences: text = sentence.text.lower().replace('_', ' ') prefixes = ['họ tên', 'họ và tên'] check = True for prefix in prefixes: start = text.find(prefix) if start != -1: # Trường hợp câu chứa tiền tố là "họ tên" nhưng bị tách làm 2 if start + len(prefix) + 5 >= len(text): check = False continue # Trường hợp "họ tên : ABCxyz" elif text.find(': ') != -1: start = text.find(': ') + 2 else: start += len(prefix) + 1 check = False yield TemporarySpanMention(char_start=start, char_end=len(text) - 1, sentence=sentence) if check: yield TemporarySpanMention(char_start=0, char_end=len(text) - 1, sentence=sentence)
def bbox_from_span(span: TemporarySpanMention) -> Bbox: if isinstance(span, TemporarySpanMention) and span.sentence.is_visual(): return Bbox( span.get_attrib_tokens("page")[0], min(span.get_attrib_tokens("top")), max(span.get_attrib_tokens("bottom")), min(span.get_attrib_tokens("left")), max(span.get_attrib_tokens("right")), ) else: return None
def apply(self, context: Sentence) -> Iterator[TemporarySpanMention]: # These are the character offset--**relative to the sentence # start**--for each _token_ offsets = context.char_offsets # Loop over all n-grams in **reverse** order (to facilitate # longest-match semantics) L = len(offsets) seen: Set[TemporarySpanMention] = set() for j in range(self.n_min, self.n_max + 1)[::-1]: for i in range(L - j + 1): w = context.words[i + j - 1] start = offsets[i] end = offsets[i + j - 1] + len(w) - 1 ts = TemporarySpanMention( char_start=start, char_end=end, sentence=context ) if ts not in seen: seen.add(ts) yield ts # Check for split if ( j == 1 and self.n_max >= 1 and self.n_min <= 1 and self.split_rgx is not None and end - start > 0 ): text = context.text[start - offsets[0] : end - offsets[0] + 1] start_idxs = [0] end_idxs = [] for m in re.finditer(self.split_rgx, text): start_idxs.append(m.end()) end_idxs.append(m.start()) end_idxs.append(len(text)) for start_idx in start_idxs: for end_idx in end_idxs: if start_idx < end_idx: ts = TemporarySpanMention( char_start=start_idx, char_end=end_idx - 1, sentence=context, ) if ts not in seen and ts.get_span(): seen.add(ts) yield ts
def apply(self, doc): list_digit = ['0','1','2','3','4','5','6','7','8','9'] symbol_plus = '+' sympol_bracket = "(+" start_rgx = r'(\d|\+|\(\+)' number_phone = '([(]?\+84[)]?[ ]*\d{1,3}|0\d{2,4})[ ]?(-|\.)?[ ]?\d{3,4}[ ]?(-|\.)?[ ]?\d{3,4}' for sentence in doc.sentences: text = sentence.text matches = re.finditer(start_rgx, text) if matches is None: start = -1 else: for match in matches: start = match.start() end = -1 for i in range(len(text)-1, 0, -1): if text[i].isdigit() and re.search('[a-zA-Z]', text[start:i+1]) is None: end = i break if start != -1 and start < end: yield TemporarySpanMention( char_start=start, char_end=end, sentence=sentence )
def apply(self, doc: Document) -> Iterator[TemporarySpanMention]: """ Generate MentionSentences from a Document by parsing all of its Sentences. :param doc: The ``Document`` to parse. :raises TypeError: If the input doc is not of type ``Document``. """ if not isinstance(doc, Document): raise TypeError( "Input Contexts to MentionSentences.apply() must be of type Document" ) for sentence in doc.sentences: yield TemporarySpanMention(char_start=0, char_end=len(sentence.text) - 1, sentence=sentence)
def apply(self, doc): """ Generate MentionSentences from a Document by parsing all of its Sentences. :param doc: The ``Document`` to parse. :type doc: ``Document`` :raises TypeError: If the input doc is not of type ``Document``. """ if not isinstance(doc, Document): raise TypeError( "Input Contexts to MentionSentences.apply() must be of type Document" ) for sentence in doc.sentences: for date_mention in self.extract_dates(sentence.text): yield TemporarySpanMention(char_start=date_mention[0], char_end=date_mention[1] - 1, sentence=sentence)
def apply(self, session, doc): """ Generate MentionSentences from a Document by parsing all of its Sentences. :param session: The database session :param doc: The ``Document`` to parse. :type doc: ``Document`` :raises TypeError: If the input doc is not of type ``Document``. """ if not isinstance(doc, Document): raise TypeError( "Input Contexts to MentionSentences.apply() must be of type Document" ) doc = session.query(Document).filter(Document.id == doc.id).one() for sentence in doc.sentences: yield TemporarySpanMention( char_start=0, char_end=len(sentence.text) - 1, sentence=sentence )