Ejemplo n.º 1
0
    def apply(self, doc):

        for sentence in doc.sentences:
            text = sentence.text.lower().replace('_', ' ')
            prefixes = ['họ tên', 'họ và tên']
            check = True

            for prefix in prefixes:
                start = text.find(prefix)
                if start != -1:
                    # Trường hợp câu chứa tiền tố là "họ tên" nhưng bị tách làm 2
                    if start + len(prefix) + 5 >= len(text):
                        check = False
                        continue
                    # Trường hợp "họ tên : ABCxyz"
                    elif text.find(': ') != -1:
                        start = text.find(': ') + 2
                    else:
                        start += len(prefix) + 1

                    check = False
                    yield TemporarySpanMention(char_start=start,
                                               char_end=len(text) - 1,
                                               sentence=sentence)

            if check:
                yield TemporarySpanMention(char_start=0,
                                           char_end=len(text) - 1,
                                           sentence=sentence)
Ejemplo n.º 2
0
def bbox_from_span(span: TemporarySpanMention) -> Bbox:
    if isinstance(span, TemporarySpanMention) and span.sentence.is_visual():
        return Bbox(
            span.get_attrib_tokens("page")[0],
            min(span.get_attrib_tokens("top")),
            max(span.get_attrib_tokens("bottom")),
            min(span.get_attrib_tokens("left")),
            max(span.get_attrib_tokens("right")),
        )
    else:
        return None
Ejemplo n.º 3
0
    def apply(self, context: Sentence) -> Iterator[TemporarySpanMention]:

        # These are the character offset--**relative to the sentence
        # start**--for each _token_
        offsets = context.char_offsets

        # Loop over all n-grams in **reverse** order (to facilitate
        # longest-match semantics)
        L = len(offsets)
        seen: Set[TemporarySpanMention] = set()
        for j in range(self.n_min, self.n_max + 1)[::-1]:
            for i in range(L - j + 1):
                w = context.words[i + j - 1]
                start = offsets[i]
                end = offsets[i + j - 1] + len(w) - 1
                ts = TemporarySpanMention(
                    char_start=start, char_end=end, sentence=context
                )
                if ts not in seen:
                    seen.add(ts)
                    yield ts

                # Check for split
                if (
                    j == 1
                    and self.n_max >= 1
                    and self.n_min <= 1
                    and self.split_rgx is not None
                    and end - start > 0
                ):
                    text = context.text[start - offsets[0] : end - offsets[0] + 1]
                    start_idxs = [0]
                    end_idxs = []
                    for m in re.finditer(self.split_rgx, text):
                        start_idxs.append(m.end())
                        end_idxs.append(m.start())
                    end_idxs.append(len(text))
                    for start_idx in start_idxs:
                        for end_idx in end_idxs:
                            if start_idx < end_idx:
                                ts = TemporarySpanMention(
                                    char_start=start_idx,
                                    char_end=end_idx - 1,
                                    sentence=context,
                                )
                                if ts not in seen and ts.get_span():
                                    seen.add(ts)
                                    yield ts
Ejemplo n.º 4
0
    def apply(self, doc):

        list_digit = ['0','1','2','3','4','5','6','7','8','9']
        symbol_plus = '+'
        sympol_bracket = "(+"

        start_rgx = r'(\d|\+|\(\+)'
        number_phone = '([(]?\+84[)]?[ ]*\d{1,3}|0\d{2,4})[ ]?(-|\.)?[ ]?\d{3,4}[ ]?(-|\.)?[ ]?\d{3,4}'
        for sentence in doc.sentences:
            text = sentence.text
            matches = re.finditer(start_rgx, text)
            if matches is None:
                start = -1
            else:
                for match in matches:
                    start = match.start()
                    end = -1
                    for i in range(len(text)-1, 0, -1):
                        if text[i].isdigit() and re.search('[a-zA-Z]', text[start:i+1]) is None:
                            end = i   
                            break
                    if start != -1 and start < end: 
                        yield TemporarySpanMention(
                            char_start=start, char_end=end, sentence=sentence
                        )
Ejemplo n.º 5
0
    def apply(self, doc: Document) -> Iterator[TemporarySpanMention]:
        """
        Generate MentionSentences from a Document by parsing all of its Sentences.

        :param doc: The ``Document`` to parse.
        :raises TypeError: If the input doc is not of type ``Document``.
        """
        if not isinstance(doc, Document):
            raise TypeError(
                "Input Contexts to MentionSentences.apply() must be of type Document"
            )

        for sentence in doc.sentences:
            yield TemporarySpanMention(char_start=0,
                                       char_end=len(sentence.text) - 1,
                                       sentence=sentence)
    def apply(self, doc):
        """
        Generate MentionSentences from a Document by parsing all of its Sentences.
        :param doc: The ``Document`` to parse.
        :type doc: ``Document``
        :raises TypeError: If the input doc is not of type ``Document``.
        """
        if not isinstance(doc, Document):
            raise TypeError(
                "Input Contexts to MentionSentences.apply() must be of type Document"
            )

        for sentence in doc.sentences:
            for date_mention in self.extract_dates(sentence.text):
                yield TemporarySpanMention(char_start=date_mention[0],
                                           char_end=date_mention[1] - 1,
                                           sentence=sentence)
Ejemplo n.º 7
0
    def apply(self, session, doc):
        """
        Generate MentionSentences from a Document by parsing all of its Sentences.

        :param session: The database session
        :param doc: The ``Document`` to parse.
        :type doc: ``Document``
        :raises TypeError: If the input doc is not of type ``Document``.
        """
        if not isinstance(doc, Document):
            raise TypeError(
                "Input Contexts to MentionSentences.apply() must be of type Document"
            )

        doc = session.query(Document).filter(Document.id == doc.id).one()
        for sentence in doc.sentences:
            yield TemporarySpanMention(
                char_start=0, char_end=len(sentence.text) - 1, sentence=sentence
            )