Ejemplo n.º 1
0
 def apply(self, doc):
     for ts in MentionNgrams.apply(self, doc):
         if ts.get_span().endswith(".0"):
             value = ts.get_span()[:-2]
             yield TemporaryImplicitSpanMention(
                 sentence=ts.sentence,
                 char_start=ts.char_start,
                 char_end=ts.char_end,
                 expander_key="volt_expander",
                 position=0,
                 text=value,
                 words=[value],
                 lemmas=[value],
                 pos_tags=[ts.get_attrib_tokens("pos_tags")[-1]],
                 ner_tags=[ts.get_attrib_tokens("ner_tags")[-1]],
                 dep_parents=[ts.get_attrib_tokens("dep_parents")[-1]],
                 dep_labels=[ts.get_attrib_tokens("dep_labels")[-1]],
                 page=[ts.get_attrib_tokens("page")[-1]]
                 if ts.sentence.is_visual() else [None],
                 top=[ts.get_attrib_tokens("top")[-1]]
                 if ts.sentence.is_visual() else [None],
                 left=[ts.get_attrib_tokens("left")[-1]]
                 if ts.sentence.is_visual() else [None],
                 bottom=[ts.get_attrib_tokens("bottom")[-1]]
                 if ts.sentence.is_visual() else [None],
                 right=[ts.get_attrib_tokens("right")[-1]]
                 if ts.sentence.is_visual() else [None],
                 meta=None,
             )
         else:
             yield ts
Ejemplo n.º 2
0
 def apply(self, session, context):
     for ts in MentionNgrams.apply(self, session, context):
         m = re.match(
             r"^([\+\-\u2010\u2011\u2012\u2013\u2014\u2212\uf02d])?(\s*)(\d+)$",
             ts.get_span(),
             re.U,
         )
         if m:
             if m.group(1) is None:
                 temp = ""
             elif m.group(1) == "+":
                 if m.group(2) != "":
                     # If bigram '+ 150' is seen, accept the unigram '150',
                     # not both
                     continue
                 temp = ""
             else:  # m.group(1) is a type of negative sign
                 # A bigram '- 150' is different from unigram '150', so we
                 # keep the implicit '-150'
                 temp = "-"
             temp += m.group(3)
             yield TemporaryImplicitSpan(
                 sentence=ts.sentence,
                 char_start=ts.char_start,
                 char_end=ts.char_end,
                 expander_key=u"temp_expander",
                 position=0,
                 text=temp,
                 words=[temp],
                 lemmas=[temp],
                 pos_tags=[ts.get_attrib_tokens("pos_tags")[-1]],
                 ner_tags=[ts.get_attrib_tokens("ner_tags")[-1]],
                 dep_parents=[ts.get_attrib_tokens("dep_parents")[-1]],
                 dep_labels=[ts.get_attrib_tokens("dep_labels")[-1]],
                 page=[ts.get_attrib_tokens("page")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 top=[ts.get_attrib_tokens("top")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 left=[ts.get_attrib_tokens("left")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 bottom=[ts.get_attrib_tokens("bottom")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 right=[ts.get_attrib_tokens("right")[-1]]
                 if ts.sentence.is_visual()
                 else [None],
                 meta=None,
             )
         else:
             yield ts
Ejemplo n.º 3
0
 def apply(self, session, context):
     for ts in MentionNgrams.apply(self, session, context):
         enumerated_parts = [
             part.upper() for part in expand_part_range(ts.get_span())
         ]
         parts = set(enumerated_parts)
         if self.parts_by_doc:
             possible_parts = self.parts_by_doc[ts.parent.document.name.upper()]
             for base_part in enumerated_parts:
                 for part in possible_parts:
                     if part.startswith(base_part) and len(base_part) >= 4:
                         parts.add(part)
         for i, part in enumerate(parts):
             if " " in part:
                 continue  # it won't pass the part_matcher
             if part == ts.get_span():
                 yield ts
             else:
                 yield TemporaryImplicitSpan(
                     sentence=ts.sentence,
                     char_start=ts.char_start,
                     char_end=ts.char_end,
                     expander_key=u"part_expander",
                     position=i,
                     text=part,
                     words=[part],
                     lemmas=[part],
                     pos_tags=[ts.get_attrib_tokens("pos_tags")[0]],
                     ner_tags=[ts.get_attrib_tokens("ner_tags")[0]],
                     dep_parents=[ts.get_attrib_tokens("dep_parents")[0]],
                     dep_labels=[ts.get_attrib_tokens("dep_labels")[0]],
                     page=[min(ts.get_attrib_tokens("page"))]
                     if ts.sentence.is_visual()
                     else [None],
                     top=[min(ts.get_attrib_tokens("top"))]
                     if ts.sentence.is_visual()
                     else [None],
                     left=[max(ts.get_attrib_tokens("left"))]
                     if ts.sentence.is_visual()
                     else [None],
                     bottom=[min(ts.get_attrib_tokens("bottom"))]
                     if ts.sentence.is_visual()
                     else [None],
                     right=[max(ts.get_attrib_tokens("right"))]
                     if ts.sentence.is_visual()
                     else [None],
                     meta=None,
                 )
Ejemplo n.º 4
0
    def apply(self, doc):
        for ts in MentionNgrams.apply(self, doc):
            m = re.match(r"^(±)?\s*(\d+)\s*(\.)?\s*(\d*)$", ts.get_span())
            if m:
                # Handle case that random spaces are inserted (e.g. "± 2  . 3")
                temp = ""
                if m.group(1):
                    temp += m.group(1)
                if m.group(2):
                    temp += m.group(2)
                if m.group(3):
                    temp += m.group(3)
                if m.group(4):
                    temp += m.group(4)

                yield TemporaryImplicitSpanMention(
                    sentence=ts.sentence,
                    char_start=ts.char_start,
                    char_end=ts.char_end,
                    expander_key="opamp_exp",
                    position=0,
                    text=temp,
                    words=[temp],
                    lemmas=[temp],
                    pos_tags=[ts.get_attrib_tokens("pos_tags")[-1]],
                    ner_tags=[ts.get_attrib_tokens("ner_tags")[-1]],
                    dep_parents=[ts.get_attrib_tokens("dep_parents")[-1]],
                    dep_labels=[ts.get_attrib_tokens("dep_labels")[-1]],
                    page=[ts.get_attrib_tokens("page")[-1]]
                    if ts.sentence.is_visual() else [None],
                    top=[ts.get_attrib_tokens("top")[-1]]
                    if ts.sentence.is_visual() else [None],
                    left=[ts.get_attrib_tokens("left")[-1]]
                    if ts.sentence.is_visual() else [None],
                    bottom=[ts.get_attrib_tokens("bottom")[-1]]
                    if ts.sentence.is_visual() else [None],
                    right=[ts.get_attrib_tokens("right")[-1]]
                    if ts.sentence.is_visual() else [None],
                    meta=None,
                )
            else:
                yield ts
Ejemplo n.º 5
0
    def apply(self, doc: Document) -> Iterator[TemporaryContext]:
        """Generate MentionNgrams from a Document by parsing all of its Sentences.

        :param doc: The ``Document`` to parse.
        :type doc: ``Document``
        :raises TypeError: If the input doc is not of type ``Document``.
        """
        if not isinstance(doc, Document):
            raise TypeError(
                "Input Contexts to MentionNgrams.apply() must be of type Document"
            )

        for ts in MentionNgrams.apply(self, doc):
            yield ts
            
        for ts in MentionDocuments.apply(self, doc):
            yield ts
            
        for ts in MentionFigures.apply(self, doc):
            yield ts
Ejemplo n.º 6
0
def mention_setup():
    """Set up mentions."""
    docs_path = "tests/data/html_simple/md.html"
    pdf_path = "tests/data/pdf_simple/"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor.__iter__())

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(
        structural=True,
        tabular=True,
        lingual=True,
        visual=True,
        visual_parser=PdfVisualParser(pdf_path),
        language="en",
    )
    doc = parser_udf.apply(doc)

    # Create 1-gram span mentions
    space = MentionNgrams(n_min=1, n_max=1)
    mentions = [tc for tc in space.apply(doc)]
    return mentions