def apply(self, doc): for ts in MentionNgrams.apply(self, doc): if ts.get_span().endswith(".0"): value = ts.get_span()[:-2] yield TemporaryImplicitSpanMention( sentence=ts.sentence, char_start=ts.char_start, char_end=ts.char_end, expander_key="volt_expander", position=0, text=value, words=[value], lemmas=[value], pos_tags=[ts.get_attrib_tokens("pos_tags")[-1]], ner_tags=[ts.get_attrib_tokens("ner_tags")[-1]], dep_parents=[ts.get_attrib_tokens("dep_parents")[-1]], dep_labels=[ts.get_attrib_tokens("dep_labels")[-1]], page=[ts.get_attrib_tokens("page")[-1]] if ts.sentence.is_visual() else [None], top=[ts.get_attrib_tokens("top")[-1]] if ts.sentence.is_visual() else [None], left=[ts.get_attrib_tokens("left")[-1]] if ts.sentence.is_visual() else [None], bottom=[ts.get_attrib_tokens("bottom")[-1]] if ts.sentence.is_visual() else [None], right=[ts.get_attrib_tokens("right")[-1]] if ts.sentence.is_visual() else [None], meta=None, ) else: yield ts
def apply(self, session, context): for ts in MentionNgrams.apply(self, session, context): m = re.match( r"^([\+\-\u2010\u2011\u2012\u2013\u2014\u2212\uf02d])?(\s*)(\d+)$", ts.get_span(), re.U, ) if m: if m.group(1) is None: temp = "" elif m.group(1) == "+": if m.group(2) != "": # If bigram '+ 150' is seen, accept the unigram '150', # not both continue temp = "" else: # m.group(1) is a type of negative sign # A bigram '- 150' is different from unigram '150', so we # keep the implicit '-150' temp = "-" temp += m.group(3) yield TemporaryImplicitSpan( sentence=ts.sentence, char_start=ts.char_start, char_end=ts.char_end, expander_key=u"temp_expander", position=0, text=temp, words=[temp], lemmas=[temp], pos_tags=[ts.get_attrib_tokens("pos_tags")[-1]], ner_tags=[ts.get_attrib_tokens("ner_tags")[-1]], dep_parents=[ts.get_attrib_tokens("dep_parents")[-1]], dep_labels=[ts.get_attrib_tokens("dep_labels")[-1]], page=[ts.get_attrib_tokens("page")[-1]] if ts.sentence.is_visual() else [None], top=[ts.get_attrib_tokens("top")[-1]] if ts.sentence.is_visual() else [None], left=[ts.get_attrib_tokens("left")[-1]] if ts.sentence.is_visual() else [None], bottom=[ts.get_attrib_tokens("bottom")[-1]] if ts.sentence.is_visual() else [None], right=[ts.get_attrib_tokens("right")[-1]] if ts.sentence.is_visual() else [None], meta=None, ) else: yield ts
def apply(self, session, context): for ts in MentionNgrams.apply(self, session, context): enumerated_parts = [ part.upper() for part in expand_part_range(ts.get_span()) ] parts = set(enumerated_parts) if self.parts_by_doc: possible_parts = self.parts_by_doc[ts.parent.document.name.upper()] for base_part in enumerated_parts: for part in possible_parts: if part.startswith(base_part) and len(base_part) >= 4: parts.add(part) for i, part in enumerate(parts): if " " in part: continue # it won't pass the part_matcher if part == ts.get_span(): yield ts else: yield TemporaryImplicitSpan( sentence=ts.sentence, char_start=ts.char_start, char_end=ts.char_end, expander_key=u"part_expander", position=i, text=part, words=[part], lemmas=[part], pos_tags=[ts.get_attrib_tokens("pos_tags")[0]], ner_tags=[ts.get_attrib_tokens("ner_tags")[0]], dep_parents=[ts.get_attrib_tokens("dep_parents")[0]], dep_labels=[ts.get_attrib_tokens("dep_labels")[0]], page=[min(ts.get_attrib_tokens("page"))] if ts.sentence.is_visual() else [None], top=[min(ts.get_attrib_tokens("top"))] if ts.sentence.is_visual() else [None], left=[max(ts.get_attrib_tokens("left"))] if ts.sentence.is_visual() else [None], bottom=[min(ts.get_attrib_tokens("bottom"))] if ts.sentence.is_visual() else [None], right=[max(ts.get_attrib_tokens("right"))] if ts.sentence.is_visual() else [None], meta=None, )
def apply(self, doc): for ts in MentionNgrams.apply(self, doc): m = re.match(r"^(±)?\s*(\d+)\s*(\.)?\s*(\d*)$", ts.get_span()) if m: # Handle case that random spaces are inserted (e.g. "± 2 . 3") temp = "" if m.group(1): temp += m.group(1) if m.group(2): temp += m.group(2) if m.group(3): temp += m.group(3) if m.group(4): temp += m.group(4) yield TemporaryImplicitSpanMention( sentence=ts.sentence, char_start=ts.char_start, char_end=ts.char_end, expander_key="opamp_exp", position=0, text=temp, words=[temp], lemmas=[temp], pos_tags=[ts.get_attrib_tokens("pos_tags")[-1]], ner_tags=[ts.get_attrib_tokens("ner_tags")[-1]], dep_parents=[ts.get_attrib_tokens("dep_parents")[-1]], dep_labels=[ts.get_attrib_tokens("dep_labels")[-1]], page=[ts.get_attrib_tokens("page")[-1]] if ts.sentence.is_visual() else [None], top=[ts.get_attrib_tokens("top")[-1]] if ts.sentence.is_visual() else [None], left=[ts.get_attrib_tokens("left")[-1]] if ts.sentence.is_visual() else [None], bottom=[ts.get_attrib_tokens("bottom")[-1]] if ts.sentence.is_visual() else [None], right=[ts.get_attrib_tokens("right")[-1]] if ts.sentence.is_visual() else [None], meta=None, ) else: yield ts
def apply(self, doc: Document) -> Iterator[TemporaryContext]: """Generate MentionNgrams from a Document by parsing all of its Sentences. :param doc: The ``Document`` to parse. :type doc: ``Document`` :raises TypeError: If the input doc is not of type ``Document``. """ if not isinstance(doc, Document): raise TypeError( "Input Contexts to MentionNgrams.apply() must be of type Document" ) for ts in MentionNgrams.apply(self, doc): yield ts for ts in MentionDocuments.apply(self, doc): yield ts for ts in MentionFigures.apply(self, doc): yield ts
def mention_setup(): """Set up mentions.""" docs_path = "tests/data/html_simple/md.html" pdf_path = "tests/data/pdf_simple/" # Preprocessor for the Docs preprocessor = HTMLDocPreprocessor(docs_path) doc = next(preprocessor.__iter__()) # Create an Parser and parse the md document parser_udf = get_parser_udf( structural=True, tabular=True, lingual=True, visual=True, visual_parser=PdfVisualParser(pdf_path), language="en", ) doc = parser_udf.apply(doc) # Create 1-gram span mentions space = MentionNgrams(n_min=1, n_max=1) mentions = [tc for tc in space.apply(doc)] return mentions