def test_figure_matcher(doc_setup): """Test matchers for figures.""" doc = doc_setup # Create two dummy figures Figure(id=2, document=doc) Figure(id=3, document=doc) assert len(doc.figures) == 2 space = MentionFigures() assert len(list(space.apply(doc))) == 2 # Set up a matcher that matches figures with id==2. matcher = LambdaFunctionFigureMatcher( func=lambda tf: True if tf.figure.id == 2 else False) # Test if matcher only matches the first figure. assert len(list(matcher.apply(space.apply(doc)))) == 1 assert set(tf.figure.id for tf in matcher.apply(space.apply(doc))) == {2} # The keyword arg should be "func" with pytest.raises(Exception): LambdaFunctionFigureMatcher( function=lambda tf: True if tf.figure.id == 2 else False) # LambdaFunctionFigureMatcher only supports TemporaryFigureMention. space = MentionNgrams(n_min=1, n_max=2) with pytest.raises(ValueError): list(matcher.apply(space.apply(doc)))
def _parse_figure(self, node, state): """Parse the figure node. :param node: The lxml img node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["img", "figure"]: return state # Process the Figure stable_id = (f"{state['document'].name}" f"::" f"{'figure'}" f":" f"{state['figure']['idx']}") # Set name for Figure name = node.attrib["name"] if "name" in node.attrib else None # img within a Figure get's processed in the parent Figure if node.tag == "img" and isinstance(state["parent"][node], Figure): return state # NOTE: We currently do NOT support nested figures. parts = {} parent = state["parent"][node] if isinstance(parent, Section): parts["section"] = parent elif isinstance(parent, Cell): parts["section"] = parent.table.section parts["cell"] = parent else: logger.warning(f"Figure is nested within {state['parent'][node]}") return state parts["document"] = state["document"] parts["stable_id"] = stable_id parts["name"] = name parts["position"] = state["figure"]["idx"] # If processing a raw img if node.tag == "img": # Create the Figure entry in the DB parts["url"] = node.get("src") state["context"][node] = Figure(**parts) elif node.tag == "figure": # Pull the image from a child img node, if one exists imgs = [child for child in node if child.tag == "img"] if len(imgs) > 1: logger.warning("Figure contains multiple images.") # Right now we don't support multiple URLs in the Figure context # As a workaround, just ignore the outer Figure and allow processing # of the individual images. We ignore the accompanying figcaption # by marking it as visited. captions = [ child for child in node if child.tag == "figcaption" ] state["visited"].update(captions) return state img = imgs[0] state["visited"].add(img) # Create the Figure entry in the DB parts["url"] = img.get("src") state["context"][node] = Figure(**parts) state["figure"]["idx"] += 1 return state
def _parse_figure(self, node, state): """Parse the figure node. :param node: The lxml img node to parse :param state: The global state necessary to place the node in context of the document as a whole. """ if node.tag not in ["img", "figure"]: return state # Process the figure stable_id = "{}::{}:{}".format( state["document"].name, "figure", state["figure"]["idx"] ) # img within a Figure get's processed in the parent Figure if node.tag == "img" and isinstance(state["parent"][node], Figure): return state # NOTE: We currently do NOT support nested figures. if not isinstance(state["parent"][node], Section): logger.warning("Figure is nested within {}".format(state["parent"][node])) return state # If processing a raw img if node.tag == "img": # Create the Figure entry in the DB state["context"][node] = Figure( document=state["document"], section=state["parent"][node], stable_id=stable_id, position=state["figure"]["idx"], url=node.get("src"), ) elif node.tag == "figure": # Pull the image from a child img node, if one exists imgs = [child for child in node if child.tag == "img"] if len(imgs) > 1: logger.warning("Figure contains multiple images.") # Right now we don't support multiple URLs in the Figure context # As a workaround, just ignore the outer Figure and allow processing # of the individual images. We ignore the accompanying figcaption # by marking it as visited. captions = [child for child in node if child.tag == "figcaption"] state["visited"].update(captions) return state img = imgs[0] state["visited"].add(img) # Create the Figure entry in the DB state["context"][node] = Figure( document=state["document"], section=state["parent"][node], stable_id=stable_id, position=state["figure"]["idx"], url=img.get("src"), ) state["figure"]["idx"] += 1 return state