def test_get_token_bounding_box():
    s = symbol(tokens=[Token("x", "atom", 0, 1)])
    token_locations = {
        token_id(0, 1): [BoundingBox(0.01, 0.01, 0.01, 0.01, 0)]
    }
    box = get_symbol_bounding_box(s, symbol_id(), token_locations)
    assert box == BoundingBox(0.01, 0.01, 0.01, 0.01, 0)
Beispiel #2
0
def test_get_character_bounding_box():
    s = symbol(characters=[0])
    character_locations = {
        character_id(0): [BoundingBox(0.01, 0.01, 0.01, 0.01, 0)]
    }
    box = get_symbol_bounding_box(s, symbol_id(), character_locations)
    assert box == BoundingBox(0.01, 0.01, 0.01, 0.01, 0)
Beispiel #3
0
def load_equation_token_locations(
    arxiv_id: ArxivId,
) -> Optional[Dict[TokenId, List[BoundingBox]]]:

    token_locations: Dict[TokenId, List[BoundingBox]] = {}
    token_locations_path = os.path.join(
        directories.arxiv_subdir("equation-tokens-locations", arxiv_id),
        "entity_locations.csv",
    )
    if not os.path.exists(token_locations_path):
        logging.warning(
            "Could not find bounding boxes information for %s. Skipping", arxiv_id,
        )
        return None

    for record in load_from_csv(token_locations_path, HueLocationInfo):
        equation_index, token_index = [int(t) for t in record.entity_id.split("-")]
        token_id = TokenId(record.tex_path, equation_index, token_index)
        box = BoundingBox(
            page=int(record.page),
            left=record.left,
            top=record.top,
            width=record.width,
            height=record.height,
        )
        if token_id not in token_locations:
            token_locations[token_id] = []
        token_locations[token_id].append(box)

    return token_locations
Beispiel #4
0
def _to_pdf_coordinates(
    bounding_box: Rectangle,
    image_width: int,
    image_height: int,
    pdf_page_width: float,
    pdf_page_height: float,
    page: int,
) -> BoundingBox:
    """
    Convert a "bounding_box" in pixel coordinates in a raster image to PDF coordinates.
    """
    left = bounding_box.left
    top = bounding_box.top
    right = bounding_box.left + bounding_box.width
    bottom = bounding_box.top + bounding_box.height
    pdf_left = left * (pdf_page_width / float(image_width))
    pdf_right = right * (pdf_page_width / float(image_width))
    # Set PDF coordinates relative to the document bottom. Because image coordinates are relative
    # to the image's top, flip the y-coordinates.
    pdf_top = pdf_page_height - (top * (pdf_page_height / float(image_height)))
    pdf_bottom = pdf_page_height - (bottom *
                                    (pdf_page_height / float(image_height)))
    return BoundingBox(
        left=pdf_left,
        top=pdf_top,
        width=pdf_right - pdf_left,
        height=pdf_top - pdf_bottom,
        page=page,
    )
Beispiel #5
0
def get_symbol_bounding_box(
        symbol: Symbol, symbol_id: SymbolId,
        character_boxes: CharacterLocations) -> Optional[BoundingBox]:
    boxes = []
    for character_index in symbol.characters:
        character_id = CharacterId(symbol_id.tex_path,
                                   symbol_id.equation_index, character_index)
        boxes.extend(character_boxes.get(character_id, []))

    if len(boxes) == 0:
        return None

    # Boxes for a symbol should be on only one page.
    if len({box.page for box in boxes}) > 1:
        logging.warning(  # pylint: disable=logging-not-lazy
            ("Boxes found on more than one page for symbol %s. " +
             "Only the boxes for one page will be considered."),
            symbol,
        )

    page = boxes[0].page
    boxes_on_page = list(filter(lambda b: b.page == page, boxes))

    left = min([box.left for box in boxes_on_page])
    right = max([box.left + box.width for box in boxes_on_page])
    top = min([box.top for box in boxes_on_page])
    bottom = max([box.top + box.height for box in boxes_on_page])

    return BoundingBox(left, top, right - left, bottom - top, page)
Beispiel #6
0
def load_locations(
        arxiv_id: ArxivId,
        entity_name: str) -> Optional[Dict[EntityId, List[BoundingBox]]]:
    """
    Load bounding boxes for each entity. Entities can have multiple bounding boxes (as will
    be the case if they are split over multiple lines).
    """

    boxes_by_entity_id: Dict[EntityId, List[BoundingBox]] = defaultdict(list)
    bounding_boxes_path = os.path.join(
        directories.arxiv_subdir(f"{entity_name}-locations", arxiv_id),
        "entity_locations.csv",
    )
    if not os.path.exists(bounding_boxes_path):
        logging.warning(
            "Could not find bounding boxes information for entity of type %s for paper %s. Skipping.",
            entity_name,
            arxiv_id,
        )
        return None

    for hue_info in load_from_csv(bounding_boxes_path, EntityLocationInfo):
        box = BoundingBox(
            page=hue_info.page,
            left=hue_info.left,
            top=hue_info.top,
            width=hue_info.width,
            height=hue_info.height,
        )
        boxes_by_entity_id[hue_info.entity_id].append(box)

    return boxes_by_entity_id
def test_merge_bounding_boxes():
    s = symbol(tokens=[0, 1])
    token_locations = {
        token_id(0): [
            BoundingBox(0.01, 0.01, 0.01, 0.01, 0),
            # Expand the bounding box downward .01 of the page
            BoundingBox(0.01, 0.02, 0.01, 0.01, 0),
        ],
        # Expand the bounding box rightward 10 pixels
        token_id(1): [BoundingBox(0.02, 0.01, 0.01, 0.01, 0)],
        # Ignore this bounding box for an irrelevant token
        token_id(2): [BoundingBox(0.03, 0.01, 0.01, 0.01, 0)],
    }
    box = get_symbol_bounding_box(s, symbol_id(), token_locations)
    assert box.left == 0.01
    assert box.top == 0.01
    assert abs(box.width - 0.02) < 0.0001
    assert abs(box.height - 0.02) < 0.0001
Beispiel #8
0
def fetch_boxes(arxiv_id: ArxivId, schema: str, version: Optional[int],
                types: List[str]) -> Optional[RegionsByPageAndType]:
    # Discover the most recent version of data in the database for the paper.

    setup_database_connections(schema)
    if version is None:
        version_number = (Version.select(fn.Max(
            Version.index)).join(Paper).where(
                Paper.arxiv_id == arxiv_id).scalar())
        if version_number is None:
            logging.warning(  # pylint: disable=logging-not-lazy
                "There are no entities for paper %s in database schema %s",
                arxiv_id,
                schema,
            )
            return None
        version = int(version_number)

    # Load bounding boxes from rows in the tables.
    rows = (EntityModel.select(
        EntityModel.id,
        EntityModel.type,
        BoundingBoxModel.left,
        BoundingBoxModel.top,
        BoundingBoxModel.width,
        BoundingBoxModel.height,
        BoundingBoxModel.page,
    ).join(Paper).switch(EntityModel).join(BoundingBoxModel).where(
        EntityModel.version == version,
        Paper.arxiv_id == arxiv_id,
        EntityModel.type << types,
    ).dicts())
    boxes_by_entity_db_id: Dict[str, List[BoundingBox]] = defaultdict(list)
    types_by_entity_db_id: Dict[str, str] = {}
    for row in rows:
        boxes_by_entity_db_id[row["id"]].append(
            BoundingBox(
                row["left"],
                row["top"],
                row["width"],
                row["height"],
                row["page"],
            ))
        types_by_entity_db_id[row["id"]] = row["type"]

    regions: RegionsByPageAndType = defaultdict(list)
    for db_id, bounding_boxes in boxes_by_entity_db_id.items():
        by_page = group_by_page(bounding_boxes)
        for page, page_boxes in by_page.items():
            key = (page, types_by_entity_db_id[db_id])
            rectangles = frozenset([
                FloatRectangle(b.left, b.top, b.width, b.height)
                for b in page_boxes
            ])
            regions[key].append(rectangles)

    return regions
Beispiel #9
0
    def on_next(self, payload):
        df = payload.dfs.get('detections', pd.DataFrame()).drop_duplicates()
        plt.title(f'{len(df)} Detections')

        h, w = payload.frame.shape[0], payload.frame.shape[1]
        plt.ylim(h, 0)
        plt.xlim(0, w)
        # if len(df) > 0:

        # df['frame'] = df.frame.astype('category')
        # sns.scatterplot(x='x', y='y', hue='frame', data=df)
        # sns.scatterplot(x='x', y='y', data=df)

        ## adjust limits
        # max_y = max(df.y)
        # max_x = max(df.x)

        vehicle_detections = list(payload.vehicle_detections)

        from common.types import BoundingBox
        boxes = (d.bounding_box for d in vehicle_detections)
        # boxes = (b.get_scaled(0.5) for b in boxes)
        boxes = (BoundingBox(b.x, b.y + b.h, b.w, int(round(b.h * 0.2)))
                 for b in boxes)

        for box in boxes:
            # Create a Rectangle patch
            rect = patches.Rectangle((box.x, box.y),
                                     box.w,
                                     box.h,
                                     linewidth=1,
                                     edgecolor='r',
                                     facecolor='black')

            # Add the patch to the Axes
            self.ax.add_patch(rect)

        pause_time = 0.001
        plt.pause(pause_time)
Beispiel #10
0
def extract_bounding_boxes(
    diff_image: np.ndarray,
    page_number: int,
    hue: float,
    masks: Optional[Iterable[FloatRectangle]] = None,
) -> List[BoundingBox]:
    """
    See 'PixelMerger' for description of how bounding boxes are extracted.
    Masks are assumed to be non-intersecting. Masks should be expressed as ratios relative to the
    page's width and height instead of pixel values---left, top, width, and height all have values
    in the range 0..1).
    """
    image_height, image_width, _ = diff_image.shape
    pixel_masks = None
    if masks is not None:
        pixel_masks = [
            Rectangle(
                left=round(m.left * image_width),
                top=round(m.top * image_height),
                width=round(m.width * image_width),
                height=round(m.height * image_height),
            ) for m in masks
        ]

    pixel_boxes = list(
        find_boxes_with_color(diff_image, hue, masks=pixel_masks))
    boxes = []
    for box in pixel_boxes:
        left_ratio = float(box.left) / image_width
        top_ratio = float(box.top) / image_height
        width_ratio = float(box.width) / image_width
        height_ratio = float(box.height) / image_height
        boxes.append(
            BoundingBox(left_ratio, top_ratio, width_ratio, height_ratio,
                        page_number))

    return boxes
Beispiel #11
0
def load_hue_locations(
    arxiv_id: ArxivId, entity_name: str
) -> Optional[Dict[HueIteration, List[BoundingBox]]]:
    """
    Load bounding boxes for each entity. Entities are indexes by the hue they were colored and
    the iteraction of coloring in which they were assigned that hue. Entities can have multiple
    bounding boxes (e.g., if they are split over multiple lines).
    """

    boxes_by_hue_iteration: Dict[HueIteration, List[BoundingBox]] = {}
    bounding_boxes_path = os.path.join(
        directories.arxiv_subdir(f"{entity_name}-locations", arxiv_id),
        "entity_locations.csv",
    )
    if not os.path.exists(bounding_boxes_path):
        logging.warning(
            "Could not find bounding boxes information entity of type %s for paper %s. Skipping.",
            entity_name,
            arxiv_id,
        )
        return None

    for hue_info in load_from_csv(bounding_boxes_path, HueLocationInfo):
        box = BoundingBox(
            page=hue_info.page,
            left=hue_info.left,
            top=hue_info.top,
            width=hue_info.width,
            height=hue_info.height,
        )
        hue_iteration = HueIteration(hue_info.hue, hue_info.iteration)
        if hue_iteration not in boxes_by_hue_iteration:
            boxes_by_hue_iteration[hue_iteration] = []
        boxes_by_hue_iteration[hue_iteration].append(box)

    return boxes_by_hue_iteration
Beispiel #12
0
def upload_symbols(
    processing_summary: PaperProcessingResult, data_version: Optional[int]
) -> None:

    arxiv_id = processing_summary.arxiv_id
    entities = [es.entity for es in processing_summary.entities]
    symbols = cast(List[SerializableSymbol], entities)
    symbols_by_id = {sid(s): s for s in symbols}

    entity_infos: List[EntityUploadInfo] = []

    # Load MathML matches for partially matching of symbols.
    matches: Matches = {}
    matches_path = os.path.join(
        directories.arxiv_subdir("symbol-matches", processing_summary.arxiv_id),
        "matches.csv",
    )
    if os.path.exists(matches_path):
        for match in file_utils.load_from_csv(matches_path, Match):
            if match.queried_mathml not in matches:
                matches[match.queried_mathml] = []
            matches[match.queried_mathml].append(match)
    else:
        logging.warning(
            "Could not find symbol matches information for paper %s.", arxiv_id,
        )

    # Load parent-child relationships for symbols.
    children: Dict[SymbolId, List[SymbolId]] = defaultdict(list)
    parents: Dict[SymbolId, SymbolId] = {}
    children_path = os.path.join(
        directories.arxiv_subdir("detected-symbols", arxiv_id), "symbol_children.csv"
    )
    if os.path.exists(children_path):
        for parent in file_utils.load_from_csv(children_path, SerializableChild):
            pid = f"{parent.tex_path}-{parent.equation_index}-{parent.symbol_index}"
            cid = f"{parent.tex_path}-{parent.equation_index}-{parent.child_index}"
            parents[cid] = pid
            children[pid].append(cid)
    else:
        logging.warning(
            "Could not find file mapping from symbol to their children for paper %s.",
            arxiv_id,
        )

    # Load contexts that the symbols appear in. Sort them by the symbol MathML.
    context_data_missing = False
    contexts_path = os.path.join(
        directories.arxiv_subdir("contexts-for-symbols", arxiv_id), "contexts.csv",
    )
    if not os.path.exists(contexts_path):
        logging.warning(  # pylint: disable=logging-not-lazy
            "Contexts have not been found for symbols for arXiv paper %s. "
            + "Symbol data will be uploaded without contexts.",
            arxiv_id,
        )
        context_data_missing = True

    symbol_contexts = {}
    mathml_contexts = defaultdict(list)
    if not context_data_missing:
        for context in file_utils.load_from_csv(contexts_path, Context):
            tex_path = context.tex_path
            symbol_id = f"{tex_path}-{context.entity_id}"
            symbol_contexts[symbol_id] = context
            symbol = symbols_by_id[symbol_id]
            mathml_contexts[symbol.mathml].append(context)

    # Prepare collections of formulae that each symbol was found in.
    symbol_formulas = {}
    mathml_formulas: Dict[str, Set[DefiningFormula]] = defaultdict(set)
    for symbol in symbols:
        if (
            symbol.is_definition
            and symbol.equation is not None
            and symbol.relative_start is not None
            and symbol.relative_end is not None
        ):
            highlighted = wrap_span(
                symbol.equation,
                symbol.relative_start,
                symbol.relative_end,
                before=r"\htmlClass{match-highlight}{",
                after="}",
                braces=True,
            )
            formula = DefiningFormula(
                tex=highlighted,
                tex_path=symbol.tex_path,
                equation_id=str(symbol.equation_index),
            )
            symbol_formulas[sid(symbol)] = formula
            mathml_formulas[symbol.mathml].add(formula)

    entity_infos = []
    for localized_entity in processing_summary.entities:

        symbol = cast(SerializableSymbol, localized_entity.entity)
        boxes = [
            BoundingBox(l.left, l.top, l.width, l.height, l.page)
            for l in localized_entity.locations
        ]

        # Get context and formula of the symbol, and other matching ones.
        symbol_context = symbol_contexts.get(sid(symbol))
        matching_contexts = mathml_contexts.get(symbol.mathml, [])
        other_context_texs = []
        other_context_sentence_ids = []
        for c in matching_contexts:
            matching_sentence_id = f"{c.tex_path}-{c.sentence_id}"
            if matching_sentence_id not in other_context_sentence_ids:
                other_context_texs.append(c.snippet)
                other_context_sentence_ids.append(matching_sentence_id)

        matching_formulas = mathml_formulas.get(symbol.mathml, set())
        other_formula_texs = []
        other_formula_ids = []
        for f in matching_formulas:
            equation_id = f"{f.tex_path}-{f.equation_id}"
            if equation_id not in other_formula_ids:
                other_formula_texs.append(f.tex)
                other_formula_ids.append(equation_id)

        # Package up data for the symbol.
        tags: List[str] = []
        MAX_BOX_HEIGHT = 0.1
        for b in boxes:
            if b.height > MAX_BOX_HEIGHT:
                logging.debug(  # pylint: disable=logging-not-lazy
                    "Detected large bounding box for symbol with height %f for entity %s of paper "
                    + "%s. Entity will be given a tag indicating it is unexpectedly large.",
                    b.height,
                    f"{localized_entity.entity.tex_path}-{localized_entity.entity.id_}",
                    arxiv_id,
                )
                tags.append("large")
                break

        data: EntityData = {
            "tex": f"${symbol.tex}$",
            "tex_start": symbol.start,
            "tex_end": symbol.end,
            "type": symbol.type_,
            "mathml": symbol.mathml,
            "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]],
            "snippets": other_context_texs,
            "defining_formulas": other_formula_texs,
            "is_definition": symbol.is_definition or False,
            "tags": tags,
        }

        # Create links between this symbol, its sentence, and related symbols.
        sentence_id = (
            f"{symbol_context.tex_path}-{symbol_context.sentence_id}"
            if symbol_context is not None
            else None
        )

        parent_id = parents.get(sid(symbol))
        child_ids = children.get(sid(symbol), [])

        relationships: EntityRelationships = {
            "equation": EntityReference(
                type_="equation", id_=f"{symbol.tex_path}-{symbol.equation_index}",
            ),
            "parent": EntityReference(type_="symbol", id_=parent_id),
            "children": [EntityReference(type_="symbol", id_=id_) for id_ in child_ids],
            "sentence": EntityReference(type_="sentence", id_=sentence_id)
            if sentence_id is not None
            else EntityReference(type_="sentence", id_=None),
            "defining_formula_equations": [
                EntityReference(type_="equation", id_=id_) for id_ in other_formula_ids
            ],
            "snippet_sentences": [
                EntityReference(type_="sentence", id_=id_)
                for id_ in other_context_sentence_ids
            ],
        }

        # Save all data for this symbol
        entity_information = EntityUploadInfo(
            id_=sid(symbol),
            type_="symbol",
            bounding_boxes=boxes,
            data=data,
            relationships=relationships,
        )
        entity_infos.append(entity_information)

    upload_entities(
        processing_summary.s2_id, arxiv_id, entity_infos, data_version,
    )
Beispiel #13
0
    def _execute(self, payload):
        frame = payload.original_frame
        raw_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        frames = self.frames
        median = np.median(
            frames, axis=0).astype(float) if len(frames) else np.ndarray(
                (0, ))  # .reshape(100, 100)

        if len(self.detection_mask) == 0:
            self.detection_mask = np.zeros(raw_gray.shape, dtype='uint8')

        # df = payload.dfs.get('detections', pd.DataFrame()).drop_duplicates()
        # if len(df) > 0:

        vehicle_detections = list(payload.vehicle_detections)

        boxes = (d.bounding_box for d in vehicle_detections)
        # boxes = (b.get_scaled(0.5) for b in boxes)
        bb_h_percentage = 0.2
        boxes = [
            BoundingBox(b.x, int(round(b.y + b.h * (1 - bb_h_percentage))),
                        b.w, int(round(b.h * bb_h_percentage))) for b in boxes
        ]
        for box in boxes:
            # y_start = max(0, box.y)
            # y_end = max(box.y + box.h, 0)
            # x_start = max(0, box.x)
            # x_end = max(0, box.x + box.w)
            # self.detection_mask[y_start: y_end, x_start:x_end] = 1
            self.detection_mask[box.y:box.y + box.h, box.x:box.x + box.w] = 1

        # cv2.imshow('detection_mask',self.detection_mask*255)
        # cv2.waitKey(0)

        gray = raw_gray.copy()  # * (1 - self.detection_mask)
        if len(median):
            idxs = np.where(self.detection_mask == 1)
            detections_median = np.median(median[idxs])
            gray[idxs] = detections_median

        self.frames.append(gray)

        title = 'Median'
        cv2.namedWindow(title, cv2.WINDOW_NORMAL)
        if len(median):
            cv2.imshow(title, median / 255)
        # cv2.imshow(title, frame)
        # cv2.imshow(title, gray / 255)

        # cv2.imwrite(r'median.jpg', median, )

        return payload
        src = median / 255

        from experimental import demo_erosion_dilatation
        src = (1 - src) * 255
        demo_erosion_dilatation(src, iterations=2)

        erosion_size = 5
        erosion_type = cv2.MORPH_ELLIPSE
        element = cv2.getStructuringElement(
            erosion_type, (2 * erosion_size + 1, 2 * erosion_size + 1),
            (erosion_size, erosion_size))
        erosion_dst = cv2.erode(src, element, iterations=1)
        cv2.imshow('erosion', erosion_dst)

        erosion_dst = cv2.dilate(src, element, iterations=2)
        cv2.imshow('dialation', erosion_dst)

        cv2.waitKey(0)

        element = cv2.getStructuringElement(
            erosion_type, (2 * erosion_size + 1, 2 * erosion_size + 1),
            (erosion_size, erosion_size))

        cv2.waitKey(1)
        # ============================================

        return payload
Beispiel #14
0
    def load_hues(self, arxiv_id: ArxivId,
                  iteration: str) -> List[HueSearchRegion]:

        equation_boxes_path = os.path.join(
            directories.arxiv_subdir("hue-locations-for-equations", arxiv_id),
            "hue_locations.csv",
        )
        bounding_boxes: Dict[EquationId, BoundingBoxesByFile] = {}

        for location_info in file_utils.load_from_csv(equation_boxes_path,
                                                      HueLocationInfo):
            equation_id = EquationId(
                tex_path=location_info.tex_path,
                equation_index=int(location_info.entity_id),
            )
            if equation_id not in bounding_boxes:
                bounding_boxes[equation_id] = {}

            file_path = location_info.relative_file_path
            if file_path not in bounding_boxes[equation_id]:
                bounding_boxes[equation_id][file_path] = []

            box = BoundingBox(
                page=location_info.page,
                left=location_info.left,
                top=location_info.top,
                width=location_info.width,
                height=location_info.height,
            )
            bounding_boxes[equation_id][file_path].append(box)

        token_records_by_equation: Dict[EquationId, Dict[
            int, EquationTokenColorizationRecord]] = {}
        token_hues_path = os.path.join(
            directories.iteration(
                "sources-with-colorized-equation-tokens",
                arxiv_id,
                iteration,
            ),
            "entity_hues.csv",
        )
        for record in file_utils.load_from_csv(
                token_hues_path, EquationTokenColorizationRecord):
            equation_id = EquationId(tex_path=record.tex_path,
                                     equation_index=record.equation_index)
            token_index = int(record.token_index)

            if equation_id not in token_records_by_equation:
                token_records_by_equation[equation_id] = {}
            token_records_by_equation[equation_id][token_index] = record

        hue_searches = []
        for equation_id, boxes_by_file in bounding_boxes.items():
            for file_path, boxes in boxes_by_file.items():
                masks_by_page: MasksForPages = {}
                for box in boxes:
                    if box.page not in masks_by_page:
                        masks_by_page[box.page] = []
                    masks_by_page[box.page].append(
                        Rectangle(box.left, box.top, box.width, box.height))

                if equation_id in token_records_by_equation:
                    for token_index, record in token_records_by_equation[
                            equation_id].items():
                        hue_searches.append(
                            HueSearchRegion(
                                hue=record.hue,
                                record=record,
                                relative_file_path=file_path,
                                masks=masks_by_page,
                            ))

        return hue_searches
def box(left: float, top: float, width: float, height: float, page: int):
    return BoundingBox(left, top, width, height, page)
Beispiel #16
0
def test_get_token_bounding_box():
    s = symbol(tokens=[0])
    token_locations = {token_id(0): [BoundingBox(0.01, 0.01, 0.01, 0.01, 0)]}
    box = get_symbol_bounding_box(s, symbol_id(), token_locations)
    assert box == BoundingBox(0.01, 0.01, 0.01, 0.01, 0)
Beispiel #17
0
def fetch_boxes(arxiv_id: ArxivId, schema: str, version: Optional[int],
                types: List[str]) -> Optional[RegionsByPageAndType]:
    # Discover the most recent version of data in the database for the paper.

    setup_database_connections(schema)
    if version is None:
        version_number = (Version.select(fn.Max(
            Version.index)).join(Paper).where(
                Paper.arxiv_id == arxiv_id).scalar())
        if version_number is None:
            logging.warning(  # pylint: disable=logging-not-lazy
                "There are no entities for paper %s in database schema %s",
                arxiv_id,
                schema,
            )
            return None
        version = int(version_number)

    # Load bounding boxes from rows in the tables.
    # Note that filtering of entities occurs in two stages:
    # 1. In the 'where' function call in the query below.
    # 2. When iterating over the returned 'rows' object.
    # In general, it is encouraged to write filtering conditions in the where clause to
    # consolidate conditions and as it could be faster. That said, if it is particularly tricky
    # to filter (e.g., involving many interrelated conditions), filters can be written as rules
    # in the loop over the rows.
    rows = (
        EntityModel.select(
            EntityModel.id,
            EntityModel.type,
            BoundingBoxModel.left,
            BoundingBoxModel.top,
            BoundingBoxModel.width,
            BoundingBoxModel.height,
            BoundingBoxModel.page,
            # Aggregate data for an entity into an array, where each field
            # is a dictionary: {"key", "...", "value", "..."}. All values will
            # be of type string.
            fn.json_agg(
                fn.json_build_object("key", EntityDataModel.key, "value",
                                     EntityDataModel.value)).alias("data"),
        ).join(Paper).switch(EntityModel).join(BoundingBoxModel).switch(
            EntityModel).join(EntityDataModel).where(
                EntityModel.version == version,
                Paper.arxiv_id == arxiv_id,
                EntityModel.type << types,
            ).group_by(
                EntityModel.id,
                EntityModel.type,
                BoundingBoxModel.left,
                BoundingBoxModel.top,
                BoundingBoxModel.width,
                BoundingBoxModel.height,
                BoundingBoxModel.page,
            ).dicts())

    boxes_by_entity_db_id: Dict[str, List[BoundingBox]] = defaultdict(list)
    types_by_entity_db_id: Dict[str, str] = {}
    for row in rows:
        if row["type"] == "symbol":
            if any([
                    d["key"] == "type" and d["value"] not in GOLD_SYMBOL_TYPES
                    for d in row["data"]
            ]):
                continue

        boxes_by_entity_db_id[row["id"]].append(
            BoundingBox(
                row["left"],
                row["top"],
                row["width"],
                row["height"],
                row["page"],
            ))
        types_by_entity_db_id[row["id"]] = row["type"]

    regions: RegionsByPageAndType = defaultdict(list)
    for db_id, bounding_boxes in boxes_by_entity_db_id.items():
        by_page = group_by_page(bounding_boxes)
        for page, page_boxes in by_page.items():
            key = (page, types_by_entity_db_id[db_id])
            rectangles = frozenset([
                FloatRectangle(b.left, b.top, b.width, b.height)
                for b in page_boxes
            ])
            regions[key].append(rectangles)

    return regions
Beispiel #18
0
    def detect(self, image):
        # load our input image and grab its spatial dimensions
        min_confidence = self.min_confidence
        threshold = self.threshold
        net = self.net

        labels = self.labels

        (H, W) = image.shape[:2]

        # determine only the *output* layer names that we need from YOLO
        ln = net.getLayerNames()
        ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]

        # construct a blob from the input image and then perform a forward
        # pass of the YOLO object detector, giving us our bounding boxes and
        # associated probabilities
        blob = cv2.dnn.blobFromImage(image,
                                     scalefactor=self.scale_factor,
                                     size=self.blob_size,
                                     swapRB=True,
                                     crop=False)
        net.setInput(blob)

        layer_outputs = net.forward(ln)

        # initialize our lists of detected bounding boxes, confidences, and
        # class IDs, respectively
        boxes = []
        confidences = []
        class_ids = []

        # loop over each of the layer outputs
        for output in layer_outputs:
            # loop over each of the detections
            for detection in output:
                # extract the class ID and confidence (i.e., probability) of
                # the current object detection
                scores = detection[5:]
                class_id = np.argmax(scores)
                confidence = scores[class_id]

                # filter out weak predictions by ensuring the detected
                # probability is greater than the minimum probability

                if confidence > min_confidence:
                    # scale the bounding box coordinates back relative to the
                    # size of the image, keeping in mind that YOLO actually
                    # returns the center (x, y)-coordinates of the bounding
                    # box followed by the boxes' width and height
                    box = detection[0:4] * np.array([W, H, W, H])
                    (centerX, centerY, width, height) = box.astype("int")

                    # use the center (x, y)-coordinates to derive the top and
                    # and left corner of the bounding box
                    x = int(centerX - (width / 2))
                    y = int(centerY - (height / 2))

                    # update our list of bounding box coordinates, confidences,
                    # and class IDs
                    boxes.append([x, y, int(width), int(height)])
                    confidences.append(float(confidence))
                    class_ids.append(class_id)

        # apply non-maxima suppression to suppress weak, overlapping bounding
        # boxes
        idxs = cv2.dnn.NMSBoxes(boxes, confidences, min_confidence, threshold)

        detections = []
        # ensure at least one detection exists
        if len(idxs) > 0:
            # loop over the indexes we are keeping
            for i in idxs.flatten():
                # extract the bounding box coordinates
                (x, y) = (boxes[i][0], boxes[i][1])
                (w, h) = (boxes[i][2], boxes[i][3])

                class_id = class_ids[i]
                label = labels[class_id]
                confidence = confidences[i]
                bounding_box = BoundingBox(x, y, w, h)
                detection = Detection(label, confidence, bounding_box)
                detections.append(detection)

        return detections
Beispiel #19
0
    def load(self) -> Iterator[SymbolData]:
        for arxiv_id in self.arxiv_ids:

            s2_id = get_s2_id(arxiv_id)
            if s2_id is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            boxes: Dict[SymbolId, BoundingBox] = {}
            boxes_path = os.path.join(
                directories.arxiv_subdir("symbol-locations", arxiv_id),
                "symbol_locations.csv",
            )
            if not os.path.exists(boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for location in file_utils.load_from_csv(boxes_path,
                                                     SymbolLocation):
                symbol_id = SymbolId(
                    tex_path=location.tex_path,
                    equation_index=location.equation_index,
                    symbol_index=location.symbol_index,
                )
                box = BoundingBox(
                    page=int(location.page),
                    left=location.left,
                    top=location.top,
                    width=location.width,
                    height=location.height,
                )
                boxes[symbol_id] = box

            matches: Matches = {}
            matches_path = os.path.join(
                directories.arxiv_subdir("symbol-matches", arxiv_id),
                "matches.csv")
            if not os.path.exists(matches_path):
                logging.warning(
                    "Could not find symbol matches information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for match in file_utils.load_from_csv(matches_path, Match):
                if match.queried_mathml not in matches:
                    matches[match.queried_mathml] = []
                matches[match.queried_mathml].append(match)

            sentence_data_missing = False
            sentences_path = os.path.join(
                directories.arxiv_subdir("sentences-for-symbols", arxiv_id),
                "entity_sentences.csv",
            )
            if not os.path.exists(sentences_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Symbols for arXiv paper %s have not been aligned to sentences. "
                    +
                    "Symbol data will be uploaded without links to sentences",
                    arxiv_id,
                )
                sentence_data_missing = True

            if not sentence_data_missing:
                symbol_sentences = {}
                for pair in file_utils.load_from_csv(sentences_path,
                                                     EntitySentencePairIds):
                    tex_path = pair.tex_path
                    equation_index, symbol_index = [
                        int(t) for t in pair.entity_id.split("-")
                    ]
                    sentence_key = SentenceKey(pair.tex_path, pair.sentence_id)
                    symbol_id = SymbolId(tex_path, equation_index,
                                         symbol_index)
                    symbol_sentences[symbol_id] = sentence_key

            yield SymbolData(
                arxiv_id,
                s2_id,
                symbols_with_ids,
                boxes,
                symbol_sentences,
                matches,
            )
Beispiel #20
0
    def load(self) -> Iterator[SymbolData]:
        for arxiv_id in self.arxiv_ids:

            s2_id = get_s2_id(arxiv_id)
            if s2_id is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            symbols_by_id = {s.symbol_id: s.symbol for s in symbols_with_ids}

            boxes: Dict[SymbolId, BoundingBox] = {}
            boxes_path = os.path.join(
                directories.arxiv_subdir("symbol-locations", arxiv_id),
                "symbol_locations.csv",
            )
            if not os.path.exists(boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for location in file_utils.load_from_csv(boxes_path,
                                                     SymbolLocation):
                symbol_id = SymbolId(
                    tex_path=location.tex_path,
                    equation_index=location.equation_index,
                    symbol_index=location.symbol_index,
                )
                box = BoundingBox(
                    page=int(location.page),
                    left=location.left,
                    top=location.top,
                    width=location.width,
                    height=location.height,
                )
                boxes[symbol_id] = box

            matches: Matches = {}
            matches_path = os.path.join(
                directories.arxiv_subdir("symbol-matches", arxiv_id),
                "matches.csv")
            if not os.path.exists(matches_path):
                logging.warning(
                    "Could not find symbol matches information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for match in file_utils.load_from_csv(matches_path, Match):
                if match.queried_mathml not in matches:
                    matches[match.queried_mathml] = []
                matches[match.queried_mathml].append(match)

            context_data_missing = False
            contexts_path = os.path.join(
                directories.arxiv_subdir("contexts-for-symbols", arxiv_id),
                "contexts.csv",
            )
            if not os.path.exists(contexts_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Contexts have not been found for symbols for arXiv paper %s. "
                    + "Symbol data will be uploaded without contexts.",
                    arxiv_id,
                )
                context_data_missing = True

            symbol_contexts = {}
            mathml_contexts = defaultdict(list)
            if not context_data_missing:
                for context in file_utils.load_from_csv(
                        contexts_path, Context):
                    tex_path = context.tex_path
                    equation_index, symbol_index = [
                        int(t) for t in context.entity_id.split("-")
                    ]
                    symbol_id = SymbolId(tex_path, equation_index,
                                         symbol_index)
                    symbol_contexts[symbol_id] = context
                    symbol = symbols_by_id[symbol_id]
                    mathml_contexts[symbol.mathml].append(context)

            symbol_formulas = {}
            mathml_formulas = defaultdict(set)
            for id_, symbol in symbols_by_id.items():
                if (symbol.is_definition and symbol.equation is not None
                        and symbol.relative_start is not None
                        and symbol.relative_end is not None):
                    highlighted = wrap_span(
                        symbol.equation,
                        symbol.relative_start,
                        symbol.relative_end,
                        before=r"\htmlClass{match-highlight}{",
                        after="}",
                        braces=True,
                    )
                    formula = DefiningFormula(
                        tex=highlighted,
                        tex_path=id_.tex_path,
                        equation_id=id_.equation_index,
                    )
                    symbol_formulas[id_] = formula
                    mathml_formulas[symbol.mathml].add(formula)

            yield SymbolData(
                arxiv_id,
                s2_id,
                symbols_with_ids,
                boxes,
                symbol_contexts,
                symbol_formulas,
                mathml_contexts,
                mathml_formulas,
                matches,
            )