Ejemplo n.º 1
0
def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower,
                          from_sentence):
    # TODO: this currently looks only in current table;
    #   precompute over the whole document/page instead
    bbox_direction_aligned = (bbox_vert_aligned
                              if direction == "vert" else bbox_horz_aligned)
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = [c] if isinstance(c, TemporarySpan) else c.get_contexts()
    for span in spans:
        if not span.sentence.is_tabular() or not span.sentence.is_visual():
            continue
        for sentence in span.sentence.table.sentences:
            if from_sentence:
                if (bbox_direction_aligned(bbox_from_sentence(sentence),
                                           bbox_from_span(span))
                        and sentence is not span.sentence):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(sentence):
                    if bbox_direction_aligned(
                            bbox_from_span(ts), bbox_from_span(span)) and not (
                                sentence == span.sentence
                                and ts.get_span() in span.get_span()):
                        yield f(ts.get_span())
Ejemplo n.º 2
0
def is_vert_aligned(c: Candidate) -> bool:
    """Return true if all the components of c are vertically aligned.

    Vertical alignment means that the bounding boxes of each Mention of c
    shares a similar x-axis value in the visual rendering of the document.

    :param c: The candidate to evaluate
    :rtype: boolean
    """
    return all([
        _to_span(c[i]).sentence.is_visual() and bbox_vert_aligned(
            bbox_from_span(_to_span(c[i])), bbox_from_span(_to_span(c[0])))
        for i in range(len(c))
    ])
Ejemplo n.º 3
0
def is_vert_aligned_center(c):
    """Return true if all the components of c are vertically aligned based on their left border.

    Vertical alignment means that the bounding boxes of each Span of c shares
    a similar x-axis value in the visual rendering of the document. In this function
    the similarity of the x-axis value is based on the center of their bounding boxes.

    :param c: The candidate to evaluate
    :rtype: boolean
    """
    return all([
        c[i].sentence.is_visual() and bbox_vert_aligned_center(
            bbox_from_span(c[i]), bbox_from_span(c[0])) for i in range(len(c))
    ])
Ejemplo n.º 4
0
def is_horz_aligned(c):
    """Return True if all the components of c are horizontally aligned.

    Horizontal alignment means that the bounding boxes of each Span of c shares
    a similar y-axis value in the visual rendering of the document.

    :param c: The candidate to evaluate
    :rtype: boolean
    """
    return all([
        c[i].sentence.is_visual()
        and bbox_horz_aligned(bbox_from_span(c[i]), bbox_from_span(c[0]))
        for i in range(len(c))
    ])
Ejemplo n.º 5
0
def same_page(c: Candidate) -> bool:
    """Return true if all the components of c are on the same page of the document.

    Page numbers are based on the PDF rendering of the document. If a PDF file is
    provided, it is used. Otherwise, if only a HTML/XML document is provided, a
    PDF is created and then used to determine the page number of a Mention.

    :param c: The candidate to evaluate
    :rtype: boolean
    """
    return all([
        _to_span(c[i]).sentence.is_visual()
        and bbox_from_span(_to_span(c[i])).page == bbox_from_span(
            _to_span(c[0])).page for i in range(len(c))
    ])
Ejemplo n.º 6
0
def is_vert_aligned_right(c: Candidate) -> bool:
    """Return true if all components vertically aligned on their right border.

    Vertical alignment means that the bounding boxes of each Mention of c
    shares a similar x-axis value in the visual rendering of the document. In
    this function the similarity of the x-axis value is based on the right
    border of their bounding boxes.

    :param c: The candidate to evaluate
    :rtype: boolean
    """
    return all([
        _to_span(c[i]).sentence.is_visual() and bbox_vert_aligned_right(
            bbox_from_span(_to_span(c[i])), bbox_from_span(_to_span(c[0])))
        for i in range(len(c))
    ])
Ejemplo n.º 7
0
def _get_direction_ngrams(
    direction: str,
    c: Union[Candidate, Mention, TemporarySpanMention],
    attrib: str,
    n_min: int,
    n_max: int,
    lower: bool,
    from_sentence: bool,
) -> Iterator[str]:
    # TODO: this currently looks only in current table;
    #   precompute over the whole document/page instead
    bbox_direction_aligned = (bbox_vert_aligned
                              if direction == "vert" else bbox_horz_aligned)
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = _to_spans(c)
    for span in spans:
        if not span.sentence.is_tabular() or not span.sentence.is_visual():
            continue
        for sentence in span.sentence.table.sentences:
            if from_sentence:
                if (bbox_direction_aligned(bbox_from_sentence(sentence),
                                           bbox_from_span(span))
                        and sentence is not span.sentence):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(sentence):
                    if bbox_direction_aligned(
                            bbox_from_span(ts), bbox_from_span(span)) and not (
                                sentence == span.sentence
                                and ts.get_span() in span.get_span()):
                        yield f(ts.get_span())
Ejemplo n.º 8
0
def get_page_vert_percentile(
    mention: Union[Candidate, Mention, TemporarySpanMention],
    page_width: int = DEFAULT_WIDTH,
    page_height: int = DEFAULT_HEIGHT,
) -> float:
    """Return which percentile from the TOP in the page the Mention is located in.

    Percentile is calculated where the top of the page is 0.0, and the bottom
    of the page is 1.0. For example, a Mention in at the top 1/4 of the page
    will have a percentile of 0.25.

    Page width and height are based on pt values::

        Letter      612x792
        Tabloid     792x1224
        Ledger      1224x792
        Legal       612x1008
        Statement   396x612
        Executive   540x720
        A0          2384x3371
        A1          1685x2384
        A2          1190x1684
        A3          842x1190
        A4          595x842
        A4Small     595x842
        A5          420x595
        B4          729x1032
        B5          516x729
        Folio       612x936
        Quarto      610x780
        10x14       720x1008

    and should match the source documents. Letter size is used by default.

    Note that if a candidate is passed in, only the vertical percentil of its
    first Mention is returned.

    :param mention: The Mention to evaluate
    :param page_width: The width of the page. Default to Letter paper width.
    :param page_height: The heigh of the page. Default to Letter paper height.
    :rtype: float in [0.0, 1.0]
    """
    span = _to_span(mention)
    return bbox_from_span(span).top / page_height
Ejemplo n.º 9
0
def get_page_horz_percentile(span,
                             page_width=DEFAULT_WIDTH,
                             page_height=DEFAULT_HEIGHT):
    """Return which percentile from the LEFT in the page the Span is located in.

    Percentile is calculated where the left of the page is 0.0, and the right
    of the page is 1.0.

    Page width and height are based on pt values::

        Letter      612x792
        Tabloid     792x1224
        Ledger      1224x792
        Legal       612x1008
        Statement   396x612
        Executive   540x720
        A0          2384x3371
        A1          1685x2384
        A2          1190x1684
        A3          842x1190
        A4          595x842
        A4Small     595x842
        A5          420x595
        B4          729x1032
        B5          516x729
        Folio       612x936
        Quarto      610x780
        10x14       720x1008

    and should match the source documents. Letter size is used by default.

    Note that if a candidate is passed in, only the vertical percentil of its
    first Span is returned.

    :param c: The Span to evaluate
    :param page_width: The width of the page. Default to Letter paper width.
    :param page_height: The heigh of the page. Default to Letter paper height.
    :rtype: float in [0.0, 1.0]
    """
    span = span if isinstance(span, TemporarySpan) else span[0]
    return bbox_from_span(span).left, page_width