Ejemplo n.º 1
0
def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower,
                          from_sentence):
    # TODO: this currently looks only in current table;
    #   precompute over the whole document/page instead
    bbox_direction_aligned = (bbox_vert_aligned
                              if direction == "vert" else bbox_horz_aligned)
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = [c] if isinstance(c, TemporarySpan) else c.get_contexts()
    for span in spans:
        if not span.sentence.is_tabular() or not span.sentence.is_visual():
            continue
        for sentence in span.sentence.table.sentences:
            if from_sentence:
                if (bbox_direction_aligned(bbox_from_sentence(sentence),
                                           bbox_from_span(span))
                        and sentence is not span.sentence):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(sentence):
                    if bbox_direction_aligned(
                            bbox_from_span(ts), bbox_from_span(span)) and not (
                                sentence == span.sentence
                                and ts.get_span() in span.get_span()):
                        yield f(ts.get_span())
Ejemplo n.º 2
0
def _preprocess_visual_features(doc: Document) -> None:
    if hasattr(doc, "_visual_features"):
        return
    # cache flag
    doc._visual_features = True

    sentence_by_page: DefaultDict[str, List[Sentence]] = defaultdict(list)
    for sentence in doc.sentences:
        sentence_by_page[sentence.page[0]].append(sentence)
        sentence._aligned_lemmas = set()

    for page, sentences in sentence_by_page.items():
        # process per page alignments
        yc_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list)
        x0_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list)
        xc_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list)
        x1_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list)
        for sentence in sentences:
            sentence.bbox = bbox_from_sentence(sentence)
            sentence.yc = (sentence.bbox.top + sentence.bbox.bottom) / 2
            sentence.x0 = sentence.bbox.left
            sentence.x1 = sentence.bbox.right
            sentence.xc = (sentence.x0 + sentence.x1) / 2
            # index current sentence by different alignment keys
            yc_aligned[sentence.yc].append(sentence)
            x0_aligned[sentence.x0].append(sentence)
            x1_aligned[sentence.x1].append(sentence)
            xc_aligned[sentence.xc].append(sentence)
        for l in yc_aligned.values():
            l.sort(key=lambda p: p.xc)
        for l in x0_aligned.values():
            l.sort(key=lambda p: p.yc)
        for l in x1_aligned.values():
            l.sort(key=lambda p: p.yc)
        for l in xc_aligned.values():
            l.sort(key=lambda p: p.yc)
        _assign_alignment_features(yc_aligned, "Y_")
        _assign_alignment_features(x0_aligned, "LEFT_")
        _assign_alignment_features(x1_aligned, "RIGHT_")
        _assign_alignment_features(xc_aligned, "CENTER_")
Ejemplo n.º 3
0
def _get_direction_ngrams(
    direction: str,
    c: Union[Candidate, Mention, TemporarySpanMention],
    attrib: str,
    n_min: int,
    n_max: int,
    lower: bool,
    from_sentence: bool,
) -> Iterator[str]:
    # TODO: this currently looks only in current table;
    #   precompute over the whole document/page instead
    bbox_direction_aligned = (bbox_vert_aligned
                              if direction == "vert" else bbox_horz_aligned)
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = _to_spans(c)
    for span in spans:
        if not span.sentence.is_tabular() or not span.sentence.is_visual():
            continue
        for sentence in span.sentence.table.sentences:
            if from_sentence:
                if (bbox_direction_aligned(bbox_from_sentence(sentence),
                                           bbox_from_span(span))
                        and sentence is not span.sentence):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(sentence):
                    if bbox_direction_aligned(
                            bbox_from_span(ts), bbox_from_span(span)) and not (
                                sentence == span.sentence
                                and ts.get_span() in span.get_span()):
                        yield f(ts.get_span())