Esempio n. 1
0
def test_visual_linker_not_affected_by_order_of_sentences():
    """Test if visual_linker result is not affected by the order of sentences."""
    docs_path = "tests/data/html/2N6427.html"
    pdf_path = "tests/data/pdf/2N6427.pdf"

    # Initialize preprocessor, parser, visual_linker.
    # Note that parser is initialized with `visual=False` and that visual_linker
    # will be used to attach "visual" information to sentences after parsing.
    preprocessor = HTMLDocPreprocessor(docs_path)
    parser_udf = get_parser_udf(structural=True,
                                lingual=False,
                                tabular=True,
                                visual=False)
    visual_linker = VisualLinker(pdf_path=pdf_path)

    doc = parser_udf.apply(next(preprocessor.__iter__()))
    # Sort sentences by sentence.position
    doc.sentences = sorted(doc.sentences, key=attrgetter("position"))
    sentences0 = [
        sent for sent in visual_linker.link(doc.name, doc.sentences, pdf_path)
    ]
    # Sort again in case visual_linker.link changes the order
    sentences0 = sorted(sentences0, key=attrgetter("position"))

    doc = parser_udf.apply(next(preprocessor.__iter__()))
    # Shuffle
    random.shuffle(doc.sentences)
    sentences1 = [
        sent for sent in visual_linker.link(doc.name, doc.sentences, pdf_path)
    ]
    # Sort sentences by sentence.position
    sentences1 = sorted(sentences1, key=attrgetter("position"))

    # This should hold as both sentences are sorted by their position
    assert all([
        sent0.position == sent1.position
        for (sent0, sent1) in zip(sentences0, sentences1)
    ])

    # The following assertion should hold if the visual_linker result is not affected
    # by the order of sentences.
    assert all([
        sent0.left == sent1.left
        for (sent0, sent1) in zip(sentences0, sentences1)
    ])
Esempio n. 2
0
def parse_doc(docs_path: str, file_name: str, pdf_path: Optional[str] = None):
    max_docs = 1

    logger.info("Parsing...")
    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
    doc = next(doc_preprocessor._parse_file(docs_path, file_name))

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(
        structural=True,
        tabular=True,
        lingual=True,
        visual=True if pdf_path else False,
        pdf_path=pdf_path,
        language="en",
    )
    doc = parser_udf.apply(doc)
    return doc
Esempio n. 3
0
def mention_setup():
    """Set up mentions."""
    docs_path = "tests/data/html_simple/md.html"
    pdf_path = "tests/data/pdf_simple/"

    # Preprocessor for the Docs
    preprocessor = HTMLDocPreprocessor(docs_path)
    doc = next(preprocessor.__iter__())

    # Create an Parser and parse the md document
    parser_udf = get_parser_udf(
        structural=True,
        tabular=True,
        lingual=True,
        visual=True,
        visual_parser=PdfVisualParser(pdf_path),
        language="en",
    )
    doc = parser_udf.apply(doc)

    # Create 1-gram span mentions
    space = MentionNgrams(n_min=1, n_max=1)
    mentions = [tc for tc in space.apply(doc)]
    return mentions