Esempio n. 1
0
def test_pdf_word_list_is_sorted():
    """Test if pdf_word_list is sorted as expected.

    no_image_unsorted.html is originally created from pdf_simple/no_image.pdf,
    but the order of html elements like block and word has been changed to see if
    pdf_word_list is sorted as expected.
    """
    docs_path = "tests/data/html_simple/no_image_unsorted.html"
    pdf_path = "dummy_path"
    visual_linker = VisualLinker(pdf_path=pdf_path)
    with open(docs_path) as f:
        soup = BeautifulSoup(f, "html.parser")
    page = soup.find_all("page")[0]
    pdf_word_list, coordinate_map = visual_linker._coordinates_from_HTML(
        page, 1)

    # Check if words are sorted by block top
    assert set([content
                for (_, content) in pdf_word_list[:2]]) == {"Sample", "HTML"}
    # Check if words are sorted by top
    assert [content for (_, content) in pdf_word_list[2:7]] == [
        "This",
        "is",
        "an",
        "html",
        "that",
    ]
    # Check if words are sorted by left (#449)
    assert [content
            for (_, content) in pdf_word_list[:2]] == ["Sample", "HTML"]
Esempio n. 2
0
    def __init__(self, structural, blacklist, flatten, lingual, strip,
                 replacements, tabular, visual, pdf_path, language, **kwargs):
        """
        :param visual: boolean, if True visual features are used in the model
        :param pdf_path: directory where pdf are saved, if a pdf file is not
            found, it will be created from the html document and saved in that
            directory
        :param replacements: a list of (_pattern_, _replace_) tuples where
            _pattern_ isinstance a regex and _replace_ is a character string.
            All occurents of _pattern_ in the text will be replaced by
            _replace_.
        """
        super(ParserUDF, self).__init__(**kwargs)

        # structural (html) setup
        self.structural = structural
        self.blacklist = blacklist if isinstance(blacklist,
                                                 list) else [blacklist]
        self.flatten = flatten if isinstance(flatten, list) else [flatten]

        # lingual setup
        self.language = language
        self.strip = strip
        self.replacements = []
        for (pattern, replace) in replacements:
            self.replacements.append((re.compile(pattern,
                                                 flags=re.UNICODE), replace))

        self.lingual = lingual
        self.lingual_parser = Spacy(self.language)
        if self.lingual_parser.has_tokenizer_support():
            self.tokenize_and_split_sentences = self.lingual_parser.split_sentences
            self.lingual_parser.load_lang_model()
        else:
            self.tokenize_and_split_sentences = SimpleTokenizer().parse

        if self.lingual:
            if self.lingual_parser.has_NLP_support():
                self.enrich_tokenized_sentences_with_nlp = (
                    self.lingual_parser.enrich_sentences_with_NLP)
            else:
                logger.warning("Lingual mode will be turned off, "
                               "as spacy doesn't provide support for this "
                               "language ({})".format(self.language))
                self.lingual = False

        # tabular setup
        self.tabular = tabular

        # visual setup
        self.visual = visual
        if self.visual:
            self.pdf_path = pdf_path
            self.vizlink = VisualLinker()
Esempio n. 3
0
    def __init__(
        self,
        structural,
        blacklist,
        flatten,
        flatten_delim,
        lingual,
        strip,
        replacements,
        tabular,
        visual,
        pdf_path,
        lingual_parser,
        **kwargs
    ):
        """
        :param visual: boolean, if True visual features are used in the model
        :param pdf_path: directory where pdf are saved, if a pdf file is not found,
        it will be created from the html document and saved in that directory
        :param replacements: a list of (_pattern_, _replace_) tuples where _pattern_ isinstance
        a regex and _replace_ is a character string. All occurents of _pattern_ in the
        text will be replaced by _replace_.
        """
        super(ParserUDF, self).__init__(**kwargs)

        # structural (html) setup
        self.structural = structural
        self.blacklist = blacklist if isinstance(blacklist, list) else [blacklist]
        self.flatten = flatten if isinstance(flatten, list) else [flatten]
        self.flatten_delim = flatten_delim

        # lingual setup
        self.lingual = lingual
        self.strip = strip
        self.replacements = []
        for (pattern, replace) in replacements:
            self.replacements.append((re.compile(pattern, flags=re.UNICODE), replace))
        if self.lingual:
            self.lingual_parser = lingual_parser
            self.lingual_parse = self.lingual_parser.parse

        else:
            self.lingual_parse = SimpleTokenizer().parse

        # tabular setup
        self.tabular = tabular

        # visual setup
        self.visual = visual
        if self.visual:
            self.pdf_path = pdf_path
            self.vizlink = VisualLinker()
Esempio n. 4
0
def test_visual_linker_not_affected_by_order_of_sentences():
    """Test if visual_linker result is not affected by the order of sentences."""
    docs_path = "tests/data/html/2N6427.html"
    pdf_path = "tests/data/pdf/2N6427.pdf"

    # Initialize preprocessor, parser, visual_linker.
    # Note that parser is initialized with `visual=False` and that visual_linker
    # will be used to attach "visual" information to sentences after parsing.
    preprocessor = HTMLDocPreprocessor(docs_path)
    parser_udf = get_parser_udf(structural=True,
                                lingual=False,
                                tabular=True,
                                visual=False)
    visual_linker = VisualLinker(pdf_path=pdf_path)

    doc = parser_udf.apply(next(preprocessor.__iter__()))
    # Sort sentences by sentence.position
    doc.sentences = sorted(doc.sentences, key=attrgetter("position"))
    sentences0 = [
        sent for sent in visual_linker.link(doc.name, doc.sentences, pdf_path)
    ]
    # Sort again in case visual_linker.link changes the order
    sentences0 = sorted(sentences0, key=attrgetter("position"))

    doc = parser_udf.apply(next(preprocessor.__iter__()))
    # Shuffle
    random.shuffle(doc.sentences)
    sentences1 = [
        sent for sent in visual_linker.link(doc.name, doc.sentences, pdf_path)
    ]
    # Sort sentences by sentence.position
    sentences1 = sorted(sentences1, key=attrgetter("position"))

    # This should hold as both sentences are sorted by their position
    assert all([
        sent0.position == sent1.position
        for (sent0, sent1) in zip(sentences0, sentences1)
    ])

    # The following assertion should hold if the visual_linker result is not affected
    # by the order of sentences.
    assert all([
        sent0.left == sent1.left
        for (sent0, sent1) in zip(sentences0, sentences1)
    ])
Esempio n. 5
0
    def __init__(
        self,
        structural: bool,
        blacklist: Union[str, List[str]],
        flatten: Union[str, List[str]],
        lingual: bool,
        lingual_parser: Optional[LingualParser],
        strip: bool,
        replacements: List[Tuple[str, str]],
        tabular: bool,
        visual: bool,
        vizlink: Optional[VisualLinker],
        pdf_path: Optional[str],
        language: Optional[str],
        **kwargs: Any,
    ) -> None:
        """Initialize Parser UDF.

        :param visual: boolean, if True visual features are used in the model
        :param pdf_path: directory where pdf are saved, if a pdf file is not
            found, it will be created from the html document and saved in that
            directory
        :param replacements: a list of (_pattern_, _replace_) tuples where
            _pattern_ isinstance a regex and _replace_ is a character string.
            All occurents of _pattern_ in the text will be replaced by
            _replace_.
        """
        super().__init__(**kwargs)

        # structural (html) setup
        self.structural = structural
        self.blacklist = blacklist if isinstance(blacklist,
                                                 list) else [blacklist]
        self.flatten = flatten if isinstance(flatten, list) else [flatten]

        # lingual setup
        self.language = language
        self.strip = strip
        self.replacements: List[Tuple[Pattern, str]] = []
        for (pattern, replace) in replacements:
            self.replacements.append((re.compile(pattern,
                                                 flags=re.UNICODE), replace))

        self.lingual = lingual
        if lingual_parser:
            self.lingual_parser = lingual_parser
        else:
            self.lingual_parser = SpacyParser(self.language)
            # Fallback to SimpleParser if a tokenizer is not supported.
            if not self.lingual_parser.has_tokenizer_support():
                self.lingual_parser = SimpleParser()

        if self.lingual and not self.lingual_parser.has_NLP_support():
            logger.warning(f"Lingual mode will be turned off, "
                           f"as spacy doesn't provide support for this "
                           f"language ({self.language})")
            self.lingual = False

        # tabular setup
        self.tabular = tabular

        # visual setup
        self.visual = visual
        self.vizlink = vizlink
        if self.visual:
            self.pdf_path = pdf_path
            if not self.vizlink:
                # Use the provided pdf_path if present
                if not self.pdf_path:
                    warnings.warn(
                        "Visual parsing failed: pdf_path is required. " +
                        "Proceeding without visual parsing.",
                        RuntimeWarning,
                    )
                    self.visual = False
                else:
                    self.vizlink = VisualLinker(pdf_path)