Example #1
0
def test_table_cell_separator():
    html = '<html><body><table><tr><td>Hallo<br>Eins</td><td>Echo<br>Zwei</td></tr></table></html>'

    config = ParserConfig()
    assert get_text(html, config) == 'Hallo  Echo\nEins   Zwei\n'

    config = ParserConfig(table_cell_separator='\t')
    assert get_text(html, config) == 'Hallo\tEcho\nEins \tZwei\n'
def test_html_annotations(filter_str=''):
    for annotation_file in glob(TESTCASE_PATTERN):
        if filter_str not in annotation_file:
            continue

        with open(annotation_file) as f:
            reference = load(f)

        with open(annotation_file.replace('.json', '.html')) as f:
            print(f.name)
            html = '<html><body>{}</body></html>'.format(f.read())

        for indentation_strategy in ('strict', 'relaxed'):
            result = get_annotated_text(
                html,
                ParserConfig(css=CSS_PROFILES[indentation_strategy],
                             annotation_rules=reference['annotation_rules']))

            converted = [[a[2], result['text'][a[0]:a[1]]]
                         for a in result['label']]

            if reference['result'] != converted:
                print("Reference:")
                print(reference['result'])
                print("\nConverted (indentation strategy: {})".format(
                    indentation_strategy))
                print(converted)

            if indentation_strategy == 'strict':
                assert reference['result'] == converted
            else:
                assert_equal_ignoring_whitespace(reference['result'],
                                                 converted)
Example #3
0
def test_html_snippets(filter_str=''):
    for testcase_txt in glob(TESTCASE_PATTERN):
        if filter_str not in testcase_txt:
            continue

        with open(testcase_txt) as f:
            reference_txt = f.read().rstrip()

        with open(testcase_txt.replace('.txt', '.html')) as f:
            print(f.name)
            html = '<html><body>{}</body></html>'.format(f.read())

        converted_txt = get_text(
            html, ParserConfig(css=CSS_PROFILES['strict'])).rstrip()

        if converted_txt != reference_txt:
            print('File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}'.
                  format(testcase_txt, html, reference_txt, converted_txt))
            print('HTML file:', testcase_txt.replace('.txt', '.html'))
            print("Visualize differences with `vimdiff reference.txt "
                  "converted.txt`")
            open("reference.txt", "w").write(reference_txt)
            open("converted.txt", "w").write(converted_txt)

        assert converted_txt == reference_txt
Example #4
0
def test_display_anchors():
    html = '''<html>
                 <body>
                   <a name="first">first</a>
                   <a href="second">second</a>
                 </body>
                </html>
            '''
    config = ParserConfig(display_anchors=True)
    assert get_text(html, config).strip() == \
        '[first](first) second'
Example #5
0
def test_display_links():
    html = '''<html>
                 <body>
                   <a href="first">first</a>
                   <a href="second">second</a>
                   <a name="third">third</a>
                 </body>
                </html>
            '''
    config = ParserConfig(display_links=True)
    assert get_text(html, config).strip() == \
        '[first](first) [second](second) third'
Example #6
0
def test_display_images_deduplicated():
    html = '''<html>
                 <body>
                   <img src="test1" alt="Ein Test Bild" title="Hallo" />
                   <img src="test2" alt="Ein Test Bild" title="Juhu" />
                   <img src="test3" alt="Ein zweites Bild" title="Echo" />
                 </body>
                </html>
            '''
    config = ParserConfig(display_images=True, deduplicate_captions=True)
    assert get_text(html, config).strip() == \
        '[Ein Test Bild] [Ein zweites Bild]'
    def __init__(self, html_tree, config=None):
        # use the default configuration, if no config object is provided
        self.config = config or ParserConfig()

        # setup start and end tag call tables
        self.start_tag_handler_dict = {
            'table': self._start_table,
            'tr': self._start_tr,
            'td': self._start_td,
            'th': self._start_td,
            'ul': self._start_ul,
            'ol': self._start_ol,
            'li': self._start_li,
            'br': self._newline,
            'a': self._start_a if self.config.parse_a() else None,
            'img': self._start_img if self.config.display_images else None,
        }
        self.end_tag_handler_dict = {
            'table': self._end_table,
            'ul': self._end_ul,
            'ol': self._end_ol,
            'td': self._end_td,
            'th': self._end_td,
            'a': self._end_a if self.config.parse_a() else None,
        }

        # instance variables
        self.current_tag = [self.config.css['body']]
        self.current_line = [Line()]
        self.next_line = [Line()]

        # the canvases used for displaying text
        # clean_text_line[0] refers to the root canvas; tables write into child
        # canvases that are created for every table line and merged with the
        # root canvas at the end of a table
        self.clean_text_lines = [[]]

        self.current_table = []
        self.li_counter = []
        self.li_level = 0
        self.last_caption = None

        # used if display_links is enabled
        self.link_target = ''

        # crawl the html tree
        self._parse_html_tree(html_tree)
        if self.current_line[-1]:
            self._write_line()
def test_limit_whitespace_affixes():
    html = '''<html>
                 <body>
                   hallo<span>echo</span>
                   <pre>
def <span>hallo</span>():
   print("echo")
                   </pre>
                 </body>
                </html>
            '''
    config = ParserConfig(css=RELAXED_CSS_PROFILE)
    assert get_text(html, config).strip() == \
        'hallo echo\n\n' \
        'def hallo():\n' \
        '   print("echo")'
Example #9
0
    def __init__(self, html_tree: lxml.html.HtmlElement,
                 config: ParserConfig = None):
        # use the default configuration, if no config object is provided
        self.config = config or ParserConfig()

        # setup start and end tag call tables
        self.start_tag_handler_dict = {
            'table': self._start_table,
            'tr': self._start_tr,
            'td': self._start_td,
            'th': self._start_td,
            'ul': self._start_ul,
            'ol': self._start_ol,
            'li': self._start_li,
            'br': self._newline,
            'a': self._start_a if self.config.parse_a() else None,
            'img': self._start_img if self.config.display_images else None,
        }
        self.end_tag_handler_dict = {
            'table': self._end_table,
            'ul': self._end_ul,
            'ol': self._end_ol,
            'td': self._end_td,
            'th': self._end_td,
            'a': self._end_a if self.config.parse_a() else None,
        }

        # instance variables
        self.canvas = Canvas()
        self.css = self.config.css
        self.apply_attributes = self.config.attribute_handler.apply_attributes

        self.tags = [self.css['body'].set_canvas(self.canvas)]
        self.current_table = []
        self.li_counter = []
        self.last_caption = None

        # used if display_links is enabled
        self.link_target = ''

        # crawl the html tree
        self._parse_html_tree(html_tree)
Example #10
0
def test_html_snippets(filter_str=''):
    for testcase_txt in glob(TESTCASE_PATTERN):
        if filter_str not in testcase_txt:
            continue

        with open(testcase_txt) as f:
            reference_txt = f.read().rstrip()

        with open(testcase_txt.replace(".txt", ".html")) as f:
            print(f.name)
            html = "<html><body>{}</body></html>".format(f.read())

        converted_txt = get_text(
            html, ParserConfig(css=CSS_PROFILES['strict'])).rstrip()

        if converted_txt != reference_txt:
            print("File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}".
                  format(testcase_txt, html, reference_txt, converted_txt))

        assert converted_txt == reference_txt
#!/usr/bin/env python
# encoding: utf-8

'''
Tests different white-space handling.
'''

from inscriptis import get_text
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.model.config import ParserConfig

config = ParserConfig(css=CSS_PROFILES['strict'])


def test_white_space():
    html = (u'<body><span style="white-space: normal"><i>1</i>2\n3</span>'
            u'</body>')
    assert get_text(html, config) == u'12 3'

    html = (u'<body><span style="white-space: nowrap"><i>1</i>2\n3</span>'
            u'</body>')
    assert get_text(html, config) == u'12 3'

    html = (u'<body><span style="white-space: pre"><i>1</i>2\n3</span>'
            u'</body>')
    assert get_text(html, config) == u'12\n3'

    html = (u'<body><span style="white-space: pre-line"><i>1</i>2\n3</span>'
            u'</body>')
    assert get_text(html, config) == u'12\n3'
Example #12
0
    },
    # SemanticElementType.FIGURE: {
    # "img",
    # "figure",
    # "picture",
    # }
}
STYLE_HTML_ELEMENTS = {}
INSCRIPTIS_ANNOTATION_RULES = {
    t: (k, )
    for (k, v) in SEMANTIC_HTML_ELEMENTS.items() for t in v
}
INSCRIPTIS_CONFIG = ParserConfig(
    css=STRICT_CSS_PROFILE,
    display_images=False,
    deduplicate_captions=True,
    display_links=False,
    annotation_rules=INSCRIPTIS_ANNOTATION_RULES,
)


class StructuredHtmlParser(Inscriptis):
    """Subclass of ```inscriptis.Inscriptis``` to provide the position of structural elements."""

    __slots__ = [
        "link_range_to_target",
        "anchors",
        "styled_elements",
    ]

    @staticmethod
        print('\nInscript comes with ABSOLUTELY NO WARRANTY.')
        print('This is free software and you are welcome to redistribute it '
              'under the terms of the {}.'.format(__license__))
        sys.exit(0)

    if not args.input:
        html_content = sys.stdin.read()
    elif isfile(args.input):
        with open(args.input, encoding=args.encoding, errors='ignore') as f:
            html_content = f.read()
    elif args.input.startswith("http://") or args.input.startswith("https://"):
        html_content = requests.get(args.input).text
    else:
        print("ERROR: Cannot open input file '{}'.\n".format(args.input))
        parser.print_help()
        sys.exit(-1)

    css_profile = CSS_PROFILES['relaxed'] if args.indentation == 'extended' \
        else CSS_PROFILES['strict']
    config = ParserConfig(css=css_profile,
                          display_images=args.display_image_captions,
                          deduplicate_captions=args.deduplicate_image_captions,
                          display_links=args.display_link_targets,
                          display_anchors=args.display_anchor_urls)
    text = get_text(html_content, config)
    if args.output:
        with open(args.output, 'w', encoding=args.encoding) as open_file:
            open_file.write(text)
    else:
        print(text)
Example #14
0
            with Path(args.annotation_rules).open() as f:
                annotation_rules = load(f)
        except IOError:
            print("ERROR: Cannot open annotation rule file '{0}'.".format(
                args.annotation_rules
            ))
            sys.exit(-1)
    else:
        annotation_rules = None

    css_profile = CSS_PROFILES['relaxed'] if args.indentation == 'extended' \
        else CSS_PROFILES['strict']
    config = ParserConfig(css=css_profile,
                          display_images=args.display_image_captions,
                          deduplicate_captions=args.deduplicate_image_captions,
                          display_links=args.display_link_targets,
                          display_anchors=args.display_anchor_urls,
                          annotation_rules=annotation_rules,
                          table_cell_separator=args.table_cell_separator)
    if not annotation_rules:
        output = get_text(html_content, config)
    else:
        output = args.postprocessor(
            get_annotated_text(html_content, config))
        if hasattr(args.postprocessor, 'verbatim') \
           and not args.postprocessor.verbatim:
            output = dumps(output)

    if args.output:
        with Path(args.output).open('w', encoding=DEFAULT_ENCODING) as f:
            f.write(output)
Example #15
0
#!/usr/bin/env python3
# coding:utf-8
'''
Inscriptis Web Service
'''

from flask import request, Response, Flask
from inscriptis import get_text, __version__
from inscriptis.css_profiles import RELAXED_CSS_PROFILE
from inscriptis.model.config import ParserConfig

app = Flask(__name__)
CONFIG = ParserConfig(css=RELAXED_CSS_PROFILE,
                      display_images=True,
                      deduplicate_captions=True,
                      display_links=False)


@app.route("/")
def index():
    return "Hello"


@app.route("/get_text", methods=['POST'])
def get_text_call():
    '''
    Returns:
        the text representation of the given HTML content.
    '''
    content_type = request.headers['Content-type']
    if '; encoding=' in content_type: