def test_input(): '''test if loaded strings/trees are handled properly''' assert utils.load_html(123) is None assert utils.load_html('<html><body>XYZ</body></html>') is not None #assert utils.load_html(b'0'*int(10e3)) is None assert extract(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None # legacy assert process_record(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None
def load_mock_page(url, xml_flag=False, langcheck=None): '''load mock page from samples''' try: with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'r') as inputf: htmlstring = inputf.read() except UnicodeDecodeError: with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'r', encoding='ISO-8859-1') as inputf: htmlstring = inputf.read() result = process_record(htmlstring, url, '0000', xml_output=xml_flag, tei_output=False, target_language=langcheck) return result
def test_input(): '''test if loaded strings/trees are handled properly''' assert utils.load_html(123) is None assert utils.load_html('<html><body>ÄÖÜ</body></html>') is not None assert utils.load_html( b'<html><body>\x2f\x2e\x9f</body></html>') is not None assert utils.load_html( '<html><body>\x2f\x2e\x9f</body></html>'.encode('latin-1')) is not None #assert utils.load_html(b'0'*int(10e3)) is None assert extract(None, 'url', '0000', target_language=None) is None # GZip with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile: myinput = gzfile.read() assert 'Long story short,' in extract(myinput) # legacy assert process_record(None, 'url', '0000', target_language=None) is None
def test_input(): '''test if loaded strings/trees are handled properly''' assert utils.is_dubious_html('This is a string.') is True assert utils.is_dubious_html(b'This is a string.') is True with pytest.raises(TypeError) as err: assert utils.load_html(123) is None assert 'incompatible' in str(err.value) assert utils.load_html('<html><body>ÄÖÜ</body></html>') is not None assert utils.load_html(b'<html><body>\x2f\x2e\x9f</body></html>') is not None assert utils.load_html('<html><body>\x2f\x2e\x9f</body></html>'.encode('latin-1')) is not None #assert utils.load_html(b'0'*int(10e3)) is None with pytest.raises(TypeError) as err: assert extract(None, 'url', '0000', target_language=None) is None # legacy assert process_record(None, 'url', '0000', target_language=None) is None # GZip with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile: myinput = gzfile.read() assert 'Long story short,' in extract(myinput) # unicode normalization assert utils.normalize_unicode('A\u0308ffin') != 'A\u0308ffin' testresult = extract('<html><body><p>A\u0308ffin</p></body></html>', config=ZERO_CONFIG) assert testresult != 'A\u0308ffin' and testresult == 'Äffin'