def test_comment2(self): html = b""" <html> <head><title>Foo</title></head> <body> Some text. <!-- HTML comment --> Some more text. <p>Text</p> More text </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual( og, { "og:title": "Foo", "og:description": "Some text.\n\nSome more text.\n\nText\n\nMore text", }, )
def test_windows_1252(self): """A body which uses cp1252, but doesn't declare that.""" html = b""" <html> <head><title>\xf3</title></head> <body> Some text. </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
def test_invalid_encoding(self): """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" html = b""" <html> <head><title>Foo</title></head> <body> Some text. </body> </html> """ tree = decode_body(html, "invalid-encoding") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_missing_title(self): html = b""" <html> <body> Some text. </body> </html> """ tree = decode_body(html) og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
def test_invalid_encoding2(self): """A body which doesn't match the sent character encoding.""" # Note that this contains an invalid UTF-8 sequence in the title. html = b""" <html> <head><title>\xff\xff Foo</title></head> <body> Some text. </body> </html> """ tree = decode_body(html) og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
def test_h1_as_title(self): html = b""" <html> <meta property="og:description" content="Some text."/> <body> <h1>Title</h1> </body> </html> """ tree = decode_body(html) og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
def test_simple(self): html = b""" <html> <head><title>Foo</title></head> <body> Some text. </body> </html> """ tree = decode_body(html) og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_xml(self): """Test decoding XML and ensure it works properly.""" # Note that the strip() call is important to ensure the xml tag starts # at the initial byte. html = b""" <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head><title>Foo</title></head><body>Some text.</body></html> """.strip() tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, { "og:title": "Foo", "og:description": "Some text." })
def test_missing_title_and_broken_h1(self): html = b""" <html> <body> <h1><a href="foo"/></h1> Some text. </body> </html> """ tree = decode_body(html, "http://example.com/test.html") og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, { "og:title": None, "og:description": "Some text." })