def get_link_embed_data( url: str, maxwidth: Optional[int] = 640, maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]: if not is_link(url): return None # Fetch information from URL. # We are using three sources in next order: # 1. OEmbed # 2. Open Graph # 3. Meta tags try: data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) except requests.exceptions.RequestException: # This is what happens if the target URL cannot be fetched; in # that case, there's nothing to do here, and this URL has no # open graph data. return None data = data or {} response = requests.get(url) if response.ok: og_data = OpenGraphParser(response.text).extract_data() if og_data: data.update(og_data) generic_data = GenericParser(response.text).extract_data() or {} for key in ['title', 'description', 'image']: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def get_link_embed_data(url: str, maxwidth: Optional[int]=640, maxheight: Optional[int]=480) -> Optional[Dict[Any, Any]]: if not is_link(url): return None # Fetch information from URL. # We are using three sources in next order: # 1. OEmbed # 2. Open Graph # 3. Meta tags try: data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) except requests.exceptions.RequestException: msg = 'Unable to fetch information from url {0}, traceback: {1}' logging.error(msg.format(url, traceback.format_exc())) return None data = data or {} response = requests.get(url) if response.ok: og_data = OpenGraphParser(response.text).extract_data() if og_data: data.update(og_data) generic_data = GenericParser(response.text).extract_data() or {} for key in ['title', 'description', 'image']: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def get_link_embed_data( url: str, maxwidth: Optional[int] = 640, maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]: if not is_link(url): return None if not valid_content_type(url): return None # We are using two different mechanisms to get the embed data # 1. Use OEmbed data, if found, for photo and video "type" sites # 2. Otherwise, use a combination of Open Graph tags and Meta tags data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {} if data.get('oembed'): return data response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT) if response.ok: og_data = OpenGraphParser(response.text).extract_data() for key in ['title', 'description', 'image']: if not data.get(key) and og_data.get(key): data[key] = og_data[key] generic_data = GenericParser(response.text).extract_data() or {} for key in ['title', 'description', 'image']: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def get_link_embed_data(url, maxwidth=640, maxheight=480): # type: (Text, Optional[int], Optional[int]) -> Optional[Dict[Any, Any]] if not is_link(url): return None # Fetch information from URL. # We are using three sources in next order: # 1. OEmbed # 2. Open Graph # 3. Meta tags try: data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) except requests.exceptions.RequestException: msg = 'Unable to fetch information from url {0}, traceback: {1}' logging.error(msg.format(url, traceback.format_exc())) return None data = data or {} response = requests.get(url) if response.ok: og_data = OpenGraphParser(response.text).extract_data() if og_data: data.update(og_data) generic_data = GenericParser(response.text).extract_data() or {} for key in ['title', 'description', 'image']: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def get_link_embed_data( url: str, maxwidth: Optional[int] = 640, maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]: if not is_link(url): return None if not valid_content_type(url): return None # Fetch information from URL. # We are using three sources in next order: # 1. OEmbed # 2. Open Graph # 3. Meta tags data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {} response = requests.get(url, stream=True, headers=HEADERS, timeout=TIMEOUT) if response.ok: og_data = OpenGraphParser(response.text).extract_data() if og_data: data.update(og_data) generic_data = GenericParser(response.text).extract_data() or {} for key in ['title', 'description', 'image']: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def get_link_embed_data(url: str, maxwidth: Optional[int]=640, maxheight: Optional[int]=480) -> Optional[Dict[str, Any]]: if not is_link(url): return None # Fetch information from URL. # We are using three sources in next order: # 1. OEmbed # 2. Open Graph # 3. Meta tags try: data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) except requests.exceptions.RequestException: # This is what happens if the target URL cannot be fetched; in # that case, there's nothing to do here, and this URL has no # open graph data. return None data = data or {} response = requests.get(url) if response.ok: og_data = OpenGraphParser(response.text).extract_data() if og_data: data.update(og_data) generic_data = GenericParser(response.text).extract_data() or {} for key in ['title', 'description', 'image']: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def test_extract_description(self) -> None: html = b""" <html> <body> <div> <div> <p>Description text</p> </div> </div> </body> </html> """ parser = GenericParser(html, "text/html; charset=UTF-8") result = parser.extract_data() self.assertEqual(result.get("description"), "Description text") html = b""" <html> <head><meta name="description" content="description 123"</head> <body></body> </html> """ parser = GenericParser(html, "text/html; charset=UTF-8") result = parser.extract_data() self.assertEqual(result.get("description"), "description 123") html = b"<html><body></body></html>" parser = GenericParser(html, "text/html; charset=UTF-8") result = parser.extract_data() self.assertIsNone(result.get("description"))
def test_extract_description(self) -> None: html = """ <html> <body> <div> <div> <p>Description text</p> </div> </div> </body> </html> """ parser = GenericParser(html) result = parser.extract_data() self.assertEqual(result.get('description'), 'Description text') html = """ <html> <head><meta name="description" content="description 123"</head> <body></body> </html> """ parser = GenericParser(html) result = parser.extract_data() self.assertEqual(result.get('description'), 'description 123') html = "<html><body></body></html>" parser = GenericParser(html) result = parser.extract_data() self.assertIsNone(result.get('description'))
def test_parser(self) -> None: html = b""" <html> <head><title>Test title</title></head> <body> <h1>Main header</h1> <p>Description text</p> </body> </html> """ parser = GenericParser(html, "text/html; charset=UTF-8") result = parser.extract_data() self.assertEqual(result.get("title"), "Test title") self.assertEqual(result.get("description"), "Description text")
def test_parser(self) -> None: html = """ <html> <head><title>Test title</title></head> <body> <h1>Main header</h1> <p>Description text</p> </body> </html> """ parser = GenericParser(html) result = parser.extract_data() self.assertEqual(result.get('title'), 'Test title') self.assertEqual(result.get('description'), 'Description text')
def get_link_embed_data(url: str, maxwidth: int = 640, maxheight: int = 480) -> Optional[Dict[str, Any]]: if not is_link(url): return None if not valid_content_type(url): return None # We are using two different mechanisms to get the embed data # 1. Use OEmbed data, if found, for photo and video "type" sites # 2. Otherwise, use a combination of Open Graph tags and Meta tags data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {} if data.get("oembed"): return data response = PreviewSession().get(mark_sanitized(url), stream=True) if response.ok: og_data = OpenGraphParser( response.content, response.headers.get("Content-Type")).extract_data() for key in ["title", "description", "image"]: if not data.get(key) and og_data.get(key): data[key] = og_data[key] generic_data = (GenericParser( response.content, response.headers.get("Content-Type")).extract_data() or {}) for key in ["title", "description", "image"]: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def test_extract_image(self) -> None: html = """ <html> <body> <h1>Main header</h1> <img src="http://test.com/test.jpg"> <div> <p>Description text</p> </div> </body> </html> """ parser = GenericParser(html) result = parser.extract_data() self.assertEqual(result.get('title'), 'Main header') self.assertEqual(result.get('description'), 'Description text') self.assertEqual(result.get('image'), 'http://test.com/test.jpg')
def test_extract_image(self) -> None: html = b""" <html> <body> <h1>Main header</h1> <img data-src="Not an image"> <img src="http://test.com/test.jpg"> <div> <p>Description text</p> </div> </body> </html> """ parser = GenericParser(html, "text/html; charset=UTF-8") result = parser.extract_data() self.assertEqual(result.get("title"), "Main header") self.assertEqual(result.get("description"), "Description text") self.assertEqual(result.get("image"), "http://test.com/test.jpg")
def test_extract_description(self): # type: () -> None html = """ <html> <body> <div> <div> <p>Description text</p> </div> </div> </body> </html> """ parser = GenericParser(html) result = parser.extract_data() self.assertEqual(result.get('description'), 'Description text') html = """ <html> <head><meta name="description" content="description 123"</head> <body></body> </html> """ parser = GenericParser(html) result = parser.extract_data() self.assertEqual(result.get('description'), 'description 123') html = "<html><body></body></html>" parser = GenericParser(html) result = parser.extract_data() self.assertIsNone(result.get('description'))