def get_link_embed_data( url: str, maxwidth: Optional[int] = 640, maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]: if not is_link(url): return None if not valid_content_type(url): return None # We are using two different mechanisms to get the embed data # 1. Use OEmbed data, if found, for photo and video "type" sites # 2. Otherwise, use a combination of Open Graph tags and Meta tags data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {} if data.get('oembed'): return data response = requests.get(mark_sanitized(url), stream=True, headers=HEADERS, timeout=TIMEOUT) if response.ok: og_data = OpenGraphParser(response.text).extract_data() for key in ['title', 'description', 'image']: if not data.get(key) and og_data.get(key): data[key] = og_data[key] generic_data = GenericParser(response.text).extract_data() or {} for key in ['title', 'description', 'image']: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def get_link_embed_data(url: str, maxwidth: int = 640, maxheight: int = 480) -> Optional[Dict[str, Any]]: if not is_link(url): return None if not valid_content_type(url): return None # We are using two different mechanisms to get the embed data # 1. Use OEmbed data, if found, for photo and video "type" sites # 2. Otherwise, use a combination of Open Graph tags and Meta tags data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {} if data.get("oembed"): return data response = PreviewSession().get(mark_sanitized(url), stream=True) if response.ok: og_data = OpenGraphParser( response.content, response.headers.get("Content-Type")).extract_data() for key in ["title", "description", "image"]: if not data.get(key) and og_data.get(key): data[key] = og_data[key] generic_data = (GenericParser( response.content, response.headers.get("Content-Type")).extract_data() or {}) for key in ["title", "description", "image"]: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def test_charset_in_header(self) -> None: html = """<html> <head> <meta property="og:title" content="中文" /> </head> </html>""".encode("big5") parser = OpenGraphParser(html, "text/html; charset=Big5") result = parser.extract_data() self.assertEqual(result["title"], "中文")
def test_page_with_og(self) -> None: html = b"""<html> <head> <meta property="og:title" content="The Rock" /> <meta property="og:type" content="video.movie" /> <meta property="og:url" content="http://www.imdb.com/title/tt0117500/" /> <meta property="og:image" content="http://ia.media-imdb.com/images/rock.jpg" /> <meta property="og:description" content="The Rock film" /> </head> </html>""" parser = OpenGraphParser(html, "text/html; charset=UTF-8") result = parser.extract_data() self.assertEqual(result.title, "The Rock") self.assertEqual(result.description, "The Rock film")
def test_page_with_og(self) -> None: html = """<html> <head> <meta property="og:title" content="The Rock" /> <meta property="og:type" content="video.movie" /> <meta property="og:url" content="http://www.imdb.com/title/tt0117500/" /> <meta property="og:image" content="http://ia.media-imdb.com/images/rock.jpg" /> <meta property="og:description" content="The Rock film" /> </head> </html>""" parser = OpenGraphParser(html) result = parser.extract_data() self.assertIn('title', result) self.assertEqual(result['title'], 'The Rock') self.assertEqual(result.get('description'), 'The Rock film')
def get_link_embed_data(url, maxwidth=640, maxheight=480): # type: (Text, Optional[int], Optional[int]) -> Optional[Dict[Any, Any]] if not is_link(url): return None # Fetch information from URL. # We are using three sources in next order: # 1. OEmbed # 2. Open Graph # 3. Meta tags try: data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) except requests.exceptions.RequestException: msg = 'Unable to fetch information from url {0}, traceback: {1}' logging.error(msg.format(url, traceback.format_exc())) return None data = data or {} response = requests.get(url) if response.ok: og_data = OpenGraphParser(response.text).extract_data() if og_data: data.update(og_data) generic_data = GenericParser(response.text).extract_data() or {} for key in ['title', 'description', 'image']: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def get_link_embed_data( url: str, maxwidth: Optional[int] = 640, maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]: if not is_link(url): return None # Fetch information from URL. # We are using three sources in next order: # 1. OEmbed # 2. Open Graph # 3. Meta tags try: data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) except requests.exceptions.RequestException: # This is what happens if the target URL cannot be fetched; in # that case, there's nothing to do here, and this URL has no # open graph data. return None data = data or {} response = requests.get(url) if response.ok: og_data = OpenGraphParser(response.text).extract_data() if og_data: data.update(og_data) generic_data = GenericParser(response.text).extract_data() or {} for key in ['title', 'description', 'image']: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def get_link_embed_data( url: str, maxwidth: Optional[int] = 640, maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]: if not is_link(url): return None if not valid_content_type(url): return None # Fetch information from URL. # We are using three sources in next order: # 1. OEmbed # 2. Open Graph # 3. Meta tags data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {} response = requests.get(url, stream=True, headers=HEADERS, timeout=TIMEOUT) if response.ok: og_data = OpenGraphParser(response.text).extract_data() if og_data: data.update(og_data) generic_data = GenericParser(response.text).extract_data() or {} for key in ['title', 'description', 'image']: if not data.get(key) and generic_data.get(key): data[key] = generic_data[key] return data
def test_page_with_evil_og_tags(self) -> None: html = b"""<html> <head> <meta property="og:title" content="The Rock" /> <meta property="og:type" content="video.movie" /> <meta property="og:url" content="http://www.imdb.com/title/tt0117500/" /> <meta property="og:image" content="http://ia.media-imdb.com/images/rock.jpg" /> <meta property="og:description" content="The Rock film" /> <meta property="og:html" content="<script>alert(window.location)</script>" /> <meta property="og:oembed" content="True" /> </head> </html>""" parser = OpenGraphParser(html, "text/html; charset=UTF-8") result = parser.extract_data() self.assertIn("title", result) self.assertEqual(result["title"], "The Rock") self.assertEqual(result.get("description"), "The Rock film") self.assertEqual(result.get("oembed"), None) self.assertEqual(result.get("html"), None)