Esempio n. 1
0
def get_link_embed_data(
        url: str,
        maxwidth: Optional[int] = 640,
        maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None
    # Fetch information from URL.
    # We are using three sources in next order:
    # 1. OEmbed
    # 2. Open Graph
    # 3. Meta tags
    try:
        data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
    except requests.exceptions.RequestException:
        # This is what happens if the target URL cannot be fetched; in
        # that case, there's nothing to do here, and this URL has no
        # open graph data.
        return None
    data = data or {}
    response = requests.get(url)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        if og_data:
            data.update(og_data)
        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Esempio n. 2
0
def get_link_embed_data(url: str,
                        maxwidth: Optional[int]=640,
                        maxheight: Optional[int]=480) -> Optional[Dict[Any, Any]]:
    if not is_link(url):
        return None
    # Fetch information from URL.
    # We are using three sources in next order:
    # 1. OEmbed
    # 2. Open Graph
    # 3. Meta tags
    try:
        data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
    except requests.exceptions.RequestException:
        msg = 'Unable to fetch information from url {0}, traceback: {1}'
        logging.error(msg.format(url, traceback.format_exc()))
        return None
    data = data or {}
    response = requests.get(url)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        if og_data:
            data.update(og_data)
        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Esempio n. 3
0
def get_link_embed_data(
        url: str,
        maxwidth: Optional[int] = 640,
        maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None

    if not valid_content_type(url):
        return None

    # We are using two different mechanisms to get the embed data
    # 1. Use OEmbed data, if found, for photo and video "type" sites
    # 2. Otherwise, use a combination of Open Graph tags and Meta tags
    data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {}
    if data.get('oembed'):
        return data

    response = requests.get(mark_sanitized(url),
                            stream=True,
                            headers=HEADERS,
                            timeout=TIMEOUT)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        for key in ['title', 'description', 'image']:
            if not data.get(key) and og_data.get(key):
                data[key] = og_data[key]

        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Esempio n. 4
0
def get_link_embed_data(url, maxwidth=640, maxheight=480):
    # type: (Text, Optional[int], Optional[int]) -> Optional[Dict[Any, Any]]
    if not is_link(url):
        return None
    # Fetch information from URL.
    # We are using three sources in next order:
    # 1. OEmbed
    # 2. Open Graph
    # 3. Meta tags
    try:
        data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
    except requests.exceptions.RequestException:
        msg = 'Unable to fetch information from url {0}, traceback: {1}'
        logging.error(msg.format(url, traceback.format_exc()))
        return None
    data = data or {}
    response = requests.get(url)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        if og_data:
            data.update(og_data)
        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Esempio n. 5
0
def get_link_embed_data(
        url: str,
        maxwidth: Optional[int] = 640,
        maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None

    if not valid_content_type(url):
        return None

    # Fetch information from URL.
    # We are using three sources in next order:
    # 1. OEmbed
    # 2. Open Graph
    # 3. Meta tags
    data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {}
    response = requests.get(url, stream=True, headers=HEADERS, timeout=TIMEOUT)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        if og_data:
            data.update(og_data)
        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Esempio n. 6
0
def get_link_embed_data(url: str,
                        maxwidth: Optional[int]=640,
                        maxheight: Optional[int]=480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None
    # Fetch information from URL.
    # We are using three sources in next order:
    # 1. OEmbed
    # 2. Open Graph
    # 3. Meta tags
    try:
        data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
    except requests.exceptions.RequestException:
        # This is what happens if the target URL cannot be fetched; in
        # that case, there's nothing to do here, and this URL has no
        # open graph data.
        return None
    data = data or {}
    response = requests.get(url)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        if og_data:
            data.update(og_data)
        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Esempio n. 7
0
    def test_extract_description(self) -> None:
        html = b"""
          <html>
            <body>
                <div>
                    <div>
                        <p>Description text</p>
                    </div>
                </div>
            </body>
          </html>
        """
        parser = GenericParser(html, "text/html; charset=UTF-8")
        result = parser.extract_data()
        self.assertEqual(result.get("description"), "Description text")

        html = b"""
          <html>
            <head><meta name="description" content="description 123"</head>
            <body></body>
          </html>
        """
        parser = GenericParser(html, "text/html; charset=UTF-8")
        result = parser.extract_data()
        self.assertEqual(result.get("description"), "description 123")

        html = b"<html><body></body></html>"
        parser = GenericParser(html, "text/html; charset=UTF-8")
        result = parser.extract_data()
        self.assertIsNone(result.get("description"))
Esempio n. 8
0
    def test_extract_description(self) -> None:
        html = """
          <html>
            <body>
                <div>
                    <div>
                        <p>Description text</p>
                    </div>
                </div>
            </body>
          </html>
        """
        parser = GenericParser(html)
        result = parser.extract_data()
        self.assertEqual(result.get('description'), 'Description text')

        html = """
          <html>
            <head><meta name="description" content="description 123"</head>
            <body></body>
          </html>
        """
        parser = GenericParser(html)
        result = parser.extract_data()
        self.assertEqual(result.get('description'), 'description 123')

        html = "<html><body></body></html>"
        parser = GenericParser(html)
        result = parser.extract_data()
        self.assertIsNone(result.get('description'))
Esempio n. 9
0
 def test_parser(self) -> None:
     html = b"""
       <html>
         <head><title>Test title</title></head>
         <body>
             <h1>Main header</h1>
             <p>Description text</p>
         </body>
       </html>
     """
     parser = GenericParser(html, "text/html; charset=UTF-8")
     result = parser.extract_data()
     self.assertEqual(result.get("title"), "Test title")
     self.assertEqual(result.get("description"), "Description text")
Esempio n. 10
0
 def test_parser(self) -> None:
     html = """
       <html>
         <head><title>Test title</title></head>
         <body>
             <h1>Main header</h1>
             <p>Description text</p>
         </body>
       </html>
     """
     parser = GenericParser(html)
     result = parser.extract_data()
     self.assertEqual(result.get('title'), 'Test title')
     self.assertEqual(result.get('description'), 'Description text')
Esempio n. 11
0
 def test_parser(self) -> None:
     html = """
       <html>
         <head><title>Test title</title></head>
         <body>
             <h1>Main header</h1>
             <p>Description text</p>
         </body>
       </html>
     """
     parser = GenericParser(html)
     result = parser.extract_data()
     self.assertEqual(result.get('title'), 'Test title')
     self.assertEqual(result.get('description'), 'Description text')
Esempio n. 12
0
def get_link_embed_data(url: str,
                        maxwidth: int = 640,
                        maxheight: int = 480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None

    if not valid_content_type(url):
        return None

    # We are using two different mechanisms to get the embed data
    # 1. Use OEmbed data, if found, for photo and video "type" sites
    # 2. Otherwise, use a combination of Open Graph tags and Meta tags
    data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {}
    if data.get("oembed"):
        return data

    response = PreviewSession().get(mark_sanitized(url), stream=True)
    if response.ok:
        og_data = OpenGraphParser(
            response.content,
            response.headers.get("Content-Type")).extract_data()
        for key in ["title", "description", "image"]:
            if not data.get(key) and og_data.get(key):
                data[key] = og_data[key]

        generic_data = (GenericParser(
            response.content,
            response.headers.get("Content-Type")).extract_data() or {})
        for key in ["title", "description", "image"]:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Esempio n. 13
0
 def test_extract_image(self) -> None:
     html = """
       <html>
         <body>
             <h1>Main header</h1>
             <img src="http://test.com/test.jpg">
             <div>
                 <p>Description text</p>
             </div>
         </body>
       </html>
     """
     parser = GenericParser(html)
     result = parser.extract_data()
     self.assertEqual(result.get('title'), 'Main header')
     self.assertEqual(result.get('description'), 'Description text')
     self.assertEqual(result.get('image'), 'http://test.com/test.jpg')
Esempio n. 14
0
 def test_extract_image(self) -> None:
     html = """
       <html>
         <body>
             <h1>Main header</h1>
             <img src="http://test.com/test.jpg">
             <div>
                 <p>Description text</p>
             </div>
         </body>
       </html>
     """
     parser = GenericParser(html)
     result = parser.extract_data()
     self.assertEqual(result.get('title'), 'Main header')
     self.assertEqual(result.get('description'), 'Description text')
     self.assertEqual(result.get('image'), 'http://test.com/test.jpg')
Esempio n. 15
0
 def test_extract_image(self) -> None:
     html = b"""
       <html>
         <body>
             <h1>Main header</h1>
             <img data-src="Not an image">
             <img src="http://test.com/test.jpg">
             <div>
                 <p>Description text</p>
             </div>
         </body>
       </html>
     """
     parser = GenericParser(html, "text/html; charset=UTF-8")
     result = parser.extract_data()
     self.assertEqual(result.get("title"), "Main header")
     self.assertEqual(result.get("description"), "Description text")
     self.assertEqual(result.get("image"), "http://test.com/test.jpg")
Esempio n. 16
0
    def test_extract_description(self):
        # type: () -> None
        html = """
          <html>
            <body>
                <div>
                    <div>
                        <p>Description text</p>
                    </div>
                </div>
            </body>
          </html>
        """
        parser = GenericParser(html)
        result = parser.extract_data()
        self.assertEqual(result.get('description'), 'Description text')

        html = """
          <html>
            <head><meta name="description" content="description 123"</head>
            <body></body>
          </html>
        """
        parser = GenericParser(html)
        result = parser.extract_data()
        self.assertEqual(result.get('description'), 'description 123')

        html = "<html><body></body></html>"
        parser = GenericParser(html)
        result = parser.extract_data()
        self.assertIsNone(result.get('description'))