Example #1
0
def get_link_embed_data(url: str,
                        maxwidth: Optional[int]=640,
                        maxheight: Optional[int]=480) -> Optional[Dict[Any, Any]]:
    if not is_link(url):
        return None
    # Fetch information from URL.
    # We are using three sources in next order:
    # 1. OEmbed
    # 2. Open Graph
    # 3. Meta tags
    try:
        data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
    except requests.exceptions.RequestException:
        msg = 'Unable to fetch information from url {0}, traceback: {1}'
        logging.error(msg.format(url, traceback.format_exc()))
        return None
    data = data or {}
    response = requests.get(url)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        if og_data:
            data.update(og_data)
        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Example #2
0
def get_link_embed_data(url: str,
                        maxwidth: int = 640,
                        maxheight: int = 480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None

    if not valid_content_type(url):
        return None

    # We are using two different mechanisms to get the embed data
    # 1. Use OEmbed data, if found, for photo and video "type" sites
    # 2. Otherwise, use a combination of Open Graph tags and Meta tags
    data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {}
    if data.get("oembed"):
        return data

    response = PreviewSession().get(mark_sanitized(url), stream=True)
    if response.ok:
        og_data = OpenGraphParser(
            response.content,
            response.headers.get("Content-Type")).extract_data()
        for key in ["title", "description", "image"]:
            if not data.get(key) and og_data.get(key):
                data[key] = og_data[key]

        generic_data = (GenericParser(
            response.content,
            response.headers.get("Content-Type")).extract_data() or {})
        for key in ["title", "description", "image"]:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Example #3
0
 def test_error_request(self, get):
     # type: (Any) -> None
     get.return_value = response = mock.Mock()
     response.ok = False
     url = 'http://instagram.com/p/BLtI2WdAymy'
     data = get_oembed_data(url)
     self.assertIsNone(data)
Example #4
0
def get_link_embed_data(
        url: str,
        maxwidth: Optional[int] = 640,
        maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None

    if not valid_content_type(url):
        return None

    # Fetch information from URL.
    # We are using three sources in next order:
    # 1. OEmbed
    # 2. Open Graph
    # 3. Meta tags
    data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {}
    response = requests.get(url, stream=True, headers=HEADERS, timeout=TIMEOUT)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        if og_data:
            data.update(og_data)
        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Example #5
0
def get_link_embed_data(
        url: str,
        maxwidth: Optional[int] = 640,
        maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None
    # Fetch information from URL.
    # We are using three sources in next order:
    # 1. OEmbed
    # 2. Open Graph
    # 3. Meta tags
    try:
        data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
    except requests.exceptions.RequestException:
        # This is what happens if the target URL cannot be fetched; in
        # that case, there's nothing to do here, and this URL has no
        # open graph data.
        return None
    data = data or {}
    response = requests.get(url)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        if og_data:
            data.update(og_data)
        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Example #6
0
def get_link_embed_data(url: str,
                        maxwidth: int = 640,
                        maxheight: int = 480) -> Optional[UrlEmbedData]:
    if not is_link(url):
        return None

    if not valid_content_type(url):
        return None

    # The oembed data from pyoembed may be complete enough to return
    # as-is; if so, we use it.  Otherwise, we use it as a _base_ for
    # the other, less sophisticated techniques which we apply as
    # successive fallbacks.
    data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
    if data is not None and isinstance(data, UrlOEmbedData):
        return data

    response = PreviewSession().get(mark_sanitized(url), stream=True)
    if not response.ok:
        return None

    if data is None:
        data = UrlEmbedData()

    for parser_class in (OpenGraphParser, GenericParser):
        parser = parser_class(response.content,
                              response.headers.get("Content-Type"))
        data.merge(parser.extract_data())

    if data.image:
        data.image = urljoin(response.url, data.image)
    return data
Example #7
0
 def test_error_request(self, get):
     # type: (Any) -> None
     get.return_value = response = mock.Mock()
     response.ok = False
     url = 'http://instagram.com/p/BLtI2WdAymy'
     data = get_oembed_data(url)
     self.assertIsNone(data)
Example #8
0
def get_link_embed_data(url: str,
                        maxwidth: Optional[int]=640,
                        maxheight: Optional[int]=480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None
    # Fetch information from URL.
    # We are using three sources in next order:
    # 1. OEmbed
    # 2. Open Graph
    # 3. Meta tags
    try:
        data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
    except requests.exceptions.RequestException:
        # This is what happens if the target URL cannot be fetched; in
        # that case, there's nothing to do here, and this URL has no
        # open graph data.
        return None
    data = data or {}
    response = requests.get(url)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        if og_data:
            data.update(og_data)
        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Example #9
0
    def test_video_provider(self) -> None:
        response_data = {
            "type": "video",
            "thumbnail_url":
            "https://scontent.cdninstagram.com/t51.2885-15/n.jpg",
            "thumbnail_width": 640,
            "thumbnail_height": 426,
            "title": "NASA",
            "html": "<p>test</p>",
            "version": "1.0",
            "width": 658,
            "height": 400,
        }
        url = "http://blip.tv/video/158727223"
        reconstructed_url = reconstruct_url(url)
        responses.add(
            responses.GET,
            reconstructed_url,
            json=response_data,
            status=200,
        )

        data = get_oembed_data(url)
        assert data is not None
        self.assertIsInstance(data, UrlOEmbedData)
        self.assertEqual(data.title, response_data["title"])
Example #10
0
def get_link_embed_data(
        url: str,
        maxwidth: Optional[int] = 640,
        maxheight: Optional[int] = 480) -> Optional[Dict[str, Any]]:
    if not is_link(url):
        return None

    if not valid_content_type(url):
        return None

    # We are using two different mechanisms to get the embed data
    # 1. Use OEmbed data, if found, for photo and video "type" sites
    # 2. Otherwise, use a combination of Open Graph tags and Meta tags
    data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight) or {}
    if data.get('oembed'):
        return data

    response = requests.get(mark_sanitized(url),
                            stream=True,
                            headers=HEADERS,
                            timeout=TIMEOUT)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        for key in ['title', 'description', 'image']:
            if not data.get(key) and og_data.get(key):
                data[key] = og_data[key]

        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Example #11
0
 def test_photo_provider(self, get: Any) -> None:
     get.return_value = response = mock.Mock()
     response.headers = {"content-type": "application/json"}
     response.ok = True
     response_data = {
         "type": "photo",
         "thumbnail_url":
         "https://scontent.cdninstagram.com/t51.2885-15/n.jpg",
         "url": "https://scontent.cdninstagram.com/t51.2885-15/n.jpg",
         "thumbnail_width": 640,
         "thumbnail_height": 426,
         "title": "NASA",
         "html": "<p>test</p>",
         "version": "1.0",
         "width": 658,
         "height": 400,
     }
     response.text = orjson.dumps(response_data).decode()
     url = "http://imgur.com/photo/158727223"
     data = get_oembed_data(url)
     self.assertIsInstance(data, dict)
     self.assertIn("title", data)
     assert data is not None  # allow mypy to infer data is indexable
     self.assertEqual(data["title"], response_data["title"])
     self.assertTrue(data["oembed"])
Example #12
0
def get_link_embed_data(url, maxwidth=640, maxheight=480):
    # type: (Text, Optional[int], Optional[int]) -> Optional[Dict[Any, Any]]
    if not is_link(url):
        return None
    # Fetch information from URL.
    # We are using three sources in next order:
    # 1. OEmbed
    # 2. Open Graph
    # 3. Meta tags
    try:
        data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
    except requests.exceptions.RequestException:
        msg = 'Unable to fetch information from url {0}, traceback: {1}'
        logging.error(msg.format(url, traceback.format_exc()))
        return None
    data = data or {}
    response = requests.get(url)
    if response.ok:
        og_data = OpenGraphParser(response.text).extract_data()
        if og_data:
            data.update(og_data)
        generic_data = GenericParser(response.text).extract_data() or {}
        for key in ['title', 'description', 'image']:
            if not data.get(key) and generic_data.get(key):
                data[key] = generic_data[key]
    return data
Example #13
0
 def test_photo_provider(self, get: Any) -> None:
     get.return_value = response = mock.Mock()
     response.headers = {'content-type': 'application/json'}
     response.ok = True
     response_data = {
         'type': 'photo',
         'thumbnail_url':
         'https://scontent.cdninstagram.com/t51.2885-15/n.jpg',
         'url': 'https://scontent.cdninstagram.com/t51.2885-15/n.jpg',
         'thumbnail_width': 640,
         'thumbnail_height': 426,
         'title': 'NASA',
         'html': '<p>test</p>',
         'version': '1.0',
         'width': 658,
         'height': 400
     }
     response.text = ujson.dumps(response_data)
     url = 'http://imgur.com/photo/158727223'
     data = get_oembed_data(url)
     self.assertIsInstance(data, dict)
     self.assertIn('title', data)
     assert data is not None  # allow mypy to infer data is indexable
     self.assertEqual(data['title'], response_data['title'])
     self.assertTrue(data['oembed'])
Example #14
0
    def test_photo_provider(self) -> None:
        response_data = {
            "type": "photo",
            "thumbnail_url":
            "https://scontent.cdninstagram.com/t51.2885-15/n.jpg",
            "url": "https://scontent.cdninstagram.com/t51.2885-15/n.jpg",
            "thumbnail_width": 640,
            "thumbnail_height": 426,
            "title": "NASA",
            "html": "<p>test</p>",
            "version": "1.0",
            "width": 658,
            "height": 400,
        }
        # pyoembed.providers.imgur only works with http:// URLs, not https:// (!)
        url = "http://imgur.com/photo/158727223"
        reconstructed_url = reconstruct_url(url)
        responses.add(
            responses.GET,
            reconstructed_url,
            json=response_data,
            status=200,
        )

        data = get_oembed_data(url)
        self.assertIsInstance(data, dict)
        self.assertIn("title", data)
        assert data is not None  # allow mypy to infer data is indexable
        self.assertEqual(data["title"], response_data["title"])
        self.assertTrue(data["oembed"])
Example #15
0
    def test_present_provider(self) -> None:
        response_data = {
            "type": "rich",
            "thumbnail_url":
            "https://scontent.cdninstagram.com/t51.2885-15/n.jpg",
            "thumbnail_width": 640,
            "thumbnail_height": 426,
            "title": "NASA",
            "html": "<p>test</p>",
            "version": "1.0",
            "width": 658,
            "height": 400,
        }
        url = "http://instagram.com/p/BLtI2WdAymy"
        reconstructed_url = reconstruct_url(url)
        responses.add(
            responses.GET,
            reconstructed_url,
            json=response_data,
            status=200,
        )

        data = get_oembed_data(url)
        self.assertIsInstance(data, dict)
        self.assertIn("title", data)
        assert data is not None  # allow mypy to infer data is indexable
        self.assertEqual(data["title"], response_data["title"])
Example #16
0
 def test_invalid_json_in_response(self, get: Any) -> None:
     get.return_value = response = mock.Mock()
     response.headers = {"content-type": "application/json"}
     response.ok = True
     response.text = "{invalid json}"
     url = "http://instagram.com/p/BLtI2WdAymy"
     data = get_oembed_data(url)
     self.assertIsNone(data)
Example #17
0
 def test_invalid_json_in_response(self) -> None:
     url = "http://instagram.com/p/BLtI2WdAymy"
     reconstructed_url = reconstruct_url(url)
     responses.add(
         responses.GET,
         reconstructed_url,
         json="{invalid json}",
         status=200,
     )
     data = get_oembed_data(url)
     self.assertIsNone(data)
Example #18
0
 def test_present_provider(self, get: Any) -> None:
     get.return_value = response = mock.Mock()
     response.headers = {'content-type': 'application/json'}
     response.ok = True
     response_data = {
         'type': 'rich',
         'thumbnail_url': 'https://scontent.cdninstagram.com/t51.2885-15/n.jpg',
         'thumbnail_width': 640,
         'thumbnail_height': 426,
         'title': 'NASA',
         'html': '<p>test</p>',
         'version': '1.0',
         'width': 658,
         'height': None}
     response.text = ujson.dumps(response_data)
     url = 'http://instagram.com/p/BLtI2WdAymy'
     data = get_oembed_data(url)
     self.assertIsInstance(data, dict)
     self.assertIn('title', data)
     self.assertEqual(data['title'], response_data['title'])
Example #19
0
 def test_present_provider(self, get: Any) -> None:
     get.return_value = response = mock.Mock()
     response.headers = {'content-type': 'application/json'}
     response.ok = True
     response_data = {
         'type': 'rich',
         'thumbnail_url':
         'https://scontent.cdninstagram.com/t51.2885-15/n.jpg',
         'thumbnail_width': 640,
         'thumbnail_height': 426,
         'title': 'NASA',
         'html': '<p>test</p>',
         'version': '1.0',
         'width': 658,
         'height': None
     }
     response.text = ujson.dumps(response_data)
     url = 'http://instagram.com/p/BLtI2WdAymy'
     data = get_oembed_data(url)
     self.assertIsInstance(data, dict)
     self.assertIn('title', data)
     self.assertEqual(data['title'], response_data['title'])
Example #20
0
 def test_500_error_request(self) -> None:
     url = "http://instagram.com/p/BLtI2WdAymy"
     reconstructed_url = reconstruct_url(url)
     responses.add(responses.GET, reconstructed_url, status=500)
     data = get_oembed_data(url)
     self.assertIsNone(data)
Example #21
0
 def test_connect_error_request(self) -> None:
     url = "http://instagram.com/p/BLtI2WdAymy"
     reconstructed_url = reconstruct_url(url)
     responses.add(responses.GET, reconstructed_url, body=ConnectionError())
     data = get_oembed_data(url)
     self.assertIsNone(data)