Example #1
0
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        title = self._html_search_meta(
            ['og:title', 'twitter:title', 'name'], webpage)
        description = self._html_search_meta(
            ['description'], webpage)

        loader_str = unescapeHTML(self._search_regex(r"data-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader"))
        loader_data = json.loads(loader_str)

        info = {
            'id': video_id,
            'title': title,
            'description': description,
            'formats': self.extract_formats(loader_data),
            'timestamp': self.extract_airdate(loader_data)
        }

        if "subtitle" in loader_data:
            info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]}

        thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()]))
        if len(thumbnails) > 0:
            info["thumbnails"] = [{"url": t} for t in thumbnails]

        return info
Example #2
0
 def parse_page(self, url):
     webpage = self._download_webpage(url, 'main')
     download_list_html = re.findall(r'<a href="([^"]+vid=[0-9]+)" onclick="[^"]+">\s*<img src="([^"]+)"\s+alt="([^"]+)"(?:\s+width="\d+"\s+height="\d+"\s+data-frz-src="([^"]+)")?', webpage)
     result = []
     for expr in download_list_html:
         infos = {}
         infos['url'] = 'http://www.canalplus.fr' + expr[0]
         infos['title'] = unescapeHTML(expr[2])
         infos['thumbnail'] = expr[1]
         if len(expr)>3 and len(expr[3])>0:
             infos['thumbnail'] = expr[3]
         result.append(infos)
     return result
Example #3
0
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML('%20;'), '%20;')
     self.assertEqual(unescapeHTML('&#x2F;'), '/')
     self.assertEqual(unescapeHTML('&#47;'), '/')
     self.assertEqual(unescapeHTML('&eacute;'), 'é')
     self.assertEqual(unescapeHTML('&#2013266066;'), '&#2013266066;')
     # HTML5 entities
     self.assertEqual(unescapeHTML('&period;&apos;'), '.\'')
Example #4
0
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML("%20;"), "%20;")
     self.assertEqual(unescapeHTML("&#x2F;"), "/")
     self.assertEqual(unescapeHTML("&#47;"), "/")
     self.assertEqual(unescapeHTML("&eacute;"), "é")
     self.assertEqual(unescapeHTML("&#2013266066;"), "&#2013266066;")
     # HTML5 entities
     self.assertEqual(unescapeHTML("&period;&apos;"), ".'")
Example #5
0
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML('%20;'), '%20;')
     self.assertEqual(unescapeHTML('&#x2F;'), '/')
     self.assertEqual(unescapeHTML('&#47;'), '/')
     self.assertEqual(unescapeHTML('&eacute;'), 'é')
     self.assertEqual(unescapeHTML('&#2013266066;'), '&#2013266066;')
     # HTML5 entities
     self.assertEqual(unescapeHTML('&period;&apos;'), '.\'')
Example #6
0
 def parse_page(self, url):
     webpage = self._download_webpage(url, 'main')
     url_thumb_list = re.findall(r'<a\s+href="([^"]+)"><img\s+width="\d+"\s+height="\d+"\s+src="([^"]+)"', webpage)
     url_title_list = re.findall(r'<h3 class="internet-title"><a href="([^"]+)">([^<]+)<', webpage)
     result = []
     i = 0
     for expr in url_thumb_list:
         expr2 = url_title_list[i]
         infos = {}
         infos['url'] = expr[0]
         infos['title'] = unescapeHTML(expr2[1])
         infos['thumbnail'] = expr[1]
         result.append(infos)
         i += 1
     return result
Example #7
0
    def parse_page(self, url):
        webpage = self._download_webpage(url, 'main')
        # returns an incomplete page the first time
        if self.parse_page_counter == 0:
            webpage = self._download_webpage(url, 'main')
        self.parse_page_counter += 1

        download_list_html = re.findall(r'<a href="/watch\?v=([^"]+)" class="[^"]+" data-sessionlink="[^"]+" title="([^"]+)"', webpage)
        result = []
        for expr in download_list_html:
            infos = {}
            infos['url'] = 'https://www.youtube.com/watch?v=' + expr[0]
            infos['title'] = unescapeHTML(expr[1])
            infos['thumbnail'] = 'https://i.ytimg.com/vi/'+ expr[0]+ '/mqdefault.jpg'
            result.append(infos)
        return result
Example #8
0
 def parse_page(self, url):
     webpage = self._download_webpage(url, 'main')
     print('parse_page')
     #url_thumb_list = re.findall(r'<a\s+href="([^"]+)"><img\s+width="\d+"\s+height="\d+"\s+src="([^"]+)"', webpage)
     url_list = re.findall(r'<a\s+href="([^"]+)"\s+class="videoLink', webpage)
     thumbnail_list = re.findall(r'data-srcset="([^ ]+) 1x', webpage)
     title_list = re.findall(r'<p class="title">([^<]+)</p><p class="stitle">', webpage)
     result = []
     url_size = len(url_list)
     thumbnail_size = len(thumbnail_list)
     title_size = len(title_list)
     for i in range(title_size):
         infos = {}
         infos['url'] = 'http://www.tf1.fr' + url_list[url_size-title_size+i]
         infos['title'] = unescapeHTML(title_list[i])
         infos['thumbnail'] = 'http:' + thumbnail_list[i*3]
         result.append(infos)
     return result
Example #9
0
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML(_compat_str('%20;')), _compat_str('%20;'))
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML('%20;'), '%20;')
     self.assertEqual(
         unescapeHTML('&eacute;'), 'é')
Example #11
0
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML('%20;'), '%20;')
     self.assertEqual(unescapeHTML('&#x2F;'), '/')
     self.assertEqual(unescapeHTML('&#47;'), '/')
     self.assertEqual(unescapeHTML('&eacute;'), 'é')
     self.assertEqual(unescapeHTML('&#2013266066;'), '&#2013266066;')
Example #12
0
	def test_unescape_html(self):
		self.assertEqual(unescapeHTML(u"%20;"), u"%20;")
Example #13
0
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML(_compat_str('%20;')), _compat_str('%20;'))
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML('%20;'), '%20;')
     self.assertEqual(unescapeHTML('&eacute;'), 'é')
Example #15
0
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML('%20;'), '%20;')
     self.assertEqual(unescapeHTML('&#x2F;'), '/')
     self.assertEqual(unescapeHTML('&#47;'), '/')
     self.assertEqual(unescapeHTML('&eacute;'), 'é')
     self.assertEqual(unescapeHTML('&#2013266066;'), '&#2013266066;')
Example #16
0
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML("%20;"), "%20;")
     self.assertEqual(unescapeHTML("&eacute;"), "é")
Example #17
0
 def test_unescape_html(self):
     self.assertEqual(unescapeHTML("%20;"), "%20;")
     self.assertEqual(unescapeHTML("&#x2F;"), "/")
     self.assertEqual(unescapeHTML("&#47;"), "/")
     self.assertEqual(unescapeHTML("&eacute;"), "é")
     self.assertEqual(unescapeHTML("&#2013266066;"), "&#2013266066;")