def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_meta( ['og:title', 'twitter:title', 'name'], webpage) description = self._html_search_meta( ['description'], webpage) loader_str = unescapeHTML(self._search_regex(r"data-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) loader_data = json.loads(loader_str) info = { 'id': video_id, 'title': title, 'description': description, 'formats': self.extract_formats(loader_data), 'timestamp': self.extract_airdate(loader_data) } if "subtitle" in loader_data: info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]} thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()])) if len(thumbnails) > 0: info["thumbnails"] = [{"url": t} for t in thumbnails] return info
def parse_page(self, url): webpage = self._download_webpage(url, 'main') download_list_html = re.findall(r'<a href="([^"]+vid=[0-9]+)" onclick="[^"]+">\s*<img src="([^"]+)"\s+alt="([^"]+)"(?:\s+width="\d+"\s+height="\d+"\s+data-frz-src="([^"]+)")?', webpage) result = [] for expr in download_list_html: infos = {} infos['url'] = 'http://www.canalplus.fr' + expr[0] infos['title'] = unescapeHTML(expr[2]) infos['thumbnail'] = expr[1] if len(expr)>3 and len(expr[3])>0: infos['thumbnail'] = expr[3] result.append(infos) return result
def test_unescape_html(self): self.assertEqual(unescapeHTML('%20;'), '%20;') self.assertEqual(unescapeHTML('/'), '/') self.assertEqual(unescapeHTML('/'), '/') self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('�'), '�') # HTML5 entities self.assertEqual(unescapeHTML('.''), '.\'')
def test_unescape_html(self): self.assertEqual(unescapeHTML("%20;"), "%20;") self.assertEqual(unescapeHTML("/"), "/") self.assertEqual(unescapeHTML("/"), "/") self.assertEqual(unescapeHTML("é"), "é") self.assertEqual(unescapeHTML("�"), "�") # HTML5 entities self.assertEqual(unescapeHTML(".'"), ".'")
def parse_page(self, url): webpage = self._download_webpage(url, 'main') url_thumb_list = re.findall(r'<a\s+href="([^"]+)"><img\s+width="\d+"\s+height="\d+"\s+src="([^"]+)"', webpage) url_title_list = re.findall(r'<h3 class="internet-title"><a href="([^"]+)">([^<]+)<', webpage) result = [] i = 0 for expr in url_thumb_list: expr2 = url_title_list[i] infos = {} infos['url'] = expr[0] infos['title'] = unescapeHTML(expr2[1]) infos['thumbnail'] = expr[1] result.append(infos) i += 1 return result
def parse_page(self, url): webpage = self._download_webpage(url, 'main') # returns an incomplete page the first time if self.parse_page_counter == 0: webpage = self._download_webpage(url, 'main') self.parse_page_counter += 1 download_list_html = re.findall(r'<a href="/watch\?v=([^"]+)" class="[^"]+" data-sessionlink="[^"]+" title="([^"]+)"', webpage) result = [] for expr in download_list_html: infos = {} infos['url'] = 'https://www.youtube.com/watch?v=' + expr[0] infos['title'] = unescapeHTML(expr[1]) infos['thumbnail'] = 'https://i.ytimg.com/vi/'+ expr[0]+ '/mqdefault.jpg' result.append(infos) return result
def parse_page(self, url): webpage = self._download_webpage(url, 'main') print('parse_page') #url_thumb_list = re.findall(r'<a\s+href="([^"]+)"><img\s+width="\d+"\s+height="\d+"\s+src="([^"]+)"', webpage) url_list = re.findall(r'<a\s+href="([^"]+)"\s+class="videoLink', webpage) thumbnail_list = re.findall(r'data-srcset="([^ ]+) 1x', webpage) title_list = re.findall(r'<p class="title">([^<]+)</p><p class="stitle">', webpage) result = [] url_size = len(url_list) thumbnail_size = len(thumbnail_list) title_size = len(title_list) for i in range(title_size): infos = {} infos['url'] = 'http://www.tf1.fr' + url_list[url_size-title_size+i] infos['title'] = unescapeHTML(title_list[i]) infos['thumbnail'] = 'http:' + thumbnail_list[i*3] result.append(infos) return result
def test_unescape_html(self): self.assertEqual(unescapeHTML(_compat_str('%20;')), _compat_str('%20;'))
def test_unescape_html(self): self.assertEqual(unescapeHTML('%20;'), '%20;') self.assertEqual( unescapeHTML('é'), 'é')
def test_unescape_html(self): self.assertEqual(unescapeHTML('%20;'), '%20;') self.assertEqual(unescapeHTML('/'), '/') self.assertEqual(unescapeHTML('/'), '/') self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('�'), '�')
def test_unescape_html(self): self.assertEqual(unescapeHTML(u"%20;"), u"%20;")
def test_unescape_html(self): self.assertEqual(unescapeHTML('%20;'), '%20;') self.assertEqual(unescapeHTML('é'), 'é')
def test_unescape_html(self): self.assertEqual(unescapeHTML("%20;"), "%20;") self.assertEqual(unescapeHTML("é"), "é")
def test_unescape_html(self): self.assertEqual(unescapeHTML("%20;"), "%20;") self.assertEqual(unescapeHTML("/"), "/") self.assertEqual(unescapeHTML("/"), "/") self.assertEqual(unescapeHTML("é"), "é") self.assertEqual(unescapeHTML("�"), "�")