def episodios(item): logger.info("tvalacarta.channels.rtpa episodios") itemlist = [] if "&fin=" not in item.url: item.url = item.url + "&fin=1000" data = scrapertools.cache_page(item.url) json_object = jsontools.load_json(data) #logger.info("json_object="+repr(json_object)) #logger.info("VOD="+repr(json_object["VOD"])) for vod in json_object["VOD"]: logger.info("vod="+repr(vod)) title = vod["nombre_programa"] if vod["titulo"]!="": title = title + " - " + vod["titulo"] if vod["fecha_emision"]!="": title = title + " ("+scrapertools.htmlclean(vod["fecha_emision"])+")" url = "http://www.rtpa.es/video:"+urllib.quote(vod["nombre_programa"])+"_"+vod["id_generado"]+".html" try: url_imagen = vod["url_imagen"] thumbnail = urllib.quote(url_imagen).replace("//","/").replace("http%3A/","http://") except: thumbnail = "" aired_date = scrapertools.parse_date( vod["fecha_emision"] ) plot = scrapertools.htmlclean(vod["sinopsis"]) itemlist.append( Item(channel=CHANNELNAME, title=title , url=url, thumbnail=thumbnail , plot=plot, fanart=thumbnail, server="rtpa", action="play" , show = item.show , viewmode="movie_with_plot", aired_date=aired_date, folder=False) ) return itemlist
def episodios(item): logger.info("extremaduratv.episodios") itemlist = [] # Descarga la página data = scrapertools.cachePage(item.url) ''' <div class="modal-video-ajax modal-video modal fade color-tv" id="modalTV2_33318" tabindex="-1" role="dialog" aria-labelledby="modalTV2_33318Label" aria-hidden="true" data-video-imagen-modal="http://www.canalextremadura.es/sites/default/files/styles/nuevo_dise_o_-_grande/public/imagenes-nuevo-disenio/tv-a-la-carta/_desdeelaire.jpg?itok=FmvSbPkH" data-video-video-mobile="http://iphonevod.canalextremadura.es/S-B4583-009.mp4" data-video-url="/alacarta/tv/videos/extremadura-desde-el-aire-3" data-video-titulo-modal="El Reino del Pata Negra" data-video-id-nodo="33318" data-video-video-modal="rtmp://canalextremadura.cdn.canalextremadura.es/canalextremadura/tv/S-B4583-009.mp4" ''' patron = '<div class="modal-video-ajax(.*?</blockquote>)' matches = re.findall(patron,data,re.DOTALL) for match in matches: title = scrapertools.find_single_match(match,'data-video-titulo-modal="([^"]+)"') url = urlparse.urljoin(item.url,scrapertools.find_single_match(match,'data-video-url="([^"]+)"')) thumbnail = urlparse.urljoin(item.url,scrapertools.find_single_match(match,'data-video-imagen-modal="([^"]+)"')) plot = scrapertools.find_single_match(match,'<blockquote class="nomargin">(.*?)</blockquote>').strip() aired_date = scrapertools.parse_date(title) extra = urlparse.urljoin(item.url,scrapertools.find_single_match(match,'data-video-video-modal="([^"]+)"')) itemlist.append( Item(channel=CHANNELNAME, title=title , action="play" , server="extremaduratv" , plot=plot, url=url, thumbnail=thumbnail, fanart=thumbnail, show=item.show, aired_date=aired_date, extra=extra, view="videos", folder=False) ) return itemlist
def detalle_episodio(item): data = scrapertools.cache_page(item.url) scrapedplot = scrapertools.find_single_match( data, '<meta content="([^"]+)" property="og\:description"') item.plot = scrapertools.htmlclean(scrapedplot).strip() scrapedthumbnail = scrapertools.find_single_match( data, '<meta content="([^"]+)" property="og\:image"') item.thumbnail = scrapedthumbnail.strip() scrapeddate = scrapertools.find_single_match( data, '<span class="date">([^<]+)</span>') item.aired_date = scrapertools.parse_date(scrapeddate.strip()) item.duration = scrapertools.find_single_match( data, '<span class="duration">([^<]+)</span>') item.geolocked = "0" try: from servers import xiptv as servermodule video_urls = servermodule.get_video_url(item.url) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def get_episodes(episodes, category, items): logger.info("[mitele.py] get_episodes") for episode in episodes: title = "%s - %s" % (episode["subtitle"], episode["title"]) thumbnail = episode["images"]["thumbnail"]["src"] url = "https://www.mitele.es" + episode["link"]["href"] plot = episode["info"]["synopsis"] if "synopsis" in episode[ "info"] else "" duration = episode["info"]["duration"] if "duration" in episode[ "info"] else None creation_date = episode["info"][ "creation_date"] if "creation_date" in episode["info"] else None aired_date = scrapertools.parse_date( creation_date) if creation_date else None items.append( Item(channel=CHANNEL, server=CHANNEL, action="play", title=title, url=url, thumbnail=thumbnail, category=category, plot=plot, duration=duration, aired_date=aired_date, folder=False))
def detalle_episodio(item): logger.info("tvalacarta.rtvcm.detalle_episodio") idvideo = scrapertools.find_single_match(item.url,"video-(\d+)$") url = "http://api.rtvcm.webtv.flumotion.com/pods/"+idvideo+"?extended=true" data = scrapertools.cache_page(url) try: json_object = jsontools.load_json(data) item.thumbnail = json_object["video_image_url"].split("?")[0] item.geolocked = "0" item.duration = scrapertools.parse_duration_secs( json_object["duration"] ) item.aired_date = scrapertools.parse_date(item.title) from servers import rtvcm as servermodule video_urls = servermodule.get_video_url(item.url) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def detalle_episodio(item): data = scrapertools.cache_page(item.url) item.plot = scrapertools.htmlclean( scrapertools.find_single_match( data, '<meta content="([^"]+)" itemprop="description')).strip() item.thumbnail = scrapertools.find_single_match( data, '<meta content="([^"]+)" itemprop="thumbnailUrl') #<meta content="miércoles, 16 de septiembre de 2015 3:30" itemprop="datePublished" scrapeddate = scrapertools.find_single_match( data, '<meta content="([^"]+)" itemprop="datePublished') item.aired_date = scrapertools.parse_date(scrapeddate) item.geolocked = "0" media_item = play(item) try: item.media_url = media_item[0].url.replace("\\", "/") except: import traceback print traceback.format_exc() item.media_url = "" return item
def detalle_episodio(item): # Saca de conectate la duración y fecha rec_id = scrapertools.find_single_match(item.url,"videos/(\d+)") data = scrapertools.cache_page("http://www.conectate.gob.ar/sitios/conectate/busqueda/buscar?rec_id="+rec_id) scrapeddate = scrapertools.find_single_match(data,'"fecha_creacion"\:"([^"]+)"') if scrapeddate=="": scrapeddate = scrapertools.find_single_match(data,'"fecha"\:"([^"]+)"') item.aired_date = scrapertools.parse_date(scrapeddate.replace("\\/","/")) scrapedduration = scrapertools.find_single_match(data,'"duracion_segundos":"(\d+)"') item.duration = scrapertools.parse_duration_secs(scrapedduration) # Ahora saca de PakaPaka la URL data = scrapertools.cache_page(item.url) item.geolocked = "0" try: from servers import pakapaka as servermodule video_urls = servermodule.get_video_url(item.url) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def episodios(item): logger.info("tvalacarta.channels.upvtv episodios") itemlist=[] # Descarga la página data = scrapertools.cachePage(item.url) data = scrapertools.find_single_match(data,'<h1>Programas anteriores(.*?)</ul') # Extrae los capitulos patron = '<li[^<]+' patron += '<span class="enlace"><a href="([^"]+)" >([^<]+)</a>' matches = re.compile(patron,re.DOTALL).findall(data) for scrapedurl,scrapedtitle in matches: title = scrapedtitle.replace("\n"," ") title = re.compile("\s+",re.DOTALL).sub(" ",title) title = title.decode('iso-8859-1').encode("utf8","ignore") thumbnail = "" plot = "" url = urlparse.urljoin(item.url,scrapedurl) aired_date = scrapertools.parse_date(title) if (DEBUG): logger.info("title=["+title+"], url=["+url+"], thumbnail=["+thumbnail+"]") itemlist.append( Item( channel=CHANNELNAME , title=title , action="play" , server="upvtv" , url=url , thumbnail=thumbnail , plot=plot , show=item.show , fanart=thumbnail , aired_date=aired_date, folder=False ) ) return itemlist
def episodios(item): logger.info("tvalacarta.channels.adn40 episodios") itemlist = [] # Descarga la página data = scrapertools.cache_page(item.url) json_data = jsontools.load_json(data) for json_item in json_data["video"]: title = json_item["title"] url = json_item["link"] thumbnail = json_item["image"] plot = json_item["teaser"] aired_date = scrapertools.parse_date(json_item["date"]) itemlist.append( Item(channel=CHANNELNAME, title=title, url=url, thumbnail=thumbnail, plot=plot, action="play", server="adn40", show=item.show, folder=False)) return itemlist
def detalle_episodio(item): item.aired_date = scrapertools.parse_date(item.title) item.geolocked = "0" item.media_url = item.extra return item
def detalle_episodio(item): # Saca de conectate la duración y fecha rec_id = scrapertools.find_single_match(item.url, "videos/(\d+)") data = scrapertools.cache_page( "http://www.conectate.gob.ar/sitios/conectate/busqueda/buscar?rec_id=" + rec_id) scrapeddate = scrapertools.find_single_match( data, '"fecha_creacion"\:"([^"]+)"') if scrapeddate == "": scrapeddate = scrapertools.find_single_match(data, '"fecha"\:"([^"]+)"') item.aired_date = scrapertools.parse_date(scrapeddate.replace("\\/", "/")) scrapedduration = scrapertools.find_single_match( data, '"duracion_segundos":"(\d+)"') item.duration = scrapertools.parse_duration_secs(scrapedduration) # Ahora saca de PakaPaka la URL data = scrapertools.cache_page(item.url) item.geolocked = "0" try: from servers import pakapaka as servermodule video_urls = servermodule.get_video_url(item.url) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def detalle_episodio(item): logger.info("tvalacarta.rtvcm.detalle_episodio") idvideo = scrapertools.find_single_match(item.url, "video-(\d+)$") url = "http://api.rtvcm.webtv.flumotion.com/pods/" + idvideo + "?extended=true" data = scrapertools.cache_page(url) try: json_object = jsontools.load_json(data) item.thumbnail = json_object["video_image_url"].split("?")[0] item.geolocked = "0" item.duration = scrapertools.parse_duration_secs( json_object["duration"]) item.aired_date = scrapertools.parse_date(item.title) from servers import rtvcm as servermodule video_urls = servermodule.get_video_url(item.url) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def detalle_episodio(item): data = scrapertools.cache_page(item.url) scrapedplot = scrapertools.find_single_match(data,'<meta content="([^"]+)" property="og\:description"') item.plot = scrapertools.htmlclean( scrapedplot ).strip() scrapedthumbnail = scrapertools.find_single_match(data,'<meta content="([^"]+)" property="og\:image"') item.thumbnail = scrapedthumbnail.strip() scrapeddate = scrapertools.find_single_match(data,'<span class="date">([^<]+)</span>') item.aired_date = scrapertools.parse_date( scrapeddate.strip() ) item.duration = scrapertools.find_single_match(data,'<span class="duration">([^<]+)</span>') item.geolocked = "0" try: from servers import xiptv as servermodule video_urls = servermodule.get_video_url(item.url) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def detalle_episodio(item): data = play_get_xml_data(item.url) item.plot = scrapertools.find_single_match(data,"<introduction><\!\[CDATA\[(.*?)\]\]><") item.thumbnail = scrapertools.find_single_match(data,"<picture>([^<]+)<") item.aired_date = scrapertools.parse_date( scrapertools.find_single_match(data,"<publication_date>([^<]+)<") ) if item.aired_date == "": item.aired_date = scrapertools.parse_date(item.title) item.geolocked = "0" items = play(item,page_data=data) item.media_url = items[-1].url return item
def episodios(item): logger.info("tvalacarta.channels.sietetvandalucia episodios") itemlist = [] # Descarga la página data = scrapertools.cache_page(item.url) # Parse ''' <div class="wrap-toggle active"> <div> <input type="checkbox" id="question0" name="q" class="questions"> <div class="plus">+</div> <label for="question0" class="question"> Listado de episodios de la temporada 5 </label> <div class="answers"> <ul> <li><a href="https://7tvandalucia.es/andalucia/cuaderno-agrario/5-14-08122018-cuaderno-agrario/43608/" > Número 14 / 08/12/2018 Cuaderno Agrario</a></li> <li><a href="https://7tvandalucia.es/andalucia/cuaderno-agrario/5-13-01122018-cuaderno-agrario/43508/" > Número 13 / 01/12/2018 Cuaderno Agrario</a></li> ... </ul> ''' patron = '<div class="wrap-toggle active"[^<]+' patron += '<div[^<]+' patron += '<input type="checkbox"[^<]+' patron += '<div class="plus"[^<]+</div[^<]+' patron += '<label[^>]+>([^<]+)</label[^<]+' patron += '<div class="answers"[^<]+' patron += '<ul(.*?)</ul' matches = scrapertools.find_multiple_matches(data, patron) for season_title, season_body in matches: season_label = season_title.strip() season_label = season_label.replace("Listado de episodios de la ", "").capitalize() patron = '<li><a href="([^"]+)[^>]+>([^<]+)</a>' matches2 = scrapertools.find_multiple_matches(season_body, patron) for scraped_url, scraped_title in matches2: url = urlparse.urljoin(item.url, scraped_url) title = season_label + " " + scraped_title.strip() aired_date = scrapertools.parse_date(title) itemlist.append( Item(channel=CHANNELNAME, action="play", server="sietetvandalucia", title=title, show=item.show, url=url, aired_date=aired_date, folder=False)) return itemlist
def episodios_bloque_izquierdo(item): logger.info("extremaduratv.episodios_bloque_izquierdo") itemlist = [] # Descarga la página data = scrapertools.cachePage(item.url) data = scrapertools.get_match( data, '<div class="contenedor-izq(.*?)<div class="contenedor-der') patron = '<li class="views-row[^<]+' patron += '<div class="views-field views-field-title"[^<]+' patron += '<span class="field-content"[^<]+' patron += '<a href="([^"]+)">([^<]+)</a>' matches = re.findall(patron, data, re.DOTALL) for url, titulo in matches: scrapedtitle = titulo.strip() scrapedurl = urlparse.urljoin(item.url, url) scrapedthumbnail = "" scrapedplot = "" # Trata de sacar la fecha de emisión del título aired_date = scrapertools.parse_date(scrapedtitle) if (DEBUG): logger.info("title=[" + scrapedtitle + "], url=[" + scrapedurl + "], thumbnail=[" + scrapedthumbnail + "]") itemlist.append( Item(channel=CHANNELNAME, title=scrapedtitle, action="play", server="extremaduratv", url=scrapedurl, thumbnail=scrapedthumbnail, show=item.show, aired_date=aired_date, folder=False)) #<li class="pager-next last"><a href="/alacarta/tv/programas/informativos/97/extremadura-noticias-1?page=1" patron = '<li class="pager-next[^<]+<a href="([^"]+)"' matches = re.findall(patron, data, re.DOTALL) for url in matches: scrapedurl = urlparse.urljoin(item.url, url) itemlist.append( Item(channel=CHANNELNAME, title=">> Página siguiente", action="episodios_bloque_izquierdo", url=scrapedurl, show=item.show, extra=item.extra)) return itemlist
def episodios(item, load_all_pages=True): logger.info("tvalacarta.channels.navarratv episodios") itemlist = [] # Descarga la página data = scrapertools.cache_page(item.url) ''' <div class="Bloque2Noticias"> <div class="ImpactoBloque W50 H120 FranjaRoja "> <div class="ImpactoBloqueImagen W98" style="height: 150px;"> <div class="ImpactoContenedorImagen" style="height: 150px; cursor: pointer; background-image: url('https://i.ytimg.com/vi/RXBDpg7oduk/mqdefault.jpg');" onclick="location.href='/AlaCarta/92C813D7-1676-E17B-1AB1E57E2065947C/yt/RXBDpg7oduk/IMPLICADOS-18-DE-JUNIO-DE-2016';"/></div> </div> <div class="ImpactoBloqueContenido W98"> <h2><a href="/AlaCarta/92C813D7-1676-E17B-1AB1E57E2065947C/yt/RXBDpg7oduk/IMPLICADOS-18-DE-JUNIO-DE-2016" class="TextoNeutro">IMPLICADOS 18 DE JUNIO DE 2016</a></h2> <p>IMPLICADOS 18 DE JUNIO DE 2016</p> </div> </div> <div class="W3"></div> ''' patron = '<div class="Bloque2Noticias"[^<]+' patron += '<div class="ImpactoBloque W50 H120 FranjaRoja[^<]+' patron += '<div class="ImpactoBloqueImagen[^<]+' patron += "<div class=\"ImpactoContenedorImagen\".*?url\('([^']+)'\)[^<]+</div[^<]+" patron += '</div[^<]+' patron += '<div class="ImpactoBloqueContenido[^<]+' patron += '<h2><a href="([^"]+)" class="TextoNeutro">([^<]+)</a></h2[^<]+' patron += '<p>([^<]*)</p>' matches = re.compile(patron, re.DOTALL).findall(data) for scrapedthumbnail, scrapedurl, scrapedtitle, scrapedplot in matches: thumbnail = urlparse.urljoin(item.url, scrapedthumbnail) yt_id = scrapertools.find_single_match(scrapedurl, "/yt/([^/]+)/") url = "https://www.youtube.com/watch?v=" + yt_id title = scrapertools.safe_unicode(scrapedtitle).encode("utf-8").strip() plot = scrapedplot.strip() aired_date = scrapertools.parse_date(title) itemlist.append( Item(channel=__channel__, action="play", server="navarratv", title=title, url=url, thumbnail=thumbnail, fanart=thumbnail, show=item.show, aired_date=aired_date, plot=plot, folder=False)) return itemlist
def episodios(item, load_all_pages=False): logger.info("tvalacarta.channels.montecarlo episodios") itemlist = [] # Descarga la página data = scrapertools.cachePage(item.url) ''' <div class="col-lg-4 col-md-6 col-sm-6 col-xs-12"> <div class="div_videos_contenedor_item"> <div class="div_videos_imagen"> <a href="/programas/el-sult%C3%A1n/videos/cap%C3%ADtulo-2"> <img class="img-responsive" typeof="foaf:Image" src="http://www.montecarlotv.com.uy/sites/default/files/styles/imagen_programa/public/Sultan_20012016.jpg?itok=0rXaCnrn" width="406" height="246" alt="" /> </a> </div> <a href="/programas/el-sult%C3%A1n/videos/cap%C3%ADtulo-2"> <div class="div_videos_contenedor_descripcion"> <div class="div_videos_contenedor_descripcion_linea1"> <div class="div_videos_fecha"> <span class="date-display-single" property="dc:date" datatype="xsd:dateTime" content="2016-01-20T00:00:00-03:00">20/01/2016</span> </div></div><div class="div_videos_titulo">Capítulo 2</div></div></a> ''' patron = '<div class="div_videos_contenedor_item"[^<]+' patron += '<div class="div_videos_imagen"[^<]+' patron += '<a href="([^"]+)"[^<]+' patron += '<img class="img-responsive" typeof="foaf:Image" src="([^"]+)"[^<]+' patron += '</a[^<]+' patron += '</div>[^<]+' patron += '<a[^<]+' patron += '<div class="div_videos_contenedor_descripcion[^<]+' patron += '<div class="div_videos_contenedor_descripcion[^<]+' patron += '<div class="div_videos_fecha[^<]+' patron += '<span class="date-display-single[^>]+>([^<]+)</span[^<]+' patron += '</div></div><div class="div_videos_titulo">([^<]+)<' matches = re.findall(patron,data,re.DOTALL) for scrapedurl,scrapedthumbnail,scrapedfecha,scrapedtitle in matches: title = scrapedtitle url = urlparse.urljoin(item.url,scrapedurl) thumbnail = urlparse.urljoin(item.url,scrapedthumbnail) plot = "" aired_date = scrapertools.parse_date(scrapedfecha) if (DEBUG): logger.info("title=["+title+"], url=["+url+"], thumbnail=["+thumbnail+"]") itemlist.append( Item(channel=CHANNELNAME, title=title , action="play" , server="montecarlo" , url=url, thumbnail=thumbnail, plot=plot, show=item.show, aired_date=aired_date, folder=False) ) next_page_url = scrapertools.find_single_match(data,'<a title="Ir a la p[^"]+" href="([^>]+)">siguiente') if next_page_url!="": itemlist.append( Item(channel=CHANNELNAME, title=">> Página siguiente" , action="episodios" , url=urlparse.urljoin(item.url,next_page_url), show=item.show) ) return itemlist
def episodios(item): logger.info("extremaduratv.episodios") itemlist = [] # Descarga la página data = scrapertools.cachePage(item.url) ''' <div class="modal-video-ajax modal-video modal fade color-tv" id="modalTV2_33318" tabindex="-1" role="dialog" aria-labelledby="modalTV2_33318Label" aria-hidden="true" data-video-imagen-modal="http://www.canalextremadura.es/sites/default/files/styles/nuevo_dise_o_-_grande/public/imagenes-nuevo-disenio/tv-a-la-carta/_desdeelaire.jpg?itok=FmvSbPkH" data-video-video-mobile="http://iphonevod.canalextremadura.es/S-B4583-009.mp4" data-video-url="/alacarta/tv/videos/extremadura-desde-el-aire-3" data-video-titulo-modal="El Reino del Pata Negra" data-video-id-nodo="33318" data-video-video-modal="rtmp://canalextremadura.cdn.canalextremadura.es/canalextremadura/tv/S-B4583-009.mp4" ''' patron = '<div class="modal-video-ajax(.*?</blockquote>)' matches = re.findall(patron, data, re.DOTALL) for match in matches: title = scrapertools.find_single_match( match, 'data-video-titulo-modal="([^"]+)"') url = urlparse.urljoin( item.url, scrapertools.find_single_match(match, 'data-video-url="([^"]+)"')) thumbnail = urlparse.urljoin( item.url, scrapertools.find_single_match( match, 'data-video-imagen-modal="([^"]+)"')) plot = scrapertools.find_single_match( match, '<blockquote class="nomargin">(.*?)</blockquote>').strip() aired_date = scrapertools.parse_date(title) extra = urlparse.urljoin( item.url, scrapertools.find_single_match(match, 'data-video-video-modal="([^"]+)"')) itemlist.append( Item(channel=CHANNELNAME, title=title, action="play", server="extremaduratv", plot=plot, url=url, thumbnail=thumbnail, fanart=thumbnail, show=item.show, aired_date=aired_date, extra=extra, view="videos", folder=False)) return itemlist
def episodios(item): logger.info("tvalacarta.channels.rtpa episodios") itemlist = [] if "&fin=" not in item.url: item.url = item.url + "&fin=1000" data = scrapertools.cache_page(item.url) json_object = jsontools.load_json(data) #logger.info("json_object="+repr(json_object)) #logger.info("VOD="+repr(json_object["VOD"])) for vod in json_object["VOD"]: logger.info("vod=" + repr(vod)) title = vod["nombre_programa"] if vod["titulo"] != "": title = title + " - " + vod["titulo"] if vod["fecha_emision"] != "": title = title + " (" + scrapertools.htmlclean( vod["fecha_emision"]) + ")" url = "http://www.rtpa.es/video:" + urllib.quote( vod["nombre_programa"]) + "_" + vod["id_generado"] + ".html" try: url_imagen = vod["url_imagen"] thumbnail = urllib.quote(url_imagen).replace("//", "/").replace( "http%3A/", "http://") except: thumbnail = "" aired_date = scrapertools.parse_date(vod["fecha_emision"]) plot = scrapertools.htmlclean(vod["sinopsis"]) itemlist.append( Item(channel=CHANNELNAME, title=title, url=url, thumbnail=thumbnail, plot=plot, fanart=thumbnail, server="rtpa", action="play", show=item.show, viewmode="movie_with_plot", aired_date=aired_date, folder=False)) return itemlist
def episodios_bloque_derecho(item, load_all_pages=False): logger.info("extremaduratv.episodios_bloque_derecho") itemlist = [] # Descarga la página data = scrapertools.cachePage(item.url) ''' <a href="/alacarta/tv/videos/trastos-y-tesoros-260315"> <img src="http://www.canalextremadura.es/sites/default/files/styles/alacarta_listado_programas/public/cadillac.jpg?itok=cAhwJKrp" width="225" height="140" alt="" /> </a></div> </div> <div class="views-field views-field-title"> <span class="field-content">Trastos y tesoros (26/03/15)</span> ''' patron = '<a href="([^"]+)"[^<]+' patron += '<img src="([^"]+)"[^<]+' patron += '</a></div[^<]+</div[^<]+' patron += '<div class="views-field views-field-title"[^<]+' patron += '<span class="field-content">([^<]+)</span>' matches = re.findall(patron,data,re.DOTALL) for url,thumbnail,titulo in matches: scrapedtitle = titulo.strip() scrapedurl = urlparse.urljoin(item.url,url) scrapedthumbnail = thumbnail scrapedplot = "" # Trata de sacar la fecha de emisión del título aired_date = scrapertools.parse_date(scrapedtitle) if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]") itemlist.append( Item(channel=CHANNELNAME, title=scrapedtitle , action="play" , server="extremaduratv" , url=scrapedurl, thumbnail = scrapedthumbnail, show=item.show, aired_date=aired_date, folder=False) ) #<li class="pager-next last"><a href="/alacarta/tv/programas/informativos/97/extremadura-noticias-1?page=1" next_page_url = scrapertools.find_single_match(data,'href="([^"]+)">siguiente') if next_page_url!="": next_page_url = urlparse.urljoin(item.url,next_page_url) next_page_item = Item(channel=CHANNELNAME, title=">> Página siguiente" , action="episodios" , url=next_page_url, show=item.show, extra=item.extra) if load_all_pages: itemlist.extend(episodios(next_page_item,load_all_pages)) else: itemlist.append( next_page_item ) return itemlist
def episodios(item): logger.info("tvalacarta.channels.extremaduratv.episodios") itemlist = [] # Descarga la página data = scrapertools.cachePage(item.url) ''' <div class="modal-video-ajax modal-video modal fade color-tv" id="modalTV2_33318" tabindex="-1" role="dialog" aria-labelledby="modalTV2_33318Label" aria-hidden="true" data-video-imagen-modal="http://www.canalextremadura.es/sites/default/files/styles/nuevo_dise_o_-_grande/public/imagenes-nuevo-disenio/tv-a-la-carta/_desdeelaire.jpg?itok=FmvSbPkH" data-video-video-mobile="http://iphonevod.canalextremadura.es/S-B4583-009.mp4" data-video-url="/alacarta/tv/videos/extremadura-desde-el-aire-3" data-video-titulo-modal="El Reino del Pata Negra" data-video-id-nodo="33318" data-video-video-modal="rtmp://canalextremadura.cdn.canalextremadura.es/canalextremadura/tv/S-B4583-009.mp4" ''' patron = '<div class="modal-video-ajax(.*?<div class="barra-cerrar-modal)' matches = re.findall(patron,data,re.DOTALL) # Las páginas siguientes se saltan los dos primeros vídeos (son destacados que se repiten) if "?page" in item.url: saltar = 2 else: saltar = 0 for match in matches: if saltar>0: saltar = saltar - 1 continue title = scrapertools.find_single_match(match,'data-video-titulo-modal="([^"]+)"') url = urlparse.urljoin(item.url,scrapertools.find_single_match(match,'data-video-url="([^"]+)"')) thumbnail = urlparse.urljoin(item.url,scrapertools.find_single_match(match,'data-video-imagen-modal="([^"]+)"')) plot = scrapertools.find_single_match(match,'<blockquote class="nomargin">(.*?)</blockquote>').strip() aired_date = scrapertools.parse_date(title) extra = urlparse.urljoin(item.url,scrapertools.find_single_match(match,'data-video-video-modal="([^"]+)"')) itemlist.append( Item(channel=CHANNELNAME, title=title , action="play" , server="extremaduratv" , plot=plot, url=url, thumbnail=thumbnail, fanart=thumbnail, show=item.show, aired_date=aired_date, extra=extra, view="videos", folder=False) ) if len(itemlist)>0: next_page_url = scrapertools.find_single_match(data,'<li class="pager-next"><a title="[^"]+" href="([^"]+)"') next_page_url = urlparse.urljoin(item.url,next_page_url) next_page_item = Item(channel=CHANNELNAME, title=">> Página siguiente" , action="episodios" , url=next_page_url) itemlist.append( next_page_item ) return itemlist
def episodios(item): logger.info("tvalacarta.channels.rtve_api episodios") itemlist = [] # Descarga la página url = item.url+"/videos.json" data = scrapertools.cache_page(url) json_object = jsontools.load_json(data) #logger.info("json_object="+json_object) json_items = json_object["page"]["items"] for json_item in json_items: title = json_item["longTitle"] url = json_item["uri"] thumbnail = json_item["imageSEO"] if json_item["description"] is not None: plot = scrapertools.htmlclean(json_item["description"]) else: plot = "" fanart = item.fanart page = json_item["htmlUrl"] aired_date = scrapertools.parse_date(json_item["publicationDate"]) ms = json_item["duration"] if ms is None: duration="" else: x = ms / 1000 seconds = x % 60 x /= 60 minutes = x % 60 x /= 60 hours = x % 24 if hours>0: duration = str(hours)+":"+str(minutes)+":"+str(seconds) else: duration = str(minutes)+":"+str(seconds) if (DEBUG): logger.info(" title=["+repr(title)+"], url=["+repr(url)+"], thumbnail=["+repr(thumbnail)+"] plot=["+repr(plot)+"]") itemlist.append( Item(channel="rtve", title=title , action="play" , server="rtve", page=page, url=url, thumbnail=thumbnail, fanart=thumbnail, show=item.show , plot=plot , duration=duration, aired_date=aired_date, viewmode="movie_with_plot", folder=False) ) from core import config if config.is_xbmc() and len(itemlist)>0: itemlist.append( Item(channel=item.channel, title=">> Opciones para esta serie", url=item.url, action="serie_options##episodios", thumbnail=item.thumbnail, show=item.show, folder=False)) return itemlist
def episodios(item): logger.info("tvalacarta.channels.clantv episodios") itemlist = [] # Descarga la página url = item.url+"/videos.json" data = scrapertools.cache_page(url) json_object = jsontools.load_json(data) #logger.info("json_object="+json_object) json_items = json_object["page"]["items"] for json_item in json_items: title = json_item["longTitle"] url = json_item["uri"] thumbnail = json_item["imageSEO"] if json_item["description"] is not None: plot = scrapertools.htmlclean(json_item["description"]) else: plot = "" fanart = item.fanart page = json_item["htmlUrl"] aired_date = scrapertools.parse_date(json_item["publicationDate"]) ms = json_item["duration"] if ms is None: duration="" else: x = ms / 1000 seconds = x % 60 x /= 60 minutes = x % 60 x /= 60 hours = x % 24 if hours>0: duration = str(hours)+":"+str(minutes)+":"+str(seconds) else: duration = str(minutes)+":"+str(seconds) if (DEBUG): logger.info(" title=["+repr(title)+"], url=["+repr(url)+"], thumbnail=["+repr(thumbnail)+"] plot=["+repr(plot)+"]") itemlist.append( Item(channel="rtve", title=title , action="play" , server="rtve", page=page, url=url, thumbnail=thumbnail, fanart=thumbnail, show=item.show , plot=plot , duration=duration, aired_date=aired_date, viewmode="movie_with_plot", folder=False) ) from core import config if config.is_xbmc() and len(itemlist)>0: itemlist.append( Item(channel=item.channel, title=">> Opciones para esta serie", url=item.url, action="serie_options##episodios", thumbnail=item.thumbnail, show=item.show, folder=False)) return itemlist
def detalle_episodio(item): item.geolocked = "0" if item.aired_date == "": item.aired_date = scrapertools.parse_date(item.title,"mdy") try: from servers import cntv as servermodule video_urls = servermodule.get_video_url(item.url) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def detalle_episodio(item): item.geolocked = "0" if item.aired_date == "": item.aired_date = scrapertools.parse_date(item.title, "mdy") try: from servers import cntv as servermodule video_urls = servermodule.get_video_url(item.url) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def episodios(item): logger.info("tvalacarta.rtvcm.episodios") itemlist = [] # Descarga la página data = scrapertools.cache_page(item.url) ''' <article> <figure> <img src="http://api.rtvcm.webtv.flumotion.com/videos/31351/poster.jpg?w=7fe8fa22" alt="Cosecha propia"> <a class="icon-play-circle" href="http://www.cmmedia.es/programas/tv/cosecha-propia/videos/31351" title="Cosecha propia"><span class="sr-only">http://api.rtvcm.webtv.flumotion.com/videos/31351/poster.jpg?w=7fe8fa22</span></a> </figure> <p class="date"><time>24/09/2016</time></p> <h3><a href="http://www.cmmedia.es/programas/tv/cosecha-propia/videos/31351" title="Cosecha propia">Cosecha propia</a></h3> <p>Venta de Don Quijote</p> </article> ''' patron = '<article[^<]+' patron += '<figure[^<]+' patron += '<img src="([^"]+)" alt="([^"]+)"[^<]+' patron += '<a class="icon-play-circle" href="([^"]+)"[^<]+<span[^<]+</span></a[^<]+' patron += '</figure[^<]+' patron += '<p class="date"><time>([^<]+)</time></p[^<]+' patron += '<h3><a[^<]+</a></h3[^<]+' patron += '<p>([^<]+)</p>' matches = re.compile(patron,re.DOTALL).findall(data) for scrapedthumbnail,scrapedtitle,scrapedurl,fecha,scrapedplot in matches: thumbnail = urlparse.urljoin(item.url,scrapedthumbnail) url = urlparse.urljoin(item.url,scrapedurl) title = scrapedtitle+" "+fecha plot = scrapedplot aired_date = scrapertools.parse_date(fecha) itemlist.append( Item(channel=__channel__, title=title , url=url, plot=plot, thumbnail=thumbnail , fanart=thumbnail , action="play" , server="rtvcm", show = item.title , aired_date=aired_date, folder=False) ) next_page_url = scrapertools.find_single_match(data,'<a href="([^"]+)" aria-label="Siguiente">') if next_page_url!="": itemlist.append( Item(channel=__channel__, action="episodios", title=">> Página siguiente" , url=urlparse.urljoin(item.url,next_page_url) , folder=True) ) return itemlist
def detalle_episodio(item): data = scrapertools.cache_page(item.url) item.title = scrapertools.find_single_match(data,'<p id="title">([^<]+)</p>') item.aired_date = scrapertools.parse_date(item.title) item.plot = scrapertools.find_single_match(data,'<p id="desp">([^<]+)</p>') item.geolocked = "0" try: from servers import onceninos as servermodule video_urls = servermodule.get_video_url(item.url,page_data=data) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def episodios(item): logger.info("tvalacarta.channels.adn40 episodios") itemlist = [] # Descarga la página data = scrapertools.cache_page(item.url) json_data = jsontools.load_json(data) for json_item in json_data["video"]: title = json_item["title"] url = json_item["link"] thumbnail = json_item["image"] plot = json_item["teaser"] aired_date = scrapertools.parse_date(json_item["date"]) itemlist.append( Item(channel=CHANNELNAME, title=title , url=url, thumbnail=thumbnail, plot=plot, action="play", server="adn40", show=item.show, folder=False) ) return itemlist
def detalle_episodio(item): logger.info("tvalacarta.rtvcm.detalle_episodio") data = scrapertools.cache_page(item.url) try: json_object = jsontools.load_json(data) item.geolocked = "0" item.aired_date = scrapertools.parse_date(item.title) from servers import rtvcm as servermodule video_urls = servermodule.get_video_url(item.url) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def detalle_episodio(item): data = scrapertools.cache_page(item.url) scrapedplot = scrapertools.find_single_match(data,'<span class="title">Resumen del v[^>]+</span>(.*?)</div>') item.plot = scrapertools.htmlclean( scrapedplot ).strip() item.title = scrapertools.find_single_match(data,'<span class="activo"><strong>([^<]+)</strong></span>') item.aired_date = scrapertools.parse_date( item.title ) item.geolocked = "0" try: from servers import aragontv as servermodule video_urls = servermodule.get_video_url(item.url) item.media_url = video_urls[0][1] except: import traceback print traceback.format_exc() item.media_url = "" return item
def detalle_episodio(item): data = scrapertools.cache_page(item.url) item.plot = scrapertools.htmlclean(scrapertools.find_single_match(data,'<meta content="([^"]+)" itemprop="description')).strip() item.thumbnail = scrapertools.find_single_match(data,'<meta content="([^"]+)" itemprop="thumbnailUrl') #<meta content="miércoles, 16 de septiembre de 2015 3:30" itemprop="datePublished" scrapeddate = scrapertools.find_single_match(data,'<meta content="([^"]+)" itemprop="datePublished') item.aired_date = scrapertools.parse_date(scrapeddate) item.geolocked = "0" media_item = play(item) try: item.media_url = media_item[0].url.replace("\\","/") except: import traceback print traceback.format_exc() item.media_url = "" return item
def episodios_bloque_izquierdo(item): logger.info("extremaduratv.episodios_bloque_izquierdo") itemlist = [] # Descarga la página data = scrapertools.cachePage(item.url) data = scrapertools.get_match(data,'<div class="contenedor-izq(.*?)<div class="contenedor-der') patron = '<li class="views-row[^<]+' patron += '<div class="views-field views-field-title"[^<]+' patron += '<span class="field-content"[^<]+' patron += '<a href="([^"]+)">([^<]+)</a>' matches = re.findall(patron,data,re.DOTALL) for url,titulo in matches: scrapedtitle = titulo.strip() scrapedurl = urlparse.urljoin(item.url,url) scrapedthumbnail = "" scrapedplot = "" # Trata de sacar la fecha de emisión del título aired_date = scrapertools.parse_date(scrapedtitle) if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]") itemlist.append( Item(channel=CHANNELNAME, title=scrapedtitle , action="play" , server="extremaduratv" , url=scrapedurl, thumbnail = scrapedthumbnail, show=item.show, aired_date=aired_date, folder=False) ) #<li class="pager-next last"><a href="/alacarta/tv/programas/informativos/97/extremadura-noticias-1?page=1" patron = '<li class="pager-next[^<]+<a href="([^"]+)"' matches = re.findall(patron,data,re.DOTALL) for url in matches: scrapedurl = urlparse.urljoin(item.url,url) itemlist.append( Item(channel=CHANNELNAME, title=">> Página siguiente" , action="episodios_bloque_izquierdo" , url=scrapedurl, show=item.show, extra=item.extra) ) return itemlist
def episodios(item): logger.info("tvalacarta.channels.extremaduratv.episodios") itemlist = [] # Descarga la página data = scrapertools.cachePage(item.url) # En la primera página debe parsear los destacados if "?page" not in item.url: patron = '<div class="ipost clearfix">(.*?<li><i class="icon-calendar3"></i[^<]+<span class="date-display-single">[^<]+</span>)' matches = re.findall(patron, data, re.DOTALL) logger.info("matches=" + repr(matches)) for match in matches: title = scrapertools.find_single_match( match, '<h3[^>]+>([^<]+)</h3>').strip() url = urlparse.urljoin( item.url, scrapertools.find_single_match(match, '<a href="([^"]+)"')) thumbnail = urlparse.urljoin( item.url, scrapertools.find_single_match( match, '<img class="image_fade" src="([^"]+)"')) plot = "" aired_date = scrapertools.find_single_match( match, '<span class="date-display-single">([^<]+)</span>') aired_date = scrapertools.parse_date(aired_date).strip() if aired_date == "": aired_date = scrapertools.parse_date(title).strip() if title != "": itemlist.append( Item(channel=CHANNELNAME, title=title, action="play", server="extremaduratv", plot=plot, url=url, thumbnail=thumbnail, fanart=thumbnail, show=item.show, aired_date=aired_date, view="videos", folder=False)) patron = '<div class="col-md-4 col-sm-4 col-xs-6">(.*?<li><i class="icon-calendar3"></i[^<]+<span class="date-display-single">[^<]+</span>)' matches = re.findall(patron, data, re.DOTALL) for match in matches: title = scrapertools.find_single_match( match, '<h4[^>]+>([^<]+)</h4>').strip() url = urlparse.urljoin( item.url, scrapertools.find_single_match(match, '<a href="([^"]+)"')) thumbnail = urlparse.urljoin( item.url, scrapertools.find_single_match( match, '<img class="image_fade" src="([^"]+)"')) plot = "" aired_date = scrapertools.find_single_match( match, '<span class="date-display-single">([^<]+)</span>') aired_date = scrapertools.parse_date(aired_date).strip() if aired_date == "": aired_date = scrapertools.parse_date(title).strip() if title != "": itemlist.append( Item(channel=CHANNELNAME, title=title, action="play", server="extremaduratv", plot=plot, url=url, thumbnail=thumbnail, fanart=thumbnail, show=item.show, aired_date=aired_date, view="videos", folder=False)) if len(itemlist) > 0: next_page_url = scrapertools.find_single_match( data, '<li class="pager-next"><a title="[^"]+" href="([^"]+)"') next_page_url = urlparse.urljoin(item.url, next_page_url) next_page_item = Item(channel=CHANNELNAME, title=">> Página siguiente", action="episodios", url=next_page_url) itemlist.append(next_page_item) return itemlist
def episodios(item): logger.info("tvalacarta.channels.dwspan episodios") itemlist = [] # ''' <div class="col1"> <div class="news searchres hov"> <a href="/es/life-links-readytofight-listos-para-pelear/av-19224025"> <div class="teaserImg tv"> <img border="0" width="220" height="124" src="/image/18378218_301.jpg" title="Life Links - #readytofight: Listos para pelear" alt="default" /> </div> <h2>Life Links - #readytofight: Listos para pelear <span class="date">30.04.2016 | 26:06 Minutos </span> <span class='icon tv'></span> </h2> <p>Un imán, un exsalafista, un ex marine de EE. UU. A todos ellos les une una meta: luchar contra el extremismo y “Estado Islámico”.</p> </a> </div> </div> ''' if "pagenumber=" in item.url: data_url = item.url else: data = scrapertools.cache_page(item.url) # http://www.dw.com/es/multimedia/todos-los-contenidos/s-100838?type=18&programs=15535663 # http://www.dw.com/mediafilter/research?lang=es&type=18&programs=15535663&sort=date&results=32&showteasers=true&pagenumber=1 program_id = scrapertools.find_single_match( data, '<a href="http://www.dw.com/es/multimedia/todos-los-contenidos/s-100838.type=18&programs=([^"]+)"' ) data_url = "http://www.dw.com/mediafilter/research?lang=es&type=18&programs=" + program_id + "&sort=date&results=32&showteasers=true&pagenumber=1" data = scrapertools.cache_page(data_url) pattern = '<div class="col1"[^<]+' pattern += '<div class="news searchres hov"[^<]+' pattern += '<a href="([^"]+)"[^<]+' pattern += '<div class="teaserImg tv"[^<]+' pattern += '<img.*?src="([^"]+)"[^<]+</div>[^<]+' pattern += '<h2>([^<]+)' pattern += '<span class="date">(\d+\.\d+\.\d+)\s+\|\s+(\d+\:\d+)[^<]+' pattern += '</span>[^<]+' pattern += '<span[^<]+</span[^<]+</h2[^<]+' pattern += '<p>([^<]+)</p>' matches = re.compile(pattern, re.DOTALL).findall(data) logger.info(repr(matches)) for scrapedurl, scrapedthumbnail, scrapedtitle, scrapeddate, duration, scrapedplot in matches: title = scrapedtitle.strip() thumbnail = urlparse.urljoin(item.url, scrapedthumbnail) url = urlparse.urljoin(item.url, scrapedurl.strip()) plot = scrapedplot aired_date = scrapertools.parse_date(scrapeddate) # Appends a new item to the xbmc item list itemlist.append( Item(channel=CHANNELNAME, title=title, action="play", server="dwspan", url=url, thumbnail=thumbnail, fanart=thumbnail, plot=plot, aired_date=aired_date, duration=duration, show=item.show, view="videos", folder=False)) if len(itemlist) > 0: current_page = scrapertools.find_single_match(data_url, "pagenumber=(\d+)") logger.info("current_page=" + current_page) next_page = str(int(current_page) + 1) logger.info("next_page=" + next_page) next_page_url = data_url.replace("pagenumber=" + current_page, "pagenumber=" + next_page) logger.info("next_page_url=" + next_page_url) itemlist.append( Item(channel=CHANNELNAME, title=">> Página siguiente", action="episodios", url=next_page_url, show=item.show)) return itemlist
def episodios(item): logger.info("tvalacarta.channels.apunt episodios") itemlist = [] # Descarga la página data = scrapertools.cache_page(item.url) # Parse ''' <div class="module" id="video-19425"> <a href="/va/a-la-carta/programes/vist-en-tv/el-mati-a-punt/05-12-2018-el-mati-a-punt" class="photo"> <div class="imgcontainer"> <img src="https://secure-cf-c.ooyala.com/VrMHByZzE6jOLo7c-cmV5HkBMt1stm_T/3Gduepif0T1UGY8H4xMDoxOjA4MTsiGN" alt="" /> </div> <span class='time'>03:01:30</span> </a><span class="category" style="background-color:#008CD6; color:#FFFFFF " >A la carta</span><a href="/va/a-la-carta/programes/vist-en-tv/el-mati-a-punt" title="El matí À Punt" class="inherit"><span>El matí À Punt</span></a><p><a class="itemTit" href="/va/a-la-carta/programes/vist-en-tv/el-mati-a-punt/05-12-2018-el-mati-a-punt" videoid="video-19425">05.12.2018 | El Matí À Punt</a></p> <p title="Programa complet d'El Matí À Punt del dimecres 5 de desembre de 2018.">Programa complet d'El Matí À Punt del dimecres 5 de desembre de 2018.</p> <ul><li><a href="#"><span class="icon fa-heart-o" videoid="19425"></span></a></li> <li><a href="#"> <span newtitle="05.12.2018 | El Matí À Punt" destiny ="https://www.apuntmedia.es/va/a-la-carta/programes/vist-en-tv/el-mati-a-punt/05-12-2018-el-mati-a-punt" class="icon fa-share-alt"></span></a></li></ul> </div> ''' patron = '<div class="module" id="video-[^<]+' patron += '<a href="([^"]+)"[^<]+' patron += '<div class="imgcontainer"[^<]+' patron += '<img src="([^"]+)"[^<]+' patron += '</div[^<]+' patron += "<span class='time'>([^<]+)</span.*?" patron += '<p title=[^>]+>(.*?)</p>.*?' patron += 'newtitle="(.*?)" destiny' matches = scrapertools.find_multiple_matches(data, patron) for scraped_url, thumbnail, duration, scraped_plot, title in matches: url = urlparse.urljoin(item.url, scraped_url) plot = scrapertools.htmlclean(scraped_plot) aired_date = scrapertools.parse_date(title) itemlist.append( Item(channel=CHANNELNAME, action="play", server="apunt", title=title, plot=plot, show=item.show, url=url, thumbnail=thumbnail, duration=duration, aired_date=aired_date, folder=False)) next_page_url = scrapertools.find_single_match( data, '<a class="flechapaginado" href="([^"]+)"') if next_page_url != "": next_page_url = urlparse.urljoin(item.url, next_page_url) if next_page_url != item.url and not next_page_url.endswith("/0"): itemlist.append( Item(channel=CHANNELNAME, title=">> Página siguiente", url=next_page_url, action="episodios", show=item.show, folder=True)) return itemlist
def episodios(item,data=""): logger.info("tvalacarta.channels.aragontv episodios") logger.info("tvalacarta.channels.aragontv programa [item="+item.tostring()+" show="+item.show+"]") itemlist = [] # Descarga la página if data=="": data = scrapertools.cachePage(item.url) # Extrae las entradas ''' <div id="idv_1186" class="vid bloque"> <div class="imagen"> <img title="Malanquilla y Camarillas" alt="Malanquilla y Camarillas" src="/_archivos/imagenes/galeria_5738_thumb.jpg" /> <div class="play"> <a href="/programas/pequeños-pero-no-invisibles/malanquilla-y-camarillas-27122011-2131" title="Ver video" rel="videoFacebox"><span>Ver video</span></a> </div> </div> <h2><a href="/programas/pequeños-pero-no-invisibles/malanquilla-y-camarillas-27122011-2131" title="Malanquilla y Camarillas" rel="videoFacebox">Malanquilla y Camarillas</a></h2> <!--<br><a href="/programas/pequeños-pero-no-invisibles/malanquilla-y-camarillas-27122011-2131" title="Malanquilla y Camarillas" rel="videoFacebox2">Malanquilla y Camarillas</a> --> <div class="social"> <span class="fecha"> 27/12/2011 21:31 h<br /> Duración: 00:49:38 </span> </div> </div> ''' patron = '<div id="[^"]+" class="vid bloque[^<]+' patron += '<div class="imagen[^<]+' patron += '<img title="[^"]+" alt="([^"]+)" src="([^"]+)"[^<]+' patron += '<div class="play">[^<]+' patron += '<a href="([^"]+)".*?' patron += '<span class="fecha">(.*?)</span>' matches = re.compile(patron,re.DOTALL).findall(data) #if DEBUG: scrapertools.printMatches(matches) itemlist = [] for match in matches: # Interpreta la fecha patron_fecha = "\s*([^<]+)<br />\s*Duración\: ([^\s]+)" campos_fecha =re.compile(patron_fecha,re.DOTALL).findall(match[3]) fecha_string = campos_fecha[0][0].strip() #import time #fecha = time.strptime(fecha_string,"%d/%m/%y %H:%M") duracion_string = campos_fecha[0][1].strip() aired_date = scrapertools.parse_date(fecha_string) duration = duracion_string #scrapedtitle = match[0]+" "+fecha.strftime("%d/%m/%y")+" (Duración "+duracion_string+")" scrapedtitle = match[0].strip() scrapedurl = urlparse.urljoin(item.url,match[2]) scrapedthumbnail = urlparse.urljoin(item.url,match[1]) scrapedplot = "" if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"], show=["+item.show+"]") # Añade al listado itemlist.append( Item(channel=CHANNELNAME, title=scrapedtitle , action="play" , server="aragontv" , url=scrapedurl, thumbnail=scrapedthumbnail, plot=scrapedplot , show=item.show, aired_date=aired_date, duration=duration, folder=False) ) patron = "Paginación.*?<span class='activo'>[^<]+</span> \| <a href='([^']+)'" matches = re.compile(patron,re.DOTALL).findall(data) scrapertools.printMatches(matches) if len(matches)>0: pageitem = Item(channel=CHANNELNAME, title=">> Página siguiente" , action="episodios" , url=urlparse.urljoin(item.url,matches[0]), thumbnail=item.thumbnail, plot=item.plot , show=item.show, folder=True) itemlist.append( pageitem ) return itemlist
def episodios(item, data=""): logger.info("tvalacarta.channels.aragontv episodios") logger.info("tvalacarta.channels.aragontv programa [item=" + item.tostring() + " show=" + item.show + "]") itemlist = [] # Descarga la página if data == "": data = scrapertools.cachePage(item.url) # Extrae las entradas ''' <div id="idv_1186" class="vid bloque"> <div class="imagen"> <img title="Malanquilla y Camarillas" alt="Malanquilla y Camarillas" src="/_archivos/imagenes/galeria_5738_thumb.jpg" /> <div class="play"> <a href="/programas/pequeños-pero-no-invisibles/malanquilla-y-camarillas-27122011-2131" title="Ver video" rel="videoFacebox"><span>Ver video</span></a> </div> </div> <h2><a href="/programas/pequeños-pero-no-invisibles/malanquilla-y-camarillas-27122011-2131" title="Malanquilla y Camarillas" rel="videoFacebox">Malanquilla y Camarillas</a></h2> <!--<br><a href="/programas/pequeños-pero-no-invisibles/malanquilla-y-camarillas-27122011-2131" title="Malanquilla y Camarillas" rel="videoFacebox2">Malanquilla y Camarillas</a> --> <div class="social"> <span class="fecha"> 27/12/2011 21:31 h<br /> Duración: 00:49:38 </span> </div> </div> ''' patron = '<div id="[^"]+" class="vid bloque[^<]+' patron += '<div class="imagen[^<]+' patron += '<img title="[^"]+" alt="([^"]+)" src="([^"]+)"[^<]+' patron += '<div class="play">[^<]+' patron += '<a href="([^"]+)".*?' patron += '<span class="fecha">(.*?)</span>' matches = re.compile(patron, re.DOTALL).findall(data) #if DEBUG: scrapertools.printMatches(matches) itemlist = [] for match in matches: # Interpreta la fecha patron_fecha = "\s*([^<]+)<br />\s*Duración\: ([^\s]+)" campos_fecha = re.compile(patron_fecha, re.DOTALL).findall(match[3]) fecha_string = campos_fecha[0][0].strip() #import time #fecha = time.strptime(fecha_string,"%d/%m/%y %H:%M") duracion_string = campos_fecha[0][1].strip() aired_date = scrapertools.parse_date(fecha_string) duration = duracion_string #scrapedtitle = match[0]+" "+fecha.strftime("%d/%m/%y")+" (Duración "+duracion_string+")" scrapedtitle = match[0].strip() if "informativos" in item.url: scrapedtitle = scrapedtitle + " (" + aired_date + ")" scrapedurl = urlparse.urljoin(item.url, match[2]) scrapedthumbnail = urlparse.urljoin(item.url, match[1]) scrapedplot = "" if (DEBUG): logger.info("title=[" + scrapedtitle + "], url=[" + scrapedurl + "], thumbnail=[" + scrapedthumbnail + "], show=[" + item.show + "]") # Añade al listado itemlist.append( Item(channel=CHANNELNAME, title=scrapedtitle, action="play", server="aragontv", url=scrapedurl, thumbnail=scrapedthumbnail, plot=scrapedplot, show=item.show, aired_date=aired_date, duration=duration, folder=False)) patron = "Paginación.*?<span class='activo'>[^<]+</span> \| <a href='([^']+)'" matches = re.compile(patron, re.DOTALL).findall(data) scrapertools.printMatches(matches) if len(matches) > 0: pageitem = Item(channel=CHANNELNAME, title=">> Página siguiente", action="episodios", url=urlparse.urljoin(item.url, matches[0]), thumbnail=item.thumbnail, plot=item.plot, show=item.show, folder=True, view="videos") itemlist.append(pageitem) return itemlist
def episodios(item): logger.info("tvalacarta.cctvspan episodios") itemlist = [] ''' <div class="text_lt"> <a guid="40f061633e614ffe829ab3df91279b44" style="cursor:pointer;" onclick="loadvideo('40f061633e614ffe829ab3df91279b44')"><img src="http://p2.img.cctvpic.com/photoworkspace/2015/03/15/2015031515100374890.bmp" width="96" height="75" class="l" /></a> <h3><a onclick="loadvideo('40f061633e614ffe829ab3df91279b44')" style="cursor:pointer;">EXTRANJEROS EN CHINA 03/15/2015 Liz Vargas, Profesora de la Universidad de Estudios Internacionales de Beijing</a></h3> ''' # Descarga la pȧina data = scrapertools.cachePage(item.url) patron = '<div class="text_lt"[^<]+' patron += '<a guid="([^"]+)"[^<]+<img src="([^"]+)"[^<]+</a[^<]+' patron += '<h3><a[^>]+>([^<]+)</a>' matches = re.compile(patron,re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) for guid,scrapedthumbnail,scrapedtitle in matches: title = scrapertools.htmlclean(scrapedtitle) url = guid thumbnail = scrapedthumbnail aired_date = scrapertools.parse_date(scrapedtitle,"mdy") itemlist.append( Item(channel=__channel__, action="play", server="cntv", title=title , url=url , thumbnail=thumbnail, show=item.show, aired_date=aired_date, folder=False) ) ''' <span class="text_lt"> <h3><a href="http://cctv.cntv.cn/2015/03/31/VIDE1427774161717552.shtml" target="_blank">ECONOMÍA AL DÍA 03/31/2015 11:00</a></h3> ''' patron = '<span class="text_lt"[^<]+' patron += '<h3><a href="([^"]+)"[^>]+>([^<]+)</a>' matches = re.compile(patron,re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) for scrapedurl,scrapedtitle in matches: title = scrapertools.htmlclean(scrapedtitle) url = scrapedurl thumbnail = "" aired_date = scrapertools.parse_date(scrapedtitle,"mdy") itemlist.append( Item(channel=__channel__, action="play", server="cntv", title=title , url=url , thumbnail=thumbnail, show=item.show, aired_date=aired_date, folder=False) ) ''' <li> <a href="http://cctv.cntv.cn/2015/08/21/VIDE1440121441066290.shtml" target="_blank"> <img src="http://p1.img.cctvpic.com/photoworkspace/2015/08/21/2015082114203738064.jpg" width="151" height="110" /> </a> <div class="tp1"><a href="http://cctv.cntv.cn/2015/08/21/VIDE1440121441066290.shtml" target="_blank"> </a> </div> <div class="tp2"> <a href="http://cctv.cntv.cn/2015/08/21/VIDE1440121441066290.shtml" target="_blank"> NIHAO CHINA 08/21/2015 Viajando y Aprendiendo Chino-Palabras y frases sobre mobiliarios </a></div></li> ''' patron = '<li[^<]+' patron += '<a href="([^"]+)"[^<]+' patron += '<img src="([^"]+)"[^<]+' patron += '</a[^<]+' patron += '<div class="tp1"><a[^<]+' patron += '</a[^<]+' patron += '</div[^<]+' patron += '<div class="tp2"[^<]+' patron += '<a[^>]+>([^<]+)</a>' matches = re.compile(patron,re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) for scrapedurl,scrapedthumbnail,scrapedtitle in matches: title = scrapertools.htmlclean(scrapedtitle) url = scrapedurl thumbnail = scrapedthumbnail aired_date = scrapertools.parse_date(scrapedtitle,"mdy") itemlist.append( Item(channel=__channel__, action="play", server="cntv", title=title , url=url , thumbnail=thumbnail, show=item.show, aired_date=aired_date, folder=False) ) # Prueba a ver si es la página de una serie if len(itemlist)==0: itemlist = episodios_serie(item,data) return itemlist
def episodios(item): logger.info("tvalacarta.cctvspan episodios") itemlist = [] ''' <div class="text_lt"> <a guid="40f061633e614ffe829ab3df91279b44" style="cursor:pointer;" onclick="loadvideo('40f061633e614ffe829ab3df91279b44')"><img src="http://p2.img.cctvpic.com/photoworkspace/2015/03/15/2015031515100374890.bmp" width="96" height="75" class="l" /></a> <h3><a onclick="loadvideo('40f061633e614ffe829ab3df91279b44')" style="cursor:pointer;">EXTRANJEROS EN CHINA 03/15/2015 Liz Vargas, Profesora de la Universidad de Estudios Internacionales de Beijing</a></h3> ''' # Descarga la pȧina data = scrapertools.cachePage(item.url) patron = '<div class="text_lt"[^<]+' patron += '<a guid="([^"]+)"[^<]+<img src="([^"]+)"[^<]+</a[^<]+' patron += '<h3><a[^>]+>([^<]+)</a>' matches = re.compile(patron, re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) for guid, scrapedthumbnail, scrapedtitle in matches: title = scrapertools.htmlclean(scrapedtitle) url = guid thumbnail = scrapedthumbnail aired_date = scrapertools.parse_date(scrapedtitle, "mdy") itemlist.append( Item(channel=__channel__, action="play", server="cntv", title=title, url=url, thumbnail=thumbnail, show=item.show, aired_date=aired_date, folder=False)) ''' <span class="text_lt"> <h3><a href="http://cctv.cntv.cn/2015/03/31/VIDE1427774161717552.shtml" target="_blank">ECONOMÍA AL DÍA 03/31/2015 11:00</a></h3> ''' patron = '<span class="text_lt"[^<]+' patron += '<h3><a href="([^"]+)"[^>]+>([^<]+)</a>' matches = re.compile(patron, re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) for scrapedurl, scrapedtitle in matches: title = scrapertools.htmlclean(scrapedtitle) url = scrapedurl thumbnail = "" aired_date = scrapertools.parse_date(scrapedtitle, "mdy") itemlist.append( Item(channel=__channel__, action="play", server="cntv", title=title, url=url, thumbnail=thumbnail, show=item.show, aired_date=aired_date, folder=False)) ''' <li> <a href="http://cctv.cntv.cn/2015/08/21/VIDE1440121441066290.shtml" target="_blank"> <img src="http://p1.img.cctvpic.com/photoworkspace/2015/08/21/2015082114203738064.jpg" width="151" height="110" /> </a> <div class="tp1"><a href="http://cctv.cntv.cn/2015/08/21/VIDE1440121441066290.shtml" target="_blank"> </a> </div> <div class="tp2"> <a href="http://cctv.cntv.cn/2015/08/21/VIDE1440121441066290.shtml" target="_blank"> NIHAO CHINA 08/21/2015 Viajando y Aprendiendo Chino-Palabras y frases sobre mobiliarios </a></div></li> ''' patron = '<li[^<]+' patron += '<a href="([^"]+)"[^<]+' patron += '<img src="([^"]+)"[^<]+' patron += '</a[^<]+' patron += '<div class="tp1"><a[^<]+' patron += '</a[^<]+' patron += '</div[^<]+' patron += '<div class="tp2"[^<]+' patron += '<a[^>]+>([^<]+)</a>' matches = re.compile(patron, re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) for scrapedurl, scrapedthumbnail, scrapedtitle in matches: title = scrapertools.htmlclean(scrapedtitle) url = scrapedurl thumbnail = scrapedthumbnail aired_date = scrapertools.parse_date(scrapedtitle, "mdy") itemlist.append( Item(channel=__channel__, action="play", server="cntv", title=title, url=url, thumbnail=thumbnail, show=item.show, aired_date=aired_date, folder=False)) # Prueba a ver si es la página de una serie if len(itemlist) == 0: itemlist = episodios_serie(item, data) return itemlist
def episodios(item): logger.info("tvalacarta.rtvcm.episodios") itemlist = [] # Descarga la página prueba_urls = [] if "?pagina=" in item.url: prueba_urls.append(item.url) else: prueba_urls.append(item.url + "/programas-completos") prueba_urls.append(item.url + "/videos") for prueba_url in prueba_urls: data = scrapertools.cache_page(prueba_url) logger.info("tvalacarta.rtvcm.episodios data=" + data) ''' <article> <figure> <img src="http://api.rtvcm.webtv.flumotion.com/videos/30531/poster.jpg?w=f720f390" alt="Promo A tu vera Mini"> <a class="icon-play-circle" href="http://www.cmmedia.es/programas/tv/a-tu-vera//programas-completos/30531?pagina=2" title="Promo A tu vera Mini"> <span class="sr-only">http://api.rtvcm.webtv.flumotion.com/videos/30531/poster.jpg?w=f720f390</span></a> </figure> <p class="date"><time></time></p> <h3><a href="http://www.cmmedia.es/programas/tv/a-tu-vera//programas-completos/30531?pagina=2" title="Promo A tu vera Mini">Promo A tu vera Mini</a></h3> <p>La novena edición de A Tu Vera Mini ya está en marcha. Participa en los casting llamando al 905 447 366</p> </article> ''' patron = '<article[^<]+' patron += '<figure[^<]+' patron += '<img src="([^"]+)" alt="([^"]+)"[^<]+' patron += '<a class="icon-play-circle" href="([^"]+)"[^<]+' patron += '<span[^<]+</span></a[^<]+' patron += '</figure[^<]+' patron += '<p class="date"><time>([^<]*)</time></p[^<]+' patron += '<h3><a[^<]+</a></h3[^<]+' patron += '<p>(.*?)</p>' matches = re.compile(patron, re.DOTALL).findall(data) for scrapedthumbnail, scrapedtitle, scrapedurl, fecha, scrapedplot in matches: thumbnail = urlparse.urljoin(item.url, scrapedthumbnail) url = urlparse.urljoin(item.url, scrapedurl) title = scrapedtitle + " " + fecha plot = scrapedplot aired_date = scrapertools.parse_date(fecha) itemlist.append( Item(channel=__channel__, title=title, url=url, plot=plot, thumbnail=thumbnail, fanart=thumbnail, action="play", server="rtvcm", show=item.title, aired_date=aired_date, folder=False)) if len(itemlist) > 0: break next_page_url = scrapertools.find_single_match( data, '<a href="([^"]+)" aria-label="Siguiente">') if next_page_url != "": itemlist.append( Item(channel=__channel__, action="episodios", title=">> Página siguiente", url=urlparse.urljoin(item.url, next_page_url), folder=True)) return itemlist
def episodios(item): logger.info("tvalacarta.channels.telemundo episodios") itemlist = [] # Descarga la página data = scrapertools.cachePage(item.url) if "video_feed" in item.url: json_data = jsontools.load_json(data) data = json_data["slide"] next_page_url = json_data["nextUrl"] else: next_page_url = scrapertools.find_single_match(data,'data-feed-url-next="([^"]+)"') ''' <div class="media--SHOW-BRAND-VIDEO media--active"> <div class="media--media"> <a href="http://www.telemundo.com/novelas/celia/videos/celia/capitulos/celia-capitulo-final-celia-muere-causa-de-un-tumor-en-el-cerebro-1046906" class="media--play-button media-item--aspect-ratio-300x215"> <img class="media--cover-image" src="http://www.telemundo.com/sites/nbcutelemundo/files/styles/show_brand_video/public/images/mpx/2016/02/08/160208_2982200_Celia__Capitulo_Final__Celia_muere_a_causa_d.jpg?itok=DdLxQQUV" width="300" height="215" alt="Aymee Nuviola en Celia" title="Aymee Nuviola en Celia" /> </a> </div> <div class="media--content"> <h4><a class="media--title" href="http://www.telemundo.com/novelas/celia/videos/celia/capitulos/celia-capitulo-final-celia-muere-causa-de-un-tumor-en-el-cerebro-1046906">Capítulo Final:Celia muere a causa de un tumor en el cerebro</a></h4> <p class="media--air-date">Emitido: lunes 02/8/16</p> <div class="media--description"> <h3><a href="http://www.telemundo.com/novelas/celia/videos/celia/capitulos/celia-capitulo-final-celia-muere-causa-de-un-tumor-en-el-cerebro-1046906" class="media--link">Después de las complicaciones de salud que venia presentado, Celia parte a mejor vida dejando un legado de amor a todos los latinos en el mundo. </a></h3> </div> </div> </div> ''' ''' <section class="video-carousel--BRAND" data-feed-url-prev="" data-feed-url-next="http://www.telemundo.com/node/947661/video_feed?group=0&sub=0&vid=1046906&page=0%2C1"> http://www.telemundo.com/node/947661/video_feed?group=0&sub=0&vid=1046906&page=0%2C0 -> slide = -> next_url = ''' patron = '<div class="media--SHOW-BRAND-VIDEO[^<]+' patron += '<div class="media--media"[^<]+' patron += '<a href="([^"]+)"[^<]+' patron += '<img class="media--cover-image" src="([^"]+)"[^<]+</a[^<]+' patron += '</div[^<]+' patron += '<div class="media--content"[^<]+' patron += '<h4><a class="media--title" href="[^"]+">([^<]+)</a></h4[^<]+' patron += '<p class="media--air-date">([^<]+)</p[^<]+' patron += '<div class="media--description"[^<]+' patron += '<h3><a href="[^"]+" class="media--link">([^<]+)</a></h3>' matches = re.compile(patron,re.DOTALL).findall(data) for scrapedurl,scrapedthumbnail,scrapedtitle,aired_date,scrapedplot in matches: title = scrapedtitle url = scrapedurl thumbnail = scrapedthumbnail plot = scrapedplot aired_date = scrapertools.parse_date(aired_date,formato="mdy") itemlist.append( Item(channel=__channel__, action="partes", title=title, url=url, thumbnail=thumbnail, fanart=thumbnail, plot=plot, aired_date=aired_date, show=item.show, view="videos", folder=True)) if next_page_url!="": itemlist.append( Item(channel=__channel__, title=">> Página siguiente" , url=urlparse.urljoin(item.url,next_page_url), action="episodios", show=item.show, folder=True) ) return itemlist
def episodios(item, load_all_pages=False): logger.info("tvalacarta.channels.sieterm episodios") # Descarga la página data = scrapertools.cachePage(item.url) #logger.info(data) # Extrae los vídeos ''' <dt class="alacarta-video"><a href="http://..." title="...">Murcianos por el mundo: Cracovia</a> · 12/05/2010 · (5411 veces visto)</dt> <dd style="height:100%; overflow:hidden"> <a href="http://www.7rm.es/servlet/rtrm.servlets.ServletLink2?METHOD=DETALLEALACARTA&sit=c,6,ofs,10&serv=BlogPortal2&orden=1&idCarta=40&mId=4182&autostart=TV" title="Ver vídeo"> <img src="http://mediateca.regmurcia.com/MediatecaCRM/ServletLink?METHOD=MEDIATECA&accion=imagen&id=4182" alt="Murcianos por el mundo: Cracovia" title="Murcianos por el mundo: Cracovia" style="width:95px" /> </a> Esta semana nos desplazamos al sur de Polonia, a Cracovia y Wroclaw, para conocer cómo viven seis murcianos en una de las ciudades más importantes de Polonia y Patrimonio de la Humanidad. <a href="http://ficheros.7rm.es:3025/Video/4/1/4182_BAJA.mp4"> <img src="/images/bajarArchivo.gif" alt="Descargar Archivo" title="Descargar Archivo" style="margin:0;padding:0 5px 0 0;vertical-align:middle;border:none" /> </a> </dd> ''' ''' <dt class="alacarta-video"><a href="http://www.7rm.es/servlet/rtrm.servlets.ServletLink2?METHOD=DETALLEALACARTA&sit=c,6,ofs,0&serv=BlogPortal2&orden=2&idCarta=36&mId=3214&autostart=TV" title="Ver vídeo">De la tierra al mar</a> · 22/12/2009 · (1072 veces visto)</dt> <dd style="height:100%; overflow:hidden"> <a href="http://www.7rm.es/servlet/rtrm.servlets.ServletLink2?METHOD=DETALLEALACARTA&sit=c,6,ofs,0&serv=BlogPortal2&orden=2&idCarta=36&mId=3214&autostart=TV" title="Ver vídeo"> <img src="http://mediateca.regmurcia.com/MediatecaCRM/ServletLink?METHOD=MEDIATECA&accion=imagen&id=3214" alt="De la tierra al mar" title="De la tierra al mar" style="width:95px" /> </a> En este programa conocemos a Plácido, joven agricultor que nos mostrará la mala situación en que se encuentra el sector, informamos de la campaña 'Dale vida a tu árbol', asistimos a la presentación del libro 'Gestión ambiental. Guía fácil para empresas y profesionales', y nos hacemos eco del malestar de nuestros agricultores con la nueva normativa europea en materia de fitosanitarios, que entrará en vigor en junio de 2011. <a href="http://ficheros.7rm.es:3025/Video/3/2/3214_BAJA.mp4"> <img src="/images/bajarArchivo.gif" alt="Descargar Archivo" title="Descargar Archivo" style="margin:0;padding:0 5px 0 0;vertical-align:middle;border:none" /> </a> </dd> ''' patron = '<dt class="alacarta-video"><a href="([^"]+)" title="[^"]+">([^<]+)</a>.*?([0-9\/]+).*?</dt>[^<]+' patron += '<dd style="[^<]+">[^<]+' patron += '<a href="[^"]+" title="[^"]+">[^<]+' patron += '<img src="([^"]+)"[^<]+' patron += '</a>([^<]+)<a href="([^"]+)">' matches = re.compile(patron,re.DOTALL).findall(data) #scrapertools.printMatches(matches) itemlist = [] for match in matches: # Atributos del vídeo scrapedtitle = unicode( match[1].strip()+" ("+match[2]+")" , "iso-8859-1" , errors="ignore").encode("utf-8") scrapedurl = urlparse.urljoin(item.url,match[5]).replace("&","&") scrapedthumbnail = urlparse.urljoin(item.url,match[3]).replace("&","&") scrapedplot = unicode( match[4].strip() , "iso-8859-1" , errors="ignore").encode("utf-8") scrapedpage = urlparse.urljoin(item.url,match[0]).replace("&","&") if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], page=["+scrapedpage+"], thumbnail=["+scrapedthumbnail+"]") # Trata de sacar la fecha de emisión del título aired_date = scrapertools.parse_date(scrapedtitle) #logger.info("aired_date="+aired_date) # Añade al listado de XBMC itemlist.append( Item(channel=CHANNELNAME, title=scrapedtitle , action="play" , server="sieterm" , url=scrapedpage, thumbnail=scrapedthumbnail, fanart=scrapedthumbnail, plot=scrapedplot , show = item.show , page=scrapedpage, viewmode="movie_with_plot", aired_date=aired_date, folder=False) ) # Busca la página siguiente next_page_url = scrapertools.find_single_match(data,'<a class="list-siguientes" href="([^"]+)" title="Ver siguientes archivos">') if next_page_url!="": next_page_url = urlparse.urljoin(item.url,next_page_url) next_page_item = Item(channel=CHANNELNAME, title=">> Página siguiente" , action="episodios" , url=next_page_url , show=item.show, folder=True) if load_all_pages: itemlist.extend(episodios(next_page_item,load_all_pages)) else: itemlist.append( next_page_item ) return itemlist
def episodios(item, load_all_pages = False): logger.info("[tvg.py] episodios") itemlist = [] # Lee la página del programa y extrae el id_programa if "/ax/" in item.url: headers=[] headers.append(["User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:17.0) Gecko/20100101 Firefox/17.0"]) headers.append(["X-Requested-With","XMLHttpRequest"]) headers.append(["Referer",item.url]) data = scrapertools.cache_page(item.url, post="", headers=headers) data = data.replace("\\n"," ") data = data.replace("\\\"","\"") data = data.replace("\\/","/") else: data = scrapertools.cache_page(item.url) try: id_programa = scrapertools.get_match(data,"initAlaCartaBuscador.(\d+)") except: id_programa = "" # Lee la primera página de episodios #http://www.crtvg.es/ax/tvgalacartabuscador/programa:33517/pagina:1/seccion:294/titulo:/mes:null/ano:null/temporada:null logger.info("[tvg.py] videos - hay programa") url = "http://www.crtvg.es/ax/tvgalacartabuscador/programa:"+id_programa+"/pagina:1/seccion:294/titulo:/mes:null/ano:null/temporada:null" headers=[] headers.append(["User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:17.0) Gecko/20100101 Firefox/17.0"]) headers.append(["X-Requested-With","XMLHttpRequest"]) headers.append(["Referer",item.url]) data = scrapertools.cache_page(url, post="", headers=headers) data = data.replace("\\n"," ") data = data.replace("\\\"","\"") data = data.replace("\\/","/") #logger.info("data="+data) # Extrae los videos ''' <tr> <td class="a-carta-resultado-titulo"> <a href="\/tvg\/a-carta\/rea-publica-74" title="\u00c1rea p\u00fablica">\u00c1rea p\u00fablica<\/a> <\/td> <td class="a-carta-resultado-tempada"> <\/td> <td class="a-carta-resultado-data"> 26\/01\/2016 18:30 <\/td> <\/tr> ''' patron = '<tr[^<]+' patron += '<td class="a-carta-resultado-titulo[^<]+' patron += '<a href="([^"]+)"\s+title="([^"]+)".*?' patron += '<td class="a-carta-resultado-data">(.*?)</td>' matches = re.compile(patron,re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) for scrapedurl,scrapedtitle,fecha in matches: title = scrapedtitle.strip() json_title = jsontools.load_json('{"title":"'+title+'"}') title = json_title["title"] title = scrapertools.htmlclean(title)+" - "+fecha.strip() url = urlparse.urljoin(item.url,scrapedurl) thumbnail = "" plot = "" aired_date = scrapertools.parse_date(fecha) if (DEBUG): logger.info("title=["+title+"], url=["+url+"], thumbnail=["+thumbnail+"]") # Añade al listado de XBMC itemlist.append( Item(channel=CHANNELNAME, title=title , action="play" , server="tvg", url=url, thumbnail=thumbnail, plot=plot , show=item.show , aired_date=aired_date, folder=False) ) #<a href=\"#\" title=\"Seguinte\" onclick=\"return posteriorpaginaclick(33517, 2, 294) patron = '<a href="\#" title="Seguinte" onclick="return posteriorpaginaclick\((\d+), (\d+), (\d+)' matches = re.compile(patron,re.DOTALL).findall(data) if DEBUG: scrapertools.printMatches(matches) for match in matches: scrapedtitle = ">>> Página siguiente" #http://www.crtvg.es/ax/tvgalacartabuscador/programa:33517/pagina:2/seccion:294/titulo:/mes:null/ano:null/temporada:null scrapedurl = "http://www.crtvg.es/ax/tvgalacartabuscador/programa:%s/pagina:%s/seccion:%s/titulo:/mes:null/ano:null/temporada:null" % (match[0],match[1],match[2]) scrapedthumbnail = urlparse.urljoin(item.url,match[2]) scrapedplot = "" if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], thumbnail=["+scrapedthumbnail+"]") next_page_item = Item(channel=CHANNELNAME, title=scrapedtitle , action="episodios" , url=scrapedurl, thumbnail=scrapedthumbnail, plot=scrapedplot , show=item.show , category = item.category , folder=True) if load_all_pages: itemlist.extend( episodios(next_page_item, load_all_pages) ) else: itemlist.append( next_page_item ) break return itemlist
def episodios(item): logger.info("tvalacarta.channels.dwspan episodios") itemlist = [] # ''' <div class="col1"> <div class="news searchres hov"> <a href="/es/life-links-readytofight-listos-para-pelear/av-19224025"> <div class="teaserImg tv"> <img border="0" width="220" height="124" src="/image/18378218_301.jpg" title="Life Links - #readytofight: Listos para pelear" alt="default" /> </div> <h2>Life Links - #readytofight: Listos para pelear <span class="date">30.04.2016 | 26:06 Minutos </span> <span class='icon tv'></span> </h2> <p>Un imán, un exsalafista, un ex marine de EE. UU. A todos ellos les une una meta: luchar contra el extremismo y “Estado Islámico”.</p> </a> </div> </div> ''' if "pagenumber=" in item.url: data_url = item.url else: data = scrapertools.cache_page(item.url) # http://www.dw.com/es/multimedia/todos-los-contenidos/s-100838?type=18&programs=15535663 # http://www.dw.com/mediafilter/research?lang=es&type=18&programs=15535663&sort=date&results=32&showteasers=true&pagenumber=1 program_id = scrapertools.find_single_match(data,'<a href="http://www.dw.com/es/multimedia/todos-los-contenidos/s-100838.type=18&programs=([^"]+)"') data_url = "http://www.dw.com/mediafilter/research?lang=es&type=18&programs="+program_id+"&sort=date&results=32&showteasers=true&pagenumber=1" data = scrapertools.cache_page(data_url) pattern = '<div class="col1"[^<]+' pattern += '<div class="news searchres hov"[^<]+' pattern += '<a href="([^"]+)"[^<]+' pattern += '<div class="teaserImg tv"[^<]+' pattern += '<img.*?src="([^"]+)"[^<]+</div>[^<]+' pattern += '<h2>([^<]+)' pattern += '<span class="date">(\d+\.\d+\.\d+)\s+\|\s+(\d+\:\d+)[^<]+' pattern += '</span>[^<]+' pattern += '<span[^<]+</span[^<]+</h2[^<]+' pattern += '<p>([^<]+)</p>' matches = re.compile(pattern,re.DOTALL).findall(data) logger.info( repr(matches) ) for scrapedurl, scrapedthumbnail, scrapedtitle, scrapeddate, duration, scrapedplot in matches: title = scrapedtitle.strip() thumbnail = urlparse.urljoin( item.url , scrapedthumbnail ) url = urlparse.urljoin( item.url , scrapedurl.strip() ) plot = scrapedplot aired_date = scrapertools.parse_date(scrapeddate) # Appends a new item to the xbmc item list itemlist.append( Item(channel=CHANNELNAME, title=title , action="play" , server="dwspan", url=url, thumbnail=thumbnail, fanart=thumbnail, plot=plot , aired_date=aired_date, duration=duration, show=item.show, view="videos", folder=False) ) if len(itemlist)>0: current_page = scrapertools.find_single_match(data_url,"pagenumber=(\d+)") logger.info("current_page="+current_page) next_page = str(int(current_page)+1) logger.info("next_page="+next_page) next_page_url = data_url.replace("pagenumber="+current_page,"pagenumber="+next_page) logger.info("next_page_url="+next_page_url) itemlist.append(Item(channel=CHANNELNAME, title=">> Página siguiente" , action="episodios" , url=next_page_url, show=item.show) ) return itemlist
# -*- coding: utf-8 -*-
def episodios(item, load_all_pages=False): logger.info("tvalacarta.channels.sieterm episodios") # Descarga la página data = scrapertools.cachePage(item.url) #logger.info(data) # Extrae los vídeos ''' <dt class="alacarta-video"><a href="http://..." title="...">Murcianos por el mundo: Cracovia</a> · 12/05/2010 · (5411 veces visto)</dt> <dd style="height:100%; overflow:hidden"> <a href="http://www.7rm.es/servlet/rtrm.servlets.ServletLink2?METHOD=DETALLEALACARTA&sit=c,6,ofs,10&serv=BlogPortal2&orden=1&idCarta=40&mId=4182&autostart=TV" title="Ver vídeo"> <img src="http://mediateca.regmurcia.com/MediatecaCRM/ServletLink?METHOD=MEDIATECA&accion=imagen&id=4182" alt="Murcianos por el mundo: Cracovia" title="Murcianos por el mundo: Cracovia" style="width:95px" /> </a> Esta semana nos desplazamos al sur de Polonia, a Cracovia y Wroclaw, para conocer cómo viven seis murcianos en una de las ciudades más importantes de Polonia y Patrimonio de la Humanidad. <a href="http://ficheros.7rm.es:3025/Video/4/1/4182_BAJA.mp4"> <img src="/images/bajarArchivo.gif" alt="Descargar Archivo" title="Descargar Archivo" style="margin:0;padding:0 5px 0 0;vertical-align:middle;border:none" /> </a> </dd> ''' ''' <dt class="alacarta-video"><a href="http://www.7rm.es/servlet/rtrm.servlets.ServletLink2?METHOD=DETALLEALACARTA&sit=c,6,ofs,0&serv=BlogPortal2&orden=2&idCarta=36&mId=3214&autostart=TV" title="Ver vídeo">De la tierra al mar</a> · 22/12/2009 · (1072 veces visto)</dt> <dd style="height:100%; overflow:hidden"> <a href="http://www.7rm.es/servlet/rtrm.servlets.ServletLink2?METHOD=DETALLEALACARTA&sit=c,6,ofs,0&serv=BlogPortal2&orden=2&idCarta=36&mId=3214&autostart=TV" title="Ver vídeo"> <img src="http://mediateca.regmurcia.com/MediatecaCRM/ServletLink?METHOD=MEDIATECA&accion=imagen&id=3214" alt="De la tierra al mar" title="De la tierra al mar" style="width:95px" /> </a> En este programa conocemos a Plácido, joven agricultor que nos mostrará la mala situación en que se encuentra el sector, informamos de la campaña 'Dale vida a tu árbol', asistimos a la presentación del libro 'Gestión ambiental. Guía fácil para empresas y profesionales', y nos hacemos eco del malestar de nuestros agricultores con la nueva normativa europea en materia de fitosanitarios, que entrará en vigor en junio de 2011. <a href="http://ficheros.7rm.es:3025/Video/3/2/3214_BAJA.mp4"> <img src="/images/bajarArchivo.gif" alt="Descargar Archivo" title="Descargar Archivo" style="margin:0;padding:0 5px 0 0;vertical-align:middle;border:none" /> </a> </dd> ''' patron = '<dt class="alacarta-video"><a href="([^"]+)" title="[^"]+">([^<]+)</a>.*?([0-9\/]+).*?</dt>[^<]+' patron += '<dd style="[^<]+">[^<]+' patron += '<a href="[^"]+" title="[^"]+">[^<]+' patron += '<img src="([^"]+)"[^<]+' patron += '</a>([^<]+)<a href="([^"]+)">' matches = re.compile(patron,re.DOTALL).findall(data) #scrapertools.printMatches(matches) itemlist = [] for match in matches: # Atributos del vídeo scrapedtitle = unicode( match[1].strip()+" ("+match[2]+")" , "iso-8859-1" , errors="ignore").encode("utf-8") scrapedurl = urlparse.urljoin(item.url,match[5]).replace("&","&") scrapedthumbnail = urlparse.urljoin(item.url,match[3]).replace("&","&") scrapedplot = unicode( match[4].strip() , "iso-8859-1" , errors="ignore").encode("utf-8") scrapedpage = urlparse.urljoin(item.url,match[0]).replace("&","&") if (DEBUG): logger.info("title=["+scrapedtitle+"], url=["+scrapedurl+"], page=["+scrapedpage+"], thumbnail=["+scrapedthumbnail+"]") # Trata de sacar la fecha de emisión del título aired_date = scrapertools.parse_date(scrapedtitle) #logger.info("aired_date="+aired_date) # Añade al listado de XBMC itemlist.append( Item(channel=CHANNELNAME, title=scrapedtitle , action="play" , server="sieterm" , url=scrapedpage, thumbnail=scrapedthumbnail, fanart=scrapedthumbnail, plot=scrapedplot , show = item.show , page=scrapedpage, aired_date=aired_date, folder=False) ) # Busca la página siguiente next_page_url = scrapertools.find_single_match(data,'<a class="list-siguientes" href="([^"]+)" title="Ver siguientes archivos">') if next_page_url!="": next_page_url = urlparse.urljoin(item.url,next_page_url) next_page_item = Item(channel=CHANNELNAME, title=">> Página siguiente" , action="episodios" , url=next_page_url , show=item.show, folder=True) if load_all_pages: itemlist.extend(episodios(next_page_item,load_all_pages)) else: itemlist.append( next_page_item ) return itemlist