def getChannels(page): x = [] if page == '0': url = Reuters.LAST_NEWS_RSS + str(time.time() * 1000) logger.debug("news rss url is: " + url) bruteResult = Reuters.getContentFromUrl(url=url, launchLocation=False, ajax=True) logger.debug("brute ajax response: " + bruteResult) results = json.loads(bruteResult) i = 0 for result in results["headlines"]: if i > 0: element = {} img = result["mainPicUrl"] link = Reuters.MAIN_URL + result["url"] title = result["formattedDate"] + " - " + result["headline"] logger.debug("appending result: " + title + ", url: " + link + ", img: " + img) element["title"] = title element["link"] = link element["thumbnail"] = img x.append(element) i += 1 else: html = Reuters.getContentFromUrl(url=page) startRegex = '<span id="article-text">' if '<span id="article-text">' in html: startRegex = '<span id="article-text">' else: startRegex = '<span id="articleText">' body = Decoder.extract(startRegex, '<div class="linebreak"></div>', html) body = Decoder.removeHTML(body) if '|' in body: body = body[body.find('|') + 1:] try: lowerCaseIndex = int(re.search("[a-z]", body).start()) body = body[:lowerCaseIndex - 1] + "\n" + body[lowerCaseIndex - 1:] except: logger.error( "No break for city was done. Something goes wrong") pass element = {} element["link"] = page element["title"] = body element["thumbnail"] = '' x.append(element) return x
def getChannels(page): x = [] logger.debug("page: "+page) if(str(page)=="0"): html = MejorTorrent.getContentFromUrl(url=MejorTorrent.MAIN_URL) menuHtml = Decoder.extract("<table width='140' border='0' cellpadding='0' cellspacing='0' style='border-left:1px solid black; border-right:1px solid black; border-bottom:1px solid black;'>",'</table>',html) for itemHtml in menuHtml.split("<a"): logger.debug("li --> HTML is: "+itemHtml) if "href=" in itemHtml: item = {} title = Decoder.extract('">','</a>',itemHtml) title = Decoder.removeHTML(title) if len(title)>0: item["title"] = title link = Decoder.extract("href='", "'", itemHtml) if 'musica' not in link and 'juegos' not in link and 'variados' not in link: if "://" not in link: item["link"] = MejorTorrent.MAIN_URL+link x.append(item) search = {} search["title"] = XBMCUtils.getString(11018) search["link"] = ".search" x.append(search) elif page=='.search': #display keyboard, it will wait for result keyboard = XBMCUtils.getKeyboard() keyboard.doModal() text = "" if (keyboard.isConfirmed()): text = keyboard.getText() x = MejorTorrent.search(text) elif '-descargar-' in page: logger.debug("-descargar- page detected...") x = MejorTorrent.extractProvidersFromLink(page) elif 'sec=descargas' in page and '&p=' not in page: logger.debug("decoding torrent..."+page) html = MejorTorrent.getContentFromUrl(url=page) link = MejorTorrent.MAIN_URL+Decoder.extract("Pincha <a href='/","'",html) logger.debug("extracted torrent link: "+link) element = {} element["link"] = link element["title"] = page element["finalLink"] = True x.append(element) else: x = MejorTorrent.extractContentFromLink(page) return x
def getChannels(page): x = [] if page == '0': url = CNN.LAST_NEWS_RSS logger.debug("news rss url is: "+url) bruteResult = CNN.getContentFromUrl(url=url,launchLocation=True,ajax=False) logger.debug("brute response: "+bruteResult) lists = common.parseDOM(bruteResult, "item") if len(lists) > 0: logger.info("counted: " + str(len(lists))) for item in lists: name = common.parseDOM(item, "title")[0].encode("utf-8") value = common.parseDOM(item, "guid")[0].encode("utf-8") logger.info("Added: " + name + ", url: " + value) element = {} element["title"] = name.replace('<![CDATA[','').replace("]]>","") element["link"] = value.replace("//www.cnn.com/","//edition.cnn.com/") try: img = common.parseDOM(item, "media:content", ret="url")[0].encode("utf-8") element["thumbnail"] = img except: logger.debug("Could not be extracted any img. :'(") x.append(element) else: html = CNN.getContentFromUrl(url=page,launchLocation=True,referer=CNN.MAIN_URL) startRegex = '<div class="el__leafmedia el__leafmedia--sourced-paragraph">' body = Decoder.extract(startRegex,'</div><p class="zn-body__paragraph zn-body__footer">',html) logger.debug("removing html: "+body) body = Decoder.removeHTML(body) logger.debug("html has removed from body!") if '|' in body: body = body[body.find('|')+1:] try: lowerCaseIndex = int(re.search("[a-z]", body).start()) body = body[:lowerCaseIndex-1]+"\n"+body[lowerCaseIndex-1:] except: logger.error("No break for city was done. Something goes wrong") pass element = {} element["link"] = page element["title"] = body element["thumbnail"] = '' x.append(element) return x
def search(text): x = [] searchUrl = MejorTorrent.SEARCH % urllib.quote_plus(text) html = MejorTorrent.getContentFromUrl(url=searchUrl,referer=MejorTorrent.MAIN_URL) logger.debug("search html is: "+html) if "<table width='96%' border='0' cellspacing='0' cellpadding='4' align='center'>" in html: table = Decoder.extract("<table width='96%' border='0' cellspacing='0' cellpadding='4' align='center'>","</table>",html) i=0 for line in table.split("<tr height='22'>"): if i>0: link = Decoder.extract("<a href='","'",line) title = Decoder.extract('onmouseout="style.textDecoration=\'none\';">', "</td>", line) title = Decoder.removeHTML(title) element = {} element["title"] = title element["link"] = "http://www.mejortorrent.com"+link x.append(element) i+=1 else: logger.debug("nothing done in search!") return x
def drawBbcCoUkNew(url): htmlContent = Downloader.getContentFromUrl(url=url) title = Decoder.extract('<p class="story-body__introduction">', '</p><div', htmlContent) if 'property="articleBody"' in htmlContent: body = Decoder.extract( 'property="articleBody"', ' </div>', htmlContent) body = body.replace('<span class="off-screen">Image copyright</span>', '') body = body.replace('<span class="story-image-copyright">AFP</span>', '') body = body.replace( '<span class="story-image-copyright">Reuters</span>', '') body = body.replace('<span class="off-screen">Image caption</span>', '') body = body.replace('<span class="off-screen">Media caption</span>', '') while '<span class="media-caption__text">' in body: line = Decoder.extractWithRegex( '<span class="media-caption__text">', "</span>", body) body = body.replace(line, "") elif 'class="text-wrapper"' in htmlContent: #special content body = Decoder.extract('class="text-wrapper"', '</p>\n', htmlContent) dates = Decoder.extractWithRegex('<div class="date', "</div>", body) lastUpdate = Decoder.extractWithRegex('<p class="date ', "</p>", body) body = body.replace(dates, "") body = body.replace(lastUpdate, "") elif '<figcaption class="sp-media-asset' in htmlContent: body = Decoder.extract('<figcaption class="sp-media-asset', '</p><div ', htmlContent) if '>' in body: body = body[body.find(">") + 1:] body = Decoder.removeHTML(body).replace(".", ".\n").replace(">", "") logger.debug("body is: " + body) drawNew(textContent=(body))
def getChannels(page): x = [] if page == '0': url = ElMundo.LAST_NEWS_RSS logger.debug("news rss url is: " + url) bruteResult = ElMundo.getContentFromUrl(url=url, launchLocation=True, ajax=False, referer=ElMundo.MAIN_URL) logger.debug("brute response: " + bruteResult) lists = common.parseDOM(bruteResult, "item") if len(lists) > 0: logger.info("counted: " + str(len(lists))) for item in lists: name = common.parseDOM(item, "title")[0].encode("utf-8") link = common.parseDOM(item, "link")[0].encode("utf-8") logger.info("Added: " + name + ", url: " + link) element = {} element["title"] = name.replace('<![CDATA[', '').replace("]]>", "") element["link"] = link try: img = common.parseDOM(item, "media:content", ret="url")[0].encode("utf-8") logger.debug("thumbnail is: " + img) element["thumbnail"] = img except: logger.debug("Could not be extracted any img. :'(") x.append(element) else: html = ElMundo.getContentFromUrl( url=page, launchLocation=True, referer=ElMundo.MAIN_URL).decode('iso-8859-15').encode('utf8') startRegex = '<article class="news-item" itemscope itemtype="http://schema.org/NewsArticle">' body = Decoder.extract(startRegex, '<h3 class="list-header">', html) if 'class="comentarios ' in body: body = body[:body.find('class="comentarios ')] if '<a href="#ancla_comentarios">' in body: replacedBy = Decoder.extract('<a href="#ancla_comentarios">', "</a>", body) logger.debug("removing: " + replacedBy) body = body.replace(replacedBy, "") logger.debug("removing html: " + body) body = Decoder.removeHTML(body) if ' Twitter Facebook Enviar ' in body: body = body.replace(" Twitter Facebook Enviar ", "\n") if ":" in body: #search by time index = body.find(":") try: figure = int(body[index + 1]) #it's a number except: #it's not a number, so needs next one body2 = body[index + 1:] index += body2.find(":") + 1 pass body = body[:index + 3] + "\n\n" + body[index + 3:] logger.debug("html has removed from body!") element = {} element["link"] = page element["title"] = body element["thumbnail"] = '' x.append(element) return x