Ejemplos de Decoder.removeHTML en Python

Lenguaje de programación: Python

Namespace/Package Name: tvboxcore.decoder

Clase / Tipo: Decoder

Método / Función: removeHTML

Ejemplos en hotexamples.com: 6

Python Decoder.removeHTML - 6 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de tvboxcore.decoder.Decoder.removeHTML extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

extract(30)

extractWithRegex(23)

removeHTML(6)

decodeBussinessApp(4)

rExtract(3)

decodeLink(3)

extractSawlive(2)

extractDinostreamPart(2)

decode247bay(2)

decodeIguide(2)

decodeUcaster(2)

decodeMipsplayer(1)

decodePrivatestream(1)

decodeStreamliveto(1)

decodeAdfly(1)

decodeVaughnlivetv(1)

decodeLiveFlash(1)

decodeLetonTv(1)

extractParams(1)

decodeCastalbatv(1)

getCastcampLink(1)

getUstreamLink(1)

preWise(1)

decodeBroadcastst(1)

rExtractWithRegex(1)

Ejemplo n.º 1

Mostrar archivo

 def getChannels(page):
     x = []
     if page == '0':
         url = Reuters.LAST_NEWS_RSS + str(time.time() * 1000)
         logger.debug("news rss url is: " + url)
         bruteResult = Reuters.getContentFromUrl(url=url,
                                                 launchLocation=False,
                                                 ajax=True)
         logger.debug("brute ajax response: " + bruteResult)
         results = json.loads(bruteResult)
         i = 0
         for result in results["headlines"]:
             if i > 0:
                 element = {}
                 img = result["mainPicUrl"]
                 link = Reuters.MAIN_URL + result["url"]
                 title = result["formattedDate"] + " - " + result["headline"]
                 logger.debug("appending result: " + title + ", url: " +
                              link + ", img: " + img)
                 element["title"] = title
                 element["link"] = link
                 element["thumbnail"] = img
                 x.append(element)
             i += 1
     else:
         html = Reuters.getContentFromUrl(url=page)
         startRegex = '<span id="article-text">'
         if '<span id="article-text">' in html:
             startRegex = '<span id="article-text">'
         else:
             startRegex = '<span id="articleText">'
         body = Decoder.extract(startRegex, '<div class="linebreak"></div>',
                                html)
         body = Decoder.removeHTML(body)
         if '|' in body:
             body = body[body.find('|') + 1:]
         try:
             lowerCaseIndex = int(re.search("[a-z]", body).start())
             body = body[:lowerCaseIndex - 1] + "\n" + body[lowerCaseIndex -
                                                            1:]
         except:
             logger.error(
                 "No break for city was done. Something goes wrong")
             pass
         element = {}
         element["link"] = page
         element["title"] = body
         element["thumbnail"] = ''
         x.append(element)
     return x

Ejemplo n.º 2

Mostrar archivo

Archivo: mejortorrent.py Proyecto: djwayne1985/tvbox

    def getChannels(page):
        x = []
        logger.debug("page: "+page)
        if(str(page)=="0"):
            html = MejorTorrent.getContentFromUrl(url=MejorTorrent.MAIN_URL)
            menuHtml = Decoder.extract("<table width='140' border='0' cellpadding='0' cellspacing='0' style='border-left:1px solid black; border-right:1px solid black; border-bottom:1px solid black;'>",'</table>',html)
            for itemHtml in menuHtml.split("<a"):
                logger.debug("li --> HTML is: "+itemHtml)
                if "href=" in itemHtml:
                    item = {}
                    title = Decoder.extract('">','</a>',itemHtml)
                    title = Decoder.removeHTML(title)
                    if len(title)>0:
                        item["title"] = title
                        link = Decoder.extract("href='", "'", itemHtml)
                        if 'musica' not in link and 'juegos' not in link and 'variados' not in link:
                            if "://" not in link:
                                item["link"] = MejorTorrent.MAIN_URL+link
                            x.append(item)
            search = {}
            search["title"] = XBMCUtils.getString(11018)
            search["link"] = ".search"
            x.append(search)

        elif page=='.search':
            #display keyboard, it will wait for result
            keyboard = XBMCUtils.getKeyboard()
            keyboard.doModal()
            text = ""
            if (keyboard.isConfirmed()):
                text = keyboard.getText()
                x = MejorTorrent.search(text)
        elif '-descargar-' in page:
            logger.debug("-descargar- page detected...")
            x = MejorTorrent.extractProvidersFromLink(page)
        elif 'sec=descargas' in page and '&p=' not in page:
            logger.debug("decoding torrent..."+page)
            html = MejorTorrent.getContentFromUrl(url=page)
            link = MejorTorrent.MAIN_URL+Decoder.extract("Pincha <a href='/","'",html)
            logger.debug("extracted torrent link: "+link)
            element = {}
            element["link"] = link
            element["title"] = page
            element["finalLink"] = True
            x.append(element)
        else:
            x = MejorTorrent.extractContentFromLink(page)

        return x

Ejemplo n.º 3

Mostrar archivo

 def getChannels(page):
     x = []
     if page == '0':
         url = CNN.LAST_NEWS_RSS
         logger.debug("news rss url is: "+url)
         bruteResult = CNN.getContentFromUrl(url=url,launchLocation=True,ajax=False)
         logger.debug("brute response: "+bruteResult)
         lists = common.parseDOM(bruteResult, "item")
         if len(lists) > 0:
             logger.info("counted: " + str(len(lists)))
             for item in lists:
                 name = common.parseDOM(item, "title")[0].encode("utf-8")
                 value = common.parseDOM(item, "guid")[0].encode("utf-8")
                 logger.info("Added: " + name + ", url: " + value)
                 element = {}
                 element["title"] = name.replace('<![CDATA[','').replace("]]>","")
                 element["link"] = value.replace("//www.cnn.com/","//edition.cnn.com/")
                 try:
                     img = common.parseDOM(item, "media:content", ret="url")[0].encode("utf-8")
                     element["thumbnail"] = img
                 except:
                     logger.debug("Could not be extracted any img. :'(")
                 x.append(element)
     else:
         html = CNN.getContentFromUrl(url=page,launchLocation=True,referer=CNN.MAIN_URL)
         startRegex = '<div class="el__leafmedia el__leafmedia--sourced-paragraph">'
         body = Decoder.extract(startRegex,'</div><p class="zn-body__paragraph zn-body__footer">',html)
         logger.debug("removing html: "+body)
         body = Decoder.removeHTML(body)
         logger.debug("html has removed from body!")
         if '|' in body:
             body = body[body.find('|')+1:]
         try:
             lowerCaseIndex = int(re.search("[a-z]", body).start())
             body = body[:lowerCaseIndex-1]+"\n"+body[lowerCaseIndex-1:]
         except:
             logger.error("No break for city was done. Something goes wrong")
             pass
         element = {}
         element["link"] = page
         element["title"] = body
         element["thumbnail"] = ''
         x.append(element)
     return x

Ejemplo n.º 4

Mostrar archivo

Archivo: mejortorrent.py Proyecto: djwayne1985/tvbox

 def search(text):
     x = []
     searchUrl = MejorTorrent.SEARCH % urllib.quote_plus(text)
     html = MejorTorrent.getContentFromUrl(url=searchUrl,referer=MejorTorrent.MAIN_URL)
     logger.debug("search html is: "+html)
     if "<table width='96%' border='0' cellspacing='0' cellpadding='4' align='center'>" in html:
         table = Decoder.extract("<table width='96%' border='0' cellspacing='0' cellpadding='4' align='center'>","</table>",html)
         i=0
         for line in table.split("<tr height='22'>"):
             if i>0:
                 link = Decoder.extract("<a href='","'",line)
                 title = Decoder.extract('onmouseout="style.textDecoration=\'none\';">', "</td>", line)
                 title = Decoder.removeHTML(title)
                 element = {}
                 element["title"] = title
                 element["link"] = "http://www.mejortorrent.com"+link
                 x.append(element)
             i+=1
     else:
         logger.debug("nothing done in search!")
     return x

Ejemplo n.º 5

Mostrar archivo

Archivo: providersUtils.py Proyecto: djwayne1985/tvbox

def drawBbcCoUkNew(url):
    htmlContent = Downloader.getContentFromUrl(url=url)
    title = Decoder.extract('<p class="story-body__introduction">', '</p><div',
                            htmlContent)
    if 'property="articleBody"' in htmlContent:
        body = Decoder.extract(
            'property="articleBody"',
            '                                                                                                </div>',
            htmlContent)
        body = body.replace('<span class="off-screen">Image copyright</span>',
                            '')
        body = body.replace('<span class="story-image-copyright">AFP</span>',
                            '')
        body = body.replace(
            '<span class="story-image-copyright">Reuters</span>', '')
        body = body.replace('<span class="off-screen">Image caption</span>',
                            '')
        body = body.replace('<span class="off-screen">Media caption</span>',
                            '')
        while '<span class="media-caption__text">' in body:
            line = Decoder.extractWithRegex(
                '<span class="media-caption__text">', "</span>", body)
            body = body.replace(line, "")
    elif 'class="text-wrapper"' in htmlContent:
        #special content
        body = Decoder.extract('class="text-wrapper"', '</p>\n', htmlContent)
        dates = Decoder.extractWithRegex('<div class="date', "</div>", body)
        lastUpdate = Decoder.extractWithRegex('<p class="date ', "</p>", body)
        body = body.replace(dates, "")
        body = body.replace(lastUpdate, "")
    elif '<figcaption class="sp-media-asset' in htmlContent:
        body = Decoder.extract('<figcaption class="sp-media-asset',
                               '</p><div ', htmlContent)
        if '>' in body:
            body = body[body.find(">") + 1:]
    body = Decoder.removeHTML(body).replace(".", ".\n").replace(">", "")
    logger.debug("body is: " + body)
    drawNew(textContent=(body))

Ejemplo n.º 6

Mostrar archivo

Archivo: elmundo.py Proyecto: djwayne1985/tvbox

 def getChannels(page):
     x = []
     if page == '0':
         url = ElMundo.LAST_NEWS_RSS
         logger.debug("news rss url is: " + url)
         bruteResult = ElMundo.getContentFromUrl(url=url,
                                                 launchLocation=True,
                                                 ajax=False,
                                                 referer=ElMundo.MAIN_URL)
         logger.debug("brute response: " + bruteResult)
         lists = common.parseDOM(bruteResult, "item")
         if len(lists) > 0:
             logger.info("counted: " + str(len(lists)))
             for item in lists:
                 name = common.parseDOM(item, "title")[0].encode("utf-8")
                 link = common.parseDOM(item, "link")[0].encode("utf-8")
                 logger.info("Added: " + name + ", url: " + link)
                 element = {}
                 element["title"] = name.replace('<![CDATA[',
                                                 '').replace("]]>", "")
                 element["link"] = link
                 try:
                     img = common.parseDOM(item, "media:content",
                                           ret="url")[0].encode("utf-8")
                     logger.debug("thumbnail is: " + img)
                     element["thumbnail"] = img
                 except:
                     logger.debug("Could not be extracted any img. :'(")
                 x.append(element)
     else:
         html = ElMundo.getContentFromUrl(
             url=page, launchLocation=True,
             referer=ElMundo.MAIN_URL).decode('iso-8859-15').encode('utf8')
         startRegex = '<article class="news-item" itemscope itemtype="http://schema.org/NewsArticle">'
         body = Decoder.extract(startRegex, '<h3 class="list-header">',
                                html)
         if 'class="comentarios ' in body:
             body = body[:body.find('class="comentarios ')]
         if '<a href="#ancla_comentarios">' in body:
             replacedBy = Decoder.extract('<a href="#ancla_comentarios">',
                                          "</a>", body)
             logger.debug("removing: " + replacedBy)
             body = body.replace(replacedBy, "")
         logger.debug("removing html: " + body)
         body = Decoder.removeHTML(body)
         if ' Twitter Facebook Enviar ' in body:
             body = body.replace(" Twitter Facebook Enviar ", "\n")
         if ":" in body:  #search by time
             index = body.find(":")
             try:
                 figure = int(body[index + 1])  #it's a number
             except:  #it's not a number, so needs next one
                 body2 = body[index + 1:]
                 index += body2.find(":") + 1
                 pass
             body = body[:index + 3] + "\n\n" + body[index + 3:]
         logger.debug("html has removed from body!")
         element = {}
         element["link"] = page
         element["title"] = body
         element["thumbnail"] = ''
         x.append(element)
     return x