Python Decoder.removeHTMLの例

プログラミング言語: Python

名前空間/パッケージ名: tvboxcore.decoder

クラス/型: Decoder

メソッド/関数: removeHTML

hotexamples.comのコード掲載数: 6

Python Decoder.removeHTML - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtvboxcore.decoder.Decoder.removeHTMLの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

extract(30)

extractWithRegex(23)

removeHTML(6)

decodeBussinessApp(4)

rExtract(3)

decodeLink(3)

extractSawlive(2)

extractDinostreamPart(2)

decode247bay(2)

decodeIguide(2)

decodeUcaster(2)

decodeMipsplayer(1)

decodePrivatestream(1)

decodeStreamliveto(1)

decodeAdfly(1)

decodeVaughnlivetv(1)

decodeLiveFlash(1)

decodeLetonTv(1)

extractParams(1)

decodeCastalbatv(1)

getCastcampLink(1)

getUstreamLink(1)

preWise(1)

decodeBroadcastst(1)

rExtractWithRegex(1)

コード例 #1

ファイルを表示

 def getChannels(page):
     x = []
     if page == '0':
         url = Reuters.LAST_NEWS_RSS + str(time.time() * 1000)
         logger.debug("news rss url is: " + url)
         bruteResult = Reuters.getContentFromUrl(url=url,
                                                 launchLocation=False,
                                                 ajax=True)
         logger.debug("brute ajax response: " + bruteResult)
         results = json.loads(bruteResult)
         i = 0
         for result in results["headlines"]:
             if i > 0:
                 element = {}
                 img = result["mainPicUrl"]
                 link = Reuters.MAIN_URL + result["url"]
                 title = result["formattedDate"] + " - " + result["headline"]
                 logger.debug("appending result: " + title + ", url: " +
                              link + ", img: " + img)
                 element["title"] = title
                 element["link"] = link
                 element["thumbnail"] = img
                 x.append(element)
             i += 1
     else:
         html = Reuters.getContentFromUrl(url=page)
         startRegex = '<span id="article-text">'
         if '<span id="article-text">' in html:
             startRegex = '<span id="article-text">'
         else:
             startRegex = '<span id="articleText">'
         body = Decoder.extract(startRegex, '<div class="linebreak"></div>',
                                html)
         body = Decoder.removeHTML(body)
         if '|' in body:
             body = body[body.find('|') + 1:]
         try:
             lowerCaseIndex = int(re.search("[a-z]", body).start())
             body = body[:lowerCaseIndex - 1] + "\n" + body[lowerCaseIndex -
                                                            1:]
         except:
             logger.error(
                 "No break for city was done. Something goes wrong")
             pass
         element = {}
         element["link"] = page
         element["title"] = body
         element["thumbnail"] = ''
         x.append(element)
     return x

コード例 #2

ファイルを表示

ファイル: mejortorrent.py プロジェクト: djwayne1985/tvbox

    def getChannels(page):
        x = []
        logger.debug("page: "+page)
        if(str(page)=="0"):
            html = MejorTorrent.getContentFromUrl(url=MejorTorrent.MAIN_URL)
            menuHtml = Decoder.extract("<table width='140' border='0' cellpadding='0' cellspacing='0' style='border-left:1px solid black; border-right:1px solid black; border-bottom:1px solid black;'>",'</table>',html)
            for itemHtml in menuHtml.split("<a"):
                logger.debug("li --> HTML is: "+itemHtml)
                if "href=" in itemHtml:
                    item = {}
                    title = Decoder.extract('">','</a>',itemHtml)
                    title = Decoder.removeHTML(title)
                    if len(title)>0:
                        item["title"] = title
                        link = Decoder.extract("href='", "'", itemHtml)
                        if 'musica' not in link and 'juegos' not in link and 'variados' not in link:
                            if "://" not in link:
                                item["link"] = MejorTorrent.MAIN_URL+link
                            x.append(item)
            search = {}
            search["title"] = XBMCUtils.getString(11018)
            search["link"] = ".search"
            x.append(search)

        elif page=='.search':
            #display keyboard, it will wait for result
            keyboard = XBMCUtils.getKeyboard()
            keyboard.doModal()
            text = ""
            if (keyboard.isConfirmed()):
                text = keyboard.getText()
                x = MejorTorrent.search(text)
        elif '-descargar-' in page:
            logger.debug("-descargar- page detected...")
            x = MejorTorrent.extractProvidersFromLink(page)
        elif 'sec=descargas' in page and '&p=' not in page:
            logger.debug("decoding torrent..."+page)
            html = MejorTorrent.getContentFromUrl(url=page)
            link = MejorTorrent.MAIN_URL+Decoder.extract("Pincha <a href='/","'",html)
            logger.debug("extracted torrent link: "+link)
            element = {}
            element["link"] = link
            element["title"] = page
            element["finalLink"] = True
            x.append(element)
        else:
            x = MejorTorrent.extractContentFromLink(page)

        return x

コード例 #3

ファイルを表示

 def getChannels(page):
     x = []
     if page == '0':
         url = CNN.LAST_NEWS_RSS
         logger.debug("news rss url is: "+url)
         bruteResult = CNN.getContentFromUrl(url=url,launchLocation=True,ajax=False)
         logger.debug("brute response: "+bruteResult)
         lists = common.parseDOM(bruteResult, "item")
         if len(lists) > 0:
             logger.info("counted: " + str(len(lists)))
             for item in lists:
                 name = common.parseDOM(item, "title")[0].encode("utf-8")
                 value = common.parseDOM(item, "guid")[0].encode("utf-8")
                 logger.info("Added: " + name + ", url: " + value)
                 element = {}
                 element["title"] = name.replace('<![CDATA[','').replace("]]>","")
                 element["link"] = value.replace("//www.cnn.com/","//edition.cnn.com/")
                 try:
                     img = common.parseDOM(item, "media:content", ret="url")[0].encode("utf-8")
                     element["thumbnail"] = img
                 except:
                     logger.debug("Could not be extracted any img. :'(")
                 x.append(element)
     else:
         html = CNN.getContentFromUrl(url=page,launchLocation=True,referer=CNN.MAIN_URL)
         startRegex = '<div class="el__leafmedia el__leafmedia--sourced-paragraph">'
         body = Decoder.extract(startRegex,'</div><p class="zn-body__paragraph zn-body__footer">',html)
         logger.debug("removing html: "+body)
         body = Decoder.removeHTML(body)
         logger.debug("html has removed from body!")
         if '|' in body:
             body = body[body.find('|')+1:]
         try:
             lowerCaseIndex = int(re.search("[a-z]", body).start())
             body = body[:lowerCaseIndex-1]+"\n"+body[lowerCaseIndex-1:]
         except:
             logger.error("No break for city was done. Something goes wrong")
             pass
         element = {}
         element["link"] = page
         element["title"] = body
         element["thumbnail"] = ''
         x.append(element)
     return x

コード例 #4

ファイルを表示

ファイル: mejortorrent.py プロジェクト: djwayne1985/tvbox

 def search(text):
     x = []
     searchUrl = MejorTorrent.SEARCH % urllib.quote_plus(text)
     html = MejorTorrent.getContentFromUrl(url=searchUrl,referer=MejorTorrent.MAIN_URL)
     logger.debug("search html is: "+html)
     if "<table width='96%' border='0' cellspacing='0' cellpadding='4' align='center'>" in html:
         table = Decoder.extract("<table width='96%' border='0' cellspacing='0' cellpadding='4' align='center'>","</table>",html)
         i=0
         for line in table.split("<tr height='22'>"):
             if i>0:
                 link = Decoder.extract("<a href='","'",line)
                 title = Decoder.extract('onmouseout="style.textDecoration=\'none\';">', "</td>", line)
                 title = Decoder.removeHTML(title)
                 element = {}
                 element["title"] = title
                 element["link"] = "http://www.mejortorrent.com"+link
                 x.append(element)
             i+=1
     else:
         logger.debug("nothing done in search!")
     return x

コード例 #5

ファイルを表示

ファイル: providersUtils.py プロジェクト: djwayne1985/tvbox

def drawBbcCoUkNew(url):
    htmlContent = Downloader.getContentFromUrl(url=url)
    title = Decoder.extract('<p class="story-body__introduction">', '</p><div',
                            htmlContent)
    if 'property="articleBody"' in htmlContent:
        body = Decoder.extract(
            'property="articleBody"',
            '                                                                                                </div>',
            htmlContent)
        body = body.replace('<span class="off-screen">Image copyright</span>',
                            '')
        body = body.replace('<span class="story-image-copyright">AFP</span>',
                            '')
        body = body.replace(
            '<span class="story-image-copyright">Reuters</span>', '')
        body = body.replace('<span class="off-screen">Image caption</span>',
                            '')
        body = body.replace('<span class="off-screen">Media caption</span>',
                            '')
        while '<span class="media-caption__text">' in body:
            line = Decoder.extractWithRegex(
                '<span class="media-caption__text">', "</span>", body)
            body = body.replace(line, "")
    elif 'class="text-wrapper"' in htmlContent:
        #special content
        body = Decoder.extract('class="text-wrapper"', '</p>\n', htmlContent)
        dates = Decoder.extractWithRegex('<div class="date', "</div>", body)
        lastUpdate = Decoder.extractWithRegex('<p class="date ', "</p>", body)
        body = body.replace(dates, "")
        body = body.replace(lastUpdate, "")
    elif '<figcaption class="sp-media-asset' in htmlContent:
        body = Decoder.extract('<figcaption class="sp-media-asset',
                               '</p><div ', htmlContent)
        if '>' in body:
            body = body[body.find(">") + 1:]
    body = Decoder.removeHTML(body).replace(".", ".\n").replace(">", "")
    logger.debug("body is: " + body)
    drawNew(textContent=(body))

コード例 #6

ファイルを表示

ファイル: elmundo.py プロジェクト: djwayne1985/tvbox

 def getChannels(page):
     x = []
     if page == '0':
         url = ElMundo.LAST_NEWS_RSS
         logger.debug("news rss url is: " + url)
         bruteResult = ElMundo.getContentFromUrl(url=url,
                                                 launchLocation=True,
                                                 ajax=False,
                                                 referer=ElMundo.MAIN_URL)
         logger.debug("brute response: " + bruteResult)
         lists = common.parseDOM(bruteResult, "item")
         if len(lists) > 0:
             logger.info("counted: " + str(len(lists)))
             for item in lists:
                 name = common.parseDOM(item, "title")[0].encode("utf-8")
                 link = common.parseDOM(item, "link")[0].encode("utf-8")
                 logger.info("Added: " + name + ", url: " + link)
                 element = {}
                 element["title"] = name.replace('<![CDATA[',
                                                 '').replace("]]>", "")
                 element["link"] = link
                 try:
                     img = common.parseDOM(item, "media:content",
                                           ret="url")[0].encode("utf-8")
                     logger.debug("thumbnail is: " + img)
                     element["thumbnail"] = img
                 except:
                     logger.debug("Could not be extracted any img. :'(")
                 x.append(element)
     else:
         html = ElMundo.getContentFromUrl(
             url=page, launchLocation=True,
             referer=ElMundo.MAIN_URL).decode('iso-8859-15').encode('utf8')
         startRegex = '<article class="news-item" itemscope itemtype="http://schema.org/NewsArticle">'
         body = Decoder.extract(startRegex, '<h3 class="list-header">',
                                html)
         if 'class="comentarios ' in body:
             body = body[:body.find('class="comentarios ')]
         if '<a href="#ancla_comentarios">' in body:
             replacedBy = Decoder.extract('<a href="#ancla_comentarios">',
                                          "</a>", body)
             logger.debug("removing: " + replacedBy)
             body = body.replace(replacedBy, "")
         logger.debug("removing html: " + body)
         body = Decoder.removeHTML(body)
         if ' Twitter Facebook Enviar ' in body:
             body = body.replace(" Twitter Facebook Enviar ", "\n")
         if ":" in body:  #search by time
             index = body.find(":")
             try:
                 figure = int(body[index + 1])  #it's a number
             except:  #it's not a number, so needs next one
                 body2 = body[index + 1:]
                 index += body2.find(":") + 1
                 pass
             body = body[:index + 3] + "\n\n" + body[index + 3:]
         logger.debug("html has removed from body!")
         element = {}
         element["link"] = page
         element["title"] = body
         element["thumbnail"] = ''
         x.append(element)
     return x