Python cleanHTML Exemples, model.util.cleanHTML Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : ted_talks_scraper.py Projet : dco-github/xbmc-plugin.video.ted.talks

 def getTalks(self):
     # themes loaded with a json call. Why are they not more consistant?
     from simplejson import loads
     # search HTML for the link to tedtalk's "api".  It is easier to use regex here than BS.
     jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0]
     # make a dict from the json formatted string from above url
     talksMarkup = loads(self.get_HTML(jsonUrl))
     # parse through said dict for all the metadata
     for markup in talksMarkup['resultSet']['result']:
         talk = BeautifulSoup(markup['markup'])
         link = URLTED+talk.dt.a['href']
         title = cleanHTML(talk.dt.a['title'])
         pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
         yield {'url':link, 'Title':title, 'Thumb':pic}

Exemple #2

0

Afficher le fichier

Fichier : ted_talks_scraper.py Projet : dco-github/xbmc-plugin.video.ted.talks

 def getTalks(self):
     # themes loaded with a json call. Why are they not more consistant?
     from simplejson import loads
     # search HTML for the link to tedtalk's "api".  It is easier to use regex here than BS.
     jsonUrl = URLTED + re.findall('DataSource\("(.+?)"', self.html)[0]
     # make a dict from the json formatted string from above url
     talksMarkup = loads(self.get_HTML(jsonUrl))
     # parse through said dict for all the metadata
     for markup in talksMarkup['resultSet']['result']:
         talk = BeautifulSoup(markup['markup'])
         link = URLTED + talk.dt.a['href']
         title = cleanHTML(talk.dt.a['title'])
         pic = resizeImage(
             talk.find('img', attrs={'src':
                                     re.compile('.+?\.jpg')})['src'])
         yield {'url': link, 'Title': title, 'Thumb': pic}

Exemple #3

0

Afficher le fichier

Fichier : ted_talks_scraper.py Projet : dco-github/xbmc-plugin.video.ted.talks

    def getNewTalks(self, url = None):
        """
        Returns 2-tuples, first value is whether this is a folder, second is attributes dict
        """
        if url == None:
            url = 'http://www.ted.com/talks/list/page/'
        html = self.getHTML(url)

        # Forward/backwards        
        navItems = getNavItems(html)
        if navItems['next']:
            yield True, {'mode':'newTalks', 'Title': self.getLS(30020), 'url':navItems['next']}
        if navItems['previous']:
            yield True, {'mode':'newTalks', 'Title': self.getLS(30021), 'url':navItems['previous']}
        
        talkContainers = SoupStrainer(attrs = {'class':re.compile('talkMedallion')})
        for talk in BeautifulSoup(html, parseOnlyThese = talkContainers):
            link = URLTED+talk.dt.a['href']
            title = cleanHTML(talk.dt.a['title'])
            pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
            yield False, {'mode':'playVideo', 'url':link, 'Title':title, 'Thumb':pic}

Exemple #4

0

Afficher le fichier

Fichier : ted_talks_scraper.py Projet : dco-github/xbmc-plugin.video.ted.talks

    def getNewTalks(self, url=None):
        """
        Returns 2-tuples, first value is whether this is a folder, second is attributes dict
        """
        if url == None:
            url = 'http://www.ted.com/talks/list/page/'
        html = self.getHTML(url)

        # Forward/backwards
        navItems = getNavItems(html)
        if navItems['next']:
            yield True, {
                'mode': 'newTalks',
                'Title': self.getLS(30020),
                'url': navItems['next']
            }
        if navItems['previous']:
            yield True, {
                'mode': 'newTalks',
                'Title': self.getLS(30021),
                'url': navItems['previous']
            }

        talkContainers = SoupStrainer(
            attrs={'class': re.compile('talkMedallion')})
        for talk in BeautifulSoup(html, parseOnlyThese=talkContainers):
            link = URLTED + talk.dt.a['href']
            title = cleanHTML(talk.dt.a['title'])
            pic = resizeImage(
                talk.find('img', attrs={'src': re.compile('.+?\.jpg')})['src'])
            yield False, {
                'mode': 'playVideo',
                'url': link,
                'Title': title,
                'Thumb': pic
            }