Python MinimalSoup Exemples, BeautifulSoup.MinimalSoup Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : addon.py Projet : alexisv/kodi

def get_vidlink_dailymotion(url):
    vidlink = ''
    # check if URL starts with just // and not the usual http: or https:; add 'http' accordingly
    dblslshpat = re.compile("//")
    if (dblslshpat.match(url, 0) > -1):
        url = "http:" + url
    html = getHTML(url)
    soup = BeautifulSoup(html)
    scripts = soup.findAll('script')
    scode = scripts[8].contents[0]
    matchconfig = re.compile('var config = (\{.+?\})\;').findall(
        scripts[8].contents[0])
    json_string = matchconfig[0]
    parsed_json = json.loads(json_string)
    fileurl = parsed_json['metadata']['qualities']['auto'][0]['url']
    lastquality = 0
    for q in parsed_json['metadata']['qualities']:
        if q == 'auto':
            continue
        if int(lastquality) > int(q):
            continue
        else:
            try:
                fileurl = parsed_json['metadata']['qualities'][q][1]['url']
            except:
                try:
                    fileurl = parsed_json['metadata']['qualities'][q][0]['url']
                except:
                    continue
            lastquality = int(q)
    vidlink = fileurl
    return vidlink

Exemple #2

0

Afficher le fichier

Fichier : default.py Projet : TsUPeR/xbmc-happymtb

def listPage(url):
    html = getHTML(urllib.unquote_plus(url))
    soup = BeautifulSoup(html) 
    for videobox in soup.findAll('div', 'videobox'):
        thumb = videobox.find('img', 'thumbnail')['src']
        try:
            title = videobox.find('a', 'title').contents
            title = title[0].encode("utf-8")
        except:
            title = "No title"
        RE_ID = 'jpg-s/(\d*)_\d.jpg'
        RE_ID_obj = re.compile(RE_ID, re.IGNORECASE)
        url = RE_ID_obj.sub(r"mp4/\g<1>.mp4?start=0", thumb)
        listitem=xbmcgui.ListItem(title, iconImage="DefaultFolder.png", thumbnailImage=thumb)
        listitem.setInfo(type="Video", infoLabels={ "Title": title })
        listitem.setPath(url)
        listitem.setProperty("IsPlayable", "true")
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem)
    nav_page = soup.find('div', 'nav_page')
    for next in nav_page.findAll('a'):
        line = next.contents
        line = line[0].encode("utf-8")
        if 'sta' in line:
            url = next['href']
            url = BASE_SITE_URL + url
            addPosts(__language__(30000), urllib.quote_plus(url))
    return

Exemple #3

0

Afficher le fichier

Fichier : asi_scraper.py Projet : beenje/plugin.video.arretsurimages

 def getVideoDownloadLink(self, url):
     """Return the video title and download link"""
     title = None
     link = None
     downloadPage = ''
     html = getHTML(url)
     soup = BeautifulSoup(html)
     # Look for the "bouton-telecharger" class (new version)
     telecharger = soup.find('a', attrs = {'class':'bouton-telecharger'})
     if telecharger:
         downloadPage = telecharger['href']
     else:
         # Look for the "bouton-telecharger" image (old version)
         img = soup.find('img', attrs = {'src':'http://www.arretsurimages.net/images/boutons/bouton-telecharger.png'})
         if img:
             downloadPage = img.findParent()['href']
     if downloadPage.endswith('.avi'):
         print downloadPage
         title = downloadPage.split('/')[-1]
         print title
         html = getHTML(downloadPage)
         soup = BeautifulSoup(html)
         click = soup.find(text=re.compile('cliquer ici'))
         if click:
             link = click.findParent()['href']
             print link
         else:
             print "No \"cliquer ici\" found"
     else:
         print "bouton-telecharger not found"
     return {'Title':title, 'url':link}

Exemple #4

0

Afficher le fichier

Fichier : ted_talks_scraper.py Projet : gobelinus/xbmc-plugin.video.ted.talks

    def getVideoDetails(self, url):
        """self.videoDetails={Title, Director, Genre, Plot, id, url}"""
        #TODO: get 'related tags' and list them under genre
        html = self.fetcher.getHTML(url)
        url = ""
        soup = BeautifulSoup(html)
        #get title
        title = soup.find('span', attrs={'id':'altHeadline'}).string
        #get speaker from title
        speaker = title.split(':', 1)[0]
        #get description:
        plot = soup.find('p', attrs={'id':'tagline'}).string
        #get url
        #detectors for link to video in order of preference
        linkDetectors = [
            lambda l: re.compile('High-res video \(MP4\)').match(str(l.string)),
            lambda l: re.compile('http://download.ted.com/talks/.+.mp4').match(str(l['href'])),
        ]
        for link in soup.findAll('a', href=True):
            for detector in linkDetectors:
                if detector(link):
                    url = link['href']
                    linkDetectors = linkDetectors[:linkDetectors.index(detector)] # Only look for better matches than what we have
                    break

        if url == "":
          # look for utub link
          utublinks = re.compile('http://(?:www.)?youtube.com/v/([^\&]*)\&').findall(html)
          for link in utublinks:
            url = 'plugin://plugin.video.youtube/?action=play_video&videoid=%s' %(link)
        #get id from url
        id = url.split('/')[-1]
        return {'Title':title, 'Director':speaker, 'Genre':'TED', 'Plot':plot, 'PlotOutline':plot, 'id':id, 'url':url}

Exemple #5

0

Afficher le fichier

Fichier : _tv.py Projet : androidrebellion/xbmc-hulu-plugin

    def addSeasonList( self ):
        tree=MinimalSoup(common.getHTML(common.args.url))  
        seasons=tree.findAll('td', attrs={"class":re.compile('^vex')})
        #flatten seasons by settings
        if common.settings['flat_season'] == 1 or (len(seasons) == 1 and common.settings['flat_season'] == 0):
            common.args.mode='TV_Episodes'
            seasonNums=[]
            for season in seasons:
                common.args.name = season.contents[0]
                seasonNums.append(season.contents[0])
                self.addEpisodeList( )
            #add clips folder
            rss=tree.findAll('a', attrs={'class':'rss-link'})
            clipRSS = None
            for feed in rss:
                if feed['href'].split('/')[-1]=='clips':
                    clipRSS = feed['href']
            if clipRSS != None:
                common.addDirectory(xbmc.getLocalizedString(30095), clipRSS, "TV_Clips")
            xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ))

        else:
            #add one folder for each season
            for season in seasons:
                name=season.contents[0]
                p=re.compile('&quot;(http://.+?)&quot;')
                url=p.findall(season['onclick'])
                url=url[0].replace('&amp;','&')
                ok=common.addDirectory(name, common.args.url, "TV_Episodes")
            #add clips folder
            rss=tree.findAll('a', attrs={'class':'rss-link'})
            for feed in rss:
                if feed['href'].split('/')[-1]=='clips': clipRSS = feed['href']
            common.addDirectory(xbmc.getLocalizedString(30095), clipRSS, "TV_Clips")
            xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ))

Exemple #6

0

Afficher le fichier

Fichier : default.py Projet : TsUPeR/xbmc-pinkbike

def listPage(url):
    html = getHTML(urllib.unquote_plus(url))
    soup = BeautifulSoup(html) 
    currentPage = soup.find('li', 'current-page').a['href']
    nextPage = soup.find('li', 'next-page').a['href']
    maxPage = soup.find('li', 'next-page').findPrevious('li').a['href']
    for inItem in soup.findAll('div', 'inItem'):
        try:
            title = inItem.findAll('a')[1].contents[0].replace('&amp;','&')
        except:
            title = "No title"
        link = inItem.find('a')['href']
        re_pinkbike = 'video/(\d+)/'
        id = re.findall(re_pinkbike, link)[0]
        id = int(id)
        partId = int(math.fabs(id/10000))
        url = 'http://lv1.pinkbike.org/vf/' + str(partId) + '/pbvid-' + str(id) + '.mp4'
        thumb = inItem.find('img', 'thimg')['src']
        time = inItem.find('span', 'fblack').contents[0]
        plot = inItem.find('p', 'uFullInfo f10 fgrey3').contents[0].strip()
        listitem=xbmcgui.ListItem(title, iconImage="DefaultFolder.png", thumbnailImage=thumb)
        listitem.setInfo(type="Video", infoLabels={ "Title": title, "Plot" : plot, "Duration" : time })
        listitem.setPath(url)
        listitem.setProperty("IsPlayable", "true")
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem)
    if currentPage != maxPage:
        item=xbmcgui.ListItem('Next page...', iconImage="DefaultFolder.png")
        xurl = sys.argv[0] + '?' + "next=true" + "&url=" + urllib.quote_plus(nextPage.replace('&amp;','&'))
        item.setInfo(type="Video", infoLabels={ "Title": ""})
        item.setPath(xurl)
        folder = True
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=xurl, listitem=item, isFolder=folder)
    return

Exemple #7

0

Afficher le fichier

Fichier : default.py Projet : TsUPeR/xbmc-pinkbike

def firstPage():
    html = getHTML(urllib.unquote_plus(BASE_URL))
    # https://bugs.launchpad.net/beautifulsoup/+bug/838022
    BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table']
    soup = BeautifulSoup(html)
    # Favorites
    for links in soup.findAll('a','iconlink'):
        try:
            title = links.contents[0]
        except:
            title = "No title"
        try:
            link = links['href']
        except:
            link = None
        if link and title and not "img" in str(title):
            addPosts(('Most faved ' + str(title)), urllib.quote_plus(link.replace('&amp;','&')))
    # Topics
    for table in soup.findAll('table'):
        for line in table.findAll('tr'):
            try:
                title = line.find('a').contents[0]
            except:
                title = None
            try:
                link = line.find('a')['href']
            except:
                link = None
            if title and link:
				if BASE_URL in link:
					addPosts(str(title), urllib.quote_plus(link.replace('&amp;','&')))
    # Search
    addPosts('Search..', '&search=True')
    return

Exemple #8

0

Afficher le fichier

Fichier : ted_talks_scraper.py Projet : whf839/xbmc-addons

 def getVideoDetails(self, url):
     """self.videoDetails={Title, Director, Genre, Plot, id, url}"""
     #TODO: get 'related tags' and list them under genre
     html = getHTML(url)
     soup = BeautifulSoup(html)
     #get title
     title = soup.find('span', attrs={'id': 'altHeadline'}).string
     #get speaker from title
     speaker = title.split(':', 1)[0]
     #get description:
     plot = soup.find('p', attrs={'id': 'tagline'}).string
     #get url
     for link in soup.findAll('a'):
         if re.match('Watch.*high-res', str(link.string)):
             url = URLTED + link['href']
     #get id from url
     id = url.split('/')[-1]
     return {
         'Title': title,
         'Director': speaker,
         'Genre': 'TED',
         'Plot': plot,
         'PlotOutline': plot,
         'id': id,
         'url': url
     }

Exemple #9

0

Afficher le fichier

Fichier : mechanized_pm_resaveprods_bystyle.py Projet : relic7/prodimages

 def http_response(self, request, response):
     if not hasattr(response, "seek"):
         response = mechanize.response_seek_wrapper(response)
     # only use BeautifulSoup if response is html
     if response.info().dict.has_key('content-type') and ('html' in response.info().dict['content-type']):
         soup = MinimalSoup (response.get_data())
         response.set_data(soup.prettify())
     return response

Exemple #10

0

Afficher le fichier

Fichier : _tv.py Projet : androidrebellion/xbmc-hulu-plugin

    def addShowsList( self ):
        xbmcplugin.addSortMethod(int(sys.argv[1]), xbmcplugin.SORT_METHOD_LABEL)
        xbmcplugin.addSortMethod(int(sys.argv[1]), xbmcplugin.SORT_METHOD_GENRE)

        html=common.getHTML(common.args.url)
        tree=MinimalSoup(html)
        shows=tree.findAll('a', attrs={"class":"show-thumb info_hover"})
        del html
        del tree
        # with clips
        for show in shows:
            name  = show.contents[0].replace('&quot;','"').replace('&amp;','&')
            url   = show['href']
            tmp   = show['href'].split('/')[3]
            art   = "http://assets.hulu.com/shows/key_art_"+tmp.replace('-','_')+".jpg"
            #thumb = "http://assets.hulu.com/shows/show_thumbnail_"+tmp.replace('-','_')+".jpg"
            #icon  = "http://assets.hulu.com/shows/show_thumbnail_"+tmp.replace('-','_')+".jpg"
            #Use higher res fanart (key_art) instead of lower res thumbs & icons
            thumb = art
            icon = art
            if common.settings['get_show_plot'] == True:
                json = common.getHTML("http://www.hulu.com/shows/info/"+tmp)
                try:
                    #this needs better regex, or maybe some sort of json parser
                    p = re.compile('description: "(.+?)"[,}]')
                    match = p.findall(json)
                    plot = match[0].replace('\\','')
                except:
                    plot=xbmc.getLocalizedString(30090)
                try:
                    p = re.compile('channel: "(.+?)"[,}]')
                    match = p.findall(json)
                    genre = match[0]
                except:
                    genre=xbmc.getLocalizedString(30090)
                #hopefully deleting this will help with xbox memory problems
                del json
            else:
                plot=genre=xbmc.getLocalizedString(30090)
            try:
                if show.parent['class'] != "full-episode-icon":
                    name += ' '+xbmc.getLocalizedString(30091)
                    genre += ' '+xbmc.getLocalizedString(30091)
                elif common.args.url != common.BASE_TV_URL:
                    common.addDirectory(name, url, "TV_Seasons", art, icon, art, plot, genre)
            except:
                name += ' '+xbmc.getLocalizedString(30091)
                genre += ' '+xbmc.getLocalizedString(30091)
                if common.settings['only_full_episodes'] == False:
                    common.addDirectory(name, url, "TV_Seasons", art, icon, art, plot, genre)
        
        #if we're doing both clips & full episodes, we need to run through the function again.
        if common.args.url == common.BASE_TV_URL :
            common.args.url = common.BASE_FULLTV_URL
            self.addShowsList()
        
        xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ))

Exemple #11

0

Afficher le fichier

def getallpages(url,useragent,referer):
    allurls = []
    lphtml = getHTML(urllib.unquote_plus(str(url)).replace(' ','%20'),useragent,referer)
    soup = BeautifulSoup(lphtml)
    center = soup.find('center')
    alllps = center.findAll('a')
    allurls.append(url)
    for part in alllps:
        parturl = part['href']
        allurls.append(parturl)
    return(allurls)

Exemple #12

0

Afficher le fichier

Fichier : addon.py Projet : alexisv/kodi

def firstPage(url):
    html = getHTML(urllib.unquote_plus(url))
    # https://bugs.launchpad.net/beautifulsoup/+bug/838022
    BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table']
    soup = BeautifulSoup(html)
    thumbs = soup.findAll('div', 'thumb')
    lcount = 0
    # Items
    for links in soup.findAll('h2', 'post-title entry-title'):
        script = thumbs[lcount].find('script')
        try:
            thumbnail_container = script.contents[0]
        except:
            thumbnail = "DefaultFolder.png"
        try:
            tmatch = re.compile(
                'document.write\(bp_thumbnail_resize\(\"(.+?)\",').findall(
                    thumbnail_container)
        except:
            thumbnail = "DefaultFolder.png"
        try:
            thumbnail = tmatch[0]
        except:
            thumbnail = "DefaultFolder.png"
        lcount = lcount + 1
        for line in links.findAll('a'):
            try:
                title = links.find('a').contents[0].strip()
            except:
                title = "No title"
            try:
                link = links.find('a')['href']
            except:
                link = None
            if title and link:
                if BASE_URL in link:
                    addPosts(str(title),
                             urllib.quote_plus(link.replace('&amp;', '&')),
                             thumbnail, 0)
    olderlinks = soup.find('a', 'blog-pager-older-link')
    try:
        title = olderlinks.contents[0]
    except:
        title = "Mga Lumang mga Post"
    try:
        link = olderlinks.attrs[1][1]
    except:
        link = None
    if title and link:
        addPosts(str(title), urllib.quote_plus(link.replace('&amp;', '&')),
                 "DefaultFolder.png", 1)
    return

Exemple #13

0

Afficher le fichier

 def http_response(self, request, response):
     if not hasattr(response, "seek"):
         response = mechanize.response_seek_wrapper(response)
     # only use BeautifulSoup if response is html
     if response.info().dict.has_key('content-type') and (
             'html' in response.info().dict['content-type']):
         #soup = BeautifulSoup(response.get_data())
         soup = MinimalSoup(response.get_data())
         #print html
         #allHTML = soup.findAll('html')
         #print allHTML
         response.set_data(soup.prettify())
     return response

Exemple #14

0

Afficher le fichier

Fichier : asi_scraper.py Projet : beenje/plugin.video.arretsurimages

 def getPrograms(self):
     """Return all programs in self.html"""
     # Couldn't parse properly the file using "'div', {'class':'bloc-contenu-8'}"
     # BeautifulSoup returns nothing in that class
     # So use 'contenu-descr-8 ' and find previous tag
     soup = BeautifulSoup(cleanHTML(self.html))
     for media in soup.findAll('div', {'class':'contenu-descr-8 '}):
         aTag = media.findPrevious('a')
         # Get link, title and thumb
         mediaLink = URLASI + aTag['href']
         mediaTitle = aTag['title'].encode('utf-8')
         mediaThumb = URLASI + aTag.find('img', attrs = {'src':re.compile('.+?\.[png|jpg]')})['src']
         yield {'url':mediaLink, 'Title':mediaTitle, 'Thumb':mediaThumb}

Exemple #15

0

Afficher le fichier

Fichier : asi_scraper.py Projet : beenje/plugin.video.arretsurimages

 def isLoggedIn(self, username):
     """Return True if @username is already logged in,
     False otherwise"""
     html = getHTML(URLMONCOMPTE)
     soup = BeautifulSoup(html)
     if soup.title.string == u'Arrêt sur images – Mon compte':
         # Already logged in, check that the username is still the same
         userText = soup.find(text=re.compile(u'L’e-mail que vous utilisez pour @si est.*'))
         if userText and userText.next.string == username:
             return True
         else:
             print "Already logged in, but username does not match..."
     return False

Exemple #16

0

Afficher le fichier

Fichier : asi_scraper.py Projet : mossroy/plugin.video.arretsurimages

 def isLoggedIn(self, username):
     """Return True if @username is already logged in,
     False otherwise"""
     html = getHTML(URLMONCOMPTE)
     soup = BeautifulSoup(html)
     if soup.title.string == u'Arrêt sur images – Mon compte':
         # Already logged in, check that the username is still the same
         userText = soup.find(
             text=re.compile(u'L’e-mail que vous utilisez pour @si est.*'))
         if userText and userText.next.string == username:
             return True
         else:
             print "Already logged in, but username does not match..."
     return False

Exemple #17

0

Afficher le fichier

Fichier : ted_talks_scraper.py Projet : drrlramsey/xbmc-addons

 def getTalks(self):
     # themes loaded with a json call. Why are they not more consistant?
     from simplejson import loads
     # search HTML for the link to tedtalk's "api".  It is easier to use regex here than BS.
     jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0]
     # make a dict from the json formatted string from above url
     talksMarkup = loads(getHTML(jsonUrl))
     # parse through said dict for all the metadata
     for markup in talksMarkup['resultSet']['result']:
         talk = BeautifulSoup(markup['markup'])
         link = URLTED+talk.dt.a['href']
         title = cleanHTML(talk.dt.a['title'])
         pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
         yield {'url':link, 'Title':title, 'Thumb':pic}

Exemple #18

0

Afficher le fichier

Fichier : ted_talks_scraper.py Projet : cjrules/xbmc-korean

 def getTalks(self):
     # themes loaded with a json call. Why are they not more consistant?
     from simplejson import loads
     # search HTML for the link to tedtalk's "api".  It is easier to use regex here than BS.
     jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0]
     # make a dict from the json formatted string from above url
     talksMarkup = loads(getHTML(jsonUrl))
     # parse through said dict for all the metadata
     for markup in talksMarkup['resultSet']['result']:
         talk = BeautifulSoup(markup['markup'])
         link = URLTED+talk.dt.a['href']
         title = cleanHTML(talk.dt.a['title'])
         pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
         yield {'url':link, 'Title':title, 'Thumb':pic}

Exemple #19

0

Afficher le fichier

Fichier : ted_talks_scraper.py Projet : dco-github/xbmc-plugin.video.ted.talks

    def getVideoDetails(self, url):
        """self.videoDetails={Title, Director, Genre, Plot, id, url}"""
        #TODO: get 'related tags' and list them under genre
        html = self.getHTML(url)
        url = ""
        soup = BeautifulSoup(html)
        #get title
        title = soup.find('span', attrs={'id': 'altHeadline'}).string
        #get speaker from title
        speaker = title.split(':', 1)[0]
        #get description:
        plot = soup.find('p', attrs={'id': 'tagline'}).string
        #get url
        #detectors for link to video in order of preference
        linkDetectors = [
            lambda l: re.compile('High-res video \(MP4\)').match(str(l.string)
                                                                 ),
            lambda l: re.compile('http://download.ted.com/talks/.+.mp4').match(
                str(l['href'])),
        ]
        for link in soup.findAll('a', href=True):
            for detector in linkDetectors:
                if detector(link):
                    url = link['href']
                    linkDetectors = linkDetectors[:linkDetectors.index(
                        detector
                    )]  # Only look for better matches than what we have
                    break

        if url == "":
            # look for utub link
            utublinks = re.compile(
                'http://(?:www.)?youtube.com/v/([^\&]*)\&').findall(html)
            for link in utublinks:
                url = 'plugin://plugin.video.youtube/?action=play_video&videoid=%s' % (
                    link)
        #get id from url
        id = url.split('/')[-1]
        return {
            'Title': title,
            'Director': speaker,
            'Genre': 'TED',
            'Plot': plot,
            'PlotOutline': plot,
            'id': id,
            'url': url
        }

Exemple #20

0

Afficher le fichier

def listPage_teleserye(url,useragent,referer):
    #notify("yes")
    allurls = getallpages(url,useragent,referer)
    links = []
    for p in allurls:
        html = getHTML(urllib.unquote_plus(p),useragent,'')
        soup = BeautifulSoup(html)
        # Items
        thumbnail_meta = soup.find('meta', attrs={'property': 'og:image'})
        try:
            thumbnail = thumbnail_meta['content']
        except:
            thumbnail = "DefaultFolder.png"
        title_tag = soup.find('title')
        try:
            title = title_tag.contents[0]
        except:
            title = "no title"
        conurl = soup.find('iframe')['src']
        #turl = get_vidlink_disklinksharetvplay(conurl,useragent,referer)
        turl = get_vidlink(conurl,useragent,referer)
        links.append(str(turl))
    
    if (len(links) > 0):
        durl = build_url({'url': links, 'mode': 'playAllVideos', 'foldername': title, 'thumbnail': thumbnail, 'title': title})
        itemname = 'Play All Parts'
        li = xbmcgui.ListItem(itemname, iconImage=thumbnail)
        li.setInfo(type="Video",infoLabels={"Title": title, "Plot" : "All parts of" + title})
        li.setProperty('fanart_image', thumbnail)
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=durl, listitem=li)
    hcnt = 0
    for l in links:
        partcnt = hcnt + 1
        ititle = "Part " + str(partcnt)
        url = links[hcnt]
        thumb = thumbnail
        plot = ititle + ' of ' + title
        listitem=xbmcgui.ListItem(ititle, iconImage=thumb, thumbnailImage=thumb)
        listitem.setInfo(type="Video", infoLabels={ "Title": title, "Plot" : plot })
        listitem.setPath(url)
        listitem.setProperty("IsPlayable", "true")
        listitem.setProperty("fanart_image", thumb)
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem)
        hcnt = hcnt + 1
    return(links)

Exemple #21

0

Afficher le fichier

Fichier : asi_scraper.py Projet : mossroy/plugin.video.arretsurimages

 def getPrograms(self):
     """Return all programs in self.html"""
     # Couldn't parse properly the file using "'div', {'class':'bloc-contenu-8'}"
     # BeautifulSoup returns nothing in that class
     # So use 'contenu-descr-8 ' and find previous tag
     soup = BeautifulSoup(cleanHTML(self.html))
     for media in soup.findAll('div', {'class': 'contenu-descr-8 '}):
         aTag = media.findPrevious('a')
         # Get link, title and thumb
         mediaLink = URLASI + aTag['href']
         mediaTitle = aTag['title'].encode('utf-8')
         mediaThumb = URLASI + aTag.find(
             'img', attrs={'src': re.compile('.+?\.[png|jpg]')})['src']
         yield {
             'url': mediaLink,
             'Title': mediaTitle,
             'Thumb': mediaThumb
         }

Exemple #22

0

Afficher le fichier

Fichier : ted_talks_scraper.py Projet : drrlramsey/xbmc-addons

 def getVideoDetails(self, url):
     """self.videoDetails={Title, Director, Genre, Plot, id, url}"""
     #TODO: get 'related tags' and list them under genre
     html = getHTML(url)
     soup = BeautifulSoup(html)
     #get title
     title = soup.find('span', attrs={'id':'altHeadline'}).string
     #get speaker from title
     speaker = title.split(':', 1)[0]
     #get description:
     plot = soup.find('p', attrs={'id':'tagline'}).string
     #get url
     for link in soup.findAll('a'):
         if re.match('Watch.*high-res' , str(link.string)):
             url = URLTED+link['href']
     #get id from url
     id = url.split('/')[-1]
     return {'Title':title, 'Director':speaker, 'Genre':'TED', 'Plot':plot, 'PlotOutline':plot, 'id':id, 'url':url}

Exemple #23

0

Afficher le fichier

Fichier : _tv.py Projet : androidrebellion/xbmc-hulu-plugin

 def addEpisodeList( self ):
     #initialize variables
     p=re.compile('(\d+)')#gets last number from "season ##"
     currentSeason=p.findall(common.args.name)[0]
     epRSS=None
     #parse html tree
     tree=MinimalSoup(common.getHTML(common.args.url))
     rss=tree.findAll('a', attrs={'class':'rss-link'})
     for feed in rss:
         if feed['href'].split('/')[-1]=='episodes':
             tree=BeautifulStoneSoup(common.getHTML(feed['href']))
             items=tree.findAll('item')
             for episode in items:
                 p=re.compile('\(s([0-9]*).+?\|.+?e([0-9]*)\)')
                 match=p.findall(episode.title.contents[0])[0]
                 seasonNum  = match[0]
                 episodeNum = match[1]
                 if seasonNum == currentSeason:
                     #add this episode to list
                     name    = episode.title.contents[0].split('(')[0]
                     if len(seasonNum)<2:seasonNum='0'+seasonNum
                     if len(episodeNum)<2:episodeNum='0'+episodeNum
                     name = 's'+seasonNum+'e'+episodeNum+' '+name
                     url = episode.link.contents[0].split('#')[0]
                     try:
                         thumb = episode.findAll('media:thumbnail')[0]['url']
                     except:
                         thumb = ''
                     try:
                         airdate = episode.pubdate.contents[0]
                     except:
                         airdate = ''
                     try:
                         p=re.compile('<p>(.+?)</p>.+?Added: ')
                         plot =''.join(p.findall(str(episode.findAll('description'))))
                         try:
                             p=re.compile('Duration: (.+?)\n')
                             duration=p.findall(plot)[0].split(':')
                             duration=(int(duration[0])*60)+int(duration[1])
                         except:
                             duration=1
                     except:
                         plot = ''
                     common.addDirectory(name,url,'TV_play', thumb, thumb, common.args.fanart, plot, 'genre')

Exemple #24

0

Afficher le fichier

def get_vidlink_disklinksharetvplay(url,useragent,referer):
    #notify(referer)
    html = getHTML(urllib.unquote_plus(url),useragent,referer)
    soup = BeautifulSoup(str(html))
    #randomname = random.randint(1,100000001)
    #xfile = 'special://temp/' + str(randomname) + '.txt'
    #x = xbmcvfs.File(xfile, 'w')
    #wres = x.write(str(soup))
    #x.close()
    vidlink = soup.find('source')['src']
    filehtml = getHTML(vidlink,useragent,referer)
    xserver = re.sub('(/)[a-zA-Z0-9\.\-]+$', r'\1', vidlink, flags=re.DOTALL)
    newm3u8 = re.sub(r'\n([^#])', '\n' + xserver + r'/\1', filehtml, flags=re.DOTALL)
    randomname = random.randint(1,100000001)
    tfile = 'special://temp/' + str(randomname) + '.m3u8'
    f = xbmcvfs.File(tfile, 'w')
    wres = f.write(str(newm3u8))
    f.close()
    #return(newm3u8)
    return(tfile)

Exemple #25

0

Afficher le fichier

def firstPage(url):
    html = getHTML(urllib.unquote_plus(url))
    # https://bugs.launchpad.net/beautifulsoup/+bug/838022
    BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table']
    soup = BeautifulSoup(html)

    for article in soup.findAll('article', 'latestPost excerpt layout-1'):
        h2 = article.find('h2', 'title front-view-title')
        try:
            title = h2.find('a')['title']
        except:
            title = "No title"
        try:
            link = h2.find('a')['href']
        except:
            link = None
        try:
            div = article.find('div', 'featured-thumbnail')
            try:
                thumbnail = div.find('img')['data-layzr']
            except:
                thumbnail = "DefaultFolder.png"
        except:
            div = None
            thumbnail = "DefaultFolder.png"

        if title and link:
            if BASE_URL in link:
                addPosts(title, link, thumbnail, 0)

    # Mga lumang mga post
    olderlinks = soup.find('a', 'next page-numbers')
    title = "Next Page"
    try:
        link = olderlinks.attrs[1][1]
    except:
        link = None
    if title and link:
        addPosts(str(title), urllib.quote_plus(link.replace('&amp;', '&')),
                 "DefaultFolder.png", 1)
    return

Exemple #26

0

Afficher le fichier

def getfirstPage_teleserye(url,useragent,referer):
    tlinks = {}
    llinks = {}
    html = getHTML(urllib.unquote_plus(str(url)).replace(' ','%20'),useragent,referer)
    #BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table']
    soup = BeautifulSoup(str(html))
    for article in soup.findAll('div','cat-hadding'):
            try:
                title = article.find('a')['title']
            except:
                title = "No title"
            try:
                link = article.find('a')['href']
            except:
                link = None
            try:
                thumbnail = article.find('img')['data-layzr']
            except:
                thumbnail = "DefaultFolder.png"
            if title and link:
            #    addPosts(title, link, thumbnail, 0)
                match_url = re.compile('http://www.teleserye.su/([^/]+?)/.+?$').findall(link)
                articleid = match_url[0]
                #alinks[title] = link
                #ilinks[title] = articleid
                tlinks[articleid] = title
                llinks[articleid] = link
            
    olderlinks = soup.find('a', 'blog-pager-older-link')
    try:
        title = olderlinks.contents[0]
    except:
        title = "Older Posts"
    try:
        link = olderlinks.attrs[1][1]
    except:
        link = None
    #if title and link:
        #addPosts(str(title), urllib.quote_plus(link.replace('&amp;','&')), "DefaultFolder.png", 1)
    return(tlinks,llinks)

Exemple #27

0

Afficher le fichier

Fichier : asi_scraper.py Projet : mossroy/plugin.video.arretsurimages

    def getProgramParts(self, url, name, icon):
        """Return all parts of a program (video id)

        video id allows to get video url with a json request"""
        html = getHTML(url)
        soup = BeautifulSoup(html)
        parts = []
        part = 1
        # Get all movie id
        for param in soup.findAll('param', attrs={'name': 'movie'}):
            try:
                videoId = param.parent["id"]
            except KeyError:
                continue
            title = name + ' - Acte %d' % part
            # Try to get the icon linked to the iPhone video on that page
            # That's faster than getting it from the json request (see getVideoDetails),
            # which would require one extra HTML request for each part
            try:
                media = param.parent.parent.find(text=re.compile(u'img src='))
                match = re.search(u'img src="(.*?)"', media)
                thumb = URLASI + match.group(1)
            except (TypeError, AttributeError):
                thumb = icon
            parts.append({'url': videoId, 'Title': title, 'Thumb': thumb})
            part += 1
        if u'ux sources' in soup.title.string and part == 3:
            # '@ux sources' is not cut in parts but getting the title is not
            # easy as it's not in a field linked to the video
            # Use a hack: since 20111110, "version intégrale" is first
            if re.search('Voici la version int&eacute;grale', html):
                parts[0]['Title'] = name + u' - intégrale'.encode('utf-8')
                parts[1]['Title'] = name + u' - aperçu'.encode('utf-8')
            else:
                # Before 20111104, the short video (version montée) was first
                parts[0]['Title'] = name + u' - montée'.encode('utf-8')
                parts[1]['Title'] = name + u' - intégrale'.encode('utf-8')
        return parts

Exemple #28

0

Afficher le fichier

Fichier : asi_scraper.py Projet : beenje/plugin.video.arretsurimages

    def getProgramParts(self, url, name, icon):
        """Return all parts of a program (video id)

        video id allows to get video url with a json request"""
        html = getHTML(url)
        soup = BeautifulSoup(html)
        parts = []
        part = 1
        # Get all movie id
        for param in soup.findAll('param', attrs = {'name':'movie'}):
            try:
                videoId = param.parent["id"]
            except KeyError:
                continue
            title = name + ' - Acte %d' % part
            # Try to get the icon linked to the iPhone video on that page
            # That's faster than getting it from the json request (see getVideoDetails),
            # which would require one extra HTML request for each part
            try:
                media = param.parent.parent.find(text=re.compile(u'img src='))
                match = re.search(u'img src="(.*?)"', media)
                thumb = URLASI + match.group(1)
            except (TypeError, AttributeError):
                thumb = icon
            parts.append({'url':videoId, 'Title':title, 'Thumb':thumb})
            part += 1
        if u'ux sources' in soup.title.string and part == 3:
            # '@ux sources' is not cut in parts but getting the title is not
            # easy as it's not in a field linked to the video
            # Use a hack: since 20111110, "version intégrale" is first
            if re.search('Voici la version int&eacute;grale', html):
                parts[0]['Title'] = name + u' - intégrale'.encode('utf-8')
                parts[1]['Title'] = name + u' - aperçu'.encode('utf-8')
            else:
                # Before 20111104, the short video (version montée) was first
                parts[0]['Title'] = name + u' - montée'.encode('utf-8')
                parts[1]['Title'] = name + u' - intégrale'.encode('utf-8')
        return parts

Exemple #29

0

Afficher le fichier

Fichier : asi_scraper.py Projet : mossroy/plugin.video.arretsurimages

 def getVideoDownloadLink(self, url):
     """Return the video title and download link"""
     title = None
     link = None
     downloadPage = ''
     html = getHTML(url)
     soup = BeautifulSoup(html)
     # Look for the "bouton-telecharger" class (new version)
     telecharger = soup.find('a', attrs={'class': 'bouton-telecharger'})
     if telecharger:
         downloadPage = telecharger['href']
     else:
         # Look for the "bouton-telecharger" image (old version)
         img = soup.find(
             'img',
             attrs={
                 'src':
                 'http://www.arretsurimages.net/images/boutons/bouton-telecharger.png'
             })
         if img:
             downloadPage = img.findParent()['href']
     if downloadPage.endswith('.avi'):
         print downloadPage
         title = downloadPage.split('/')[-1]
         print title
         html = getHTML(downloadPage)
         soup = BeautifulSoup(html)
         click = soup.find(text=re.compile('cliquer ici'))
         if click:
             link = click.findParent()['href']
             print link
         else:
             print "No \"cliquer ici\" found"
     else:
         print "bouton-telecharger not found"
     return {'Title': title, 'url': link}

Exemple #30

0

Afficher le fichier

from os.path import exists, getsize, dirname, join
from urllib import urlretrieve, urlencode, quote
from sys import argv
import demjson
import zlib

folder = dirname(argv[0])

cache = Cache(debug=False)

pages = []

for index in range(1,11):
        index = cache.get("http://www.escapistmagazine.com/videos/view/zero-punctuation?page=%d"%index, max_age=60*60*2).read()
        index = index.replace("''>","'>")
        index = BeautifulSoup(index)

        for link in index.findAll("a"):
                if not link.has_key("href"):
                        continue
                if link["href"].find("http://www.escapistmagazine.com/videos/view/zero-punctuation/")!=-1:
                        short_href = link["href"]
                        slash = short_href.rfind("/")
                        if short_href[slash:].find("-")!=-1:
                                short_href = short_href[slash+1:slash+short_href[slash:].find("-")]
                        else:
                                short_href = short_href[slash+1:]

                        assert len(short_href)>0, link["href"]

                        if short_href not in pages:

Exemple #31

0

Afficher le fichier

Fichier : get-business-bulletins.py Projet : samknight/parlparse

        ur = urllib.urlopen(index_page_url)
        fp = open(output_filename, 'w')
        fp.write(ur.read())
        fp.close()
        ur.close()

for year in range(1999, currentyear + 1):

    year_index_filename = output_directory + str(year) + ".html"
    if not os.path.exists(year_index_filename):
        raise Exception, "Missing the year index: '%s'" % year_index_filename
    fp = open(year_index_filename)
    html = fp.read()
    fp.close()

    soup = MinimalSoup(html)
    link_tags = soup.findAll('a')

    contents_pages = set()
    daily_pages = set()

    contents_hash = {}

    for t in link_tags:

        if t.has_key('href'):
            m = re.search('(^|/)(bb-[0-9]+/.*)$', t['href'])
            if m:
                page = m.group(2)

                subdir, leaf = page.split("/")

Exemple #32

0

Afficher le fichier

def get_vidlink_linksharetv(url,useragent,referer):
    html = getHTML(urllib.unquote_plus(url),useragent,referer)
    soup = BeautifulSoup(str(html))
    vidlink = soup.find('source')['src']
    return(vidlink)

Exemple #33

0

Afficher le fichier

Fichier : addon.py Projet : alexisv/kodi

def listPage(url):
    html = getHTML(urllib.unquote_plus(url))
    soup = BeautifulSoup(html)
    links = []
    # Items
    thumbnail_meta = soup.find('meta', attrs={'property': 'og:image'})
    try:
        thumbnail = thumbnail_meta['content']
    except:
        thumbnail = "DefaultFolder.png"
    title_tag = soup.find('title')
    try:
        title = title_tag.contents[0]
    except:
        title = "no title"
    tab1 = soup.find('div', attrs={'id': 'tabs-1'})
    headings = tab1.findAll('h3')
    iframes = tab1.findAll('iframe')
    hcnt = 0
    for heading in headings:
        lurl = tab1.findAll('iframe')[hcnt]['src']
        url = get_vidlink(lurl)
        links.append(str(url))
        hcnt = hcnt + 1
    if (len(links) > 1):
        durl = build_url({
            'url': links,
            'mode': 'playAllVideos',
            'foldername': title,
            'thumbnail': thumbnail,
            'title': title
        })
        itemname = 'Play All Parts'
        li = xbmcgui.ListItem(itemname, iconImage=thumbnail)
        li.setInfo(type="Video",
                   infoLabels={
                       "Title": title,
                       "Plot": "All parts of" + title
                   })
        li.setProperty('fanart_image', thumbnail)
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]),
                                    url=durl,
                                    listitem=li)
    hcnt = 0
    for heading in headings:
        ititle = heading.contents[0]
        url = links[hcnt]
        thumb = thumbnail
        plot = ititle + ' of ' + title
        listitem = xbmcgui.ListItem(ititle,
                                    iconImage=thumb,
                                    thumbnailImage=thumb)
        listitem.setInfo(type="Video",
                         infoLabels={
                             "Title": title,
                             "Plot": plot
                         })
        listitem.setProperty("IsPlayable", "true")
        listitem.setProperty("fanart_image", thumb)
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]),
                                    url=url,
                                    listitem=listitem)
        hcnt = hcnt + 1
    return

Exemple #34

0

Afficher le fichier

    else:
        filename_year = int('20' + two_digit_year, 10)
    try:
        date_from_filename = datetime.date(filename_year, filename_month,
                                           filename_day)
    except ValueError:
        date_from_filename = None
        if verbose:
            print "Date in filename %s-%s-%s" % (filename_year, filename_month,
                                                 filename_day)

    # Don't soup it if we don't have to:
    if date_from_filename and date_from_filename < all_after_date:
        continue

    day_soup = MinimalSoup(day_html)

    day_body = day_soup.find('body')
    if day_body:
        page_as_text = non_tag_data_in(day_body)
    else:
        error = "File couldn't be parsed by MinimalSoup: " + day_filename
        raise Exception, error

    # Now guess the date from the file contents as well:
    m = re.search(
        '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?',
        page_as_text)
    if not m:
        m = re.search(
            '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?',

Exemple #35

0

Afficher le fichier

Fichier : parse-question-mentions.py Projet : JonathanBowker/parlparse

    filename_day = int(two_digit_day,10)
    if two_digit_year == '99':
        filename_year = 1999
    else:
        filename_year = int('20'+two_digit_year,10)
    try:
        date_from_filename = datetime.date(filename_year,filename_month,filename_day)
    except ValueError:
        date_from_filename = None
        if verbose: print "Date in filename %s-%s-%s" % ( filename_year, filename_month, filename_day )

    # Don't soup it if we don't have to:
    if date_from_filename and date_from_filename < all_after_date:
        continue

    day_soup = MinimalSoup(day_html)

    day_body = day_soup.find('body')
    if day_body:
        page_as_text = non_tag_data_in(day_body)
    else:
        error = "File couldn't be parsed by MinimalSoup: "+day_filename
        raise Exception, error

    # Now guess the date from the file contents as well:
    m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text)
    if not m:
        m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text)
    if m:
        day_of_week = m.group(2)
        day = m.group(3)

Exemple #36

0

Afficher le fichier

Fichier : get-business-bulletins.py Projet : JonathanBowker/parlparse

        ur = urllib.urlopen(index_page_url)
        fp = open(output_filename, 'w')
        fp.write(ur.read())
        fp.close()
        ur.close()

for year in range(1999,currentyear+1):

    year_index_filename = output_directory  + str(year) + ".html"
    if not os.path.exists(year_index_filename):
        raise Exception, "Missing the year index: '%s'" % year_index_filename
    fp = open(year_index_filename)
    html = fp.read()
    fp.close()

    soup = MinimalSoup( html )
    link_tags = soup.findAll( 'a' )

    contents_pages = set()
    daily_pages = set()

    contents_hash = {}

    for t in link_tags:

        if t.has_key('href'):
            m = re.search('(^|/)(bb-[0-9]+/.*)$',t['href'])
            if m:
                page = m.group(2)

                subdir, leaf = page.split("/")