def get_vidlink_dailymotion(url): vidlink = '' # check if URL starts with just // and not the usual http: or https:; add 'http' accordingly dblslshpat = re.compile("//") if (dblslshpat.match(url, 0) > -1): url = "http:" + url html = getHTML(url) soup = BeautifulSoup(html) scripts = soup.findAll('script') scode = scripts[8].contents[0] matchconfig = re.compile('var config = (\{.+?\})\;').findall( scripts[8].contents[0]) json_string = matchconfig[0] parsed_json = json.loads(json_string) fileurl = parsed_json['metadata']['qualities']['auto'][0]['url'] lastquality = 0 for q in parsed_json['metadata']['qualities']: if q == 'auto': continue if int(lastquality) > int(q): continue else: try: fileurl = parsed_json['metadata']['qualities'][q][1]['url'] except: try: fileurl = parsed_json['metadata']['qualities'][q][0]['url'] except: continue lastquality = int(q) vidlink = fileurl return vidlink
def listPage(url): html = getHTML(urllib.unquote_plus(url)) soup = BeautifulSoup(html) for videobox in soup.findAll('div', 'videobox'): thumb = videobox.find('img', 'thumbnail')['src'] try: title = videobox.find('a', 'title').contents title = title[0].encode("utf-8") except: title = "No title" RE_ID = 'jpg-s/(\d*)_\d.jpg' RE_ID_obj = re.compile(RE_ID, re.IGNORECASE) url = RE_ID_obj.sub(r"mp4/\g<1>.mp4?start=0", thumb) listitem=xbmcgui.ListItem(title, iconImage="DefaultFolder.png", thumbnailImage=thumb) listitem.setInfo(type="Video", infoLabels={ "Title": title }) listitem.setPath(url) listitem.setProperty("IsPlayable", "true") xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem) nav_page = soup.find('div', 'nav_page') for next in nav_page.findAll('a'): line = next.contents line = line[0].encode("utf-8") if 'sta' in line: url = next['href'] url = BASE_SITE_URL + url addPosts(__language__(30000), urllib.quote_plus(url)) return
def getVideoDownloadLink(self, url): """Return the video title and download link""" title = None link = None downloadPage = '' html = getHTML(url) soup = BeautifulSoup(html) # Look for the "bouton-telecharger" class (new version) telecharger = soup.find('a', attrs = {'class':'bouton-telecharger'}) if telecharger: downloadPage = telecharger['href'] else: # Look for the "bouton-telecharger" image (old version) img = soup.find('img', attrs = {'src':'http://www.arretsurimages.net/images/boutons/bouton-telecharger.png'}) if img: downloadPage = img.findParent()['href'] if downloadPage.endswith('.avi'): print downloadPage title = downloadPage.split('/')[-1] print title html = getHTML(downloadPage) soup = BeautifulSoup(html) click = soup.find(text=re.compile('cliquer ici')) if click: link = click.findParent()['href'] print link else: print "No \"cliquer ici\" found" else: print "bouton-telecharger not found" return {'Title':title, 'url':link}
def getVideoDetails(self, url): """self.videoDetails={Title, Director, Genre, Plot, id, url}""" #TODO: get 'related tags' and list them under genre html = self.fetcher.getHTML(url) url = "" soup = BeautifulSoup(html) #get title title = soup.find('span', attrs={'id':'altHeadline'}).string #get speaker from title speaker = title.split(':', 1)[0] #get description: plot = soup.find('p', attrs={'id':'tagline'}).string #get url #detectors for link to video in order of preference linkDetectors = [ lambda l: re.compile('High-res video \(MP4\)').match(str(l.string)), lambda l: re.compile('http://download.ted.com/talks/.+.mp4').match(str(l['href'])), ] for link in soup.findAll('a', href=True): for detector in linkDetectors: if detector(link): url = link['href'] linkDetectors = linkDetectors[:linkDetectors.index(detector)] # Only look for better matches than what we have break if url == "": # look for utub link utublinks = re.compile('http://(?:www.)?youtube.com/v/([^\&]*)\&').findall(html) for link in utublinks: url = 'plugin://plugin.video.youtube/?action=play_video&videoid=%s' %(link) #get id from url id = url.split('/')[-1] return {'Title':title, 'Director':speaker, 'Genre':'TED', 'Plot':plot, 'PlotOutline':plot, 'id':id, 'url':url}
def addSeasonList( self ): tree=MinimalSoup(common.getHTML(common.args.url)) seasons=tree.findAll('td', attrs={"class":re.compile('^vex')}) #flatten seasons by settings if common.settings['flat_season'] == 1 or (len(seasons) == 1 and common.settings['flat_season'] == 0): common.args.mode='TV_Episodes' seasonNums=[] for season in seasons: common.args.name = season.contents[0] seasonNums.append(season.contents[0]) self.addEpisodeList( ) #add clips folder rss=tree.findAll('a', attrs={'class':'rss-link'}) clipRSS = None for feed in rss: if feed['href'].split('/')[-1]=='clips': clipRSS = feed['href'] if clipRSS != None: common.addDirectory(xbmc.getLocalizedString(30095), clipRSS, "TV_Clips") xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] )) else: #add one folder for each season for season in seasons: name=season.contents[0] p=re.compile('"(http://.+?)"') url=p.findall(season['onclick']) url=url[0].replace('&','&') ok=common.addDirectory(name, common.args.url, "TV_Episodes") #add clips folder rss=tree.findAll('a', attrs={'class':'rss-link'}) for feed in rss: if feed['href'].split('/')[-1]=='clips': clipRSS = feed['href'] common.addDirectory(xbmc.getLocalizedString(30095), clipRSS, "TV_Clips") xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ))
def listPage(url): html = getHTML(urllib.unquote_plus(url)) soup = BeautifulSoup(html) currentPage = soup.find('li', 'current-page').a['href'] nextPage = soup.find('li', 'next-page').a['href'] maxPage = soup.find('li', 'next-page').findPrevious('li').a['href'] for inItem in soup.findAll('div', 'inItem'): try: title = inItem.findAll('a')[1].contents[0].replace('&','&') except: title = "No title" link = inItem.find('a')['href'] re_pinkbike = 'video/(\d+)/' id = re.findall(re_pinkbike, link)[0] id = int(id) partId = int(math.fabs(id/10000)) url = 'http://lv1.pinkbike.org/vf/' + str(partId) + '/pbvid-' + str(id) + '.mp4' thumb = inItem.find('img', 'thimg')['src'] time = inItem.find('span', 'fblack').contents[0] plot = inItem.find('p', 'uFullInfo f10 fgrey3').contents[0].strip() listitem=xbmcgui.ListItem(title, iconImage="DefaultFolder.png", thumbnailImage=thumb) listitem.setInfo(type="Video", infoLabels={ "Title": title, "Plot" : plot, "Duration" : time }) listitem.setPath(url) listitem.setProperty("IsPlayable", "true") xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem) if currentPage != maxPage: item=xbmcgui.ListItem('Next page...', iconImage="DefaultFolder.png") xurl = sys.argv[0] + '?' + "next=true" + "&url=" + urllib.quote_plus(nextPage.replace('&','&')) item.setInfo(type="Video", infoLabels={ "Title": ""}) item.setPath(xurl) folder = True xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=xurl, listitem=item, isFolder=folder) return
def firstPage(): html = getHTML(urllib.unquote_plus(BASE_URL)) # https://bugs.launchpad.net/beautifulsoup/+bug/838022 BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table'] soup = BeautifulSoup(html) # Favorites for links in soup.findAll('a','iconlink'): try: title = links.contents[0] except: title = "No title" try: link = links['href'] except: link = None if link and title and not "img" in str(title): addPosts(('Most faved ' + str(title)), urllib.quote_plus(link.replace('&','&'))) # Topics for table in soup.findAll('table'): for line in table.findAll('tr'): try: title = line.find('a').contents[0] except: title = None try: link = line.find('a')['href'] except: link = None if title and link: if BASE_URL in link: addPosts(str(title), urllib.quote_plus(link.replace('&','&'))) # Search addPosts('Search..', '&search=True') return
def getVideoDetails(self, url): """self.videoDetails={Title, Director, Genre, Plot, id, url}""" #TODO: get 'related tags' and list them under genre html = getHTML(url) soup = BeautifulSoup(html) #get title title = soup.find('span', attrs={'id': 'altHeadline'}).string #get speaker from title speaker = title.split(':', 1)[0] #get description: plot = soup.find('p', attrs={'id': 'tagline'}).string #get url for link in soup.findAll('a'): if re.match('Watch.*high-res', str(link.string)): url = URLTED + link['href'] #get id from url id = url.split('/')[-1] return { 'Title': title, 'Director': speaker, 'Genre': 'TED', 'Plot': plot, 'PlotOutline': plot, 'id': id, 'url': url }
def http_response(self, request, response): if not hasattr(response, "seek"): response = mechanize.response_seek_wrapper(response) # only use BeautifulSoup if response is html if response.info().dict.has_key('content-type') and ('html' in response.info().dict['content-type']): soup = MinimalSoup (response.get_data()) response.set_data(soup.prettify()) return response
def addShowsList( self ): xbmcplugin.addSortMethod(int(sys.argv[1]), xbmcplugin.SORT_METHOD_LABEL) xbmcplugin.addSortMethod(int(sys.argv[1]), xbmcplugin.SORT_METHOD_GENRE) html=common.getHTML(common.args.url) tree=MinimalSoup(html) shows=tree.findAll('a', attrs={"class":"show-thumb info_hover"}) del html del tree # with clips for show in shows: name = show.contents[0].replace('"','"').replace('&','&') url = show['href'] tmp = show['href'].split('/')[3] art = "http://assets.hulu.com/shows/key_art_"+tmp.replace('-','_')+".jpg" #thumb = "http://assets.hulu.com/shows/show_thumbnail_"+tmp.replace('-','_')+".jpg" #icon = "http://assets.hulu.com/shows/show_thumbnail_"+tmp.replace('-','_')+".jpg" #Use higher res fanart (key_art) instead of lower res thumbs & icons thumb = art icon = art if common.settings['get_show_plot'] == True: json = common.getHTML("http://www.hulu.com/shows/info/"+tmp) try: #this needs better regex, or maybe some sort of json parser p = re.compile('description: "(.+?)"[,}]') match = p.findall(json) plot = match[0].replace('\\','') except: plot=xbmc.getLocalizedString(30090) try: p = re.compile('channel: "(.+?)"[,}]') match = p.findall(json) genre = match[0] except: genre=xbmc.getLocalizedString(30090) #hopefully deleting this will help with xbox memory problems del json else: plot=genre=xbmc.getLocalizedString(30090) try: if show.parent['class'] != "full-episode-icon": name += ' '+xbmc.getLocalizedString(30091) genre += ' '+xbmc.getLocalizedString(30091) elif common.args.url != common.BASE_TV_URL: common.addDirectory(name, url, "TV_Seasons", art, icon, art, plot, genre) except: name += ' '+xbmc.getLocalizedString(30091) genre += ' '+xbmc.getLocalizedString(30091) if common.settings['only_full_episodes'] == False: common.addDirectory(name, url, "TV_Seasons", art, icon, art, plot, genre) #if we're doing both clips & full episodes, we need to run through the function again. if common.args.url == common.BASE_TV_URL : common.args.url = common.BASE_FULLTV_URL self.addShowsList() xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ))
def getallpages(url,useragent,referer): allurls = [] lphtml = getHTML(urllib.unquote_plus(str(url)).replace(' ','%20'),useragent,referer) soup = BeautifulSoup(lphtml) center = soup.find('center') alllps = center.findAll('a') allurls.append(url) for part in alllps: parturl = part['href'] allurls.append(parturl) return(allurls)
def firstPage(url): html = getHTML(urllib.unquote_plus(url)) # https://bugs.launchpad.net/beautifulsoup/+bug/838022 BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table'] soup = BeautifulSoup(html) thumbs = soup.findAll('div', 'thumb') lcount = 0 # Items for links in soup.findAll('h2', 'post-title entry-title'): script = thumbs[lcount].find('script') try: thumbnail_container = script.contents[0] except: thumbnail = "DefaultFolder.png" try: tmatch = re.compile( 'document.write\(bp_thumbnail_resize\(\"(.+?)\",').findall( thumbnail_container) except: thumbnail = "DefaultFolder.png" try: thumbnail = tmatch[0] except: thumbnail = "DefaultFolder.png" lcount = lcount + 1 for line in links.findAll('a'): try: title = links.find('a').contents[0].strip() except: title = "No title" try: link = links.find('a')['href'] except: link = None if title and link: if BASE_URL in link: addPosts(str(title), urllib.quote_plus(link.replace('&', '&')), thumbnail, 0) olderlinks = soup.find('a', 'blog-pager-older-link') try: title = olderlinks.contents[0] except: title = "Mga Lumang mga Post" try: link = olderlinks.attrs[1][1] except: link = None if title and link: addPosts(str(title), urllib.quote_plus(link.replace('&', '&')), "DefaultFolder.png", 1) return
def http_response(self, request, response): if not hasattr(response, "seek"): response = mechanize.response_seek_wrapper(response) # only use BeautifulSoup if response is html if response.info().dict.has_key('content-type') and ( 'html' in response.info().dict['content-type']): #soup = BeautifulSoup(response.get_data()) soup = MinimalSoup(response.get_data()) #print html #allHTML = soup.findAll('html') #print allHTML response.set_data(soup.prettify()) return response
def getPrograms(self): """Return all programs in self.html""" # Couldn't parse properly the file using "'div', {'class':'bloc-contenu-8'}" # BeautifulSoup returns nothing in that class # So use 'contenu-descr-8 ' and find previous tag soup = BeautifulSoup(cleanHTML(self.html)) for media in soup.findAll('div', {'class':'contenu-descr-8 '}): aTag = media.findPrevious('a') # Get link, title and thumb mediaLink = URLASI + aTag['href'] mediaTitle = aTag['title'].encode('utf-8') mediaThumb = URLASI + aTag.find('img', attrs = {'src':re.compile('.+?\.[png|jpg]')})['src'] yield {'url':mediaLink, 'Title':mediaTitle, 'Thumb':mediaThumb}
def isLoggedIn(self, username): """Return True if @username is already logged in, False otherwise""" html = getHTML(URLMONCOMPTE) soup = BeautifulSoup(html) if soup.title.string == u'Arrêt sur images – Mon compte': # Already logged in, check that the username is still the same userText = soup.find(text=re.compile(u'L’e-mail que vous utilisez pour @si est.*')) if userText and userText.next.string == username: return True else: print "Already logged in, but username does not match..." return False
def isLoggedIn(self, username): """Return True if @username is already logged in, False otherwise""" html = getHTML(URLMONCOMPTE) soup = BeautifulSoup(html) if soup.title.string == u'Arrêt sur images – Mon compte': # Already logged in, check that the username is still the same userText = soup.find( text=re.compile(u'L’e-mail que vous utilisez pour @si est.*')) if userText and userText.next.string == username: return True else: print "Already logged in, but username does not match..." return False
def getTalks(self): # themes loaded with a json call. Why are they not more consistant? from simplejson import loads # search HTML for the link to tedtalk's "api". It is easier to use regex here than BS. jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0] # make a dict from the json formatted string from above url talksMarkup = loads(getHTML(jsonUrl)) # parse through said dict for all the metadata for markup in talksMarkup['resultSet']['result']: talk = BeautifulSoup(markup['markup']) link = URLTED+talk.dt.a['href'] title = cleanHTML(talk.dt.a['title']) pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src']) yield {'url':link, 'Title':title, 'Thumb':pic}
def getTalks(self): # themes loaded with a json call. Why are they not more consistant? from simplejson import loads # search HTML for the link to tedtalk's "api". It is easier to use regex here than BS. jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0] # make a dict from the json formatted string from above url talksMarkup = loads(getHTML(jsonUrl)) # parse through said dict for all the metadata for markup in talksMarkup['resultSet']['result']: talk = BeautifulSoup(markup['markup']) link = URLTED+talk.dt.a['href'] title = cleanHTML(talk.dt.a['title']) pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src']) yield {'url':link, 'Title':title, 'Thumb':pic}
def getVideoDetails(self, url): """self.videoDetails={Title, Director, Genre, Plot, id, url}""" #TODO: get 'related tags' and list them under genre html = self.getHTML(url) url = "" soup = BeautifulSoup(html) #get title title = soup.find('span', attrs={'id': 'altHeadline'}).string #get speaker from title speaker = title.split(':', 1)[0] #get description: plot = soup.find('p', attrs={'id': 'tagline'}).string #get url #detectors for link to video in order of preference linkDetectors = [ lambda l: re.compile('High-res video \(MP4\)').match(str(l.string) ), lambda l: re.compile('http://download.ted.com/talks/.+.mp4').match( str(l['href'])), ] for link in soup.findAll('a', href=True): for detector in linkDetectors: if detector(link): url = link['href'] linkDetectors = linkDetectors[:linkDetectors.index( detector )] # Only look for better matches than what we have break if url == "": # look for utub link utublinks = re.compile( 'http://(?:www.)?youtube.com/v/([^\&]*)\&').findall(html) for link in utublinks: url = 'plugin://plugin.video.youtube/?action=play_video&videoid=%s' % ( link) #get id from url id = url.split('/')[-1] return { 'Title': title, 'Director': speaker, 'Genre': 'TED', 'Plot': plot, 'PlotOutline': plot, 'id': id, 'url': url }
def listPage_teleserye(url,useragent,referer): #notify("yes") allurls = getallpages(url,useragent,referer) links = [] for p in allurls: html = getHTML(urllib.unquote_plus(p),useragent,'') soup = BeautifulSoup(html) # Items thumbnail_meta = soup.find('meta', attrs={'property': 'og:image'}) try: thumbnail = thumbnail_meta['content'] except: thumbnail = "DefaultFolder.png" title_tag = soup.find('title') try: title = title_tag.contents[0] except: title = "no title" conurl = soup.find('iframe')['src'] #turl = get_vidlink_disklinksharetvplay(conurl,useragent,referer) turl = get_vidlink(conurl,useragent,referer) links.append(str(turl)) if (len(links) > 0): durl = build_url({'url': links, 'mode': 'playAllVideos', 'foldername': title, 'thumbnail': thumbnail, 'title': title}) itemname = 'Play All Parts' li = xbmcgui.ListItem(itemname, iconImage=thumbnail) li.setInfo(type="Video",infoLabels={"Title": title, "Plot" : "All parts of" + title}) li.setProperty('fanart_image', thumbnail) xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=durl, listitem=li) hcnt = 0 for l in links: partcnt = hcnt + 1 ititle = "Part " + str(partcnt) url = links[hcnt] thumb = thumbnail plot = ititle + ' of ' + title listitem=xbmcgui.ListItem(ititle, iconImage=thumb, thumbnailImage=thumb) listitem.setInfo(type="Video", infoLabels={ "Title": title, "Plot" : plot }) listitem.setPath(url) listitem.setProperty("IsPlayable", "true") listitem.setProperty("fanart_image", thumb) xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem) hcnt = hcnt + 1 return(links)
def getPrograms(self): """Return all programs in self.html""" # Couldn't parse properly the file using "'div', {'class':'bloc-contenu-8'}" # BeautifulSoup returns nothing in that class # So use 'contenu-descr-8 ' and find previous tag soup = BeautifulSoup(cleanHTML(self.html)) for media in soup.findAll('div', {'class': 'contenu-descr-8 '}): aTag = media.findPrevious('a') # Get link, title and thumb mediaLink = URLASI + aTag['href'] mediaTitle = aTag['title'].encode('utf-8') mediaThumb = URLASI + aTag.find( 'img', attrs={'src': re.compile('.+?\.[png|jpg]')})['src'] yield { 'url': mediaLink, 'Title': mediaTitle, 'Thumb': mediaThumb }
def getVideoDetails(self, url): """self.videoDetails={Title, Director, Genre, Plot, id, url}""" #TODO: get 'related tags' and list them under genre html = getHTML(url) soup = BeautifulSoup(html) #get title title = soup.find('span', attrs={'id':'altHeadline'}).string #get speaker from title speaker = title.split(':', 1)[0] #get description: plot = soup.find('p', attrs={'id':'tagline'}).string #get url for link in soup.findAll('a'): if re.match('Watch.*high-res' , str(link.string)): url = URLTED+link['href'] #get id from url id = url.split('/')[-1] return {'Title':title, 'Director':speaker, 'Genre':'TED', 'Plot':plot, 'PlotOutline':plot, 'id':id, 'url':url}
def addEpisodeList( self ): #initialize variables p=re.compile('(\d+)')#gets last number from "season ##" currentSeason=p.findall(common.args.name)[0] epRSS=None #parse html tree tree=MinimalSoup(common.getHTML(common.args.url)) rss=tree.findAll('a', attrs={'class':'rss-link'}) for feed in rss: if feed['href'].split('/')[-1]=='episodes': tree=BeautifulStoneSoup(common.getHTML(feed['href'])) items=tree.findAll('item') for episode in items: p=re.compile('\(s([0-9]*).+?\|.+?e([0-9]*)\)') match=p.findall(episode.title.contents[0])[0] seasonNum = match[0] episodeNum = match[1] if seasonNum == currentSeason: #add this episode to list name = episode.title.contents[0].split('(')[0] if len(seasonNum)<2:seasonNum='0'+seasonNum if len(episodeNum)<2:episodeNum='0'+episodeNum name = 's'+seasonNum+'e'+episodeNum+' '+name url = episode.link.contents[0].split('#')[0] try: thumb = episode.findAll('media:thumbnail')[0]['url'] except: thumb = '' try: airdate = episode.pubdate.contents[0] except: airdate = '' try: p=re.compile('<p>(.+?)</p>.+?Added: ') plot =''.join(p.findall(str(episode.findAll('description')))) try: p=re.compile('Duration: (.+?)\n') duration=p.findall(plot)[0].split(':') duration=(int(duration[0])*60)+int(duration[1]) except: duration=1 except: plot = '' common.addDirectory(name,url,'TV_play', thumb, thumb, common.args.fanart, plot, 'genre')
def get_vidlink_disklinksharetvplay(url,useragent,referer): #notify(referer) html = getHTML(urllib.unquote_plus(url),useragent,referer) soup = BeautifulSoup(str(html)) #randomname = random.randint(1,100000001) #xfile = 'special://temp/' + str(randomname) + '.txt' #x = xbmcvfs.File(xfile, 'w') #wres = x.write(str(soup)) #x.close() vidlink = soup.find('source')['src'] filehtml = getHTML(vidlink,useragent,referer) xserver = re.sub('(/)[a-zA-Z0-9\.\-]+$', r'\1', vidlink, flags=re.DOTALL) newm3u8 = re.sub(r'\n([^#])', '\n' + xserver + r'/\1', filehtml, flags=re.DOTALL) randomname = random.randint(1,100000001) tfile = 'special://temp/' + str(randomname) + '.m3u8' f = xbmcvfs.File(tfile, 'w') wres = f.write(str(newm3u8)) f.close() #return(newm3u8) return(tfile)
def firstPage(url): html = getHTML(urllib.unquote_plus(url)) # https://bugs.launchpad.net/beautifulsoup/+bug/838022 BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table'] soup = BeautifulSoup(html) for article in soup.findAll('article', 'latestPost excerpt layout-1'): h2 = article.find('h2', 'title front-view-title') try: title = h2.find('a')['title'] except: title = "No title" try: link = h2.find('a')['href'] except: link = None try: div = article.find('div', 'featured-thumbnail') try: thumbnail = div.find('img')['data-layzr'] except: thumbnail = "DefaultFolder.png" except: div = None thumbnail = "DefaultFolder.png" if title and link: if BASE_URL in link: addPosts(title, link, thumbnail, 0) # Mga lumang mga post olderlinks = soup.find('a', 'next page-numbers') title = "Next Page" try: link = olderlinks.attrs[1][1] except: link = None if title and link: addPosts(str(title), urllib.quote_plus(link.replace('&', '&')), "DefaultFolder.png", 1) return
def getfirstPage_teleserye(url,useragent,referer): tlinks = {} llinks = {} html = getHTML(urllib.unquote_plus(str(url)).replace(' ','%20'),useragent,referer) #BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table'] soup = BeautifulSoup(str(html)) for article in soup.findAll('div','cat-hadding'): try: title = article.find('a')['title'] except: title = "No title" try: link = article.find('a')['href'] except: link = None try: thumbnail = article.find('img')['data-layzr'] except: thumbnail = "DefaultFolder.png" if title and link: # addPosts(title, link, thumbnail, 0) match_url = re.compile('http://www.teleserye.su/([^/]+?)/.+?$').findall(link) articleid = match_url[0] #alinks[title] = link #ilinks[title] = articleid tlinks[articleid] = title llinks[articleid] = link olderlinks = soup.find('a', 'blog-pager-older-link') try: title = olderlinks.contents[0] except: title = "Older Posts" try: link = olderlinks.attrs[1][1] except: link = None #if title and link: #addPosts(str(title), urllib.quote_plus(link.replace('&','&')), "DefaultFolder.png", 1) return(tlinks,llinks)
def getProgramParts(self, url, name, icon): """Return all parts of a program (video id) video id allows to get video url with a json request""" html = getHTML(url) soup = BeautifulSoup(html) parts = [] part = 1 # Get all movie id for param in soup.findAll('param', attrs={'name': 'movie'}): try: videoId = param.parent["id"] except KeyError: continue title = name + ' - Acte %d' % part # Try to get the icon linked to the iPhone video on that page # That's faster than getting it from the json request (see getVideoDetails), # which would require one extra HTML request for each part try: media = param.parent.parent.find(text=re.compile(u'img src=')) match = re.search(u'img src="(.*?)"', media) thumb = URLASI + match.group(1) except (TypeError, AttributeError): thumb = icon parts.append({'url': videoId, 'Title': title, 'Thumb': thumb}) part += 1 if u'ux sources' in soup.title.string and part == 3: # '@ux sources' is not cut in parts but getting the title is not # easy as it's not in a field linked to the video # Use a hack: since 20111110, "version intégrale" is first if re.search('Voici la version intégrale', html): parts[0]['Title'] = name + u' - intégrale'.encode('utf-8') parts[1]['Title'] = name + u' - aperçu'.encode('utf-8') else: # Before 20111104, the short video (version montée) was first parts[0]['Title'] = name + u' - montée'.encode('utf-8') parts[1]['Title'] = name + u' - intégrale'.encode('utf-8') return parts
def getProgramParts(self, url, name, icon): """Return all parts of a program (video id) video id allows to get video url with a json request""" html = getHTML(url) soup = BeautifulSoup(html) parts = [] part = 1 # Get all movie id for param in soup.findAll('param', attrs = {'name':'movie'}): try: videoId = param.parent["id"] except KeyError: continue title = name + ' - Acte %d' % part # Try to get the icon linked to the iPhone video on that page # That's faster than getting it from the json request (see getVideoDetails), # which would require one extra HTML request for each part try: media = param.parent.parent.find(text=re.compile(u'img src=')) match = re.search(u'img src="(.*?)"', media) thumb = URLASI + match.group(1) except (TypeError, AttributeError): thumb = icon parts.append({'url':videoId, 'Title':title, 'Thumb':thumb}) part += 1 if u'ux sources' in soup.title.string and part == 3: # '@ux sources' is not cut in parts but getting the title is not # easy as it's not in a field linked to the video # Use a hack: since 20111110, "version intégrale" is first if re.search('Voici la version intégrale', html): parts[0]['Title'] = name + u' - intégrale'.encode('utf-8') parts[1]['Title'] = name + u' - aperçu'.encode('utf-8') else: # Before 20111104, the short video (version montée) was first parts[0]['Title'] = name + u' - montée'.encode('utf-8') parts[1]['Title'] = name + u' - intégrale'.encode('utf-8') return parts
def getVideoDownloadLink(self, url): """Return the video title and download link""" title = None link = None downloadPage = '' html = getHTML(url) soup = BeautifulSoup(html) # Look for the "bouton-telecharger" class (new version) telecharger = soup.find('a', attrs={'class': 'bouton-telecharger'}) if telecharger: downloadPage = telecharger['href'] else: # Look for the "bouton-telecharger" image (old version) img = soup.find( 'img', attrs={ 'src': 'http://www.arretsurimages.net/images/boutons/bouton-telecharger.png' }) if img: downloadPage = img.findParent()['href'] if downloadPage.endswith('.avi'): print downloadPage title = downloadPage.split('/')[-1] print title html = getHTML(downloadPage) soup = BeautifulSoup(html) click = soup.find(text=re.compile('cliquer ici')) if click: link = click.findParent()['href'] print link else: print "No \"cliquer ici\" found" else: print "bouton-telecharger not found" return {'Title': title, 'url': link}
from os.path import exists, getsize, dirname, join from urllib import urlretrieve, urlencode, quote from sys import argv import demjson import zlib folder = dirname(argv[0]) cache = Cache(debug=False) pages = [] for index in range(1,11): index = cache.get("http://www.escapistmagazine.com/videos/view/zero-punctuation?page=%d"%index, max_age=60*60*2).read() index = index.replace("''>","'>") index = BeautifulSoup(index) for link in index.findAll("a"): if not link.has_key("href"): continue if link["href"].find("http://www.escapistmagazine.com/videos/view/zero-punctuation/")!=-1: short_href = link["href"] slash = short_href.rfind("/") if short_href[slash:].find("-")!=-1: short_href = short_href[slash+1:slash+short_href[slash:].find("-")] else: short_href = short_href[slash+1:] assert len(short_href)>0, link["href"] if short_href not in pages:
ur = urllib.urlopen(index_page_url) fp = open(output_filename, 'w') fp.write(ur.read()) fp.close() ur.close() for year in range(1999, currentyear + 1): year_index_filename = output_directory + str(year) + ".html" if not os.path.exists(year_index_filename): raise Exception, "Missing the year index: '%s'" % year_index_filename fp = open(year_index_filename) html = fp.read() fp.close() soup = MinimalSoup(html) link_tags = soup.findAll('a') contents_pages = set() daily_pages = set() contents_hash = {} for t in link_tags: if t.has_key('href'): m = re.search('(^|/)(bb-[0-9]+/.*)$', t['href']) if m: page = m.group(2) subdir, leaf = page.split("/")
def get_vidlink_linksharetv(url,useragent,referer): html = getHTML(urllib.unquote_plus(url),useragent,referer) soup = BeautifulSoup(str(html)) vidlink = soup.find('source')['src'] return(vidlink)
def listPage(url): html = getHTML(urllib.unquote_plus(url)) soup = BeautifulSoup(html) links = [] # Items thumbnail_meta = soup.find('meta', attrs={'property': 'og:image'}) try: thumbnail = thumbnail_meta['content'] except: thumbnail = "DefaultFolder.png" title_tag = soup.find('title') try: title = title_tag.contents[0] except: title = "no title" tab1 = soup.find('div', attrs={'id': 'tabs-1'}) headings = tab1.findAll('h3') iframes = tab1.findAll('iframe') hcnt = 0 for heading in headings: lurl = tab1.findAll('iframe')[hcnt]['src'] url = get_vidlink(lurl) links.append(str(url)) hcnt = hcnt + 1 if (len(links) > 1): durl = build_url({ 'url': links, 'mode': 'playAllVideos', 'foldername': title, 'thumbnail': thumbnail, 'title': title }) itemname = 'Play All Parts' li = xbmcgui.ListItem(itemname, iconImage=thumbnail) li.setInfo(type="Video", infoLabels={ "Title": title, "Plot": "All parts of" + title }) li.setProperty('fanart_image', thumbnail) xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=durl, listitem=li) hcnt = 0 for heading in headings: ititle = heading.contents[0] url = links[hcnt] thumb = thumbnail plot = ititle + ' of ' + title listitem = xbmcgui.ListItem(ititle, iconImage=thumb, thumbnailImage=thumb) listitem.setInfo(type="Video", infoLabels={ "Title": title, "Plot": plot }) listitem.setProperty("IsPlayable", "true") listitem.setProperty("fanart_image", thumb) xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem) hcnt = hcnt + 1 return
else: filename_year = int('20' + two_digit_year, 10) try: date_from_filename = datetime.date(filename_year, filename_month, filename_day) except ValueError: date_from_filename = None if verbose: print "Date in filename %s-%s-%s" % (filename_year, filename_month, filename_day) # Don't soup it if we don't have to: if date_from_filename and date_from_filename < all_after_date: continue day_soup = MinimalSoup(day_html) day_body = day_soup.find('body') if day_body: page_as_text = non_tag_data_in(day_body) else: error = "File couldn't be parsed by MinimalSoup: " + day_filename raise Exception, error # Now guess the date from the file contents as well: m = re.search( '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?', page_as_text) if not m: m = re.search( '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?',
filename_day = int(two_digit_day,10) if two_digit_year == '99': filename_year = 1999 else: filename_year = int('20'+two_digit_year,10) try: date_from_filename = datetime.date(filename_year,filename_month,filename_day) except ValueError: date_from_filename = None if verbose: print "Date in filename %s-%s-%s" % ( filename_year, filename_month, filename_day ) # Don't soup it if we don't have to: if date_from_filename and date_from_filename < all_after_date: continue day_soup = MinimalSoup(day_html) day_body = day_soup.find('body') if day_body: page_as_text = non_tag_data_in(day_body) else: error = "File couldn't be parsed by MinimalSoup: "+day_filename raise Exception, error # Now guess the date from the file contents as well: m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text) if not m: m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text) if m: day_of_week = m.group(2) day = m.group(3)
ur = urllib.urlopen(index_page_url) fp = open(output_filename, 'w') fp.write(ur.read()) fp.close() ur.close() for year in range(1999,currentyear+1): year_index_filename = output_directory + str(year) + ".html" if not os.path.exists(year_index_filename): raise Exception, "Missing the year index: '%s'" % year_index_filename fp = open(year_index_filename) html = fp.read() fp.close() soup = MinimalSoup( html ) link_tags = soup.findAll( 'a' ) contents_pages = set() daily_pages = set() contents_hash = {} for t in link_tags: if t.has_key('href'): m = re.search('(^|/)(bb-[0-9]+/.*)$',t['href']) if m: page = m.group(2) subdir, leaf = page.split("/")