def getVideoDownloadLink(self, url): """Return the video title and download link""" title = None link = None downloadPage = '' html = getHTML(url) soup = BeautifulSoup(html) # Look for the "bouton-telecharger" class (new version) telecharger = soup.find('a', attrs = {'class':'bouton-telecharger'}) if telecharger: downloadPage = telecharger['href'] else: # Look for the "bouton-telecharger" image (old version) img = soup.find('img', attrs = {'src':'http://www.arretsurimages.net/images/boutons/bouton-telecharger.png'}) if img: downloadPage = img.findParent()['href'] if downloadPage.endswith('.avi'): print downloadPage title = downloadPage.split('/')[-1] print title html = getHTML(downloadPage) soup = BeautifulSoup(html) click = soup.find(text=re.compile('cliquer ici')) if click: link = click.findParent()['href'] print link else: print "No \"cliquer ici\" found" else: print "bouton-telecharger not found" return {'Title':title, 'url':link}
def listPage(url): html = getHTML(urllib.unquote_plus(url)) soup = BeautifulSoup(html) currentPage = soup.find('li', 'current-page').a['href'] nextPage = soup.find('li', 'next-page').a['href'] maxPage = soup.find('li', 'next-page').findPrevious('li').a['href'] for inItem in soup.findAll('div', 'inItem'): try: title = inItem.findAll('a')[1].contents[0].replace('&','&') except: title = "No title" link = inItem.find('a')['href'] re_pinkbike = 'video/(\d+)/' id = re.findall(re_pinkbike, link)[0] id = int(id) partId = int(math.fabs(id/10000)) url = 'http://lv1.pinkbike.org/vf/' + str(partId) + '/pbvid-' + str(id) + '.mp4' thumb = inItem.find('img', 'thimg')['src'] time = inItem.find('span', 'fblack').contents[0] plot = inItem.find('p', 'uFullInfo f10 fgrey3').contents[0].strip() listitem=xbmcgui.ListItem(title, iconImage="DefaultFolder.png", thumbnailImage=thumb) listitem.setInfo(type="Video", infoLabels={ "Title": title, "Plot" : plot, "Duration" : time }) listitem.setPath(url) listitem.setProperty("IsPlayable", "true") xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem) if currentPage != maxPage: item=xbmcgui.ListItem('Next page...', iconImage="DefaultFolder.png") xurl = sys.argv[0] + '?' + "next=true" + "&url=" + urllib.quote_plus(nextPage.replace('&','&')) item.setInfo(type="Video", infoLabels={ "Title": ""}) item.setPath(xurl) folder = True xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=xurl, listitem=item, isFolder=folder) return
def getVideoDetails(self, url): """self.videoDetails={Title, Director, Genre, Plot, id, url}""" #TODO: get 'related tags' and list them under genre html = self.fetcher.getHTML(url) url = "" soup = BeautifulSoup(html) #get title title = soup.find('span', attrs={'id':'altHeadline'}).string #get speaker from title speaker = title.split(':', 1)[0] #get description: plot = soup.find('p', attrs={'id':'tagline'}).string #get url #detectors for link to video in order of preference linkDetectors = [ lambda l: re.compile('High-res video \(MP4\)').match(str(l.string)), lambda l: re.compile('http://download.ted.com/talks/.+.mp4').match(str(l['href'])), ] for link in soup.findAll('a', href=True): for detector in linkDetectors: if detector(link): url = link['href'] linkDetectors = linkDetectors[:linkDetectors.index(detector)] # Only look for better matches than what we have break if url == "": # look for utub link utublinks = re.compile('http://(?:www.)?youtube.com/v/([^\&]*)\&').findall(html) for link in utublinks: url = 'plugin://plugin.video.youtube/?action=play_video&videoid=%s' %(link) #get id from url id = url.split('/')[-1] return {'Title':title, 'Director':speaker, 'Genre':'TED', 'Plot':plot, 'PlotOutline':plot, 'id':id, 'url':url}
def getVideoDetails(self, url): """self.videoDetails={Title, Director, Genre, Plot, id, url}""" #TODO: get 'related tags' and list them under genre html = getHTML(url) soup = BeautifulSoup(html) #get title title = soup.find('span', attrs={'id': 'altHeadline'}).string #get speaker from title speaker = title.split(':', 1)[0] #get description: plot = soup.find('p', attrs={'id': 'tagline'}).string #get url for link in soup.findAll('a'): if re.match('Watch.*high-res', str(link.string)): url = URLTED + link['href'] #get id from url id = url.split('/')[-1] return { 'Title': title, 'Director': speaker, 'Genre': 'TED', 'Plot': plot, 'PlotOutline': plot, 'id': id, 'url': url }
def listPage(url): html = getHTML(urllib.unquote_plus(url)) soup = BeautifulSoup(html) for videobox in soup.findAll('div', 'videobox'): thumb = videobox.find('img', 'thumbnail')['src'] try: title = videobox.find('a', 'title').contents title = title[0].encode("utf-8") except: title = "No title" RE_ID = 'jpg-s/(\d*)_\d.jpg' RE_ID_obj = re.compile(RE_ID, re.IGNORECASE) url = RE_ID_obj.sub(r"mp4/\g<1>.mp4?start=0", thumb) listitem=xbmcgui.ListItem(title, iconImage="DefaultFolder.png", thumbnailImage=thumb) listitem.setInfo(type="Video", infoLabels={ "Title": title }) listitem.setPath(url) listitem.setProperty("IsPlayable", "true") xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem) nav_page = soup.find('div', 'nav_page') for next in nav_page.findAll('a'): line = next.contents line = line[0].encode("utf-8") if 'sta' in line: url = next['href'] url = BASE_SITE_URL + url addPosts(__language__(30000), urllib.quote_plus(url)) return
def getVideoDetails(self, url): """self.videoDetails={Title, Director, Genre, Plot, id, url}""" #TODO: get 'related tags' and list them under genre html = self.getHTML(url) url = "" soup = BeautifulSoup(html) #get title title = soup.find('span', attrs={'id': 'altHeadline'}).string #get speaker from title speaker = title.split(':', 1)[0] #get description: plot = soup.find('p', attrs={'id': 'tagline'}).string #get url #detectors for link to video in order of preference linkDetectors = [ lambda l: re.compile('High-res video \(MP4\)').match(str(l.string) ), lambda l: re.compile('http://download.ted.com/talks/.+.mp4').match( str(l['href'])), ] for link in soup.findAll('a', href=True): for detector in linkDetectors: if detector(link): url = link['href'] linkDetectors = linkDetectors[:linkDetectors.index( detector )] # Only look for better matches than what we have break if url == "": # look for utub link utublinks = re.compile( 'http://(?:www.)?youtube.com/v/([^\&]*)\&').findall(html) for link in utublinks: url = 'plugin://plugin.video.youtube/?action=play_video&videoid=%s' % ( link) #get id from url id = url.split('/')[-1] return { 'Title': title, 'Director': speaker, 'Genre': 'TED', 'Plot': plot, 'PlotOutline': plot, 'id': id, 'url': url }
def listPage_teleserye(url,useragent,referer): #notify("yes") allurls = getallpages(url,useragent,referer) links = [] for p in allurls: html = getHTML(urllib.unquote_plus(p),useragent,'') soup = BeautifulSoup(html) # Items thumbnail_meta = soup.find('meta', attrs={'property': 'og:image'}) try: thumbnail = thumbnail_meta['content'] except: thumbnail = "DefaultFolder.png" title_tag = soup.find('title') try: title = title_tag.contents[0] except: title = "no title" conurl = soup.find('iframe')['src'] #turl = get_vidlink_disklinksharetvplay(conurl,useragent,referer) turl = get_vidlink(conurl,useragent,referer) links.append(str(turl)) if (len(links) > 0): durl = build_url({'url': links, 'mode': 'playAllVideos', 'foldername': title, 'thumbnail': thumbnail, 'title': title}) itemname = 'Play All Parts' li = xbmcgui.ListItem(itemname, iconImage=thumbnail) li.setInfo(type="Video",infoLabels={"Title": title, "Plot" : "All parts of" + title}) li.setProperty('fanart_image', thumbnail) xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=durl, listitem=li) hcnt = 0 for l in links: partcnt = hcnt + 1 ititle = "Part " + str(partcnt) url = links[hcnt] thumb = thumbnail plot = ititle + ' of ' + title listitem=xbmcgui.ListItem(ititle, iconImage=thumb, thumbnailImage=thumb) listitem.setInfo(type="Video", infoLabels={ "Title": title, "Plot" : plot }) listitem.setPath(url) listitem.setProperty("IsPlayable", "true") listitem.setProperty("fanart_image", thumb) xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem) hcnt = hcnt + 1 return(links)
def getVideoDetails(self, url): """self.videoDetails={Title, Director, Genre, Plot, id, url}""" #TODO: get 'related tags' and list them under genre html = getHTML(url) soup = BeautifulSoup(html) #get title title = soup.find('span', attrs={'id':'altHeadline'}).string #get speaker from title speaker = title.split(':', 1)[0] #get description: plot = soup.find('p', attrs={'id':'tagline'}).string #get url for link in soup.findAll('a'): if re.match('Watch.*high-res' , str(link.string)): url = URLTED+link['href'] #get id from url id = url.split('/')[-1] return {'Title':title, 'Director':speaker, 'Genre':'TED', 'Plot':plot, 'PlotOutline':plot, 'id':id, 'url':url}
def getallpages(url,useragent,referer): allurls = [] lphtml = getHTML(urllib.unquote_plus(str(url)).replace(' ','%20'),useragent,referer) soup = BeautifulSoup(lphtml) center = soup.find('center') alllps = center.findAll('a') allurls.append(url) for part in alllps: parturl = part['href'] allurls.append(parturl) return(allurls)
def firstPage(url): html = getHTML(urllib.unquote_plus(url)) # https://bugs.launchpad.net/beautifulsoup/+bug/838022 BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table'] soup = BeautifulSoup(html) thumbs = soup.findAll('div', 'thumb') lcount = 0 # Items for links in soup.findAll('h2', 'post-title entry-title'): script = thumbs[lcount].find('script') try: thumbnail_container = script.contents[0] except: thumbnail = "DefaultFolder.png" try: tmatch = re.compile( 'document.write\(bp_thumbnail_resize\(\"(.+?)\",').findall( thumbnail_container) except: thumbnail = "DefaultFolder.png" try: thumbnail = tmatch[0] except: thumbnail = "DefaultFolder.png" lcount = lcount + 1 for line in links.findAll('a'): try: title = links.find('a').contents[0].strip() except: title = "No title" try: link = links.find('a')['href'] except: link = None if title and link: if BASE_URL in link: addPosts(str(title), urllib.quote_plus(link.replace('&', '&')), thumbnail, 0) olderlinks = soup.find('a', 'blog-pager-older-link') try: title = olderlinks.contents[0] except: title = "Mga Lumang mga Post" try: link = olderlinks.attrs[1][1] except: link = None if title and link: addPosts(str(title), urllib.quote_plus(link.replace('&', '&')), "DefaultFolder.png", 1) return
def isLoggedIn(self, username): """Return True if @username is already logged in, False otherwise""" html = getHTML(URLMONCOMPTE) soup = BeautifulSoup(html) if soup.title.string == u'Arrêt sur images – Mon compte': # Already logged in, check that the username is still the same userText = soup.find(text=re.compile(u'L’e-mail que vous utilisez pour @si est.*')) if userText and userText.next.string == username: return True else: print "Already logged in, but username does not match..." return False
def getTalks(self): # themes loaded with a json call. Why are they not more consistant? from simplejson import loads # search HTML for the link to tedtalk's "api". It is easier to use regex here than BS. jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0] # make a dict from the json formatted string from above url talksMarkup = loads(getHTML(jsonUrl)) # parse through said dict for all the metadata for markup in talksMarkup['resultSet']['result']: talk = BeautifulSoup(markup['markup']) link = URLTED+talk.dt.a['href'] title = cleanHTML(talk.dt.a['title']) pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src']) yield {'url':link, 'Title':title, 'Thumb':pic}
def isLoggedIn(self, username): """Return True if @username is already logged in, False otherwise""" html = getHTML(URLMONCOMPTE) soup = BeautifulSoup(html) if soup.title.string == u'Arrêt sur images – Mon compte': # Already logged in, check that the username is still the same userText = soup.find( text=re.compile(u'L’e-mail que vous utilisez pour @si est.*')) if userText and userText.next.string == username: return True else: print "Already logged in, but username does not match..." return False
def getVideoDownloadLink(self, url): """Return the video title and download link""" title = None link = None downloadPage = '' html = getHTML(url) soup = BeautifulSoup(html) # Look for the "bouton-telecharger" class (new version) telecharger = soup.find('a', attrs={'class': 'bouton-telecharger'}) if telecharger: downloadPage = telecharger['href'] else: # Look for the "bouton-telecharger" image (old version) img = soup.find( 'img', attrs={ 'src': 'http://www.arretsurimages.net/images/boutons/bouton-telecharger.png' }) if img: downloadPage = img.findParent()['href'] if downloadPage.endswith('.avi'): print downloadPage title = downloadPage.split('/')[-1] print title html = getHTML(downloadPage) soup = BeautifulSoup(html) click = soup.find(text=re.compile('cliquer ici')) if click: link = click.findParent()['href'] print link else: print "No \"cliquer ici\" found" else: print "bouton-telecharger not found" return {'Title': title, 'url': link}
def get_vidlink_disklinksharetvplay(url,useragent,referer): #notify(referer) html = getHTML(urllib.unquote_plus(url),useragent,referer) soup = BeautifulSoup(str(html)) #randomname = random.randint(1,100000001) #xfile = 'special://temp/' + str(randomname) + '.txt' #x = xbmcvfs.File(xfile, 'w') #wres = x.write(str(soup)) #x.close() vidlink = soup.find('source')['src'] filehtml = getHTML(vidlink,useragent,referer) xserver = re.sub('(/)[a-zA-Z0-9\.\-]+$', r'\1', vidlink, flags=re.DOTALL) newm3u8 = re.sub(r'\n([^#])', '\n' + xserver + r'/\1', filehtml, flags=re.DOTALL) randomname = random.randint(1,100000001) tfile = 'special://temp/' + str(randomname) + '.m3u8' f = xbmcvfs.File(tfile, 'w') wres = f.write(str(newm3u8)) f.close() #return(newm3u8) return(tfile)
def firstPage(url): html = getHTML(urllib.unquote_plus(url)) # https://bugs.launchpad.net/beautifulsoup/+bug/838022 BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table'] soup = BeautifulSoup(html) for article in soup.findAll('article', 'latestPost excerpt layout-1'): h2 = article.find('h2', 'title front-view-title') try: title = h2.find('a')['title'] except: title = "No title" try: link = h2.find('a')['href'] except: link = None try: div = article.find('div', 'featured-thumbnail') try: thumbnail = div.find('img')['data-layzr'] except: thumbnail = "DefaultFolder.png" except: div = None thumbnail = "DefaultFolder.png" if title and link: if BASE_URL in link: addPosts(title, link, thumbnail, 0) # Mga lumang mga post olderlinks = soup.find('a', 'next page-numbers') title = "Next Page" try: link = olderlinks.attrs[1][1] except: link = None if title and link: addPosts(str(title), urllib.quote_plus(link.replace('&', '&')), "DefaultFolder.png", 1) return
def getfirstPage_teleserye(url,useragent,referer): tlinks = {} llinks = {} html = getHTML(urllib.unquote_plus(str(url)).replace(' ','%20'),useragent,referer) #BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table'] soup = BeautifulSoup(str(html)) for article in soup.findAll('div','cat-hadding'): try: title = article.find('a')['title'] except: title = "No title" try: link = article.find('a')['href'] except: link = None try: thumbnail = article.find('img')['data-layzr'] except: thumbnail = "DefaultFolder.png" if title and link: # addPosts(title, link, thumbnail, 0) match_url = re.compile('http://www.teleserye.su/([^/]+?)/.+?$').findall(link) articleid = match_url[0] #alinks[title] = link #ilinks[title] = articleid tlinks[articleid] = title llinks[articleid] = link olderlinks = soup.find('a', 'blog-pager-older-link') try: title = olderlinks.contents[0] except: title = "Older Posts" try: link = olderlinks.attrs[1][1] except: link = None #if title and link: #addPosts(str(title), urllib.quote_plus(link.replace('&','&')), "DefaultFolder.png", 1) return(tlinks,llinks)
def get_vidlink_linksharetv(url,useragent,referer): html = getHTML(urllib.unquote_plus(url),useragent,referer) soup = BeautifulSoup(str(html)) vidlink = soup.find('source')['src'] return(vidlink)
filename_year = 1999 else: filename_year = int('20'+two_digit_year,10) try: date_from_filename = datetime.date(filename_year,filename_month,filename_day) except ValueError: date_from_filename = None if verbose: print "Date in filename %s-%s-%s" % ( filename_year, filename_month, filename_day ) # Don't soup it if we don't have to: if date_from_filename and date_from_filename < all_after_date: continue day_soup = MinimalSoup(day_html) day_body = day_soup.find('body') if day_body: page_as_text = non_tag_data_in(day_body) else: error = "File couldn't be parsed by MinimalSoup: "+day_filename raise Exception, error # Now guess the date from the file contents as well: m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text) if not m: m = re.search('(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?',page_as_text) if m: day_of_week = m.group(2) day = m.group(3) month = month_name_to_int(m.group(4)) if month == 0:
try: date_from_filename = datetime.date(filename_year, filename_month, filename_day) except ValueError: date_from_filename = None if verbose: print "Date in filename %s-%s-%s" % (filename_year, filename_month, filename_day) # Don't soup it if we don't have to: if date_from_filename and date_from_filename < all_after_date: continue day_soup = MinimalSoup(day_html) day_body = day_soup.find('body') if day_body: page_as_text = non_tag_data_in(day_body) else: error = "File couldn't be parsed by MinimalSoup: " + day_filename raise Exception, error # Now guess the date from the file contents as well: m = re.search( '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)(\d+)\w*\s+(\w+)(\s+(\d+))?', page_as_text) if not m: m = re.search( '(?ims)((Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?(\d+)\w*\s+(\w+)(\s+(\d+))?', page_as_text) if m:
def listPage(url): html = getHTML(urllib.unquote_plus(url)) soup = BeautifulSoup(html) links = [] # Items thumbnail_meta = soup.find('meta', attrs={'property': 'og:image'}) try: thumbnail = thumbnail_meta['content'] except: thumbnail = "DefaultFolder.png" title_tag = soup.find('title') try: title = title_tag.contents[0] except: title = "no title" tab1 = soup.find('div', attrs={'id': 'tabs-1'}) headings = tab1.findAll('h3') iframes = tab1.findAll('iframe') hcnt = 0 for heading in headings: lurl = tab1.findAll('iframe')[hcnt]['src'] url = get_vidlink(lurl) links.append(str(url)) hcnt = hcnt + 1 if (len(links) > 1): durl = build_url({ 'url': links, 'mode': 'playAllVideos', 'foldername': title, 'thumbnail': thumbnail, 'title': title }) itemname = 'Play All Parts' li = xbmcgui.ListItem(itemname, iconImage=thumbnail) li.setInfo(type="Video", infoLabels={ "Title": title, "Plot": "All parts of" + title }) li.setProperty('fanart_image', thumbnail) xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=durl, listitem=li) hcnt = 0 for heading in headings: ititle = heading.contents[0] url = links[hcnt] thumb = thumbnail plot = ititle + ' of ' + title listitem = xbmcgui.ListItem(ititle, iconImage=thumb, thumbnailImage=thumb) listitem.setInfo(type="Video", infoLabels={ "Title": title, "Plot": plot }) listitem.setProperty("IsPlayable", "true") listitem.setProperty("fanart_image", thumb) xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem) hcnt = hcnt + 1 return