def __retrieve_tv_shows__(tv_channel_url): tv_channel = {} tv_channel["running_tvshows"] = [] tv_channel["finished_tvshows"] = [] logging.getLogger().debug('TV Channel URL: ' + tv_channel_url) tv_shows = tv_channel["running_tvshows"] if tv_channel_url is None: return tv_shows tv_channel_url = BASE_WSITE_URL + tv_channel_url logging.getLogger().debug(tv_channel_url) contentDiv = BeautifulSoup.SoupStrainer('li', {'class': 'categories'}) soup = HttpClient().get_beautiful_soup(url=tv_channel_url, parseOnlyThese=contentDiv) # soup = BeautifulSoup.BeautifulSoup(HttpClient().get_html_content(url=tv_channel_url)).findAll('div', {'id':'forumbits', 'class':'forumbits'})[0] for title_tag in soup.findAll('li'): aTag = title_tag.findNext('a') tv_show_url = str(aTag['href']) if tv_show_url[0:4] != "http": tv_show_url = BASE_WSITE_URL + '/' + tv_show_url tv_show_name = aTag.getText() if not re.search('Completed Shows', tv_show_name, re.IGNORECASE): tv_shows.append({ "name": http.unescape(tv_show_name), "url": tv_show_url, "iconimage": "" }) else: tv_shows = tv_channel["finished_tvshows"] return tv_channel
def find_artist_url(artist_name): normalized = urllib.quote(artist_name, '') url = 'http://www.musicbrainz.org/ws/2/artist/?query=artist:"%s"&limit=10' % normalized print(' Searching MusicBrainz at URL %s' % url) resp = requests.get(url, headers=HEADERS) tree = ET.fromstring(resp.text.encode('utf-8'))[0] artist_node = tree[0] # prefer exact matches for node in tree: if node[0].text == artist_name: artist_node = node break mb_id = artist_node.attrib['id'] print(' Found MusicBrainz ID %s' % mb_id) the_ugly = requests.get('http://musicbrainz.org/artist/%s' % mb_id, headers=HEADERS) for link in bs.BeautifulSoup(the_ugly.text, parseOnlyThese=bs.SoupStrainer('a')): if link.has_key('href') and WIKI_LINK.match(link['href']): if 'discography' not in link['href']: return link['href'] raise NoWikiForArtistError( ' MusicBrainz does not have a wikipedia page for %s' % artist_name)
def __retrieve_tv_shows__(tv_channel_url): tv_shows = [] if tv_channel_url is None: return tv_shows tv_channel_url = BASE_WSITE_URL + tv_channel_url contentDiv = BeautifulSoup.SoupStrainer('div', {'class': 'all-tv-shows'}) soup = HttpClient().get_beautiful_soup(url=tv_channel_url, parseOnlyThese=contentDiv, accept_500_error=True) list_item = soup.find('ul') for item in list_item.findChildren('li'): aTag = item.findChild('a') tv_show_url = str(aTag['href']) if tv_show_url[0:4] != "http": tv_show_url = BASE_WSITE_URL + '/' + tv_show_url tv_show_name = aTag.getText() tv_shows.append({ "name": http.unescape(tv_show_name), "url": tv_show_url, "iconimage": "" }) return tv_shows
def downloadShowList ( self, Shows ) : ''' Download a list of Show object's episodes. :param Shows: Shows to feth data for :type Shows: list of api.dbapi.Show objects :returns: dictionary { value( Show ) : key ( raw html from web page ) } :rtype: dict ''' conn = httplib.HTTPConnection("www.imdb.com") #totalbytes = 0 gzippedfiles = [ ] Showdict = { } for Show in Shows : headers = {'User-Agent' : 'veefire/1.0', 'Accept-encoding' : 'gzip' } params = '' conn.request("GET", "/title/" + Show.url + "/episodes", params , headers) r1 = conn.getresponse() data1 = r1.read() seasons_select_form_id = BeautifulSoup.SoupStrainer('select', { "id" : "bySeason" }) seasons_select_form = BeautifulSoup.BeautifulSoup( gzip.GzipFile(fileobj=StringIO.StringIO(data1)).read(), parseOnlyThese=seasons_select_form_id).findAll('option') seasonsraw = [seasonnumber["value"] for seasonnumber in seasons_select_form] seasons = dict() for season in seasonsraw : conn.request("GET", "/title/" + Show.url + "/episodes?season="+season, params , headers) r = conn.getresponse() d = r.read() filter = BeautifulSoup.SoupStrainer('div', { "class" : "info" }) if season not in seasons : seasons[season] = list() seasons[season].extend([ { "season" : season, "episode" : episode } for episode in BeautifulSoup.BeautifulSoup( gzip.GzipFile(fileobj=StringIO.StringIO(d)).read(), parseOnlyThese=filter) ]) #print seasons #print r1.status, r1.reason, ' [ gzipped: ' + str(len(data1)) + ' bytes ]' #totalbytes += len(data1) #gzippedfiles.append(data1) Showdict[Show] = seasons conn.close() return Showdict
def displayRecentMovies(request_obj, response_obj): contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'sub-sidebar'}) soup = HttpClient().getBeautifulSoup(url='http://www.pinoymovie.co/', parseOnlyThese=contentDiv) soup = soup.findChild('div', {'class':'right'}) movieLinkTags = soup.findChildren('a') recentMoviesItems = XBMCInterfaceUtils.callBackDialogProgressBar(getattr(sys.modules[__name__], '__retrieveRecentMovies__'), movieLinkTags, 'Retrieving recent movies and its information', 'Failed to retrieve video information, please try again later', line1='Takes about 5 minutes') response_obj.extendItemList(recentMoviesItems)
def retieveTrailerStream(request_obj, response_obj): soup = None title = request_obj.get_data()['movieTitle'] if request_obj.get_data().has_key('movieInfo'): soup = BeautifulSoup.BeautifulSoup(request_obj.get_data()['movieInfo']) elif request_obj.get_data().has_key('moviePageUrl'): contentDiv = BeautifulSoup.SoupStrainer('div', {'dir': 'ltr'}) soup = HttpUtils.HttpClient().getBeautifulSoup( url=request_obj.get_data()['moviePageUrl'], parseOnlyThese=contentDiv) if soup is None: return videoLink = None Logger.logDebug(soup.prettify()) frameTag = soup.findChild('iframe', recursive=True) if frameTag is not None: videoLink = frameTag['src'] else: paramTag = soup.findChild('param', attrs={'name': 'movie'}, recursive=True) if paramTag is not None: videoLink = paramTag['value'] else: videoLink = soup.findChild('embed', recursive=True)['src'] request_obj.set_data({'videoLink': videoLink, 'videoTitle': title})
def rafraichir(self): # RAZ de la liste des emissions self.listeEmissions.clear() # Recupere le XML de description de toutes les emissions for (chaine, urlChaine) in self.listeChaines.items(): self.listeEmissions[chaine] = {} pageHtml = self.getPage(urlChaine) soupStrainer = BeautifulSoup.SoupStrainer("a", {"class": "visuel"}) pageSoup = BeautifulSoup.BeautifulSoup(pageHtml, parseOnlyThese=soupStrainer) # Liste des pages des emissions listePagesUrl = map( lambda x: "%s%s" % ("http://www.franceinter.fr", x["href"]), pageSoup.contents) # Recupere toutes les pages listePagesData = self.getPages(listePagesUrl) for emission in pageSoup.contents: try: nomEmission = emission["title"] urlPageEmission = "%s%s" % ("http://www.franceinter.fr", emission["href"]) # Extrait le lien XML de la page de l'emission urlXml = re.findall( "http://radiofrance-podcast.net/podcast09/rss_\d+?.xml", listePagesData[urlPageEmission])[0] # Ajoute l'emission a la liste self.listeEmissions[chaine][nomEmission] = urlXml except: continue # Sauvegarde la liste dans le cache self.sauvegarderCache(self.listeEmissions)
def extract_links(html): """ >>> html = '<a href="go">hey</a><br href="not" /><a href="w"></a>' >>> links = [str(link) for link in extract_links(html)] #unicode -> str >>> links ['go', 'w'] >>> h = '<a href="javascript:poptastic(\\'event.php?eventID=922\\')"></a>' >>> l = [str(link) for link in extract_links(h)] #unicode -> str >>> l ['event.php?eventID=922'] >>> h = "<a href='javascript:poptastic(\\"event.php?eventID=922\\")'></a>" >>> l = [str(link) for link in extract_links(h)] #unicode -> str >>> l #also works for double-quoted javascript ['event.php?eventID=922'] >>> html = '<a name="bla" id="q">hello anchor</a>' >>> links = extract_links(html) >>> links [] """ a = BeautifulSoup.SoupStrainer('a') links = BeautifulSoup.BeautifulSoup(html, parseOnlyThese=a) hrefs = [link['href'] for link in links if link.has_key('href')] #also extract javascript popup shit def extract_js(link): if link.lower().startswith('javascript'): return get_match(link, r"\([\'\"](.*)[\'\"]\)") else: return link hrefs = [extract_js(href) for href in hrefs] return hrefs
def displayTVShowEpisodes(request_obj, response_obj): url = request_obj.get_data()['tvChannelUrl'] contentDiv = GetContent(url) newcontent = ''.join(contentDiv.encode("utf-8").splitlines()).replace('\t','') contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'content'}) soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv) videoBoxes =re.compile("<div id='videobox'>(.+?)</h3><div style='clear: both;'>").findall(newcontent) for videoBox in videoBoxes: #imgTag = videoBox.findChild('img') imageUrl = re.compile('<img [^>]*src=["\']?([^>^"^\']+)["\']?[^>]*>').findall(str(videoBox))[0] match=re.compile('createSummaryThumb\("(.+?)","(.+?)","(.+?)",').findall(str(videoBox)) if(len(match)>0): episodeName = match[0][1] episodeUrl = str(match[0][2]) item = ListItem() item.add_request_data('episodeName', episodeName) item.add_request_data('episodeUrl', episodeUrl) item.set_next_action_name('Show_Episode_VLinks') xbmcListItem = xbmcgui.ListItem(label=episodeName, iconImage=imageUrl, thumbnailImage=imageUrl) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item) pageTag = soup.findChild('div', {'class':'postnav'}) if(pageTag !=None): olderPageTag = pageTag.findChild('a', {'class':'blog-pager-older-link'}) else: olderPageTag = None if olderPageTag is not None: item = ListItem() item.add_request_data('tvChannelUrl', str(olderPageTag['href'])) pageName = AddonUtils.getBoldString(' -> Next Page') item.set_next_action_name('Show_Episodes_Next_Page') xbmcListItem = xbmcgui.ListItem(label=pageName) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item)
def displayMovies(request_obj, response_obj): url = request_obj.get_data()['movieCategoryUrl'] print "indisplay" + url if request_obj.get_data().has_key('page'): url_parts = url.split('?') url_part_A = '' url_part_B = '' if len(url_parts) == 2: url_part_A = url_parts[0] url_part_B = '?' + url_parts[1] else: url_part_A = url if url_part_A[len(url_part_A) - 1] != '/': url_part_A = url_part_A + '/' url = url_part_A + 'page/' + request_obj.get_data()['page'] + url_part_B contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'content'}) soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv) movieTags = soup.findChildren('div', {'class':'post'}) print "intags" + str(movieTags) if len(movieTags) == 0: movieTags = soup.findChildren('div', {'class':'videopost'}) for movieTag in movieTags: item = __retrieveAndCreateMovieItem__(movieTag) response_obj.addListItem(item) response_obj.set_xbmc_content_type('movies') try: pagesInfoTag = soup.findChild('div', {'class':'navigation'}) current_page = int(pagesInfoTag.find('span', {'class':'page current'}).getText()) #print current_page pages = pagesInfoTag.findChildren('a', {'class':'page'}) #print pages last_page = int(pages[len(pages) - 1].getText()) if current_page < last_page: for page in range(current_page + 1, last_page + 1): createItem = False if page == last_page: pageName = AddonUtils.getBoldString(' -> Last Page #' + str(page)) createItem = True elif page <= current_page + 4: pageName = AddonUtils.getBoldString(' -> Page #' + str(page)) createItem = True if createItem: item = ListItem() item.add_request_data('movieCategoryUrl', request_obj.get_data()['movieCategoryUrl']) item.add_request_data('page', str(page)) item.set_next_action_name('Movies_List_Next_Page') xbmcListItem = xbmcgui.ListItem(label=pageName) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item) except: pass
def _parse_pagetitle(self, page, url): ''' Get the page title ''' head_tag = BeautifulSoup.SoupStrainer('head') soup = BeautifulSoup.BeautifulSoup(page, parseOnlyThese=head_tag, convertEntities=['html', 'xml']) if soup.title is None: return '%s -- no title found' % url title = unicode(soup.title.string).encode('utf-8') return '%s -- "%s"' % (url, title)
def load_tv_show_episodes(req_attrib, modelMap): logging.getLogger().debug('load tv show episodes...') url = req_attrib['tv-show-url'] tv_show_url = req_attrib['tv-show-url'] tv_show_name = req_attrib['tv-show-name'] channel_type = req_attrib['channel-type'] channel_name = req_attrib['channel-name'] currentPage = 1 if req_attrib.has_key('tv-show-page') and req_attrib['tv-show-page'] != '': currentPage = int(req_attrib['tv-show-page']) if currentPage != 1: url = url + 'page/' + req_attrib['tv-show-page'] + '/' logging.getLogger().debug('load tv show episodes...' + url) contentDiv = BeautifulSoup.SoupStrainer('div', {'id': 'left-div'}) soup = HttpClient().get_beautiful_soup(url=url + '?tag=video', parseOnlyThese=contentDiv) # soup = BeautifulSoup.BeautifulSoup(HttpClient().get_html_content(url=url)).findAll('div', {'id':'contentBody'})[0] tv_show_episode_items = [] threads = soup.findAll('h2', {'class': 'titles'}) tv_show_episode_items.extend( __retrieveTVShowEpisodes__(threads, tv_show_name, channel_type, channel_name)) logging.getLogger().debug('In DTB: total tv show episodes: %s' % str(len(tv_show_episode_items))) pagesDiv = soup.findChild('p', {'class': 'pagination'}) if pagesDiv is not None: pagesInfoTags = pagesDiv.findAllNext('a') for pagesInfoTag in pagesInfoTags: logging.getLogger().debug(pagesInfoTag) pageInfo = re.compile('page/(.+?)/').findall(pagesInfoTag['href']) if len(pageInfo) > 0: if re.search('Old', pagesInfoTag.getText(), re.IGNORECASE): item = xbmcgui.ListItem(label='<< Older Entries') elif re.search('Next', pagesInfoTag.getText(), re.IGNORECASE): item = xbmcgui.ListItem(label='Next Entries >>') item.setProperty('tv-show-page', pageInfo[0][0]) item.setProperty('channel-type', channel_type) item.setProperty('channel-name', channel_name) item.setProperty('tv-show-name', tv_show_name) item.setProperty('tv-show-url', tv_show_url) tv_show_episode_items.append(item) else: item = xbmcgui.ListItem(label='Newest Entries >>') item.setProperty('tv-show-page', '1') item.setProperty('channel-type', channel_type) item.setProperty('channel-name', channel_name) item.setProperty('tv-show-name', tv_show_name) item.setProperty('tv-show-url', tv_show_url) tv_show_episode_items.append(item) modelMap['tv_show_episode_items'] = tv_show_episode_items
def get_urls(cls, content): # retrieve all link hrefs from html links = [] try: link_soup = BeautifulSoup.BeautifulSoup( content, parseOnlyThese=BeautifulSoup.SoupStrainer('a')) except UnicodeEncodeError: return links for link in link_soup: if link.has_key('href'): links.append(link.get('href')) return cls.clean_urls(links)
def render(self, entry): soup = BeautifulSoup.BeautifulSoup(entry.body, parseOnlyThese=BeautifulSoup.SoupStrainer("img")) imgs = soup.findAll("img") thumbnails = [] for img in imgs: if "nomediarss" in img.get("class", "").split(): continue thumbnails.append({ "url": img["src"], "title": img.get("title", img.get("alt", "")), "width": img.get("width", ""), "height": img.get("height", ""), }) return self.render_string("modules/mediarss.html", entry=entry, thumbnails=thumbnails)
def displayAllTVShows(request_obj, response_obj): url = request_obj.get_data()['tvChannelUrl'] contentDiv = BeautifulSoup.SoupStrainer('div', {'class':'rightwidget'}) soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv) tvshows = soup.findChildren('a') for tvshow in tvshows: tvshowName = tvshow.getText() tvshowUrl = str(tvshow['href']) item = ListItem() item.add_request_data('tvshowName', tvshowName) item.add_request_data('tvshowUrl', tvshowUrl) item.add_request_data('tvChannelUrl', tvshowUrl) item.set_next_action_name('Show_Episodes') xbmcListItem = xbmcgui.ListItem(label=tvshowName) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item)
def rafraichir( self ): self.afficher( u"Récupération de la liste des émissions..." ) # RAZ self.listeEmissions.clear() # Recupere la page principale page = self.API.getPage( self.pageEmissions ) soupStrainer = BeautifulSoup.SoupStrainer( "div", { "class" : "unit size1of5" } ) pageSoup = BeautifulSoup.BeautifulSoup( page, parseOnlyThese = soupStrainer ) # Extrait les emissions for emissionBlock in pageSoup.contents: try: nomEmission = unicodedata.normalize( 'NFKD', emissionBlock.div.p.a[ "title" ] ).encode( 'ASCII', 'ignore' ) idEmission = emissionBlock.div.p.input[ "value" ] self.listeEmissions[ nomEmission ] = idEmission except: continue self.sauvegarderCache( self.listeEmissions ) self.afficher( u"Liste des émissions sauvegardées" )
def listerFichiers( self, emission ): if( self.listeEmissions.has_key( emission ) ): # Recupere la page qui liste les fichiers pageFichiers = self.API.getPage( "http://www.lcp.fr/spip.php?page=lcp_page_videos_ajax&parent=%s" %( self.listeEmissions[ emission ] ) ) soupStrainer = BeautifulSoup.SoupStrainer( "div", { "class" : "video-item" } ) pageFichiersSoup = BeautifulSoup.BeautifulSoup( pageFichiers, parseOnlyThese = soupStrainer ) # Extrait d'abord tous les liens vers les pages qui contienent les fichiers listeUrls = map( lambda x : "http://www.lcp.fr/%s" %( x[ "href" ] ), pageFichiersSoup.findAll( "a" ) ) # Recupere toutes les pages dicoPageFichier = self.API.getPages( listeUrls ) # Extrait les fichiers for fichiersBlock in pageFichiersSoup.contents: try: urlPageFichier = "http://www.lcp.fr/%s" %( fichiersBlock.strong.a[ "href" ] ) urlImage = "http://www.lcp.fr/%s" %( fichiersBlock.strong.img[ "src" ] ) descriptif = fichiersBlock.p.contents[ 0 ].replace( "\n", "" ).replace( "\t", "" ) pageFichier = dicoPageFichier[ urlPageFichier ] if( pageFichier == "" ): continue soup = BeautifulSoup.BeautifulSoup( pageFichier ) # # Code k3c # nom = urlPageFichier.split('/')[-1:][0] player = soup.find('param', {'name': 'movie'})['value'] info_video = soup.find('param', attrs={'name' : 'flashvars' })['value'] host = info_video.split('rtmp://')[1].split('/')[0] app = info_video.split('rtmp://')[1].split('/')[1] s2 = host+"/"+app+"/" playpath = info_video.split(s2)[1].split('/mp4')[0] playpath = playpath[:-4] cmds = "rtmpdump"+" --resume --live 0 --host "+host+" --swfVfy "+ player+" --swfAge 0 -v --app "+app+" --playpath "+playpath+" -e -k 1 --flv "+str(nom)+".mp4" # # Fin code k3c # # Ajouter le fichier self.ajouterFichier( emission, Fichier( nom = "%s - %s" %( emission, descriptif ), lien = cmds, nomFichierSortie = "%s %s.mp4" %( emission, descriptif ), urlImage = urlImage, descriptif = descriptif ) ) except: continue
def displayMoviesMenu(request_obj, response_obj): # ALL Movies movies_icon_filepath = AddonUtils.getCompleteFilePath(baseDirPath=AddonContext().addonPath, extraDirPath=AddonUtils.ADDON_ART_FOLDER, filename='movies.png') item = ListItem() item.set_next_action_name('Movies_List') item.add_request_data('movieCategoryUrl', 'http://www.pinoymovie.co/video') xbmcListItem = xbmcgui.ListItem(label='All Movies', iconImage=movies_icon_filepath, thumbnailImage=movies_icon_filepath) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item) # Recently Added item = ListItem() item.set_next_action_name('Recent_Movies_List') item.add_request_data('movieCategoryUrl', 'http://www.pinoymovie.co/video') xbmcListItem = xbmcgui.ListItem(label='Recently Added', iconImage=movies_icon_filepath, thumbnailImage=movies_icon_filepath) item.set_xbmc_list_item_obj(xbmcListItem) #response_obj.addListItem(item) contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'sub-sidebar'}) soup = HttpClient().getBeautifulSoup(url='http://www.pinoymovie.co/video', parseOnlyThese=contentDiv) soup = soup.findChild('div', {'class':'right'}) for liItemTag in soup.findChildren('li', {'class':re.compile(r'\bcat-item\b')}): aTag = liItemTag.findChild('a') categoryUrl = aTag['href'] categoryName = aTag.getText() item = ListItem() item.set_next_action_name('Movies_List') item.add_request_data('movieCategoryUrl', categoryUrl) xbmcListItem = xbmcgui.ListItem(label=categoryName, iconImage=movies_icon_filepath, thumbnailImage=movies_icon_filepath) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item) # Search TV search_icon_filepath = AddonUtils.getCompleteFilePath(baseDirPath=AddonContext().addonPath, extraDirPath=AddonUtils.ADDON_ART_FOLDER, filename='search.png') item = ListItem() item.set_next_action_name('Search_Movies_List') item.add_request_data('movieCategoryUrl', 'http://www.pinoymovie.co/?s=') xbmcListItem = xbmcgui.ListItem(label='Search Movies', iconImage=search_icon_filepath, thumbnailImage=search_icon_filepath) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item)
def get_top_K_pages(phrase, K): """ In which we coax a mighty search engine into giving us what we want. TODO: References: - http://en.wikibooks.org/wiki/Python_Programming/Internet - http://docs.python.org/library/urllib2.html """ global W, T_to_be_visited # TODO: use urllib.quote instead of str.replace search_url = yahoo_url % (phrase.replace(' ', '+'), str(K)) # Sleep for a few seconds, just in case we are calling the search engine too frequently time.sleep(search_lag_time) search_results = urllib2.urlopen(urllib2.Request(search_url, None, headers)) clickurls = BeautifulSoup.SoupStrainer('clickurl') results_soup = BeautifulSoup.BeautifulStoneSoup(search_results, parseOnlyThese=clickurls) logging.debug('Search results: ' + results_soup.prettify()) # order of W is not important at the moment W = set([link.string for link in results_soup.findAll('clickurl')]) T_to_be_visited = list(W.copy())
def soupify(url, model): """ Gets html from string url Passes html through strainer rules contained in dict model Feeds strainer results back into model If we got redirected during the search, return redirected url Else, return model with strainer results """ assert type(url) is str and type(model) is dict contents = urllib2.urlopen(url) # check for redirect if url == contents.geturl(): # if no redirect, run soup through strainers for k, v in model.iteritems(): try: if v: strainer = BeautifulSoup.SoupStrainer(v[0], attrs=v[1]) model[k] = BeautifulSoup.BeautifulSoup(contents, strainer) except KeyError: print k, v return model else: return None
contentDiv = BeautifulSoup.SoupStrainer('div', {'class': 'all-tv-shows'}) soup = http.HttpClient().get_beautiful_soup(url=link, parseOnlyThese=contentDiv, accept_500_error=True) list = soup.find('ul') for item in list.findChildren('li'): tv_show = item.findChild('a') link = tv_show['href'] name = tv_show.getText() print '>>>>>>>>' + name print '>>>>>>>>' + link if __name__ == '__main__': print 'DTF actions...' contentDiv = BeautifulSoup.SoupStrainer('div', {'class': 'tv-channel-list'}) soup = http.HttpClient().get_beautiful_soup( url='http://desitvforum.net/television/', parseOnlyThese=contentDiv, accept_500_error=True) list = soup.find('ul') for item in list.findChildren('li'): channel = item.findChild('a') link = channel['href'] name = channel.getText() print name print link print '------------------------' try: retrieve_tv_shows(link) except:
def retrieveVideoLinks(request_obj, response_obj): video_source_id = 1 video_source_img = None video_part_index = 0 video_playlist_items = [] #ignoreAllLinks = False url = request_obj.get_data()['episodeUrl'] contentDiv = BeautifulSoup.SoupStrainer('div', {'class':'entry'}) soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv) soup = soup.findChild('div') for child in soup.findChildren(): if child.name == 'img' or child.name == 'param' or child.name == 'object' or child.name == 'b' or child.name == 'font' or child.name == 'br': pass elif child.name == 'span' and re.search('ALTERNATIVE VIDEO', child.getText(), re.IGNORECASE): if len(video_playlist_items) > 0: response_obj.addListItem(__preparePlayListItem__(video_source_id, video_source_img, video_playlist_items)) video_source_id = video_source_id + 1 video_source_img = None video_part_index = 0 video_playlist_items = [] #ignoreAllLinks = False elif child.name == 'embed' or child.name == 'iframe': if re.search('http://gdata.youtube.com/feeds/api/playlists/', str(child)) or re.search('http://www.youtubereloaded.com/playlists/', str(child)): playlistId = re.compile('/playlists/(.+?)(\&|\.xml)').findall(str(child))[0][0] videoUrls = YouTube.retrievePlaylistVideoItems(playlistId) for videoUrl in videoUrls: try: video_part_index = video_part_index + 1 video_link = {} video_link['videoTitle'] = 'Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index) video_link['videoLink'] = videoUrl print "myvidlink"+videoUrl video_hosting_info = SnapVideo.findVideoHostingInfo(video_link['videoLink']) video_link['videoSourceImg'] = video_hosting_info.get_video_hosting_image() video_playlist_items.append(video_link) video_source_img = video_link['videoSourceImg'] item = ListItem() item.add_request_data('videoLink', video_link['videoLink']) item.add_request_data('videoTitle', video_link['videoTitle']) item.set_next_action_name('SnapAndPlayVideo') xbmcListItem = xbmcgui.ListItem(label='Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index) , iconImage=video_source_img, thumbnailImage=video_source_img) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item) except: print 'Unable to recognize a source = ' + video_link['videoLink'] video_source_img = None video_part_index = 0 video_playlist_items = [] #ignoreAllLinks = True else: videoUrl = str(child['src']) try: video_part_index = video_part_index + 1 video_link = {} video_link['videoTitle'] = 'Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index) video_link['videoLink'] = videoUrl print "myvidlink"+videoUrl video_hosting_info = SnapVideo.findVideoHostingInfo(video_link['videoLink']) video_link['videoSourceImg'] = video_hosting_info.get_video_hosting_image() video_playlist_items.append(video_link) video_source_img = video_link['videoSourceImg'] item = ListItem() item.add_request_data('videoLink', video_link['videoLink']) item.add_request_data('videoTitle', video_link['videoTitle']) item.set_next_action_name('SnapAndPlayVideo') xbmcListItem = xbmcgui.ListItem(label='Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index) , iconImage=video_source_img, thumbnailImage=video_source_img) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item) except: print 'Unable to recognize a source = ' + video_link['videoLink'] video_source_img = None video_part_index = 0 video_playlist_items = [] #ignoreAllLinks = True else: print 'UNKNOWN child name' print child if len(video_playlist_items) > 0: response_obj.addListItem(__preparePlayListItem__(video_source_id, video_source_img, video_playlist_items))
def _retrieve_video_links_(req_attrib, modelMap): modelMap['channel-name'] = req_attrib['channel-name'] modelMap['tv-show-name'] = req_attrib['tv-show-name'] modelMap['episode-name'] = req_attrib['episode-name'] video_source_id = 1 video_source_img = None video_source_name = None video_part_index = 0 video_playlist_items = [] ignoreAllLinks = False list_items = [] contentDiv = BeautifulSoup.SoupStrainer('div', {'id': 'left-div'}) soup = HttpClient().get_beautiful_soup(url=req_attrib['episode-url'], parseOnlyThese=contentDiv) # soup = BeautifulSoup.BeautifulSoup(HttpClient().get_html_content(url=req_attrib['episode-url'])).findAll('blockquote', {'class':re.compile(r'\bpostcontent\b')})[0] centerTag = soup.findNext('center') logging.getLogger().debug(centerTag) prevChild = '' prevAFont = None isHD = 'false' videoSource = '' for child in soup.findChildren(): if child.name == 'span': if len(video_playlist_items) > 0: list_items.append( __preparePlayListItem__(video_source_id, video_source_img, video_source_name, video_playlist_items, modelMap, isHD)) logging.getLogger().debug(videoSource) videoSource = child.getText() if (re.search('720p', videoSource, re.I)): isHD = 'true' else: isHD = 'false' if video_source_img is not None: video_source_id = video_source_id + 1 video_source_img = None video_source_name = None video_part_index = 0 video_playlist_items = [] ignoreAllLinks = False elif not ignoreAllLinks and child.name == 'a': if (str(child['href']) != 'https://www.facebook.com/iamdesirulez'): video_part_index = video_part_index + 1 video_link = {} video_link['videoTitle'] = 'Source #' + str( video_source_id) + ' | ' + 'Part #' + str( video_part_index) + ' | ' + child.getText() video_link['videoLink'] = str(child['href']) video_link['videoSource'] = videoSource try: try: __prepareVideoLink__(video_link) except Exception, e: logging.getLogger().error(e) video_hosting_info = SnapVideo().findVideoHostingInfo( video_link['videoLink']) if video_hosting_info is None or video_hosting_info.get_name( ) == 'UrlResolver by t0mm0': raise video_link[ 'videoSourceImg'] = video_hosting_info.get_icon() video_link[ 'videoSourceName'] = video_hosting_info.get_name() video_playlist_items.append(video_link) video_source_img = video_link['videoSourceImg'] video_source_name = video_link['videoSourceName'] item = xbmcgui.ListItem(label='Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index), iconImage=video_source_img, thumbnailImage=video_source_img) item.setProperty('videoLink', video_link['videoLink']) item.setProperty('videoTitle', video_link['videoTitle']) item.setProperty('videoSourceName', video_source_name) item.setProperty('isContinuousPlayItem', 'false') list_items.append(item) prevAFont = child.findChild('font') except: logging.getLogger().error( 'Unable to recognize a source = ' + str(video_link['videoLink'])) video_source_img = None video_source_name = None video_part_index = 0 video_playlist_items = [] ignoreAllLinks = True prevAFont = None
def retrieveVideoLinks(request_obj, response_obj): url = request_obj.get_data()['movieUrl'] contentDiv = BeautifulSoup.SoupStrainer('div', {'class':'video'}) soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv) if len(str(soup)) ==0: contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'content'}) soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv) decodedSoup = urllib.unquote(str(soup)) videoFrameLinks = re.compile('http://www.pinoymovie.c(o|a)/ajaxtabs/(.+?).htm').findall(decodedSoup) if len(videoFrameLinks) > 0: video_source_id = 1 for ignoreIt, videoFrameLink in videoFrameLinks: #@UnusedVariable try: soup = HttpClient().getBeautifulSoup(url='http://www.pinoymovie.co/ajaxtabs/' + videoFrameLink + '.htm') video_url = str(soup.find('iframe')['src']) video_hosting_info = SnapVideo.findVideoHostingInfo(video_url) if video_hosting_info is None: print 'UNKNOWN streaming link found: ' + video_url else: video_source_img = video_hosting_info.get_video_hosting_image() video_title = 'Source #' + str(video_source_id) + ' :: ' + video_hosting_info.get_video_hosting_name() item = ListItem() item.add_request_data('videoLink', video_url) item.add_request_data('videoTitle', video_title) item.set_next_action_name('SnapAndPlayVideo') xbmcListItem = xbmcgui.ListItem(label=video_title, iconImage=video_source_img, thumbnailImage=video_source_img) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item) video_source_id = video_source_id + 1 except: print 'UNKNOWN streaming link found' else: videoLinks = re.compile('flashvars=(.+?)file=(.+?)&').findall(decodedSoup) moreLinks = re.compile('<iframe(.+?)src="(.+?)"', flags=re.I).findall(decodedSoup) if len(moreLinks) > 0: videoLinks.extend(moreLinks) moreLinks = re.compile('<a(.+?)href="(.+?)"', flags=re.I).findall(decodedSoup) if len(moreLinks) > 0: videoLinks.extend(moreLinks) if len(videoLinks) > 0: video_source_id = 1 video_source_img = None video_part_index = 0 video_playlist_items = [] for ignoreIt, videoLink in videoLinks: #@UnusedVariable try: if re.search('http://media.pinoymovie.ca/playlist/(.+?).xml', videoLink, re.I): soupXml = HttpClient().getBeautifulSoup(url=videoLink) for media in soupXml.findChildren('track'): video_url = media.findChild('location').getText() video_hosting_info = SnapVideo.findVideoHostingInfo(video_url) if video_hosting_info is None: print 'UNKNOWN streaming link found: ' + video_url else: video_part_index = video_part_index + 1 video_link = {} video_link['videoTitle'] = 'Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index) video_link['videoLink'] = video_url video_link['videoSourceImg'] = video_hosting_info.get_video_hosting_image() video_playlist_items.append(video_link) video_source_img = video_link['videoSourceImg'] item = ListItem() item.add_request_data('videoLink', video_link['videoLink']) item.add_request_data('videoTitle', video_link['videoTitle']) item.set_next_action_name('SnapAndPlayVideo') xbmcListItem = xbmcgui.ListItem(label=video_link['videoTitle'], iconImage=video_source_img, thumbnailImage=video_source_img) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item) if len(video_playlist_items) > 0: response_obj.addListItem(__preparePlayListItem__(video_source_id, video_source_img, video_playlist_items)) video_source_id = video_source_id + 1 video_source_img = None video_part_index = 0 video_playlist_items = [] else: print "insecond" if re.search('http://media.pinoymovie.ca/playlist/(.+?).htm', videoLink, re.I): html = HttpClient().getHtmlContent(url=videoLink).replace('\'', '"') videoLink = re.compile('<iframe(.+?)src="(.+?)"', flags=re.I).findall(html)[0][0] video_hosting_info = SnapVideo.findVideoHostingInfo(videoLink) if video_hosting_info is None: print 'UNKNOWN streaming link found: ' + videoLink else: item = ListItem() item.add_request_data('videoLink', videoLink) print "source:" + videoLink item.add_request_data('videoTitle', 'Source #' + str(video_source_id)) item.set_next_action_name('SnapAndPlayVideo') xbmcListItem = xbmcgui.ListItem(label='Source #' + str(video_source_id), iconImage=video_hosting_info.get_video_hosting_image(), thumbnailImage=video_hosting_info.get_video_hosting_image()) item.set_xbmc_list_item_obj(xbmcListItem) response_obj.addListItem(item) video_source_id = video_source_id + 1 except: print 'UNKNOWN streaming link found' video_source_img = None video_part_index = 0 video_playlist_items = []
def scrape_episode(self, title, show_year, year, season, episode, imdb, tvdb, debrid=False): try: query = '%s %s' % (title, episode) query = self.search_link % (urllib.quote_plus(query)) result = client.request(self.base_link + query) result = result.decode('iso-8859-1').encode('utf-8') result = result.replace('\n', '').replace('\t', '') items = client.parseDOM(result, 'content:encoded')[0] items = re.compile('class=\"single-heading\">(.+?)<span').findall( items) for i in range(0, len(items)): try: if '720p' in items[i]: quality = 'HD' else: quality = 'SD' urls = client.parseDOM(items[i], "a", ret="href") for j in range(0, len(urls)): result = client.request(urls[j]) item = BeautifulSoup.BeautifulSoup( result, parseOnlyThese=BeautifulSoup.SoupStrainer( "iframe")) if len(item) == 0: item = re.compile('data-config="(.+?)"').findall( result)[0] item = [{"src": item}] for links in item: rUrl = links["src"] if rUrl.startswith('//'): rUrl = 'http:%s' % rUrl urls[j] = rUrl host = client.host(urls[0]) url = "##".join(urls) self.srcs.append({ 'source': host, 'parts': str(len(urls)), 'quality': quality, 'scraper': self.name, 'url': url, 'direct': False }) urls = [] except: pass return self.srcs except: return self.srcs
import BeautifulSoup from scraperwiki import scrape, sqlite import datetime, re, json html = scrape('http://tinyurl.com/tulevat-tanssit') restrict = BeautifulSoup.SoupStrainer(["h2", "dl"]) conversion = BeautifulSoup.BeautifulStoneSoup.ALL_ENTITIES page = BeautifulSoup.BeautifulStoneSoup(html, parseOnlyThese=restrict, convertEntities=conversion) dates = page.findAll("h2") for d in dates: wday, dinfo = d.a.contents[0].split() record = {'weekday': wday} mdate = re.match("(\d+)\.(\d+)\.(\d\d\d\d)", dinfo) date = datetime.date(int(mdate.group(3)), int(mdate.group(2)), int(mdate.group(1))) dthandler = lambda obj: obj.isoformat() if isinstance( obj, datetime.datetime) else None date = json.dumps(date, default=dthandler) record['date'] = date data = d.nextSibling try: record['place'] = data.dt.a.text except Exception, e: print data record['artists'] = data.dd.text.replace('&', ' & ').replace(',', ', ') sqlite.save(unique_keys=['place', 'date'], data=record, date=date)
def __retrieveRecentMovies__(movieLinkTag): movieLink = movieLinkTag['href'] contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'content'}) soup = HttpClient().getBeautifulSoup(url=movieLink, parseOnlyThese=contentDiv) movieTag = soup.findChild('div', {'class':'post'}) return __retrieveAndCreateMovieItem__(movieTag)
def retrieve_tv_shows(link): contentDiv = BeautifulSoup.SoupStrainer('div', {'class': 'all-tv-shows'}) soup = http.HttpClient().get_beautiful_soup(url=link, parseOnlyThese=contentDiv, accept_500_error=True) list = soup.find('ul') for item in list.findChildren('li'): tv_show = item.findChild('a') link = tv_show['href'] name = tv_show.getText() print '>>>>>>>>' + name print '>>>>>>>>' + link