def _get_albums(self): self._albums = [] url = 'https://www.theatlantic.com/infocus/' html = self._get_html(url) pattern = r'@media\(min-width:\s*1632px\)\s*{\s*#river1 \.lead-image\s*{\s*background-image:\s*url\((.+?)\)' for _id, li in enumerate( parseDOM(html, 'li', attrs={'class': 'article'})): headline = parseDOM(li, 'h1')[0] match = re.search(pattern.replace('river1', 'river%d' % (_id + 1)), html) if match: self._albums.append({ 'title': parseDOM(headline, 'a')[0], 'album_id': _id, 'pic': match.group(1), 'description': stripTags( self._parser.unescape( parseDOM(li, 'p', attrs={'class': 'dek'})[0])), 'album_url': 'https://www.theatlantic.com' + parseDOM(headline, 'a', ret='href')[0] }) return self._albums
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) pattern = r'source data-srcset=\"(.+?)\"' match_image = re.findall(pattern, html) album_title = self._parser.unescape(parseDOM(html, 'title')[0]) for _id, p in enumerate(parseDOM(html, 'p', attrs={'class': 'caption'})): match_description = re.search('<span>(.+?)</span>', p) if match_description: self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': match_image[_id * 5], 'description': stripTags(self._parser.unescape( match_description.group(1))), 'album_url': album_url }) return self._photos[album_url]
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) album_title = parseDOM(html, 'title')[0] images = parseDOM(html, 'div', attrs={'class': 'photo'}) descs = parseDOM(html, 'article', attrs={'class': 'pcaption'}) for _id, photo in enumerate(images): pic = urllib2.quote(parseDOM(photo, 'img', ret='src')[0]) description = stripTags( self._parser.unescape( parseDOM(descs[_id], 'div', attrs={'class': 'gcaption geor'})[0])) self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': 'http:' + pic, 'description': description, 'album_url': album_url }) return self._photos[album_url]
def _get_albums(self): self._albums = [] home_url = 'https://www.readingthepictures.org' url = home_url + '/category/notes/' html = self._get_html(url) articles = parseDOM(html, 'div', attrs={'class': 'article'}) for _id, article in enumerate(articles): title = parseDOM(article, 'a', ret='title')[0] picture = parseDOM(article, 'img', ret='src')[0] description = parseDOM(article, 'p')[0] self._albums.append({ 'title': self._parser.unescape(title), 'album_id': _id, 'pic': picture, 'description': stripTags(self._parser.unescape(description)), 'album_url': parseDOM(article, 'a', ret='href')[0] }) return self._albums
def _get_albums(self): self._albums = [] home_url = 'https://time.com' url = home_url + '/tag/photography/' html = self._get_html(url) articles = parseDOM(html, 'div', attrs={'class': 'taxonomy-tout'}) for _id, article in enumerate(articles): title = parseDOM(article, 'h2')[0] picture = parseDOM(article, 'img', ret='src')[0] try: description = parseDOM(article, 'h3')[0] except Exception: description = '' self._albums.append({ 'title': self._parser.unescape(title), 'album_id': _id, 'pic': picture, 'description': stripTags(self._parser.unescape(description)), 'album_url': home_url + parseDOM(article, 'a', ret='href')[0] }) return self._albums
def _get_albums(self): self._albums = [] home_url = 'https://www.bbc.com' url = home_url + '/news/in_pictures' html = self._get_html(url) articles = parseDOM(html, 'div', attrs={'class': 'gs-o-media__body'}) pictures = parseDOM( html, 'div', attrs={'class': \ 'gs-u-mb\+ gel-body-copy qa-post-body'} ) descriptions = parseDOM(html, 'div', attrs={'class': 'gel-5/8@l'}) timestamp = parseDOM(html, 'span', attrs={'class': 'qa-post-auto-meta'}) for _id, article in enumerate(articles): title = parseDOM(parseDOM(article, 'a')[0], 'span')[0] try: picture = parseDOM(pictures[_id], 'img', ret='srcset')[0] picture = re.search(r', (?P<bigger_url>https://[^ ]+) \d+w$', picture).group('bigger_url') description = parseDOM(descriptions[_id], 'p')[0] except Exception: continue self._albums.append({ 'title': self._parser.unescape( title ), 'album_id': _id, 'pic': picture, 'description': stripTags( self._parser.unescape( description ) ) + \ "\n\nPosted @" + timestamp[_id], 'album_url': home_url + parseDOM(article, 'a', ret='href')[0] }) return self._albums
def _get_albums(self): self._albums = [] home_url = 'https://photojournalismnow43738385.wordpress.com' url = home_url + '/' html = self._get_html(url) articles = parseDOM(html, 'article') for _id, article in enumerate(articles): title = parseDOM(parseDOM(article, 'h1')[0], 'a')[0] picture = parseDOM(article, 'img', ret='src')[0] picture = re.search(r'^([^\?]+)', picture).group(1) description = parseDOM(article, 'p')[0] self._albums.append({ 'title': self._parser.unescape(title), 'album_id': _id, 'pic': picture, 'description': stripTags(self._parser.unescape(description)), 'album_url': parseDOM(article, 'a', ret='href')[0] }) return self._albums
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) pattern = r'@media\(min-width:1592px\){#img01 \.img{background-image:url\((.+?)\)' id_pattern = re.compile(r'#img(\d\d)') album_title = parseDOM(html, 'title')[0] for _id, p in enumerate(parseDOM(html, 'p', attrs={'class': 'caption'})): match = re.search(id_pattern, p) if match: img_id = match.group(1) match = re.search(pattern.replace('img01', 'img%s' % img_id), html) if match: self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': match.group(1), 'description': stripTags(self._parser.unescape(p)).replace( '\n #', ''), 'album_url': album_url, }) return self._photos[album_url]
def _get_albums(self): self._albums = [] url = 'http://www.bostonglobe.com/news/bigpicture' html = self._get_html(url) for _id, album in enumerate(parseDOM(html, 'section')): title = parseDOM(album, 'a')[0] album_url = 'http://www.bostonglobe.com' + parseDOM( album, 'a', ret='href')[0] d = parseDOM(album, 'div', attrs={'class': 'subhead geor'})[0] if not d: continue description = stripTags(self._parser.unescape(d)) pic = urllib2.quote(parseDOM(album, 'img', ret='src')[0]) if not pic: continue self._albums.append({ 'title': title, 'album_id': _id, 'pic': 'http:' + pic, 'description': description, 'album_url': album_url }) return self._albums
def _get_photos(self, album_url): self._photos[album_url] = [] tree = self._get_tree(album_url) for id, photo in enumerate(tree.findAll('div', {'class': 'image'})): img = photo.find('img') if not img: continue if id == 0: album_title = photo.find('h2').string # jump first entry as it is a repetition of the album description continue description = stripTags( self._parser.unescape( str(tree.find('p', {'class': 'desc'})))) else: try: description = self._parser.unescape( photo.find('p', { 'class': 'info-txt' }).string) except: description = '' self._photos[album_url].append({ 'title': '%d - %s' % (id + 1, album_title), 'album_title': album_title, 'photo_id': id, 'pic': img['src'], 'description': description, 'album_url': album_url }) if (id == 1): # possibly a video: video = tree.find('iframe')['src'] xbmc.log('possible video = ' + video) if re.match(r'.+youtube.com/.+', video): video_id = re.sub('.+/', '', video) xbmc.log('youtube video = ' + video_id) xbmc.executebuiltin( 'PlayMedia(plugin://plugin.video.youtube/play/?video_id=' + video_id + ')') elif re.match(r'.+vimeo.com/.+', video): video_id = re.sub('.+/', '', video) xbmc.log('vimeo video = ' + video_id) xbmc.executebuiltin( 'PlayMedia(plugin://plugin.video.vimeo/play/?video_id=' + video_id + ')') # if no match: previous processing have retrieved images return self._photos[album_url]
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) album_title = self._parser.unescape( re.findall(r'"headline":"(?P<title>[^"]+)"', html)[0]) images = parseDOM( html, 'div', attrs={ 'class': 'component lazy-image lead-media marquee_large_2x[^"]*' }, ret='data-src') images += parseDOM(parseDOM(html, 'div', attrs={'class': 'image-wrapper'}), 'div', attrs={'class': 'component lazy-image[^"]*'}, ret='data-src') if len(images) == 0: # if there are no images that's because the article contains just a video: so show its poster only images = [parseDOM(html, 'video', ret='poster')[0]] descriptions = [''] else: descriptions = parseDOM( html, 'div', attrs={ 'class': 'component lazy-image lead-media marquee_large_2x[^"]*' }, ret='data-alt') descriptions += parseDOM( parseDOM(html, 'div', attrs={'class': 'image-wrapper'}), 'div', attrs={'class': 'component lazy-image[^"]*'}, ret='data-alt') for _id, image in enumerate(images): self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': image, 'description': stripTags(self._parser.unescape(descriptions[_id])), 'album_url': album_url }) return self._photos[album_url]
def get_topics(): '''Returns a list of (topic_name, url) of available topics''' html = _get_html(BASE_URL) menu = parseDOM(html, 'div', attrs={'class': 'header-container[^\'"]*'}) topics_url = parseDOM(menu, 'a', ret='href') topics_description = parseDOM(menu, 'a') links_indexes = [ x for x, y in enumerate(topics_url) if y.startswith('/video/') ] topics = [(stripTags(topics_description[i]), NYT_URL_BASE + topics_url[i][1:]) for i in links_indexes] topics.insert(0, (LATEST_VIDEOS, _url('/video/latest-video/'))) return topics
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) html = html.replace('srcSet', 'srcset') album_title = self._parser.unescape(parseDOM(html, 'title')[0]) pictures = parseDOM(html, 'img', attrs={'class': '.+Image[^"]+'}, ret='srcset') descriptions = parseDOM(html, 'figcaption') if (len(descriptions) == 0): descriptions = [''] * len(pictures) id_picture = 0 for _id, description in enumerate(descriptions): try: description = stripTags( self._parser.unescape( description ) ).\ replace( 'image caption','' ) condition = True while (condition): picture = pictures[id_picture] picture = re.search( r', (?P<bigger_url>https://[^ ]+) \d+w$', picture).group('bigger_url') id_picture += 1 if (re.search(r'(transparent|line)[^\."]+\.png', picture) == None): condition = False if (description == '' and re.search(r'banner[^\."]+\.png', picture) != None): continue self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': picture, 'description': self._parser.unescape(description), 'album_url': album_url }) except Exception: continue return self._photos[album_url]
def _get_albums(self): self._albums = [] url = 'http://www.theatlantic.com/infocus/' html = self._get_html(url) pattern = r'@media\(min-width:1632px\){#river1 \.lead-image{background-image:url\((.+?)\)' for _id, li in enumerate(parseDOM(html, 'li', attrs={'class': 'article'})): headline = parseDOM(li, 'h1')[0] match = re.search(pattern.replace('river1', 'river%d' % (_id + 1)), html) if match: self._albums.append({ 'title': parseDOM(headline, 'a')[0], 'album_id': _id, 'pic': match.group(1), 'description': stripTags(self._parser.unescape(parseDOM(li, 'p', attrs={'class': 'dek'})[0])), 'album_url': 'http://www.theatlantic.com' + parseDOM(headline, 'a', ret='href')[0], }) return self._albums
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) album_title = parseDOM(html, 'title')[0] images = parseDOM(html, 'div', attrs={'class': 'photo'}) descs = parseDOM(html, 'article', attrs={'class': 'pcaption'}) for _id, photo in enumerate(images): pic = urllib2.quote(parseDOM(photo, 'img', ret='src')[0]) description = stripTags(parseDOM(descs[_id], 'div', attrs={'class': 'gcaption geor'})[0]) self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': 'http:' + pic, 'description': description, 'album_url': album_url }) return self._photos[album_url]
def _get_photos(self, album_url): self._photos[album_url] = [] tree = self._get_tree(album_url) tree = json.loads(tree.find('script', {'id': 'slideshow-json'}).string) for id, slide in enumerate(tree['imageslideshow']['slides']): self._photos[album_url].append({ 'title': self._parser.unescape(tree['summary']), 'album_title': self._parser.unescape(tree['headline']), 'photo_id': id, 'pic': slide['image_crops']['superJumbo']['url'], 'description': stripTags(self._parser.unescape(slide['caption']['full'])), 'album_url': album_url }) return self._photos[album_url]
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) pattern = r'@media\(min-width:1592px\){#img01 \.img{background-image:url\((.+?)\)' id_pattern = re.compile(r'#img(\d\d)') album_title = parseDOM(html, 'title')[0] for _id, p in enumerate(parseDOM(html, 'p', attrs={'class': 'caption'})): match = re.search(id_pattern, p) if match: img_id = match.group(1) match = re.search(pattern.replace('img01', 'img%s' % img_id), html) if match: self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': match.group(1), 'description': stripTags(self._parser.unescape(p)).replace('\n #', ''), 'album_url': album_url, }) return self._photos[album_url]
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) album_title = self._parser.unescape(parseDOM(html, 'title')[0]) pictures = parseDOM(html, 'figure') for _id, picture in enumerate(pictures): try: image = parseDOM(picture, 'img', ret='srcset')[0] image_resolutions = re.findall( r'(?P<url>https://[^ ]+) (?P<resolution>\d+)w', image) resolution = 0 for image_line in image_resolutions: # take the greater resolution if resolution < int(image_line[1]): image = image_line[0] resolution = int(image_line[1]) try: description = parseDOM(picture, 'figcaption')[0] except Exception: description = '' self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': image, 'description': stripTags(self._parser.unescape(description)), 'album_url': album_url }) except Exception: continue return self._photos[album_url]
def _get_albums(self): self._albums = [] url = 'http://www.bostonglobe.com/news/bigpicture' html = self._get_html(url) for _id, album in enumerate(parseDOM(html, 'section')): title = parseDOM(album, 'a')[0] album_url = 'http://www.bostonglobe.com' + parseDOM(album, 'a', ret='href')[0] d = parseDOM(album, 'div', attrs={'class': 'subhead geor'})[0] if not d: continue description = stripTags(self._parser.unescape(d)) pic = urllib2.quote(parseDOM(album, 'img', ret='src')[0]) if not pic: continue self._albums.append({ 'title': title, 'album_id': _id, 'pic': 'http:' + pic, 'description': description, 'album_url': album_url}) return self._albums
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) album_title = self._parser.unescape( parseDOM(html, 'meta', attrs={'property': 'og:title'}, ret='content')[0]) alternative_distribution = 0 images = parseDOM(html, 'div', attrs={'class': 'wp-caption alignnone'}) if len(images) > 0: # may be alternative div classes are used: alternative_distribution = 1 else: images = parseDOM(html, 'section', attrs={'class': 'single-intro wysiwyg'}) images += parseDOM(html, 'section', attrs={'class': 'single-large-photo'}) for _id, image in enumerate(images): try: if alternative_distribution == 0: try: description = parseDOM( image, 'div', attrs={'class': 'caption' })[0] # description for first image except Exception: description = parseDOM(image, 'figcaption')[ 0] # description for images after the first one else: description = parseDOM(image, 'p', attrs={'class': 'wp-caption-text'})[0] # clean description: try: description = stripTags(description).replace( ' ', '').replace(chr(9), '') description_items = re.search( r'^(?P<author>.+)Caption: *(?P<caption>.+)', description, re.DOTALL) description = re.sub( r'\n+', '', description_items.group('caption') ) + "\n\n" + \ description_items.group('author') except Exception: description = '' picture = parseDOM(image, 'img', ret='src')[0] if alternative_distribution == 1: picture = re.search( r'(?P<pic>https://[^ ]+) [^ ]+?$', parseDOM(image, 'img', ret='srcset')[0]).group('pic') self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': picture, 'description': self._parser.unescape(description), 'album_url': album_url }) except Exception: continue if len(self._photos[album_url]) == 0: self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': 'https://www.readingthepictures.org/wp-content/uploads/2019/04/1_LAT_Mueller.jpg', 'description': 'no more images here !', 'album_url': album_url }) if XBMC_MODE: dialog = xbmcgui.Dialog() dialog.notification('Reading the Pictures .org :', 'no more images here !', xbmcgui.NOTIFICATION_INFO, int(2000)) return self._photos[album_url]
def _get_albums(self): img = '' url = 'http://living-wild.net/blog/' html = self._get_html(url) html = html.replace('\n', '').replace('\t', '').replace('\r', '') posts = re.compile( '<div class="tesseract-post tesseract-post-vertical">(.+?)</div></div></div>' ).findall(html) _id = 1 print str(len(posts)) for post in posts: date = re.compile( '<div class="author-and-date">(.+?)</div>').findall(post)[0] lnk_title = re.compile('<a href="(.+?)">(.+?)</a>').findall( post)[0] href = lnk_title[0] title = unicode( BeautifulStoneSoup( lnk_title[1], convertEntities=BeautifulStoneSoup.ALL_ENTITIES)) desc = re.compile('<div class="content">(.+?)<img class=').findall( post)[0] desc = unicode( BeautifulStoneSoup( desc, convertEntities=BeautifulStoneSoup.ALL_ENTITIES)) desc = re.sub("<h.>", "\n", desc) desc = (desc.replace('<li>', '\n - ').replace('</li>', '. \n')) desc = desc.replace('..', '.').replace('\n\n', '\n') best_img = '' best_img_size = 0 try: clss, src, junk, size, srcset = re.compile( '<img class="(.+?)" src="(.+?)" alt(.+?)" (.+?) srcset="(.+?)"' ).findall(post)[0] imgs = srcset.split('w, ') for image in imgs: img = image.split(' ') img_size = int(str(img[1].replace('w', ''))) if img_size > best_img_size: best_img_size = img_size best_img = img[0] except: try: clss, src = re.compile( '<img class="(.+?)" src="(.+?)"').findall(post)[0] best_img = src except: pass self._albums.append({ 'title': '%s - (%s)' % (stripTags(title), date), 'album_id': _id, 'pic': best_img, 'description': stripTags(desc).replace(u'Â', u''), 'album_url': href }) _id += 1 return self._albums
def _get_photos(self, album_url): descs = [] self._photos[album_url] = [] img = '' html = self._get_html(album_url) html = html.replace('\n', '').replace('\t', '').replace('\r', '') article = re.compile('<article(.+?)</article>').findall(html) album_title = re.compile( '<div id="blogpost_title"><h1 class="entry-title">(.+?)</h1>' ).findall(article[0]) album_title = unicode( BeautifulStoneSoup( album_title[0], convertEntities=BeautifulStoneSoup.ALL_ENTITIES)) _id = 0 i = 0 images = re.compile('srcset="(.+?)"').findall(article[0]) len_images = len(images) for item in images: desc = re.compile(item + '(.+?)p>(.+?)<p><(.+?) class=').findall( article[0]) if len(desc) > 0: descs.append(['', desc[0][1], '']) else: desc = re.compile(item + '(.+?)</figure>(.+?) class(.+?)').findall( article[0]) if len(desc) > 0: descs.append(['', desc[0][1], '']) last_desc = re.compile( images[len_images - 1] + '" sizes="(.+?)" /></p>(.+?)</p>(.+?)<a class="synved').findall( article[0]) if len(last_desc) > 0: descs.append(['', str(last_desc[0][2]), '']) for srcset in images: best_img = '' best_img_size = 0 try: imgs = srcset.split('w, ') for image in imgs: img = image.split(' ') img_size = int(str(img[1].replace('w', ''))) if img_size > best_img_size: best_img_size = img_size best_img = img[0] desc = unicode( BeautifulStoneSoup( descs[i][1], convertEntities=BeautifulStoneSoup.ALL_ENTITIES)) except: desc = '' self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, stripTags(album_title)), 'album_title': album_title[0], 'photo_id': _id, 'pic': best_img, 'description': stripTags(desc).replace(u'Â', u''), 'album_url': album_url }) _id += 1 i += 1 return self._photos[album_url]
def _get_albums(self): self._albums = [] home_url = 'https://www.theatlantic.com' url = home_url + '/photo/' html = self._get_html(url) css = parseDOM(html, 'style', attrs={'type': 'text/css'})[0] pictures = re.findall( r'#river(?P<river>[0-9]+) \.lead-image.?\{.{1,10}background-image: url\("(?P<url>.+?/.+?x(?P<height>[0-9]+)[^"]+)"', css, re.DOTALL) containers = parseDOM(html, 'div', attrs={'id': 'home-hero'}) # header container containers += parseDOM(html, 'li', attrs={'class': 'article'}) # <li> containers for _id, li in enumerate(containers): # this is the header container (<div id="home-hero">) title = parseDOM(li, 'h1')[0] try: # this is one of the <li id="river#"> containers title = parseDOM(title, 'a')[0] except Exception: pass # add date to description: try: date = parseDOM(parseDOM(li, 'ul'), 'li', attrs={'class': 'date'})[0] except Exception: date = '' try: # this is the header container (<div id="home-hero">) picture = parseDOM(li, 'img', ret='src')[0] except Exception: # this is one of the <li id="river#"> containers resolution = 0 for picture_line in pictures: if picture_line[0] == str(_id): # take the greater resolution if resolution < int(picture_line[2]): picture = picture_line[1] resolution = int(picture_line[2]) try: description = parseDOM(li, 'p', attrs={'class': 'dek'})[0] except Exception: # description (<p></p>) may not exists: description = title self._albums.append({ 'title': self._parser.unescape(title), 'album_id': _id, 'pic': picture, 'description': date + "\n" + stripTags(self._parser.unescape(description)), 'album_url': home_url + parseDOM(li, 'a', ret='href')[0] }) return self._albums