Python stripTagsの例、CommonFunctions.stripTags Pythonの例

コード例 #1

0

ファイルを表示

ファイル: scrapers.py プロジェクト: idleloop-github/script.module.bigpictures

    def _get_albums(self):
        self._albums = []
        url = 'https://www.theatlantic.com/infocus/'
        html = self._get_html(url)
        pattern = r'@media\(min-width:\s*1632px\)\s*{\s*#river1 \.lead-image\s*{\s*background-image:\s*url\((.+?)\)'
        for _id, li in enumerate(
                parseDOM(html, 'li', attrs={'class': 'article'})):
            headline = parseDOM(li, 'h1')[0]
            match = re.search(pattern.replace('river1', 'river%d' % (_id + 1)),
                              html)
            if match:
                self._albums.append({
                    'title':
                    parseDOM(headline, 'a')[0],
                    'album_id':
                    _id,
                    'pic':
                    match.group(1),
                    'description':
                    stripTags(
                        self._parser.unescape(
                            parseDOM(li, 'p', attrs={'class': 'dek'})[0])),
                    'album_url':
                    'https://www.theatlantic.com' +
                    parseDOM(headline, 'a', ret='href')[0]
                })

        return self._albums

コード例 #2

0

ファイルを表示

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        pattern = r'source data-srcset=\"(.+?)\"'
        match_image = re.findall(pattern, html)
        album_title = self._parser.unescape(parseDOM(html, 'title')[0])
        for _id, p in enumerate(parseDOM(html, 'p', attrs={'class':
                                                           'caption'})):
            match_description = re.search('<span>(.+?)</span>', p)
            if match_description:
                self._photos[album_url].append({
                    'title':
                    '%d - %s' % (_id + 1, album_title),
                    'album_title':
                    album_title,
                    'photo_id':
                    _id,
                    'pic':
                    match_image[_id * 5],
                    'description':
                    stripTags(self._parser.unescape(
                        match_description.group(1))),
                    'album_url':
                    album_url
                })

        return self._photos[album_url]

コード例 #3

0

ファイルを表示

ファイル: scrapers.py プロジェクト: idleloop-github/script.module.bigpictures

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        album_title = parseDOM(html, 'title')[0]
        images = parseDOM(html, 'div', attrs={'class': 'photo'})
        descs = parseDOM(html, 'article', attrs={'class': 'pcaption'})
        for _id, photo in enumerate(images):
            pic = urllib2.quote(parseDOM(photo, 'img', ret='src')[0])
            description = stripTags(
                self._parser.unescape(
                    parseDOM(descs[_id],
                             'div',
                             attrs={'class': 'gcaption geor'})[0]))
            self._photos[album_url].append({
                'title':
                '%d - %s' % (_id + 1, album_title),
                'album_title':
                album_title,
                'photo_id':
                _id,
                'pic':
                'http:' + pic,
                'description':
                description,
                'album_url':
                album_url
            })

        return self._photos[album_url]

コード例 #4

0

ファイルを表示

    def _get_albums(self):
        self._albums = []
        home_url = 'https://www.readingthepictures.org'
        url = home_url + '/category/notes/'
        html = self._get_html(url)

        articles = parseDOM(html, 'div', attrs={'class': 'article'})
        for _id, article in enumerate(articles):
            title = parseDOM(article, 'a', ret='title')[0]
            picture = parseDOM(article, 'img', ret='src')[0]
            description = parseDOM(article, 'p')[0]
            self._albums.append({
                'title':
                self._parser.unescape(title),
                'album_id':
                _id,
                'pic':
                picture,
                'description':
                stripTags(self._parser.unescape(description)),
                'album_url':
                parseDOM(article, 'a', ret='href')[0]
            })

        return self._albums

コード例 #5

0

ファイルを表示

    def _get_albums(self):
        self._albums = []
        home_url = 'https://time.com'
        url = home_url + '/tag/photography/'
        html = self._get_html(url)

        articles = parseDOM(html, 'div', attrs={'class': 'taxonomy-tout'})
        for _id, article in enumerate(articles):
            title = parseDOM(article, 'h2')[0]
            picture = parseDOM(article, 'img', ret='src')[0]
            try:
                description = parseDOM(article, 'h3')[0]
            except Exception:
                description = ''
            self._albums.append({
                'title':
                self._parser.unescape(title),
                'album_id':
                _id,
                'pic':
                picture,
                'description':
                stripTags(self._parser.unescape(description)),
                'album_url':
                home_url + parseDOM(article, 'a', ret='href')[0]
            })

        return self._albums

コード例 #6

0

ファイルを表示

    def _get_albums(self):
        self._albums = []
        home_url = 'https://www.bbc.com'
        url = home_url + '/news/in_pictures'
        html = self._get_html(url)

        articles = parseDOM(html, 'div', attrs={'class': 'gs-o-media__body'})
        pictures  = parseDOM( html, 'div', attrs={'class': \
                        'gs-u-mb\+ gel-body-copy qa-post-body'} )
        descriptions = parseDOM(html, 'div', attrs={'class': 'gel-5/8@l'})
        timestamp = parseDOM(html,
                             'span',
                             attrs={'class': 'qa-post-auto-meta'})
        for _id, article in enumerate(articles):
            title = parseDOM(parseDOM(article, 'a')[0], 'span')[0]
            try:
                picture = parseDOM(pictures[_id], 'img', ret='srcset')[0]
                picture = re.search(r', (?P<bigger_url>https://[^ ]+) \d+w$',
                                    picture).group('bigger_url')
                description = parseDOM(descriptions[_id], 'p')[0]
            except Exception:
                continue
            self._albums.append({
                'title': self._parser.unescape( title ),
                'album_id': _id,
                'pic': picture,
                'description': stripTags( self._parser.unescape( description ) ) + \
                                "\n\nPosted @" + timestamp[_id],
                'album_url': home_url + parseDOM(article, 'a', ret='href')[0]
                })

        return self._albums

コード例 #7

0

ファイルを表示

    def _get_albums(self):
        self._albums = []
        home_url = 'https://photojournalismnow43738385.wordpress.com'
        url = home_url + '/'
        html = self._get_html(url)

        articles = parseDOM(html, 'article')
        for _id, article in enumerate(articles):
            title = parseDOM(parseDOM(article, 'h1')[0], 'a')[0]
            picture = parseDOM(article, 'img', ret='src')[0]
            picture = re.search(r'^([^\?]+)', picture).group(1)
            description = parseDOM(article, 'p')[0]
            self._albums.append({
                'title':
                self._parser.unescape(title),
                'album_id':
                _id,
                'pic':
                picture,
                'description':
                stripTags(self._parser.unescape(description)),
                'album_url':
                parseDOM(article, 'a', ret='href')[0]
            })

        return self._albums

コード例 #8

0

ファイルを表示

ファイル: scrapers.py プロジェクト: camster1/RTOTV

 def _get_photos(self, album_url):
     self._photos[album_url] = []
     html = self._get_html(album_url)
     pattern = r'@media\(min-width:1592px\){#img01 \.img{background-image:url\((.+?)\)'
     id_pattern = re.compile(r'#img(\d\d)')
     album_title = parseDOM(html, 'title')[0]
     for _id, p in enumerate(parseDOM(html, 'p', attrs={'class':
                                                        'caption'})):
         match = re.search(id_pattern, p)
         if match:
             img_id = match.group(1)
             match = re.search(pattern.replace('img01', 'img%s' % img_id),
                               html)
             if match:
                 self._photos[album_url].append({
                     'title':
                     '%d - %s' % (_id + 1, album_title),
                     'album_title':
                     album_title,
                     'photo_id':
                     _id,
                     'pic':
                     match.group(1),
                     'description':
                     stripTags(self._parser.unescape(p)).replace(
                         '\n                #', ''),
                     'album_url':
                     album_url,
                 })
     return self._photos[album_url]

コード例 #9

0

ファイルを表示

ファイル: scrapers.py プロジェクト: camster1/RTOTV

    def _get_albums(self):
        self._albums = []
        url = 'http://www.bostonglobe.com/news/bigpicture'

        html = self._get_html(url)

        for _id, album in enumerate(parseDOM(html, 'section')):
            title = parseDOM(album, 'a')[0]
            album_url = 'http://www.bostonglobe.com' + parseDOM(
                album, 'a', ret='href')[0]
            d = parseDOM(album, 'div', attrs={'class': 'subhead geor'})[0]
            if not d:
                continue
            description = stripTags(self._parser.unescape(d))
            pic = urllib2.quote(parseDOM(album, 'img', ret='src')[0])
            if not pic:
                continue
            self._albums.append({
                'title': title,
                'album_id': _id,
                'pic': 'http:' + pic,
                'description': description,
                'album_url': album_url
            })

        return self._albums

コード例 #10

0

ファイルを表示

ファイル: scrapers.py プロジェクト: idleloop-github/script.module.bigpictures

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        tree = self._get_tree(album_url)
        for id, photo in enumerate(tree.findAll('div', {'class': 'image'})):
            img = photo.find('img')
            if not img:
                continue
            if id == 0:
                album_title = photo.find('h2').string
                # jump first entry as it is a repetition of the album description
                continue
                description = stripTags(
                    self._parser.unescape(
                        str(tree.find('p', {'class': 'desc'}))))
            else:
                try:
                    description = self._parser.unescape(
                        photo.find('p', {
                            'class': 'info-txt'
                        }).string)
                except:
                    description = ''

            self._photos[album_url].append({
                'title':
                '%d - %s' % (id + 1, album_title),
                'album_title':
                album_title,
                'photo_id':
                id,
                'pic':
                img['src'],
                'description':
                description,
                'album_url':
                album_url
            })
        if (id == 1):
            # possibly a video:
            video = tree.find('iframe')['src']
            xbmc.log('possible video = ' + video)
            if re.match(r'.+youtube.com/.+', video):
                video_id = re.sub('.+/', '', video)
                xbmc.log('youtube video = ' + video_id)
                xbmc.executebuiltin(
                    'PlayMedia(plugin://plugin.video.youtube/play/?video_id=' +
                    video_id + ')')
            elif re.match(r'.+vimeo.com/.+', video):
                video_id = re.sub('.+/', '', video)
                xbmc.log('vimeo video = ' + video_id)
                xbmc.executebuiltin(
                    'PlayMedia(plugin://plugin.video.vimeo/play/?video_id=' +
                    video_id + ')')
            # if no match: previous processing have retrieved images
        return self._photos[album_url]

コード例 #11

0

ファイルを表示

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        album_title = self._parser.unescape(
            re.findall(r'"headline":"(?P<title>[^"]+)"', html)[0])
        images = parseDOM(
            html,
            'div',
            attrs={
                'class':
                'component lazy-image lead-media marquee_large_2x[^"]*'
            },
            ret='data-src')
        images += parseDOM(parseDOM(html,
                                    'div',
                                    attrs={'class': 'image-wrapper'}),
                           'div',
                           attrs={'class': 'component lazy-image[^"]*'},
                           ret='data-src')
        if len(images) == 0:
            # if there are no images that's because the article contains just a video: so show its poster only
            images = [parseDOM(html, 'video', ret='poster')[0]]
            descriptions = ['']
        else:
            descriptions = parseDOM(
                html,
                'div',
                attrs={
                    'class':
                    'component lazy-image lead-media marquee_large_2x[^"]*'
                },
                ret='data-alt')
            descriptions += parseDOM(
                parseDOM(html, 'div', attrs={'class': 'image-wrapper'}),
                'div',
                attrs={'class': 'component lazy-image[^"]*'},
                ret='data-alt')
        for _id, image in enumerate(images):
            self._photos[album_url].append({
                'title':
                '%d - %s' % (_id + 1, album_title),
                'album_title':
                album_title,
                'photo_id':
                _id,
                'pic':
                image,
                'description':
                stripTags(self._parser.unescape(descriptions[_id])),
                'album_url':
                album_url
            })

        return self._photos[album_url]

コード例 #12

0

ファイルを表示

ファイル: api.py プロジェクト: idleloop-github/xbmc-newyorktimes

def get_topics():
    '''Returns a list of (topic_name, url) of available topics'''
    html = _get_html(BASE_URL)
    menu = parseDOM(html, 'div', attrs={'class': 'header-container[^\'"]*'})
    topics_url = parseDOM(menu, 'a', ret='href')
    topics_description = parseDOM(menu, 'a')
    links_indexes = [
        x for x, y in enumerate(topics_url) if y.startswith('/video/')
    ]
    topics = [(stripTags(topics_description[i]),
               NYT_URL_BASE + topics_url[i][1:]) for i in links_indexes]
    topics.insert(0, (LATEST_VIDEOS, _url('/video/latest-video/')))
    return topics

コード例 #13

0

ファイルを表示

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        html = html.replace('srcSet', 'srcset')
        album_title = self._parser.unescape(parseDOM(html, 'title')[0])
        pictures = parseDOM(html,
                            'img',
                            attrs={'class': '.+Image[^"]+'},
                            ret='srcset')
        descriptions = parseDOM(html, 'figcaption')
        if (len(descriptions) == 0):
            descriptions = [''] * len(pictures)
        id_picture = 0
        for _id, description in enumerate(descriptions):
            try:
                description = stripTags( self._parser.unescape( description ) ).\
                                replace( 'image caption','' )
                condition = True
                while (condition):
                    picture = pictures[id_picture]
                    picture = re.search(
                        r', (?P<bigger_url>https://[^ ]+) \d+w$',
                        picture).group('bigger_url')
                    id_picture += 1
                    if (re.search(r'(transparent|line)[^\."]+\.png',
                                  picture) == None):
                        condition = False
                if (description == ''
                        and re.search(r'banner[^\."]+\.png', picture) != None):
                    continue
                self._photos[album_url].append({
                    'title':
                    '%d - %s' % (_id + 1, album_title),
                    'album_title':
                    album_title,
                    'photo_id':
                    _id,
                    'pic':
                    picture,
                    'description':
                    self._parser.unescape(description),
                    'album_url':
                    album_url
                })
            except Exception:
                continue

        return self._photos[album_url]

コード例 #14

0

ファイルを表示

ファイル: scrapers.py プロジェクト: rmrector/script.module.bigpictures

 def _get_albums(self):
     self._albums = []
     url = 'http://www.theatlantic.com/infocus/'
     html = self._get_html(url)
     pattern = r'@media\(min-width:1632px\){#river1 \.lead-image{background-image:url\((.+?)\)'
     for _id, li in enumerate(parseDOM(html, 'li', attrs={'class': 'article'})):
         headline = parseDOM(li, 'h1')[0]
         match = re.search(pattern.replace('river1', 'river%d' % (_id + 1)), html)
         if match:
             self._albums.append({
                 'title': parseDOM(headline, 'a')[0],
                 'album_id': _id,
                 'pic': match.group(1),
                 'description': stripTags(self._parser.unescape(parseDOM(li, 'p', attrs={'class': 'dek'})[0])),
                 'album_url': 'http://www.theatlantic.com' + parseDOM(headline, 'a', ret='href')[0],
             })
     return self._albums

コード例 #15

0

ファイルを表示

ファイル: scrapers.py プロジェクト: rmrector/script.module.bigpictures

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        album_title = parseDOM(html, 'title')[0]
        images = parseDOM(html, 'div', attrs={'class': 'photo'})
        descs = parseDOM(html, 'article', attrs={'class': 'pcaption'})

        for _id, photo in enumerate(images):
            pic = urllib2.quote(parseDOM(photo, 'img', ret='src')[0])
            description = stripTags(parseDOM(descs[_id], 'div', attrs={'class': 'gcaption geor'})[0])
            self._photos[album_url].append({
                'title': '%d - %s' % (_id + 1, album_title),
                'album_title': album_title,
                'photo_id': _id,
                'pic': 'http:' + pic,
                'description': description,
                'album_url': album_url
            })
        return self._photos[album_url]

コード例 #16

0

ファイルを表示

ファイル: scrapers.py プロジェクト: idleloop-github/script.module.bigpictures

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        tree = self._get_tree(album_url)
        tree = json.loads(tree.find('script', {'id': 'slideshow-json'}).string)
        for id, slide in enumerate(tree['imageslideshow']['slides']):
            self._photos[album_url].append({
                'title':
                self._parser.unescape(tree['summary']),
                'album_title':
                self._parser.unescape(tree['headline']),
                'photo_id':
                id,
                'pic':
                slide['image_crops']['superJumbo']['url'],
                'description':
                stripTags(self._parser.unescape(slide['caption']['full'])),
                'album_url':
                album_url
            })

        return self._photos[album_url]

コード例 #17

0

ファイルを表示

ファイル: scrapers.py プロジェクト: rmrector/script.module.bigpictures

 def _get_photos(self, album_url):
     self._photos[album_url] = []
     html = self._get_html(album_url)
     pattern = r'@media\(min-width:1592px\){#img01 \.img{background-image:url\((.+?)\)'
     id_pattern = re.compile(r'#img(\d\d)')
     album_title = parseDOM(html, 'title')[0]
     for _id, p in enumerate(parseDOM(html, 'p', attrs={'class': 'caption'})):
         match = re.search(id_pattern, p)
         if match:
             img_id = match.group(1)
             match = re.search(pattern.replace('img01', 'img%s' % img_id), html)
             if match:
                 self._photos[album_url].append({
                     'title': '%d - %s' % (_id + 1, album_title),
                     'album_title': album_title,
                     'photo_id': _id,
                     'pic': match.group(1),
                     'description': stripTags(self._parser.unescape(p)).replace('\n                #', ''),
                     'album_url': album_url,
                 })
     return self._photos[album_url]

コード例 #18

0

ファイルを表示

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        album_title = self._parser.unescape(parseDOM(html, 'title')[0])
        pictures = parseDOM(html, 'figure')
        for _id, picture in enumerate(pictures):
            try:
                image = parseDOM(picture, 'img', ret='srcset')[0]
                image_resolutions = re.findall(
                    r'(?P<url>https://[^ ]+) (?P<resolution>\d+)w', image)
                resolution = 0
                for image_line in image_resolutions:
                    # take the greater resolution
                    if resolution < int(image_line[1]):
                        image = image_line[0]
                        resolution = int(image_line[1])
                try:
                    description = parseDOM(picture, 'figcaption')[0]
                except Exception:
                    description = ''
                self._photos[album_url].append({
                    'title':
                    '%d - %s' % (_id + 1, album_title),
                    'album_title':
                    album_title,
                    'photo_id':
                    _id,
                    'pic':
                    image,
                    'description':
                    stripTags(self._parser.unescape(description)),
                    'album_url':
                    album_url
                })
            except Exception:
                continue

        return self._photos[album_url]

コード例 #19

0

ファイルを表示

ファイル: scrapers.py プロジェクト: rmrector/script.module.bigpictures

    def _get_albums(self):
        self._albums = []
        url = 'http://www.bostonglobe.com/news/bigpicture'

        html = self._get_html(url)

        for _id, album in enumerate(parseDOM(html, 'section')):
            title = parseDOM(album, 'a')[0]
            album_url = 'http://www.bostonglobe.com' + parseDOM(album, 'a', ret='href')[0]
            d = parseDOM(album, 'div', attrs={'class': 'subhead geor'})[0]
            if not d:
                continue
            description = stripTags(self._parser.unescape(d))
            pic = urllib2.quote(parseDOM(album, 'img', ret='src')[0])
            if not pic:
                continue
            self._albums.append({
                'title': title,
                'album_id': _id,
                'pic': 'http:' + pic,
                'description': description,
                'album_url': album_url})

        return self._albums

コード例 #20

0

ファイルを表示

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        album_title = self._parser.unescape(
            parseDOM(html,
                     'meta',
                     attrs={'property': 'og:title'},
                     ret='content')[0])
        alternative_distribution = 0
        images = parseDOM(html, 'div', attrs={'class': 'wp-caption alignnone'})
        if len(images) > 0:
            # may be alternative div classes are used:
            alternative_distribution = 1
        else:
            images = parseDOM(html,
                              'section',
                              attrs={'class': 'single-intro wysiwyg'})
            images += parseDOM(html,
                               'section',
                               attrs={'class': 'single-large-photo'})
        for _id, image in enumerate(images):
            try:
                if alternative_distribution == 0:
                    try:
                        description = parseDOM(
                            image, 'div',
                            attrs={'class': 'caption'
                                   })[0]  # description for first image
                    except Exception:
                        description = parseDOM(image, 'figcaption')[
                            0]  # description for images after the first one
                else:
                    description = parseDOM(image,
                                           'p',
                                           attrs={'class':
                                                  'wp-caption-text'})[0]
                # clean description:
                try:
                    description = stripTags(description).replace(
                        '&nbsp;', '').replace(chr(9), '')
                    description_items = re.search(
                        r'^(?P<author>.+)Caption: *(?P<caption>.+)',
                        description, re.DOTALL)
                    description = re.sub( r'\n+', '', description_items.group('caption') ) + "\n\n" + \
                                    description_items.group('author')
                except Exception:
                    description = ''
                picture = parseDOM(image, 'img', ret='src')[0]
                if alternative_distribution == 1:
                    picture = re.search(
                        r'(?P<pic>https://[^ ]+) [^ ]+?$',
                        parseDOM(image, 'img', ret='srcset')[0]).group('pic')
                self._photos[album_url].append({
                    'title':
                    '%d - %s' % (_id + 1, album_title),
                    'album_title':
                    album_title,
                    'photo_id':
                    _id,
                    'pic':
                    picture,
                    'description':
                    self._parser.unescape(description),
                    'album_url':
                    album_url
                })
            except Exception:
                continue

        if len(self._photos[album_url]) == 0:
            self._photos[album_url].append({
                'title':
                '%d - %s' % (_id + 1, album_title),
                'album_title':
                album_title,
                'photo_id':
                _id,
                'pic':
                'https://www.readingthepictures.org/wp-content/uploads/2019/04/1_LAT_Mueller.jpg',
                'description':
                'no more images here !',
                'album_url':
                album_url
            })
            if XBMC_MODE:
                dialog = xbmcgui.Dialog()
                dialog.notification('Reading the Pictures .org :',
                                    'no more images here !',
                                    xbmcgui.NOTIFICATION_INFO, int(2000))

        return self._photos[album_url]

コード例 #21

0

ファイルを表示

ファイル: scrapers.py プロジェクト: Ibrawap/Entertainment

 def _get_albums(self):
     img = ''
     url = 'http://living-wild.net/blog/'
     html = self._get_html(url)
     html = html.replace('\n', '').replace('\t', '').replace('\r', '')
     posts = re.compile(
         '<div class="tesseract-post tesseract-post-vertical">(.+?)</div></div></div>'
     ).findall(html)
     _id = 1
     print str(len(posts))
     for post in posts:
         date = re.compile(
             '<div class="author-and-date">(.+?)</div>').findall(post)[0]
         lnk_title = re.compile('<a href="(.+?)">(.+?)</a>').findall(
             post)[0]
         href = lnk_title[0]
         title = unicode(
             BeautifulStoneSoup(
                 lnk_title[1],
                 convertEntities=BeautifulStoneSoup.ALL_ENTITIES))
         desc = re.compile('<div class="content">(.+?)<img class=').findall(
             post)[0]
         desc = unicode(
             BeautifulStoneSoup(
                 desc, convertEntities=BeautifulStoneSoup.ALL_ENTITIES))
         desc = re.sub("<h.>", "\n", desc)
         desc = (desc.replace('<li>', '\n - ').replace('</li>', '. \n'))
         desc = desc.replace('..', '.').replace('\n\n', '\n')
         best_img = ''
         best_img_size = 0
         try:
             clss, src, junk, size, srcset = re.compile(
                 '<img class="(.+?)" src="(.+?)" alt(.+?)" (.+?) srcset="(.+?)"'
             ).findall(post)[0]
             imgs = srcset.split('w, ')
             for image in imgs:
                 img = image.split(' ')
                 img_size = int(str(img[1].replace('w', '')))
                 if img_size > best_img_size:
                     best_img_size = img_size
                     best_img = img[0]
         except:
             try:
                 clss, src = re.compile(
                     '<img class="(.+?)" src="(.+?)"').findall(post)[0]
                 best_img = src
             except:
                 pass
         self._albums.append({
             'title':
             '%s - (%s)' % (stripTags(title), date),
             'album_id':
             _id,
             'pic':
             best_img,
             'description':
             stripTags(desc).replace(u'Â', u''),
             'album_url':
             href
         })
         _id += 1
     return self._albums

コード例 #22

0

ファイルを表示

ファイル: scrapers.py プロジェクト: Ibrawap/Entertainment

 def _get_photos(self, album_url):
     descs = []
     self._photos[album_url] = []
     img = ''
     html = self._get_html(album_url)
     html = html.replace('\n', '').replace('\t', '').replace('\r', '')
     article = re.compile('<article(.+?)</article>').findall(html)
     album_title = re.compile(
         '<div id="blogpost_title"><h1 class="entry-title">(.+?)</h1>'
     ).findall(article[0])
     album_title = unicode(
         BeautifulStoneSoup(
             album_title[0],
             convertEntities=BeautifulStoneSoup.ALL_ENTITIES))
     _id = 0
     i = 0
     images = re.compile('srcset="(.+?)"').findall(article[0])
     len_images = len(images)
     for item in images:
         desc = re.compile(item + '(.+?)p>(.+?)<p><(.+?) class=').findall(
             article[0])
         if len(desc) > 0:
             descs.append(['', desc[0][1], ''])
         else:
             desc = re.compile(item +
                               '(.+?)</figure>(.+?) class(.+?)').findall(
                                   article[0])
             if len(desc) > 0:
                 descs.append(['', desc[0][1], ''])
     last_desc = re.compile(
         images[len_images - 1] +
         '" sizes="(.+?)" /></p>(.+?)</p>(.+?)<a class="synved').findall(
             article[0])
     if len(last_desc) > 0:
         descs.append(['', str(last_desc[0][2]), ''])
     for srcset in images:
         best_img = ''
         best_img_size = 0
         try:
             imgs = srcset.split('w, ')
             for image in imgs:
                 img = image.split(' ')
                 img_size = int(str(img[1].replace('w', '')))
                 if img_size > best_img_size:
                     best_img_size = img_size
                     best_img = img[0]
             desc = unicode(
                 BeautifulStoneSoup(
                     descs[i][1],
                     convertEntities=BeautifulStoneSoup.ALL_ENTITIES))
         except:
             desc = ''
         self._photos[album_url].append({
             'title':
             '%d - %s' % (_id + 1, stripTags(album_title)),
             'album_title':
             album_title[0],
             'photo_id':
             _id,
             'pic':
             best_img,
             'description':
             stripTags(desc).replace(u'Â', u''),
             'album_url':
             album_url
         })
         _id += 1
         i += 1
     return self._photos[album_url]

コード例 #23

0

ファイルを表示

    def _get_albums(self):
        self._albums = []
        home_url = 'https://www.theatlantic.com'
        url = home_url + '/photo/'
        html = self._get_html(url)

        css = parseDOM(html, 'style', attrs={'type': 'text/css'})[0]
        pictures = re.findall(
            r'#river(?P<river>[0-9]+) \.lead-image.?\{.{1,10}background-image: url\("(?P<url>.+?/.+?x(?P<height>[0-9]+)[^"]+)"',
            css, re.DOTALL)

        containers = parseDOM(html, 'div',
                              attrs={'id': 'home-hero'})  # header container
        containers += parseDOM(html, 'li',
                               attrs={'class': 'article'})  # <li> containers
        for _id, li in enumerate(containers):
            # this is the header container (<div id="home-hero">)
            title = parseDOM(li, 'h1')[0]
            try:
                # this is one of the <li id="river#"> containers
                title = parseDOM(title, 'a')[0]
            except Exception:
                pass
            # add date to description:
            try:
                date = parseDOM(parseDOM(li, 'ul'),
                                'li',
                                attrs={'class': 'date'})[0]
            except Exception:
                date = ''
            try:
                # this is the header container (<div id="home-hero">)
                picture = parseDOM(li, 'img', ret='src')[0]
            except Exception:
                # this is one of the <li id="river#"> containers
                resolution = 0
                for picture_line in pictures:
                    if picture_line[0] == str(_id):
                        # take the greater resolution
                        if resolution < int(picture_line[2]):
                            picture = picture_line[1]
                            resolution = int(picture_line[2])
            try:
                description = parseDOM(li, 'p', attrs={'class': 'dek'})[0]
            except Exception:
                # description (<p></p>) may not exists:
                description = title
            self._albums.append({
                'title':
                self._parser.unescape(title),
                'album_id':
                _id,
                'pic':
                picture,
                'description':
                date + "\n" + stripTags(self._parser.unescape(description)),
                'album_url':
                home_url + parseDOM(li, 'a', ret='href')[0]
            })

        return self._albums