Ejemplo n.º 1
0
    def parse_soup(self, soup: BeautifulSoup, result: ParseResult,
                   base_url: URL):
        # parce video page
        content = soup.find('div', {'id': 'content'})
        if content is not None:
            urls = UrlList()
            is_video = False
            for script in _iter(
                    content.find_all('script',
                                     text=lambda x: 'jwplayer(' in x)):
                data = str(script.string).replace(' ', '')
                file = quotes(data, '"file":"', '"')
                urls.add('DEFAULT', get_url(file, base_url))
                is_video = True

            if is_video:
                result.set_video(urls.get_media_data())

                #adding "user" to video
                user = soup.find('div', {'class': 'thumb-member-username'})
                if user is not None:
                    href = user.find('a').attrs['href']
                    username = href.rpartition('/')[2]

                    result.add_control(
                        ControlInfo(
                            '"' + username + ' uploads"',
                            URL('http://motherless.com/u/' + username + '*')))
                    result.add_control(
                        ControlInfo(
                            '"' + username + ' gals"',
                            URL('http://motherless.com/galleries/member/' +
                                username + '*')))

                #adding tags to video
                for item in _iter(
                        soup.find_all('div', {'id': 'media-tags-container'})):
                    for href in _iter(item.find_all('a')):
                        if href.string is not None:
                            result.add_control(
                                ControlInfo(
                                    str(href.string),
                                    get_url(href.attrs['href'], base_url)))

                return result

        # parce thumbnail page
        for item in _iter(soup.find_all('div', {'class': ['content-inner']})):
            for thumbnail in _iter(item.find_all('div', {'class': 'thumb'})):
                href = get_url(thumbnail.a.attrs['href'], base_url)
                thumb_url = get_url(thumbnail.img.attrs['src'], base_url)

                duration = thumbnail.find('div', {'class': 'caption left'})
                dur_time = '' if duration is None else str(duration.string)

                caption = thumbnail.find('h2', {'class': 'caption title'})
                label = '' if caption is None else str(caption.string)

                user = thumbnail.find('a', {'class': 'caption left'})
                username = '' if user is None else str(user.string)

                if not 'x' in dur_time:
                    result.add_thumb(
                        ThumbInfo(thumb_url=thumb_url,
                                  href=href,
                                  popup=label,
                                  labels=[{
                                      'text': dur_time,
                                      'align': 'top right'
                                  }, {
                                      'text': label,
                                      'align': 'bottom center'
                                  }, {
                                      'text': username,
                                      'align': 'top left'
                                  }]))
        #adding tags to thumbs
        tags = soup.find('div', {'class': 'dark-menu'})
        if tags is not None:
            for tag in _iter(tags.find_all('a')):
                # print(tag)
                result.add_control(
                    ControlInfo(
                        str(tag.string).strip(),
                        get_url(tag.attrs['href'], base_url)))
        #adding pages to thumbs
        pagination = soup.find('div', {'class': 'pagination_link'})
        if pagination is not None:
            for page in _iter(pagination.find_all('a')):
                # print(page)
                if page.string.isdigit():
                    result.add_page(
                        ControlInfo(page.string,
                                    get_url(page.attrs['href'], base_url)))

        return result
Ejemplo n.º 2
0
    def parse_soup(self, soup: BeautifulSoup, result: ParseResult,
                   base_url: URL):
        # parce video page
        video = soup.find('div', {'class': 'video'})
        if video is not None:
            urls = UrlList()
            for source in _iter(video.find_all('source')):
                urls.add(source.attrs['res'],
                         get_url(source.attrs['src'], base_url))
            result.set_video(urls.get_media_data(-1))

            for tag_container in _iter(
                    soup.find_all('div', {'class': 'video_header'})):
                for href in _iter(tag_container.find_all('a')):
                    if href.string is not None:
                        result.add_control(
                            ControlInfo(str(href.string),
                                        get_url(href.attrs['href'], base_url)))
            return result

        # parce thumbnail page
        thumbs_container = soup.find('div', {'class': 'videos cf'})
        if thumbs_container is not None:
            for thumbnail in _iter(
                    thumbs_container.find_all('div', {'class': ['polaroid']})):
                href = get_url(thumbnail.a.attrs['href'], base_url)
                description = thumbnail.a.img.attrs['alt']
                thumb_url = get_url(thumbnail.img.attrs['data-src'], base_url)

                duration = thumbnail.find('div', {'class': "duration"})
                dur_time = '' if duration is None else str(duration.string)

                result.add_thumb(
                    ThumbInfo(thumb_url=thumb_url,
                              href=href,
                              popup=description,
                              labels=[{
                                  'text': dur_time,
                                  'align': 'top right'
                              }, {
                                  'text': description,
                                  'align': 'bottom center'
                              }]))

            tags = soup.find('ul', {'class': 'tags cf'})
            if tags is not None:
                for tag in tags.find_all('a'):
                    result.add_control(
                        ControlInfo(
                            str(tag.string).strip(),
                            get_url(tag.attrs['href'], base_url)))

            pagination = soup.find('div', {'class': 'pagination'})
            if pagination is not None:
                for page in pagination.find_all('a'):
                    if page.string.isdigit():
                        result.add_page(
                            ControlInfo(page.string,
                                        get_url(page.attrs['href'], base_url)))
            return result

        # parce categories page
        categories = set()
        for category in _iter(soup.find_all('div', {'class': 'catbox'})):
            href = get_url(category.a.attrs['href'], base_url)
            thumb_url = get_url(category.img.attrs['data-src'], base_url)
            title = str(category.find('div', {'class': 'title'}).string)

            if title not in categories:
                result.add_thumb(
                    ThumbInfo(thumb_url=thumb_url,
                              href=href,
                              popup=title,
                              labels=[{
                                  'text': title,
                                  'align': 'top right'
                              }]))
                categories.add(title)
        return result
Ejemplo n.º 3
0
    def parse_soup(self, soup: BeautifulSoup, result: ParseResult,
                   base_url: URL):
        # parce video page
        video = soup.find('video')
        if video is not None:
            urls = UrlList()
            for source in _iter(video.find_all('source')):
                urls.add(source.attrs['res'],
                         get_url(source.attrs['src'], base_url))
            result.set_video(urls.get_media_data(-1))

            user = soup.find('div', {'class': 'pull-left user-container'})
            if user is not None:
                user_strings = [string for string in user.stripped_strings]
                label = '"{0} {1}"'.format(user_strings[0], user_strings[1])
                href = user.find('a', href=lambda x: '#' not in x)
                result.add_control(
                    ControlInfo(
                        label, get_url(href.attrs['href'] + '/videos',
                                       base_url)))

            for tag_container in _iter(
                    soup.find_all('div', {'class': 'tags-container'})):
                for href in _iter(tag_container.find_all('a')):
                    if href.string is not None:
                        result.add_control(
                            ControlInfo(str(href.string),
                                        get_url(href.attrs['href'], base_url)))
            return result

        # parce thumbnail page
        for thumbnail in soup.find_all('div', {'class': 'video-thumb'}):
            href = get_url(thumbnail.a.attrs['href'], base_url)
            description = thumbnail.a.img.attrs['alt']
            thumb_url = get_url(thumbnail.img.attrs['src'], base_url)

            duration = thumbnail.find('span', {'class': "time"})
            dur_time = '' if duration is None else str(duration.string)

            quality = thumbnail.find('span', {'class': "quality"})
            qual = '' if quality is None else str(quality.string)

            result.add_thumb(
                ThumbInfo(thumb_url=thumb_url,
                          href=href,
                          popup=description,
                          labels=[{
                              'text': dur_time,
                              'align': 'top right'
                          }, {
                              'text': description,
                              'align': 'bottom center'
                          }, {
                              'text': qual,
                              'align': 'top left'
                          }]))

        tags = soup.find('ul', {'class': 'drop2 hidden-xs'})
        if tags is not None:
            for tag in tags.find_all('a'):
                result.add_control(
                    ControlInfo(
                        str(tag.string).strip(),
                        get_url(tag.attrs['href'], base_url)))

        pagination = soup.find('ul', {'class': 'pagination'})
        if pagination is not None:
            for page in pagination.find_all('a'):
                if page.string.isdigit():
                    result.add_page(
                        ControlInfo(page.string,
                                    get_url(page.attrs['href'], base_url)))

        return result
Ejemplo n.º 4
0
    def parse_soup(self, soup: BeautifulSoup, result: ParseResult,
                   base_url: URL):
        # parce video page
        content = soup.find('div', {'id': 'mediaspace'})
        if content is not None:
            urls = UrlList()
            is_video = False
            for script in _iter(
                    content.find_all('script',
                                     text=lambda x: 'jwplayer(' in x)):
                data = str(script.string).replace(' ', '')
                file = quotes(data, 'file:"', '"')
                urls.add('DEFAULT', get_url(file, base_url))
                is_video = True

            if is_video:
                result.set_video(urls.get_media_data())

                #adding tags to video
                tags = list()
                for item in _iter(
                        soup.find_all('div', {'class': 'more-content'})):
                    for href in _iter(item.find_all('a')):
                        if href.string is not None:
                            if '/user/' in href.attrs['href']:
                                result.add_control(
                                    ControlInfo(
                                        '"' + str(href.string) + '"',
                                        get_url(href.attrs['href'], base_url)))
                            else:
                                tags.append(
                                    ControlInfo(
                                        str(href.string),
                                        get_url(href.attrs['href'], base_url)))

                for item in tags:
                    result.add_control(item)

                return result

        # parce thumbnail page
        for thumbnail in _iter(soup.find_all('div', {'class': 'post'})):
            href = get_url(thumbnail.a.attrs['href'], base_url)
            description = thumbnail.a.img.attrs['alt']
            thumb_url = get_url(thumbnail.img.attrs['src'], base_url)

            duration = thumbnail.find('b', {'class': 'post-duration'})
            dur_time = '' if duration is None else str(duration.string)

            result.add_thumb(
                ThumbInfo(thumb_url=thumb_url,
                          href=href,
                          popup=description,
                          labels=[{
                              'text': dur_time,
                              'align': 'top right'
                          }, {
                              'text': description,
                              'align': 'bottom center'
                          }]))

        tags_container = soup.find('div', {'class': 'site-cats'})
        if tags_container is not None:
            for tag in _iter(tags_container.find_all('a')):
                result.add_control(
                    ControlInfo(str(tag.string),
                                get_url(tag.attrs['href'], base_url)))

        pagination = soup.find('div', {'class': 'pagination'})
        if pagination is not None:
            for page in _iter(pagination.find_all('a')):
                if page.string is not None and page.string.isdigit():
                    result.add_page(
                        ControlInfo(page.string,
                                    get_url(page.attrs['href'], base_url)))
        return result