Esempio n. 1
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'content-inner')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url) + '*')
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination_link')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('div', 'class', 'sub_menu dark-menu'),
            ('div', 'class', 'sub-menu dark-menu')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url) + '*')
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'content')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'jwplayer' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id',
                                                    'media-tags-container')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'thumb-member-username')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                file = self.quotes(item['data'].replace(' ', ''), '"file":"',
                                   '"')
                urls.add('default', URL(file + '*'))

            result.set_video(urls.get_media_data())

            if gallery_user_rule.is_result():
                user = gallery_user_rule.get_result()[0]['href'].rpartition(
                    '/')[2]
                result.add_control(
                    ControlInfo('"' + user + ' uploads"',
                                URL('http://motherless.com/u/' + user + '*')))
                result.add_control(
                    ControlInfo(
                        '"' + user + ' gals"',
                        URL('http://motherless.com/galleries/member/' + user +
                            '*')))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(
                    ControlInfo(item.get('title', item.get('data', '')),
                                URL(item['href'])))

        return result
Esempio n. 2
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'video-item compact')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_categories_rule = ParserRule()
        startpage_categories_rule.add_activate_rule_level([
            ('nav', 'class', 'video-categories')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_categories_rule.add_process_rule_level('a', {'href'})
        # startpage_categories_rule.set_attribute_filter_function('href',lambda x:'/free_porn/' in x)
        startpage_categories_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_categories_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'class',
                                                       'cat-menu hidden-xs')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_filter_function(
            'href', lambda x: '/free_porn/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('video', '', '')])
        video_rule.add_process_rule_level('source', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        video_rule.set_attribute_modifier_function('src',
                                                   lambda txt: txt + '*')
        parser.add_rule(video_rule)

        gallery_rule = ParserRule()
        gallery_rule.add_activate_rule_level([('div', 'id', 'galleryImages')])
        gallery_rule.add_process_rule_level('a', {})
        gallery_rule.add_process_rule_level('img', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        gallery_rule.set_attribute_modifier_function(
            'src', lambda txt: txt.replace('/thumbs/', '/'))
        parser.add_rule(gallery_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'tags')])
        gallery_href_rule.add_process_rule_level('a', {'href', 'title'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: (self.get_href(x, base_url)))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class', 'uploaded')
                                                   ])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            urls = UrlList()
            for item in video_rule.get_result():
                urls.add('default', URL(item['src']))
            result.set_video(urls.get_media_data(-1))

            if gallery_user_rule.is_result():
                user = gallery_user_rule.get_result()[0]['href'].rpartition(
                    '/')[2]
                result.add_control(
                    ControlInfo(
                        '"' + user + '"',
                        URL('http://www.heavy-r.com/user/' + user +
                            '?pro=videos*')))

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                if label == '':
                    label = f['title']
                result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if gallery_rule.is_result():
            result.set_type('pictures')
            url = URL(gallery_rule.get_result()[0]['src'] + '*')
            base_dir = url.get_path(base=Setting.base_dir)
            result.set_gallery_path(base_dir)
            for f in gallery_rule.get_result():
                picture = FullPictureInfo(abs_href=URL(f['src'] + '*'),
                                          rel_name=f['src'].rpartition('/')[2])
                picture.set_base(base_dir)
                result.add_full(picture)

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                if label == '':
                    label = f['title']
                if '/user/' in f['href']:
                    split = f['href'].rpartition('-')
                    base = split[0].partition('/user/')[0]
                    # print(split)
                    # print(base)
                    result.add_control(
                        ControlInfo(label + ' videos',
                                    URL(base + '/uploads-by-user/' +
                                        split[2])))
                    result.add_control(
                        ControlInfo(
                            label + ' gals',
                            URL(base + '/uploads-by-user/' + split[2] +
                                '?photos=1')))
                else:
                    result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_categories_rule.get_result(['href'])) > 0:
                for item in startpage_categories_rule.get_result(
                    ['href', 'data']):
                    result.add_control(
                        ControlInfo(item.get('data', ''), URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item.get('data', ''), URL(item['href'])))

        return result
Esempio n. 3
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'contents videos')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'id',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'contents')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'jwplayer' in text)
        # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url))
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'info_holder')])
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'l')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_filter_function(
            'href', lambda x: '/profiles/' not in x)
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'info_holder')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '/profiles/' in x)
        gallery_user_rule.set_attribute_modifier_function(
            'href',
            lambda x: self.get_href(x.replace('.html', '/videos/'), base_url))
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                file = self.quotes(item['data'], 'file:', ',').strip(' "')
                urls.add('default', URL(file + '*'))

            result.set_video(urls.get_media_data())

            for f in gallery_user_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo('"' + f['data'].strip() + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 4
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([
            ('div', 'class', 'well well-sm hover'),
            ('div', 'class', 'channelContainer')
        ])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class',
                                                       'drop2 hidden-xs')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'video-container')
                                            ])
        video_rule.add_process_rule_level('source', {'src', 'label', 'res'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'video_url' in text)
        video_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'm-t-10 overflow-hidden')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule(collect_data=True)
        gallery_user_rule.add_activate_rule_level([
            ('div', 'class', 'pull-left user-container')
        ])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '#' not in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                urls.add(item['res'], URL(item['src']))

            result.set_video(urls.get_media_data(-1))

            for f in gallery_user_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo('"' + f['data'].strip() + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                label = item['href'].strip('*').rpartition('/')[2]
                result.add_control(ControlInfo(label, URL(item['href'])))

        return result
Esempio n. 5
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([
            ('span', 'class', 'thumb_container_box short'),
            ('span', 'class', 'thumb_container_box long')
        ])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'main-sectionpaging')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: util.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('ul', 'class', 'simple-list simple-list--channels')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*')
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('video', '', '')])
        video_rule.add_process_rule_level('source', {'src', 'id'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        video_rule.set_attribute_modifier_function('src',
                                                   lambda txt: txt + '*')
        parser.add_rule(video_rule)

        video_script_rule = ParserRule()
        video_script_rule.add_activate_rule_level([('body', '', '')])
        video_script_rule.add_process_rule_level('script', {})
        video_script_rule.set_attribute_filter_function(
            'data', lambda text: 'shows:' in text)
        # video_script_rule.set_attribute_modifier_function('src',lambda txt:txt+'*')
        parser.add_rule(video_script_rule)

        gallery_rule = ParserRule()
        gallery_rule.add_activate_rule_level([('div', 'id', 'slideshow')])
        gallery_rule.add_process_rule_level('a', {'index'})
        gallery_rule.add_process_rule_level('img', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        gallery_rule.set_attribute_modifier_function(
            'src', lambda txt: txt.replace('/thumbs/', '/'))
        parser.add_rule(gallery_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'added')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: util.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_script_rule.is_result() or video_rule.is_result():
            files = set()
            default_vid = None

            for item in video_script_rule.get_result():
                script = item['data'].replace(' ', '')
                streams = up.unquote(self.quotes(script, '"streams":[', ']'))
                while '"file":"' in streams:
                    split = streams.partition('"file":"')[2].partition('"')
                    streams = split[2]
                    files.add(split[0] + '*')

            for item in video_rule.get_result():
                files.add(item['src'])
                if 'id' not in item:
                    default_vid = item['src']

            if len(files) == 0:
                return result

            if default_vid is None:
                default_vid = files.pop()
            else:
                files.discard(default_vid)

            urls = UrlList()
            urls.add('Default', URL(default_vid))
            for item in files:
                urls.add(item[-5:-1], URL(item))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if gallery_rule.is_result():
            result.set_type('pictures')
            url = URL(gallery_rule.get_result()[0]['src'] + '*')
            base_dir = url.get_path(base=Setting.base_dir)
            result.set_gallery_path(base_dir)
            for f in gallery_rule.get_result():
                fname = int(f['index'])
                # print(fname)
                picture = FullPictureInfo(
                    abs_href=URL(f['src'] + '*'),
                    rel_name='pic{0:03d}.jpg'.format(fname))
                picture.set_base(base_dir)
                result.add_full(picture)

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('title', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item.get('title', item.get('data', '')),
                                    URL(item['href'])))

        return result
Esempio n. 6
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'videos_form')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'data-lazy-src'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'videos_page')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer(' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'Categories')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                file = self.quotes(item['data'].replace(' ', ''), "file:'", "'")
                urls.add('default', URL(file + '*'))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(ThumbInfo(thumb_url=URL(item['data-lazy-src']), href=URL(item['href']),
                                           popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href']):
                href = item['href']
                label = href.split('/')[-2]
                # print(label,href)
                result.add_control(ControlInfo(label, URL(href)))

        return result
Esempio n. 7
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'video-thumb')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('video', '', '')])
        video_rule.add_process_rule_level('source', {'src', 'label', 'res'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        # gallery_href_rule.add_activate_rule_level([('div', 'class', 'option')])
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'tags-container')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        # gallery_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result(['src', 'res']):
                urls.add(item['res'], URL(item['src']))

            result.set_video(urls.get_media_data(-1))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'],
                                               URL(f['href'] + '*')))
            return result

        if startpage_rule.is_result():
            # for item in startpage_rule.get_result():
            #     print(item)

            for item in startpage_rule.get_result(['href', 'src']):
                href = item['href']
                if '/category/' in href:
                    result.set_caption_visible(True)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(href),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 8
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        # startpage_rule.add_activate_rule_level([('section', '', '')])
        startpage_rule.add_activate_rule_level([('div', 'class', 'inner-block')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'id',
                                                       'pagination')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'stage-video')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'jwplayer(' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([
            ('ul', 'class', 'stats-list stats-list--plain')
        ])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                # print(item['data'])
                file = self.quotes(item['data'].replace(' ', ''), 'file:"',
                                   '"')
                urls.add('DEFAULT', URL(file))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                if '/user/' in f['href']:
                    form = '"{0}"'
                else:
                    form = '{0}'

                result.add_control(
                    ControlInfo(form.format(f['data']), URL(f['href'])))

            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt'
                                             '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 9
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'item-col col')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        # startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*' )
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('nav', 'class', 'pagination-col col pagination')
        ])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: 'http:/' + base_url.get_path() + x + '*')
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('ul', 'class', 'simple-list simple-list--channels')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*')
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('video', 'id', 'thisPlayer')])
        video_rule.add_process_rule_level('source', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        video_rule.set_attribute_modifier_function('src',
                                                   lambda txt: txt + '*')
        parser.add_rule(video_rule)

        gallery_rule = ParserRule()
        gallery_rule.add_activate_rule_level([('div', 'id', 'galleryImages')])
        gallery_rule.add_process_rule_level('a', {})
        gallery_rule.add_process_rule_level('img', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        gallery_rule.set_attribute_modifier_function(
            'src', lambda txt: txt.replace('/thumbs/', '/'))
        parser.add_rule(gallery_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([
            ('div', 'class', 'tags-block'),
            ('div', 'class', 'submitter-container')
        ])
        gallery_href_rule.add_process_rule_level('a', {'href', 'title'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: (self.get_href(x, base_url)))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                urls.add('default', URL(item['src']))
            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                if label == '':
                    label = f['title']
                if '/user/' in f['href']:
                    split = f['href'].rpartition('-')
                    base = split[0].partition('/user/')[0]
                    result.add_control(
                        ControlInfo('"' + label + ' videos"',
                                    URL(base + '/uploads-by-user/' +
                                        split[2])))
                    result.add_control(
                        ControlInfo(
                            '"' + label + ' gals"',
                            URL(base + '/uploads-by-user/' + split[2] +
                                '?photos=1')))
                else:
                    result.add_control(ControlInfo(label, URL(f['href'])))
            return result

        if gallery_rule.is_result():
            result.set_type('pictures')
            url = URL(gallery_rule.get_result()[0]['src'] + '*')
            base_dir = url.get_path(base=Setting.base_dir)
            result.set_gallery_path(base_dir)
            for f in gallery_rule.get_result():
                picture = FullPictureInfo(abs_href=URL(f['src'] + '*'),
                                          rel_name=f['src'].rpartition('/')[2])
                picture.set_base(base_dir)
                result.add_full(picture)

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                if label == '':
                    label = f['title']
                if '/user/' in f['href']:
                    split = f['href'].rpartition('-')
                    base = split[0].partition('/user/')[0]
                    result.add_control(
                        ControlInfo('"' + label + ' videos"',
                                    URL(base + '/uploads-by-user/' +
                                        split[2])))
                    result.add_control(
                        ControlInfo(
                            '"' + label + ' gals"',
                            URL(base + '/uploads-by-user/' + split[2] +
                                '?photos=1')))
                else:
                    result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(
                    ControlInfo(item.get('title', item.get('data', '')),
                                URL(item['href'])))

        return result
Esempio n. 10
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        # startpage_rule.add_activate_rule_level([('section', '', '')])
        startpage_rule.add_activate_rule_level([
            ('article', 'class', 'teaser singleLink hasButtonRow'),
            ('article', 'class', 'activity video hasButtonFooter')
        ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img',
                                              {'src', 'data-lazysrc', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('nav', 'class', 'clearfix pagination bottom'),
            ('nav', 'class', 'range rangeCount-2 clearfix')
        ])
        startpage_pages_rule.add_process_rule_level('a', {'href', 'data-href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_pages_rule.set_attribute_modifier_function(
            'data-href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_categories_rule = ParserRule()
        startpage_categories_rule.add_activate_rule_level([
            ('select', 'id', 'input_selectCategories')
        ])
        startpage_categories_rule.add_process_rule_level('option', {'value'})
        startpage_categories_rule.set_attribute_modifier_function(
            'value', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_categories_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'playerWrapper')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'sources:' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('dl', 'class', 'group')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([
            ('nav', 'class', 'profileNav clearfix buttonRow')
        ])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '#videos' in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                script = item['data'].replace(' ', '').replace('\\', '')
                sources = self.quotes(script, 'sources:{"', '"},').split('","')
                for f in sources:
                    t = f.partition('":"')
                    label = t[0]
                    file = self.get_href(t[2], base_url)
                    urls.add(label, URL(file))

            result.set_video(urls.get_media_data())

            # for f in gallery_user_rule.get_result():
            #     print(f)
            #     name='"{0}"'.format(f['href'].rpartition('/')[2].partition('#')[0])
            #     result.add_control(ControlInfo(name, URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(
                        item.get('data-lazysrc', item['src'])),
                              href=URL(item['href']),
                              popup=item.get('alt'
                                             '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                # print(item)
                href = item.get('data-href', item['href'])
                # print(href)
                result.add_page(
                    ControlInfo(href.rpartition('/')[2].strip('*'), URL(href)))

            for item in startpage_categories_rule.get_result():
                result.add_control(
                    ControlInfo(item['data'], URL(item['value'])))

        return result
Esempio n. 11
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'item  '),
                                                ('div', 'class',
                                                 'list-categories')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'data-original', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('li', {'class'})
        startpage_pages_rule.add_process_rule_level('a',
                                                    {'href', 'data-query'})
        startpage_pages_rule.set_attribute_filter_function(
            'class', lambda x: x == 'page')
        startpage_pages_rule.set_attribute_modifier_function(
            'data-query', lambda x: x.partition(':')[2])
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'flashvars' in text)
        # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url))
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'info')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        # gallery_href_rule.set_attribute_filter_function('href',lambda x:'/profiles/' not in x)
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'block-user')])
        gallery_user_rule.add_process_rule_level('a', {'href', 'title'})
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profiles/' in x)
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x + 'videos/', base_url))
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                flashvars = self.quotes(
                    item['data'].replace(' ',
                                         '').replace('\n',
                                                     '').replace('\t', ''),
                    'flashvars={', '};').split(',')
                fv = dict()
                for flashvar in flashvars:
                    split = flashvar.partition(':')
                    fv[split[0]] = split[2].strip("'\"")

                # file=self.quotes(item['data'],'file:',',').strip(' "')
                urls.add('default', URL(fv['video_url'] + '*'))

            result.set_video(urls.get_media_data())

            for f in gallery_user_rule.get_result(['title', 'href']):
                result.add_control(
                    ControlInfo('"' + f['title'].strip() + '"',
                                URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-original']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in startpage_pages_rule.get_result(['href',
                                                         'data-query']):
                result.add_page(
                    ControlInfo(item['data-query'], URL(item['href'])))

        return result
Esempio n. 12
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'loop-nav-inner')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('ul', 'class', 'menu')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(tags_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class',
                                             'section-content'),
                                            ('div', 'id', 'video')])
        video_rule.add_process_rule_level('iframe', {'src'})
        # video_rule.set_attribute_filter_function('src',lambda x:'fileone.tv' in x)
        video_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                print(item)
                src = item['src']
                if '.video/embed' in src:
                    try:
                        r = load(URL(item['src']))
                        setup = self.quotes(r.text,
                                            'jwplayer("vplayer").setup(',
                                            ")").replace(' ', '')
                        sources = self.quotes(setup, 'sources:[{',
                                              '}],').split('},{')
                        for item in sources:
                            if '.mp4' in item:
                                file = self.quotes(item, 'file:"', '"')
                                label = self.quotes(item, 'label:"', '"')
                                urls.add(label, URL(file + '*'))
                    except LoaderError as err:
                        print(err)
                elif 'javfinder.com/' in src:
                    try:
                        r = load(URL(item['src']))
                        split1 = r.text.split('<source src="')[1:]
                        for f in split1:
                            f1 = f.partition('>')[0]
                            if '.mp4' in f1:
                                file = f1.partition('"')[0]
                                label = self.quotes(f1, 'res="', '"')
                                urls.add(label, URL(file + '*'))
                    except LoaderError as err:
                        print(err)

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
Esempio n. 13
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        # def star_get_url(txt=''):
        #     return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'image ')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('div', 'class', 'sub_menu dark-menu'),
            ('div', 'class', 'sub-menu dark-menu')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'flashvars' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'block_content')])
        # gallery_href_rule.add_activate_rule_level([('td', 'colspan', '2')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_filter_function(
            'href', lambda x: '/tags/' in x or '/categories/' in x)
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'block_content')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '/members/' in x)
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url))
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                file = self.quotes(item['data'].replace(' ', ''),
                                   "video_url:'", "'")
                urls.add('default', URL(file))

            result.set_video(urls.get_media_data())

            if gallery_user_rule.is_result():
                username = gallery_user_rule.get_result()[0].get('data', '***')
                user = gallery_user_rule.get_result()[0]['href'].rstrip(
                    '/').rpartition('/')[2]
                result.add_control(
                    ControlInfo(
                        '"' + username + '"',
                        URL('http://gobdsm.com/members/' + user +
                            '/public_videos/')))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(
                    ControlInfo(item.get('title', item.get('data', '')),
                                URL(item['href'])))

        return result
Esempio n. 14
0
    def parse_soup(self, soup: BeautifulSoup, result: ParseResult,
                   base_url: URL):
        # parce video page
        content = soup.find('div', {'id': 'mediaspace'})
        if content is not None:
            urls = UrlList()
            is_video = False
            for script in _iter(
                    content.find_all('script',
                                     text=lambda x: 'jwplayer(' in x)):
                data = str(script.string).replace(' ', '')
                file = quotes(data, 'file:"', '"')
                urls.add('DEFAULT', get_url(file, base_url))
                is_video = True

            if is_video:
                result.set_video(urls.get_media_data())

                #adding tags to video
                tags = list()
                for item in _iter(
                        soup.find_all('div', {'class': 'more-content'})):
                    for href in _iter(item.find_all('a')):
                        if href.string is not None:
                            if '/user/' in href.attrs['href']:
                                result.add_control(
                                    ControlInfo(
                                        '"' + str(href.string) + '"',
                                        get_url(href.attrs['href'], base_url)))
                            else:
                                tags.append(
                                    ControlInfo(
                                        str(href.string),
                                        get_url(href.attrs['href'], base_url)))

                for item in tags:
                    result.add_control(item)

                return result

        # parce thumbnail page
        for thumbnail in _iter(soup.find_all('div', {'class': 'post'})):
            href = get_url(thumbnail.a.attrs['href'], base_url)
            description = thumbnail.a.img.attrs['alt']
            thumb_url = get_url(thumbnail.img.attrs['src'], base_url)

            duration = thumbnail.find('b', {'class': 'post-duration'})
            dur_time = '' if duration is None else str(duration.string)

            result.add_thumb(
                ThumbInfo(thumb_url=thumb_url,
                          href=href,
                          popup=description,
                          labels=[{
                              'text': dur_time,
                              'align': 'top right'
                          }, {
                              'text': description,
                              'align': 'bottom center'
                          }]))

        tags_container = soup.find('div', {'class': 'site-cats'})
        if tags_container is not None:
            for tag in _iter(tags_container.find_all('a')):
                result.add_control(
                    ControlInfo(str(tag.string),
                                get_url(tag.attrs['href'], base_url)))

        pagination = soup.find('div', {'class': 'pagination'})
        if pagination is not None:
            for page in _iter(pagination.find_all('a')):
                if page.string is not None and page.string.isdigit():
                    result.add_page(
                        ControlInfo(page.string,
                                    get_url(page.attrs['href'], base_url)))
        return result
Esempio n. 15
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'item  ')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'data-original', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination-holder')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'flashvars' in text)
        # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url))
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'info')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                flashvars = self.quotes(
                    item['data'].replace(' ',
                                         '').replace('\n',
                                                     '').replace('\t', ''),
                    'flashvars={', '};').split(',')
                fv = dict()
                for flashvar in flashvars:
                    split = flashvar.partition(':')
                    fv[split[0]] = split[2].strip("'\"")
                files = dict()
                for f in fv:
                    if fv[f].startswith('http://') and fv[f].rpartition(
                            '.')[2].strip('/') == 'mp4':
                        file = fv[f]
                        label = fv.get(f + '_text', f)
                        files[label] = file

                for key in sorted(files.keys(), reverse=True):
                    urls.add(key, URL(files[key]))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-original']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 16
0
    def parse_index_file(self, fname, base_url=URL()):

        print('This site is very slow, be patient')

        parser = SiteParser()

        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'item  ')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination-holder')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'item')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                file = self.quotes(item['data'].replace(' ', ''), "video_url:'", "'")
                urls.add('default', URL(file))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href']):
                href = item['href']
                label = href.split('/')[-2]
                # print(label,href)
                result.add_control(ControlInfo(label, URL(href)))

        return result
Esempio n. 17
0
    def parse_soup(self, soup: BeautifulSoup, result: ParseResult,
                   base_url: URL):
        # parce video page
        video = soup.find('video')
        if video is not None:
            urls = UrlList()
            for source in _iter(video.find_all('source')):
                urls.add(source.attrs['res'],
                         get_url(source.attrs['src'], base_url))
            result.set_video(urls.get_media_data(-1))

            user = soup.find('div', {'class': 'pull-left user-container'})
            if user is not None:
                user_strings = [string for string in user.stripped_strings]
                label = '"{0} {1}"'.format(user_strings[0], user_strings[1])
                href = user.find('a', href=lambda x: '#' not in x)
                result.add_control(
                    ControlInfo(
                        label, get_url(href.attrs['href'] + '/videos',
                                       base_url)))

            for tag_container in _iter(
                    soup.find_all('div', {'class': 'tags-container'})):
                for href in _iter(tag_container.find_all('a')):
                    if href.string is not None:
                        result.add_control(
                            ControlInfo(str(href.string),
                                        get_url(href.attrs['href'], base_url)))
            return result

        # parce thumbnail page
        for thumbnail in soup.find_all('div', {'class': 'video-thumb'}):
            href = get_url(thumbnail.a.attrs['href'], base_url)
            description = thumbnail.a.img.attrs['alt']
            thumb_url = get_url(thumbnail.img.attrs['src'], base_url)

            duration = thumbnail.find('span', {'class': "time"})
            dur_time = '' if duration is None else str(duration.string)

            quality = thumbnail.find('span', {'class': "quality"})
            qual = '' if quality is None else str(quality.string)

            result.add_thumb(
                ThumbInfo(thumb_url=thumb_url,
                          href=href,
                          popup=description,
                          labels=[{
                              'text': dur_time,
                              'align': 'top right'
                          }, {
                              'text': description,
                              'align': 'bottom center'
                          }, {
                              'text': qual,
                              'align': 'top left'
                          }]))

        tags = soup.find('ul', {'class': 'drop2 hidden-xs'})
        if tags is not None:
            for tag in tags.find_all('a'):
                result.add_control(
                    ControlInfo(
                        str(tag.string).strip(),
                        get_url(tag.attrs['href'], base_url)))

        pagination = soup.find('ul', {'class': 'pagination'})
        if pagination is not None:
            for page in pagination.find_all('a'):
                if page.string.isdigit():
                    result.add_page(
                        ControlInfo(page.string,
                                    get_url(page.attrs['href'], base_url)))

        return result
Esempio n. 18
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('article', '', '')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class',
                                                       'pagination2')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('section', 'class',
                                                       'categories')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        # startpage_hrefs_rule.set_attribute_modifier_function('title', lambda x: x.replace('Sex films in de categorie ',''))
        parser.add_rule(startpage_hrefs_rule)
        #

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('video', '', '')])
        video_rule.add_process_rule_level('source', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        video_rule.set_attribute_modifier_function('src',
                                                   lambda txt: txt + '*')
        parser.add_rule(video_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'tags')])
        gallery_href_rule.add_process_rule_level('a', {'href', 'title'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                urls.add('default', URL(item['src']))
            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['href']):
                result.add_control(ControlInfo(f['title'], URL(f['href'])))
            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            def href_simple(txt=''):
                txt = txt.lower().replace(' ',
                                          '').replace('sexfilmsindecategorie',
                                                      '')
                txt = txt.replace('insexfilms',
                                  '').replace('sexfilms',
                                              '').replace('inhd', 'HD')
                return txt

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(
                    ControlInfo(href_simple(item.get('title', '')),
                                URL(item['href'] + '*')))

        return result
Esempio n. 19
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(tags_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'videoContainer')])
        video_rule.add_process_rule_level('iframe', {'src'})
        video_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                try:
                    r = load(URL(item['src']))
                    r = load(URL(self.quotes(r.text, "jwplayer().load('", "'") + '*'))
                    source = self.quotes(r.text, '<item>', '</item>').strip()
                    split = source.split('<jwplayer:source file="')
                    for l in split:
                        if l is '':
                            continue
                        url = l.partition('"')[0]
                        label = self.quotes(l, 'label="', '"')
                        urls.add(label, URL(url + '*'))

                except LoaderError as err:
                    print(err)

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 20
0
    def parse_index_file(self, fname, base_url=URL()):
        if base_url.contain('format=json'):
            xhr_page = True
        else:
            xhr_page = False

        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([
            ('div', 'class',
             'relative margin-auto video-box-wrapper normal-box')
        ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'alt', 'data-srcmedium'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'video-container')
                                            ])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'flashvars=' in text.replace(' ', ''))
        # video_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)
        #
        video_href_rule = ParserRule()
        video_href_rule.add_activate_rule_level([('div', 'class',
                                                  'ibInfo js_ibInfo')])
        video_href_rule.add_process_rule_level('a', {'href'})
        video_href_rule.set_attribute_filter_function(
            'href', lambda x: 'javascript' not in x)
        video_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_href_rule)

        video_user_rule = ParserRule()
        video_user_rule.add_activate_rule_level([('div', 'class', 'ibLine1')])
        video_user_rule.add_process_rule_level('a', {'href'})
        # video_user_rule.set_attribute_filter_function('href',lambda x: 'javascript' not in x)
        video_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_user_rule)

        if not xhr_page:
            self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                flashvars = self.quotes(item['data'].replace(' ', ''),
                                        'flashvars={',
                                        '};').replace('\\', '').split(',"')
                for v in flashvars:
                    if v.startswith('quality_'):
                        label = self.quotes(v, 'quality_', '"')
                        file = self.quotes(v, ':"', '"')
                        urls.add(label, URL(file + '*'))

            result.set_video(urls.get_media_data(-1))

            for f in video_user_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo('"' + f['data'] + '"', URL(f['href'])))

            for f in video_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if xhr_page:

            def parce_data(data, buttons_added, base_url, items_name='items'):
                xhr_data = base_url.xhr_data
                items = data[items_name]
                nav = data['navigation']
                last_page = nav['lastPage']
                if last_page is None: last_page = 1
                curr_page = int(base_url.get().rpartition('page=')[2])
                pattern = nav['urlPattern'].replace('[%pageId%]', '{}') + '*'

                for item in items:
                    thumb_url = item['thumb_url']
                    title = item['specialchars_title']
                    url = item['video_link']
                    result.add_thumb(
                        ThumbInfo(thumb_url=URL(thumb_url),
                                  href=URL(url + '*'),
                                  popup=title))

                # print('Page {} of {}'.format(curr_page, last_page), pattern.format(curr_page))
                if not buttons_added:
                    result.add_page(ControlInfo('1', xhr_data['base_url']))

                    p_from = curr_page - 5
                    if p_from < 2: p_from = 2

                    p_to = curr_page + 5
                    if p_to > last_page: p_to = last_page

                    for x in range(p_from, p_to):
                        url = URL(pattern.format(x), xhr_data=xhr_data)
                        if x == curr_page: x = str(x) + '(this)'
                        result.add_page(ControlInfo(str(x), url))

                    last_url = URL(pattern.format(last_page),
                                   xhr_data=xhr_data)
                    result.add_page(ControlInfo(str(last_page), last_url))
                    buttons_added = True

                return buttons_added

            with open(fname) as fp:
                try:
                    json_data = json.load(fp)
                    buttons_added = False
                    if base_url.contain('/keyword/'):
                        for i in json_data:
                            buttons_added = parce_data(i, buttons_added,
                                                       base_url)
                    elif base_url.contain('/users/'):
                        parce_data(json_data['response'],
                                   buttons_added,
                                   base_url,
                                   items_name='videos')
                    else:
                        for i in json_data:
                            buttons_added = parce_data(json_data[i],
                                                       buttons_added, base_url)
                except ValueError:
                    pass
            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-srcmedium']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            xhr_data = {'base_url': base_url}

            next_url = URL(
                base_url.get() + '*',
                xhr_data=xhr_data)  # +'?format=json&number_pages=1&page=2*'
            next_url.add_query([('format', 'json'), ('number_pages', '1'),
                                ('page', '2')])
            result.add_page(ControlInfo('next', next_url))

        return result
Esempio n. 21
0
    def parse_soup(self, soup: BeautifulSoup, result: ParseResult,
                   base_url: URL):
        # parce video page
        content = soup.find('div', {'id': 'content'})
        if content is not None:
            urls = UrlList()
            is_video = False
            for script in _iter(
                    content.find_all('script',
                                     text=lambda x: 'jwplayer(' in x)):
                data = str(script.string).replace(' ', '')
                file = quotes(data, '"file":"', '"')
                urls.add('DEFAULT', get_url(file, base_url))
                is_video = True

            if is_video:
                result.set_video(urls.get_media_data())

                #adding "user" to video
                user = soup.find('div', {'class': 'thumb-member-username'})
                if user is not None:
                    href = user.find('a').attrs['href']
                    username = href.rpartition('/')[2]

                    result.add_control(
                        ControlInfo(
                            '"' + username + ' uploads"',
                            URL('http://motherless.com/u/' + username + '*')))
                    result.add_control(
                        ControlInfo(
                            '"' + username + ' gals"',
                            URL('http://motherless.com/galleries/member/' +
                                username + '*')))

                #adding tags to video
                for item in _iter(
                        soup.find_all('div', {'id': 'media-tags-container'})):
                    for href in _iter(item.find_all('a')):
                        if href.string is not None:
                            result.add_control(
                                ControlInfo(
                                    str(href.string),
                                    get_url(href.attrs['href'], base_url)))

                return result

        # parce thumbnail page
        for item in _iter(soup.find_all('div', {'class': ['content-inner']})):
            for thumbnail in _iter(item.find_all('div', {'class': 'thumb'})):
                href = get_url(thumbnail.a.attrs['href'], base_url)
                thumb_url = get_url(thumbnail.img.attrs['src'], base_url)

                duration = thumbnail.find('div', {'class': 'caption left'})
                dur_time = '' if duration is None else str(duration.string)

                caption = thumbnail.find('h2', {'class': 'caption title'})
                label = '' if caption is None else str(caption.string)

                user = thumbnail.find('a', {'class': 'caption left'})
                username = '' if user is None else str(user.string)

                if not 'x' in dur_time:
                    result.add_thumb(
                        ThumbInfo(thumb_url=thumb_url,
                                  href=href,
                                  popup=label,
                                  labels=[{
                                      'text': dur_time,
                                      'align': 'top right'
                                  }, {
                                      'text': label,
                                      'align': 'bottom center'
                                  }, {
                                      'text': username,
                                      'align': 'top left'
                                  }]))
        #adding tags to thumbs
        tags = soup.find('div', {'class': 'dark-menu'})
        if tags is not None:
            for tag in _iter(tags.find_all('a')):
                # print(tag)
                result.add_control(
                    ControlInfo(
                        str(tag.string).strip(),
                        get_url(tag.attrs['href'], base_url)))
        #adding pages to thumbs
        pagination = soup.find('div', {'class': 'pagination_link'})
        if pagination is not None:
            for page in _iter(pagination.find_all('a')):
                # print(page)
                if page.string.isdigit():
                    result.add_page(
                        ControlInfo(page.string,
                                    get_url(page.attrs['href'], base_url)))

        return result
Esempio n. 22
0
    def parse_soup(self, soup: BeautifulSoup, result: ParseResult,
                   base_url: URL):
        # parce video page
        video = soup.find('div', {'class': 'video'})
        if video is not None:
            urls = UrlList()
            for source in _iter(video.find_all('source')):
                urls.add(source.attrs['res'],
                         get_url(source.attrs['src'], base_url))
            result.set_video(urls.get_media_data(-1))

            for tag_container in _iter(
                    soup.find_all('div', {'class': 'video_header'})):
                for href in _iter(tag_container.find_all('a')):
                    if href.string is not None:
                        result.add_control(
                            ControlInfo(str(href.string),
                                        get_url(href.attrs['href'], base_url)))
            return result

        # parce thumbnail page
        thumbs_container = soup.find('div', {'class': 'videos cf'})
        if thumbs_container is not None:
            for thumbnail in _iter(
                    thumbs_container.find_all('div', {'class': ['polaroid']})):
                href = get_url(thumbnail.a.attrs['href'], base_url)
                description = thumbnail.a.img.attrs['alt']
                thumb_url = get_url(thumbnail.img.attrs['data-src'], base_url)

                duration = thumbnail.find('div', {'class': "duration"})
                dur_time = '' if duration is None else str(duration.string)

                result.add_thumb(
                    ThumbInfo(thumb_url=thumb_url,
                              href=href,
                              popup=description,
                              labels=[{
                                  'text': dur_time,
                                  'align': 'top right'
                              }, {
                                  'text': description,
                                  'align': 'bottom center'
                              }]))

            tags = soup.find('ul', {'class': 'tags cf'})
            if tags is not None:
                for tag in tags.find_all('a'):
                    result.add_control(
                        ControlInfo(
                            str(tag.string).strip(),
                            get_url(tag.attrs['href'], base_url)))

            pagination = soup.find('div', {'class': 'pagination'})
            if pagination is not None:
                for page in pagination.find_all('a'):
                    if page.string.isdigit():
                        result.add_page(
                            ControlInfo(page.string,
                                        get_url(page.attrs['href'], base_url)))
            return result

        # parce categories page
        categories = set()
        for category in _iter(soup.find_all('div', {'class': 'catbox'})):
            href = get_url(category.a.attrs['href'], base_url)
            thumb_url = get_url(category.img.attrs['data-src'], base_url)
            title = str(category.find('div', {'class': 'title'}).string)

            if title not in categories:
                result.add_thumb(
                    ThumbInfo(thumb_url=thumb_url,
                              href=href,
                              popup=title,
                              labels=[{
                                  'text': title,
                                  'align': 'top right'
                              }]))
                categories.add(title)
        return result
Esempio n. 23
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'link-3col')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'wp-pagenavi')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(tags_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('iframe', {'src'})
        video_rule.set_attribute_filter_function('src',
                                                 lambda x: 'fileone.tv' in x)
        video_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                print(item)
                try:
                    r = load(URL(item['src']))
                    setup = self.quotes(r.text, "jwplayer('player').setup(",
                                        ")").replace(' ', '')
                    file = self.quotes(setup, "file:'", "'")
                    urls.add("default", URL(file + '*'))
                except LoaderError as err:
                    print(err)

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
Esempio n. 24
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('section', '', '')])
        startpage_rule.add_activate_rule_level([('div', 'class', 'videos cf')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'data-src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'data-src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        categories_rule = ParserRule()
        # categories_rule.add_activate_rule_level([('section', '', '')])
        categories_rule.add_activate_rule_level([('div', 'class', 'catbox')])
        categories_rule.add_process_rule_level('a', {'href'})
        categories_rule.add_process_rule_level('img', {'data-src', 'alt'})
        categories_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        categories_rule.set_attribute_modifier_function(
            'data-src', lambda x: self.get_href(x, base_url))
        parser.add_rule(categories_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_tags_rule = ParserRule()
        startpage_tags_rule.add_activate_rule_level([('ul', 'class', 'tags cf')
                                                     ])
        startpage_tags_rule.add_process_rule_level('a', {'href'})
        startpage_tags_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_tags_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('video', 'id', 'video_id')])
        video_rule.add_process_rule_level('source', {'src', 'res'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text)
        video_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'video_header')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                urls.add(item['res'], URL(item['src']))
            result.set_video(urls.get_media_data(-1))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-src']),
                              href=URL(item['href']),
                              popup=item.get('alt'
                                             '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

            if base_url.contain('/categories/'):
                result.set_caption_visible(True)

            return result

        if categories_rule.is_result():
            urls = list()
            for item in categories_rule.get_result(['href']):
                if item['href'] in urls:
                    continue
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-src']),
                              href=URL(item['href']),
                              popup=item.get('alt'
                                             '')))
                urls.append(item['href'])

            result.set_caption_visible(True)
        return result
Esempio n. 25
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        # print(base_url.domain())
        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule(debug=False)
        startpage_rule.add_activate_rule_level([('ul', 'class', 'video-listing'),
                                                ('ul', 'class', 'video-listing two-in-row'),
                                                ('ul', 'class', 'video-listing four-in-row'),
                                                ('ul', 'class', 'video-listing two-in-row id-recommended-list'),
                                                ('ul', 'class', 'video-listing four-in-row id-recommended-list')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'data-src', 'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function('data-src', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pageNumbersHolder')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'categories-listing'),
                                                      ('ul', 'class', 'categories-popular-listing'),
                                                      ('ul', 'class', 'abc-categories newAbcCategories')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)

        pornstars_rule = ParserRule()
        pornstars_rule.add_activate_rule_level([('div', 'id', 'all_pornstars')])
        pornstars_rule.add_process_rule_level('a', {'href'})
        pornstars_rule.add_process_rule_level('img', {'src', 'alt'})
        # pornstars_rule.set_attribute_filter_function('href',lambda x: '/channel/' in x or '/prime/' in x)
        pornstars_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(pornstars_rule)

        pornstars_hrefs_rule = ParserRule()
        pornstars_hrefs_rule.add_activate_rule_level([('ul', 'class', 'abc-categories newAbcCategories')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        pornstars_hrefs_rule.add_process_rule_level('a', {'href'})
        pornstars_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(pornstars_hrefs_rule)

        channels_rule = ParserRule(debug=False)
        channels_rule.add_activate_rule_level([('ul', 'class', 'channels-list three-in-row')])
        channels_rule.add_process_rule_level('a', {'href'})
        channels_rule.add_process_rule_level('img', {'src', 'data-src', 'alt'})
        channels_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        channels_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url))
        channels_rule.set_attribute_modifier_function('data-src', lambda x: self.get_href(x, base_url))
        parser.add_rule(channels_rule)

        channels_hrefs_rule = ParserRule()
        channels_hrefs_rule.add_activate_rule_level([('div', 'class', 'channel-filters-categories')])
        # channels_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        channels_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        channels_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(channels_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'watch')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'redtube_flv_player' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'video-details')])
        # gallery_href_rule.add_activate_rule_level([('td', 'class', 'links')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        gallery_href_rule.set_attribute_filter_function('href', lambda x: x != '*')
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                script = item['data'].replace(' ', '').replace('\\', '')
                sources = self.quotes(script, 'sources:{"', '"},').split('","')
                for f in sources:
                    t = f.partition('":"')
                    label = t[0]
                    file = self.get_href(t[2], base_url)
                    urls.add(label, URL(file))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                href = f['href']
                label = f['data']
                if '/redtube/' in href or '/tag/' in href:
                    result.add_control(ControlInfo(label, URL(href)))
                elif '/pornstar/' in href:
                    ps_name = href.rstrip('*').rpartition('/')[2].replace('+', ' ').title()
                    result.add_control(ControlInfo(ps_name, URL(href)))
                else:
                    # adding user
                    result.add_control(ControlInfo("'" + label + "'", URL(href.replace('*', '/videos*'))))
            return result

        if pornstars_rule.is_result():
            result.set_caption_visible(True)
            for item in pornstars_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in pornstars_hrefs_rule.get_result(['href']):
                result.add_control(ControlInfo(item['data'], URL(item['href'])))
            return result

        if channels_rule.is_result():
            result.set_caption_visible(True)
            for item in channels_rule.get_result(['href']):
                thumb_href = item.get('data-src', item.get('src'))
                descr = item.get('alt', '').title()
                result.add_thumb(ThumbInfo(thumb_url=URL(thumb_href), href=URL(item['href']), popup=descr))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in channels_hrefs_rule.get_result(['href']):
                result.add_control(ControlInfo(item['title'], URL(item['href'])))
            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                thumb_href = item.get('data-src', item.get('src'))
                descr = item.get('title', item.get('alt', ''))
                result.add_thumb(ThumbInfo(thumb_url=URL(thumb_href), href=URL(item['href']), popup=descr))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'title']):
                result.add_control(ControlInfo(item['title'], URL(item['href'])))

        return result
Esempio n. 26
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule(debug=False)
        startpage_rule.add_activate_rule_level([('div', 'class', 'main l170'),
                                                ('div', 'class', 'main l200'),
                                                ('div', 'class', 'main'),
                                                ('div', 'class', 'profileRight'),
                                                ('div', 'class', 'main l200 r300')])
        startpage_rule.add_activate_rule_level([('ul', 'class', 'listThumbs'),
                                                ('ul', 'class', 'listProfiles'),
                                                ('ul', 'class', 'listChannels'),
                                                ('ul', 'class', 'listGalleries')])
        startpage_rule.add_process_rule_level('a', {'href', 'class', 'style'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        startpage_rule.set_attribute_modifier_function('style', star_get_url)
        startpage_rule.set_attribute_filter_function('href', lambda x: not '/pictures/' in x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'sFilters initial'),
                                                      ('ul', 'class', 'sFilters'),
                                                      ('div', 'class', 'listSearches searchOption'),
                                                      ('div', 'class', 'alpha')
                                                      ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        startpage_hrefs_rule.set_attribute_filter_function('title', lambda x: 'Combine Category' not in x)
        parser.add_rule(startpage_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('head', '', '')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'streams:[' in text)
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('p', 'class', 'source tags'),
                                                   ('p', 'class', 'source categories')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('p', 'class', 'source')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_filter_function('href', lambda x: '/profile/' in x)
        gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x + '/videos', base_url))
        parser.add_rule(gallery_user_rule)

        gallery_actor_rule = ParserRule()
        gallery_actor_rule.add_activate_rule_level([('p', 'class', 'source')])
        gallery_actor_rule.add_process_rule_level('a', {'href'})
        gallery_actor_rule.set_attribute_filter_function('href', lambda x: '/pornstars/' in x)
        gallery_actor_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x + '/videos', base_url))
        parser.add_rule(gallery_actor_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                script = item['data'].replace(' ', '')
                sources = self.quotes(script, 'streams:[{', '}]').split('},{')
                for f in sources:
                    label = self.quotes(f, 'id:"', '"')
                    file = self.quotes(f, 'url:"', '"')
                    urls.add(label, URL(file + '*'))

            result.set_video(urls.get_media_data(-1))

            for f in gallery_user_rule.get_result(['href']):
                result.add_control(ControlInfo('"' + f['data'] + '"', URL(f['href'])))

            for f in gallery_actor_rule.get_result(['href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():
            #
            # for item in startpage_rule.get_result():
            #     print(item)

            for item in startpage_rule.get_result(['href', 'src']):
                caption = ''
                href = item['href']
                if '/channels/' in href or '/pornstars/' in href:
                    result.set_caption_visible(True)
                    caption = item.get('alt', href.rpartition('/')[2].strip('*').replace('-', ' ').title())
                result.add_thumb(ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=caption))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href']):
                result.add_control(ControlInfo(item.get('title', item.get('data', '')), URL(item['href'])))

        return result