Esempio n. 1
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs200'),
                                                ('div', 'class', 'thumbs300')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', self.get_href)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'menu')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        startpage_pages_rule.set_attribute_filter_function(
            'href', lambda txt: '/st/' in txt)
        parser.add_rule(startpage_pages_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class',
                                               'gallery-thumbs')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', self.process_picture_address)
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            for f in picture_rule.get_result():
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['src']),
                                    rel_name=f['src'].rpartition('/')[2]))
            return result

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))
                #
                # for item in startpage_hrefs_rule.get_result(['href', 'data']):
                #     result.add_control(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 2
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'video')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        channels_rule = ParserRule()
        channels_rule.add_activate_rule_level([('ul', 'class', 'channels')])
        channels_rule.add_process_rule_level('a', {'href', 'title'})
        channels_rule.add_process_rule_level('div', {})
        channels_rule.add_process_rule_level('img', {'src', 'alt'})
        channels_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url).replace('*', '/'))
        channels_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(channels_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('ul', 'class', 'pagination pagination-lg')
        ])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('ul', 'class', 'nav nav-stacked navigation')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'jwplayer' in text)
        # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url))
        parser.add_rule(video_rule)

        video2_rule = ParserRule()
        video2_rule.add_activate_rule_level([('div', 'id', 'video')])
        video2_rule.add_process_rule_level('script', {'src'})
        video2_rule.set_attribute_filter_function(
            'src', lambda text: 'pornbraze.com/' in text)
        video2_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video2_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([
            ('div', 'class', 'col-xs-12 col-sm-12 col-md-12')
        ])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:

            urls = list()
            for item in video_rule.get_result():
                # print(item['data'])
                script = item['data'].replace(' ', '')
                if 'sources:[{' in script:
                    txt = '[{' + self.quotes(item['data'].replace(' ', ''),
                                             'sources:[{', '}]') + '}]'
                    j = json.loads(txt)
                    for j_data in j:
                        # print(j_data)
                        if j_data['file'] is not '':
                            data = dict(text=j_data['label'],
                                        url=URL(j_data['file'] + '*'))
                            urls.append(data)
                elif 'sources:' in script:
                    if video2_rule.is_result(['src']):
                        # print(video2_rule.get_result())
                        php_url = URL(
                            video2_rule.get_result(['src'])[0]['src'])
                        # print(php_url)
                        res = load(php_url)
                        # print(res.text)
                        bitrates = self.quotes(res.text, "'bitrates':[{",
                                               "}]").split('},{')
                        # print(bitrates)
                        for line in bitrates:
                            print(line)
                            video_url = self.quotes(line, "'file':'", "'")
                            label = self.quotes(line, 'label:"', '"')
                            data = dict(text=label, url=URL(video_url + '*'))
                            urls.append(data)

            if len(urls) == 1:
                video = MediaData(urls[0]['url'])
            elif len(urls) > 1:
                video = MediaData(urls[0]['url'])
                for item in urls:
                    video.add_alternate(item)
            else:
                return result

            result.set_type('video')
            result.set_video(video)

            for f in gallery_href_rule.get_result(['data', 'href']):
                # print(f)
                href = f['href'].replace('*', '/')
                label = f['data']
                if '/users/' in href:
                    href = href + '/videos/public/'
                    label = '"' + label + '"'

                result.add_control(ControlInfo(label, URL(href)))

            return result

        if startpage_rule.is_result() or channels_rule.is_result():
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in channels_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result():
                label = item['href'].strip('*/').rpartition('/')[2]
                result.add_control(ControlInfo(label, URL(item['href'])))

        return result
Esempio n. 3
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs'),
                                                ('div', 'class', 'content_box domain2'),
                                                ('div', 'class', 'video_list'),
                                                ('div', 'class', 'video_list_models'),
                                                ('div', 'class', 'pics_list'),
                                                ('div', 'class', 'movie_thumbs')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', self.get_href)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pages'),
                                                      ('div', 'class', 'pg')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('li', 'class', 'orange dropdown')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_filter_function('href', lambda txt: '/st/' in txt)
        parser.add_rule(startpage_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'video_url:' in text)
        parser.add_rule(video_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'thumb_box')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('t.jpg', '.jpg'))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('div', 'class', 'crumbles'),
                                                   ('div', 'class', 'tags')])
        picture_href_rule.add_process_rule_level('a', {'href', 'title'})
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if video_rule.is_result():

            video = MediaData(URL(self.get_attr_from_script(video_rule.get_result()[0]['data'])))
            result.set_video(video)
            result.set_type('video')

            for f in picture_href_rule.get_result():
                # print(f)
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            for f in picture_rule.get_result():
                result.add_full(FullPictureInfo(rel_name=f['src']))

            for f in picture_href_rule.get_result():
                # print(f)
                result.add_control(ControlInfo(f['title'], URL(f['href'])))

        return result
Esempio n. 4
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        # def star_get_url(txt=''):
        #     return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([
            ('div', 'class', 'list_videos'), ('div', 'class', 'list_albums'),
            ('div', 'class', 'list_videos model-girls-list'),
            ('div', 'class', 'list_videos list_channel')
        ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'class',
                                                       'model-alpha')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'vids')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'video_url' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'video-categories')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule(collect_data=True)
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'video-added-info')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x+'/videos',base_url))
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '/members/' in x)
        parser.add_rule(gallery_user_rule)

        photo_rule = ParserRule()
        photo_rule.add_activate_rule_level([('div', 'class', 'zoom-gallery')])
        photo_rule.add_process_rule_level('a', {'href'})
        # photo_rule.set_attribute_filter_function('href', lambda text: '/photos/' in text)
        photo_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(photo_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        def add_href_and_user_to_result():
            if gallery_user_rule.is_result(['href']):
                for item in gallery_user_rule.get_result(['href']):
                    # print(item)
                    username = item['data'].strip().partition(
                        'Added by ')[2].partition(' ')[0]
                    # print(username)
                    if username != '':
                        result.add_control(
                            ControlInfo('"' + username + ' videos"',
                                        URL(item['href'] + 'public_videos/')))
                        result.add_control(
                            ControlInfo('"' + username + ' photos"',
                                        URL(item['href'] + 'albums/')))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:

            # for item in video_rule.get_result():
            #     print('=============================')
            #     print(item['data'])

            script = video_rule.get_result()[0]['data'].replace(' ', '')
            # print(script)

            url = script.partition("video_url:'")[2].partition("'")[0]
            # print(url)

            video = MediaData(URL(url))
            result.set_type('video')
            result.set_video(video)

            add_href_and_user_to_result()
            return result

        if photo_rule.is_result():
            result.set_type('pictures')
            base_dir = base_url.get_path(base=Setting.base_dir) + base_url.get(
            ).rpartition('/')[2] + '/'
            result.set_gallery_path(base_dir)
            # print(base_dir)

            for item in photo_rule.get_result():
                name = item['href'].rpartition('/')[2].strip('*')
                picture = FullPictureInfo(abs_href=URL(item['href']),
                                          rel_name=name)
                picture.set_base(base_dir)
                result.add_full(picture)

            add_href_and_user_to_result()

            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item.get('data', item.get('title', '')),
                                    URL(item['href'])))

        return result
Esempio n. 5
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([
            ('span', 'class', 'thumb_container_box short'),
            ('span', 'class', 'thumb_container_box long')
        ])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'main-sectionpaging')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: util.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('ul', 'class', 'simple-list simple-list--channels')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*')
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('video', '', '')])
        video_rule.add_process_rule_level('source', {'src', 'id'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        video_rule.set_attribute_modifier_function('src',
                                                   lambda txt: txt + '*')
        parser.add_rule(video_rule)

        video_script_rule = ParserRule()
        video_script_rule.add_activate_rule_level([('body', '', '')])
        video_script_rule.add_process_rule_level('script', {})
        video_script_rule.set_attribute_filter_function(
            'data', lambda text: 'shows:' in text)
        # video_script_rule.set_attribute_modifier_function('src',lambda txt:txt+'*')
        parser.add_rule(video_script_rule)

        gallery_rule = ParserRule()
        gallery_rule.add_activate_rule_level([('div', 'id', 'slideshow')])
        gallery_rule.add_process_rule_level('a', {'index'})
        gallery_rule.add_process_rule_level('img', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        gallery_rule.set_attribute_modifier_function(
            'src', lambda txt: txt.replace('/thumbs/', '/'))
        parser.add_rule(gallery_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'added')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: util.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_script_rule.is_result() or video_rule.is_result():
            files = set()
            default_vid = None

            for item in video_script_rule.get_result():
                script = item['data'].replace(' ', '')
                streams = up.unquote(self.quotes(script, '"streams":[', ']'))
                while '"file":"' in streams:
                    split = streams.partition('"file":"')[2].partition('"')
                    streams = split[2]
                    files.add(split[0] + '*')

            for item in video_rule.get_result():
                files.add(item['src'])
                if 'id' not in item:
                    default_vid = item['src']

            if len(files) == 0:
                return result

            if default_vid is None:
                default_vid = files.pop()
            else:
                files.discard(default_vid)

            urls = UrlList()
            urls.add('Default', URL(default_vid))
            for item in files:
                urls.add(item[-5:-1], URL(item))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if gallery_rule.is_result():
            result.set_type('pictures')
            url = URL(gallery_rule.get_result()[0]['src'] + '*')
            base_dir = url.get_path(base=Setting.base_dir)
            result.set_gallery_path(base_dir)
            for f in gallery_rule.get_result():
                fname = int(f['index'])
                # print(fname)
                picture = FullPictureInfo(
                    abs_href=URL(f['src'] + '*'),
                    rel_name='pic{0:03d}.jpg'.format(fname))
                picture.set_base(base_dir)
                result.add_full(picture)

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('title', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item.get('title', item.get('data', '')),
                                    URL(item['href'])))

        return result
Esempio n. 6
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        # def star_get_url(txt=''):
        #     return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'well wellov well-sm'),
                                                ('div', 'class', 'col-sm-4 m-t-15')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt', 'data-original'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'btn-group')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'container')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'nuevoplayer' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'm-t-10 overflow-hidden catmenu')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class', 'pull-left user-container')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x + '/videos', base_url))
        gallery_user_rule.set_attribute_filter_function('href', lambda x: '/user/' in x)
        parser.add_rule(gallery_user_rule)

        photo_rule = ParserRule()
        photo_rule.add_activate_rule_level([('div', 'class', 'panel-body')])
        photo_rule.add_process_rule_level('a', {'href'})
        photo_rule.set_attribute_filter_function('href', lambda text: '/photos/' in text)
        photo_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(photo_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data']

            urls = list()
            for item in script.split(';'):
                if 'var video' in item:
                    part = item.strip().partition('video_')[2].partition('=')
                    data = dict(text=part[0], url=URL(part[2].strip("'") + '*'))
                    urls.append(data)

            if len(urls) == 1:
                video = MediaData(urls[0]['url'])
            elif len(urls) > 1:
                video = MediaData(urls[-1]['url'])
                for item in urls:
                    video.add_alternate(item)
            else:
                return result

            result.set_type('video')
            result.set_video(video)

            #
            # for f in gallery_channel_rule.get_result(['data', 'href']):
            #     result.add_control(ControlInfo(f['data'], URL(f['href'])))
            if gallery_user_rule.is_result():
                for item in gallery_user_rule.get_result():
                    # print(item)
                    # print('*'+item['data'].strip()+'*')
                    username = item['data'].strip()
                    if username != '':
                        result.add_control(ControlInfo('"' + username + '"', URL(item['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if photo_rule.is_result():
            result.set_type('pictures')
            base_dir = base_url.get_path(base=Setting.base_dir)
            result.set_gallery_path(base_dir)

            for item in photo_rule.get_result():
                name = item['href'].rpartition('/')[2].strip('*')
                picture = FullPictureInfo(abs_href=URL(item['href']), rel_name=name)
                picture.set_base(base_dir)
                result.add_full(picture)

            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                # print(item)
                t_url = self.get_href(item.get('data-original', item['src']), base_url)
                t_href = item['href']
                # print(t_href)
                if '/album/' in t_href:
                    # print(t_href)
                    t_href = t_href.replace('/album/', '/album/slideshow/')
                    # print(t_href)

                result.add_thumb(ThumbInfo(thumb_url=URL(t_url), href=URL(t_href), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(ControlInfo(item.get('title', item.get('data', '')), URL(item['href'])))

        return result
Esempio n. 7
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('ul', 'class', 'thumbs'),
                                                ('li', 'class', 'category')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img',
                                              {'src', 'alt', 'data-original'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pager paging')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level(
            'span', {'data-query-key', 'data-query-value'})
        # startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url))
        parser.add_rule(startpage_pages_rule)

        channels_rule = ParserRule()
        channels_rule.add_activate_rule_level([('ul', 'class', 'tag-150-list')
                                               ])
        channels_rule.add_process_rule_level('a', {'href'})
        channels_rule.add_process_rule_level('img', {'src'})
        channels_rule.set_attribute_filter_function(
            'href', lambda x: '/channel/' in x or '/prime/' in x)
        channels_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(channels_rule)

        channel_categories_rule = ParserRule()
        channel_categories_rule.add_activate_rule_level([
            ('ul', 'class', 'link-tag-list long-col')
        ])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        channel_categories_rule.add_process_rule_level(
            'span', {'data-query-key', 'data-query-value'})
        # startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url))
        parser.add_rule(channel_categories_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class',
                                             'player-container')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'players.push' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('ul', 'class',
                                                    'video-tag-list')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data'].replace(' ',
                                                                '').replace(
                                                                    '\\', '')
            sources = script.partition("'sources':{")[2].partition(
                '}')[0].split(',')

            urls = list()
            for item in sources:
                part = item.strip("\n\t'").partition("':'")
                if part[2].startswith('http://'):
                    data = dict(text=part[0],
                                url=URL(part[2].strip("'") + '*'))
                    urls.append(data)

            if len(urls) == 1:
                video = MediaData(urls[0]['url'])
            elif len(urls) > 1:
                video = MediaData(urls[-1]['url'])
                for item in urls:
                    video.add_alternate(item)
            else:
                return result

            result.set_type('video')
            result.set_video(video)

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        def add_key(old, key, value):
            (addr, br, keys) = old.partition('?')
            print(addr, br, keys)
            pairs = keys.split('&')
            print(pairs)
            keys = ''
            found = False
            for pair in pairs:
                if pair.startswith(key):
                    keys += key + '=' + value + '&'
                    found = True
                else:
                    keys += pair + '&'

            if not found:
                keys += key + '=' + value
            return addr + '?' + keys.strip('&')

        def add_pages_info_to_result(rule, description_key='data-query-value'):
            for item in rule.get_result(['data-query-key',
                                         'data-query-value']):
                print(item)
                key = item['data-query-key']
                val = item['data-query-value']
                description = item[description_key].strip('\t')
                old = base_url.get()

                addr = add_key(old, key, val)

                result.add_page(ControlInfo(description, URL(addr + '*')))

        if channels_rule.is_result():
            result.set_type('hrefs')

            for item in channels_rule.get_result():
                # print(item)
                info = item['href'].rpartition('/')[2].strip('*')
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=info))

            add_pages_info_to_result(channel_categories_rule,
                                     description_key='data')

            # for item in channel_categories_rule.get_result(['data-query-key', 'data-query-value']):
            #     print(item)

            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                # print(item)
                t_url = item.get('data-original', item['src'])
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(t_url),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            add_pages_info_to_result(startpage_pages_rule)

        return result
Esempio n. 8
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'videos_form')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'data-lazy-src'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'videos_page')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer(' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'Categories')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                file = self.quotes(item['data'].replace(' ', ''), "file:'", "'")
                urls.add('default', URL(file + '*'))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(ThumbInfo(thumb_url=URL(item['data-lazy-src']), href=URL(item['href']),
                                           popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href']):
                href = item['href']
                label = href.split('/')[-2]
                # print(label,href)
                result.add_control(ControlInfo(label, URL(href)))

        return result
Esempio n. 9
0
    def parse_index_file(self, fname, base_url=URL()):

        print('This site is very slow, be patient')

        parser = SiteParser()

        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'item  ')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination-holder')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'item')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                file = self.quotes(item['data'].replace(' ', ''), "video_url:'", "'")
                urls.add('default', URL(file))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href']):
                href = item['href']
                label = href.split('/')[-2]
                # print(label,href)
                result.add_control(ControlInfo(label, URL(href)))

        return result
Esempio n. 10
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        # startpage_rule.add_activate_rule_level([('section', '', '')])
        startpage_rule.add_activate_rule_level([('div', 'class', 'inner-block')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'id',
                                                       'pagination')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'stage-video')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'jwplayer(' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([
            ('ul', 'class', 'stats-list stats-list--plain')
        ])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                # print(item['data'])
                file = self.quotes(item['data'].replace(' ', ''), 'file:"',
                                   '"')
                urls.add('DEFAULT', URL(file))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                if '/user/' in f['href']:
                    form = '"{0}"'
                else:
                    form = '{0}'

                result.add_control(
                    ControlInfo(form.format(f['data']), URL(f['href'])))

            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt'
                                             '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 11
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'item  '),
                                                ('div', 'class',
                                                 'list-categories')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'data-original', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('li', {'class'})
        startpage_pages_rule.add_process_rule_level('a',
                                                    {'href', 'data-query'})
        startpage_pages_rule.set_attribute_filter_function(
            'class', lambda x: x == 'page')
        startpage_pages_rule.set_attribute_modifier_function(
            'data-query', lambda x: x.partition(':')[2])
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'flashvars' in text)
        # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url))
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'info')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        # gallery_href_rule.set_attribute_filter_function('href',lambda x:'/profiles/' not in x)
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'block-user')])
        gallery_user_rule.add_process_rule_level('a', {'href', 'title'})
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profiles/' in x)
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x + 'videos/', base_url))
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                flashvars = self.quotes(
                    item['data'].replace(' ',
                                         '').replace('\n',
                                                     '').replace('\t', ''),
                    'flashvars={', '};').split(',')
                fv = dict()
                for flashvar in flashvars:
                    split = flashvar.partition(':')
                    fv[split[0]] = split[2].strip("'\"")

                # file=self.quotes(item['data'],'file:',',').strip(' "')
                urls.add('default', URL(fv['video_url'] + '*'))

            result.set_video(urls.get_media_data())

            for f in gallery_user_rule.get_result(['title', 'href']):
                result.add_control(
                    ControlInfo('"' + f['title'].strip() + '"',
                                URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-original']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in startpage_pages_rule.get_result(['href',
                                                         'data-query']):
                result.add_page(
                    ControlInfo(item['data-query'], URL(item['href'])))

        return result
Esempio n. 12
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'image-delete'),
                                                ('div', 'class', 'thumbs'),
                                                ('div', 'class', 'image')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'main')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'function getEmbed()' in text)
        parser.add_rule(video_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'thumbs2')])
        picture_rule.add_process_rule_level('a', {'href'})
        picture_rule.add_process_rule_level('img', {'src'})
        # picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('thumb', 'origin'))
        parser.add_rule(picture_rule)

        picture_tags_rule = ParserRule()
        picture_tags_rule.add_activate_rule_level([('div', 'class', 'main')])
        picture_tags_rule.add_process_rule_level('a', {'href'})
        picture_tags_rule.set_attribute_filter_function(
            'href', lambda txt: '/categories/' in txt or '/model/' in txt)
        parser.add_rule(picture_tags_rule)

        for s in open(fname, encoding='utf-8'):
            parser.feed(s)

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            result.set_video(
                MediaData(
                    URL(
                        self.get_attr_from_script(
                            video_rule.get_result()[0]['data']))))
            result.set_type('video')

            for item in picture_tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')

            result.set_picture_collector(ELSitePictureCollector())

            i = 1
            for f in picture_rule.get_result():
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['href']),
                                    rel_name='%03d.jpg' % i))
                i += 1

            for item in picture_tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
Esempio n. 13
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        domain = base_url.domain()

        href_rule = ParserRule()  # startpage & model's page
        href_rule.add_activate_rule_level([('div', 'id', 'lst-galleries'),
                                           ('div', 'class', 'lblock'),
                                           ('div', 'class', 'modal_info_full')
                                           ])
        href_rule.add_process_rule_level('a', {'href'})
        href_rule.add_process_rule_level('img', {'src', 'alt'})
        href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        href_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        href_rule.set_attribute_filter_function('href', self.thumb_href_filter)
        href_rule.set_attribute_filter_function('src', self.thumb_src_filter)
        parser.add_rule(href_rule)

        href_page_rule = ParserRule()  # page number in model's page
        href_page_rule.add_activate_rule_level([('div', 'class', 'pages'),
                                                ('div', 'class', 'cat')])
        href_page_rule.add_process_rule_level('a', {'href'})
        href_page_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(href_page_rule)

        model_litera_rule = ParserRule()
        model_litera_rule.add_activate_rule_level([('div', 'class',
                                                    'babe_index')])
        model_litera_rule.add_process_rule_level('a', {'href'})
        model_litera_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(model_litera_rule)

        model_more_rule = ParserRule()
        model_more_rule.add_activate_rule_level([('div', 'class', 'more'),
                                                 ('div', 'id', 'MoreCont')])
        model_more_rule.add_process_rule_level('a', {'href'})
        model_more_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        model_more_rule.set_attribute_filter_function('href',
                                                      self.thumb_href_filter)
        parser.add_rule(model_more_rule)

        picture_rule = ParserRule()  # gallery rule
        picture_rule.add_activate_rule_level([('div', 'class', 'lblock')])
        # picture_rule.add_activate_rule_level([('ul', 'class', 'block')])
        picture_rule.add_process_rule_level('a', {'href'})
        picture_rule.add_process_rule_level('img', {'alt'})
        picture_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        picture_rule.set_attribute_filter_function(
            'href', lambda x: x.endswith('.jpg'))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()  # gallery href's rule
        picture_href_rule.add_activate_rule_level([('div', 'id', 'ModelMenu'),
                                                   ('div', 'class', 'lblock')])
        picture_href_rule.add_process_rule_level('a', {'href', 'title'})
        picture_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        picture_href_rule.set_attribute_filter_function(
            'href', self.gal_href_filter)
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result():
                x = FullPictureInfo(abs_href=URL(f['href']),
                                    rel_name='%03d.jpg' % i)
                result.add_full(x)
                i += 1

            for f in picture_href_rule.get_result(['href', 'title']):
                # print(f)
                result.add_control(
                    ControlInfo(text=f['title'], url=URL(f['href'])))
            return result

        if len(href_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in href_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in model_more_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))
            for item in model_litera_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))
            for item in href_page_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 14
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'data-original'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'class',
                                                       'listTags listTags5')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('body', '', '')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'jwplayer("jw_video").setup' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('ul', 'class',
                                                    'stickerNav')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(gallery_href_rule)
        #
        # gallery_channel_rule = ParserRule()
        # gallery_channel_rule.add_activate_rule_level([('p', 'class', 'source')])
        # gallery_channel_rule.add_process_rule_level('a', {'href'})
        # gallery_channel_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        # parser.add_rule(gallery_channel_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0][
                'data']  # .replace(' ', '').replace('\\','')

            # print(script)
            # print('len=',len(video_rule.get_result()))
            sources = script.partition('"sources":[{')[2].partition(
                '}]')[0].split('},{')

            # for i in sources:
            #     print(i)

            def parce(txt):
                label = txt.partition('"label":"')[2].partition('"')[0]
                file = txt.partition('"file":"')[2].partition('"')[0]
                # print(label,file)
                return dict(text=label, url=URL(file + '*'))

            if len(sources) == 1:
                video = MediaData(parce(sources[0])['url'])
            elif len(sources) > 1:
                video = MediaData(parce(sources[len(sources) - 1])['url'])
                for item in sources:
                    video.add_alternate(parce(item))
            else:
                return result

            result.set_type('video')
            result.set_video(video)
            #
            # for f in gallery_channel_rule.get_result(['data', 'href']):
            #     result.add_control(ControlInfo(f['data'], URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href', 'data-original']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-original']),
                              href=URL(item['href']),
                              popup=item.get('title', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href', 'data'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 15
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'thmb-wrapper')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level(
            'img', {'src', 'alt', 'data-src', 'data-original'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'id',
                                                       'divPagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'class',
                                                       'listTags listTags5')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'videoContainer')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'playerData.cdnPath' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([
            ('div', 'class', 'video-info-tags float-left')
        ])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(gallery_href_rule)

        gallery_channel_rule = ParserRule()
        gallery_channel_rule.add_activate_rule_level([
            ('div', 'class', 'video-info-uploaded float-right')
        ])
        gallery_channel_rule.add_process_rule_level('a', {'href'})
        gallery_channel_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        gallery_channel_rule.set_attribute_filter_function(
            'href', lambda x: '/categories/' in x)
        parser.add_rule(gallery_channel_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([
            ('div', 'class', 'video-info-uploaded float-right')
        ])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '/user/' in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data'].replace(
                ' ', '')  # .replace('\\','')

            # print(video_rule.get_result()[0]['data'])
            # print(script)
            # print('len=',len(video_rule.get_result()))
            lines = script.split('\n')

            data = list()

            for i in lines:
                if i.strip().startswith('playerData.cdnPath'):
                    if "''" not in i:
                        data.append(i.strip())
                        # print(i.strip())

            def parce(txt):
                label = txt.partition('playerData.cdnPath')[2].partition(
                    '=')[0]
                file = txt.partition("'")[2].partition("'")[0].replace(
                    '%3A', ':').replace('%2F', '/').replace('%26', '&')
                # print(label,file)
                return dict(text=label, url=URL(file + '*'))

            if len(data) == 1:
                video = MediaData(parce(data[0])['url'])
            elif len(data) > 1:
                video = MediaData(parce(data[len(data) - 1])['url'])
                for item in data:
                    video.add_alternate(parce(item))
            else:
                return result

            result.set_type('video')
            result.set_video(video)

            for f in gallery_user_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(
                    ControlInfo('"' + f['data'] + '"', URL(f['href'])))

            for f in gallery_channel_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                # print(item)
                if 'data-src' in item.keys():
                    src = item['data-src']
                elif 'data-original' in item.keys():
                    src = item['data-original']
                elif 'src' in item.keys():
                    src = item['src']
                else:
                    print('New key found. Need rewrite "startpage_rule"')
                    continue
                # print(src,item.get('src',''),item.get('data-src',''))
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(src),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href', 'data'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 16
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'item  ')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'data-original', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination-holder')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'flashvars' in text)
        # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url))
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'info')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                flashvars = self.quotes(
                    item['data'].replace(' ',
                                         '').replace('\n',
                                                     '').replace('\t', ''),
                    'flashvars={', '};').split(',')
                fv = dict()
                for flashvar in flashvars:
                    split = flashvar.partition(':')
                    fv[split[0]] = split[2].strip("'\"")
                files = dict()
                for f in fv:
                    if fv[f].startswith('http://') and fv[f].rpartition(
                            '.')[2].strip('/') == 'mp4':
                        file = fv[f]
                        label = fv.get(f + '_text', f)
                        files[label] = file

                for key in sorted(files.keys(), reverse=True):
                    urls.add(key, URL(files[key]))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-original']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 17
0
    def parse_index_file(self, fname, base_url=URL()):
        if base_url.contain('format=json'):
            xhr_page = True
        else:
            xhr_page = False

        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([
            ('div', 'class',
             'relative margin-auto video-box-wrapper normal-box')
        ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'alt', 'data-srcmedium'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'video-container')
                                            ])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'flashvars=' in text.replace(' ', ''))
        # video_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)
        #
        video_href_rule = ParserRule()
        video_href_rule.add_activate_rule_level([('div', 'class',
                                                  'ibInfo js_ibInfo')])
        video_href_rule.add_process_rule_level('a', {'href'})
        video_href_rule.set_attribute_filter_function(
            'href', lambda x: 'javascript' not in x)
        video_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_href_rule)

        video_user_rule = ParserRule()
        video_user_rule.add_activate_rule_level([('div', 'class', 'ibLine1')])
        video_user_rule.add_process_rule_level('a', {'href'})
        # video_user_rule.set_attribute_filter_function('href',lambda x: 'javascript' not in x)
        video_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_user_rule)

        if not xhr_page:
            self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                flashvars = self.quotes(item['data'].replace(' ', ''),
                                        'flashvars={',
                                        '};').replace('\\', '').split(',"')
                for v in flashvars:
                    if v.startswith('quality_'):
                        label = self.quotes(v, 'quality_', '"')
                        file = self.quotes(v, ':"', '"')
                        urls.add(label, URL(file + '*'))

            result.set_video(urls.get_media_data(-1))

            for f in video_user_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo('"' + f['data'] + '"', URL(f['href'])))

            for f in video_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if xhr_page:

            def parce_data(data, buttons_added, base_url, items_name='items'):
                xhr_data = base_url.xhr_data
                items = data[items_name]
                nav = data['navigation']
                last_page = nav['lastPage']
                if last_page is None: last_page = 1
                curr_page = int(base_url.get().rpartition('page=')[2])
                pattern = nav['urlPattern'].replace('[%pageId%]', '{}') + '*'

                for item in items:
                    thumb_url = item['thumb_url']
                    title = item['specialchars_title']
                    url = item['video_link']
                    result.add_thumb(
                        ThumbInfo(thumb_url=URL(thumb_url),
                                  href=URL(url + '*'),
                                  popup=title))

                # print('Page {} of {}'.format(curr_page, last_page), pattern.format(curr_page))
                if not buttons_added:
                    result.add_page(ControlInfo('1', xhr_data['base_url']))

                    p_from = curr_page - 5
                    if p_from < 2: p_from = 2

                    p_to = curr_page + 5
                    if p_to > last_page: p_to = last_page

                    for x in range(p_from, p_to):
                        url = URL(pattern.format(x), xhr_data=xhr_data)
                        if x == curr_page: x = str(x) + '(this)'
                        result.add_page(ControlInfo(str(x), url))

                    last_url = URL(pattern.format(last_page),
                                   xhr_data=xhr_data)
                    result.add_page(ControlInfo(str(last_page), last_url))
                    buttons_added = True

                return buttons_added

            with open(fname) as fp:
                try:
                    json_data = json.load(fp)
                    buttons_added = False
                    if base_url.contain('/keyword/'):
                        for i in json_data:
                            buttons_added = parce_data(i, buttons_added,
                                                       base_url)
                    elif base_url.contain('/users/'):
                        parce_data(json_data['response'],
                                   buttons_added,
                                   base_url,
                                   items_name='videos')
                    else:
                        for i in json_data:
                            buttons_added = parce_data(json_data[i],
                                                       buttons_added, base_url)
                except ValueError:
                    pass
            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-srcmedium']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            xhr_data = {'base_url': base_url}

            next_url = URL(
                base_url.get() + '*',
                xhr_data=xhr_data)  # +'?format=json&number_pages=1&page=2*'
            next_url.add_query([('format', 'json'), ('number_pages', '1'),
                                ('page', '2')])
            result.add_page(ControlInfo('next', next_url))

        return result
Esempio n. 18
0
    def parse_index_file(self, fname, base_url=URL()):
        # print(base_url)
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumblinks')
                                                ])
        # startpage_rule.add_process_rule_level('div', {})
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url) + '*')
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url) + '*')
        parser.add_rule(startpage_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('div', 'id', 'divTags')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(tags_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('td', 'class', 'pages')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        picture_base_addr_rule = ParserRule()
        picture_base_addr_rule.add_activate_rule_level([('div', 'class',
                                                         'imagelinks')])
        picture_base_addr_rule.add_process_rule_level('script', {})
        picture_base_addr_rule.set_attribute_filter_function(
            'data', lambda x: 'unescape' in x)
        parser.add_rule(picture_base_addr_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'imagelinks')])
        picture_rule.add_process_rule_level('script', {})
        picture_rule.set_attribute_filter_function('data',
                                                   lambda x: "'src'" in x)
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_base_addr_rule.get_result()) > 0:
            result.set_type('pictures')
            base = \
            picture_base_addr_rule.get_result()[0]['data'].replace('%2f', '/').partition("unescape('//")[2].partition(
                "'")[0]
            # print(base)
            i = 1
            for f in picture_rule.get_result():
                picname = f['data'].partition("+'")[2].partition("'")[0]
                # print(picname)
                result.add_full(
                    FullPictureInfo(abs_href=URL(base + picname + '*'),
                                    rel_name='%03d.jpg' % i))
                i += 1

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 19
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'fixed-content')])
        startpage_rule.add_process_rule_level('a', {'href', 'class'})
        startpage_rule.add_process_rule_level('div', {'style'})
        startpage_rule.set_attribute_filter_function(
            'class', lambda x: x == 'thumbnail')
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'style', lambda x: x.partition("url('")[2].partition("')")[0])
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('div', 'class', 'col-xs-12 content-pagination')
        ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('section', 'id', 'footer-tag')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(tags_rule)

        categories_rule = ParserRule()
        categories_rule.add_activate_rule_level([('ul', 'class',
                                                  'nav navbar-nav')])
        categories_rule.add_process_rule_level('a', {'href'})
        categories_rule.set_attribute_filter_function(
            'href', lambda x: '/Category/' in x and "#" not in x)
        categories_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(categories_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('body', '', '')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'angular.' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'row tag-area')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data'].replace(' ', '')
            json_file_url = self.get_href(self.quotes(script, "host:'", "'"),
                                          base_url)
            # print(json_file_url)

            from requests_loader import load, LoaderError

            json_file = Setting.base_dir + 'tsp_video.json'

            urls = list()
            result.set_type('video')

            try:
                r = load(URL(json_file_url), json_file)

                links = set()
                for item in r.json()['mediaSources']:
                    # print(item)
                    if item['source'] not in links:
                        data = dict(text=item['quality'],
                                    url=URL(item['source'] + '*'))
                        urls.append(data)
                        links.add(item['source'])

                if len(urls) == 1:
                    video = MediaData(urls[0]['url'])
                elif len(urls) > 1:
                    video = MediaData(urls[0]['url'])
                    for item in urls:
                        video.add_alternate(item)
                else:
                    return result

                result.set_video(video)

            except LoaderError as err:
                print(err)

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['style']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                label = item['data'].replace(' ', '')
                # print(item)
                if len(label) > 0:
                    result.add_page(ControlInfo(label, URL(item['href'])))

            if categories_rule.is_result(['href']):
                for item in categories_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item['data'], URL(item['href'])))

            if tags_rule.is_result(['href']):
                for item in tags_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 20
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'inner_wrap')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img',
                                              {'src', 'alt', 'data-original'})
        # startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*' )
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'class',
                                                       'holder_list')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*')
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'video_url:' in text or 'jwplayer(' in text)
        parser.add_rule(video_rule)
        #
        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class', 'descrow')
                                                   ])
        gallery_user_rule.add_process_rule_level('a', {'href', 'title'})
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: x + 'public_videos/')
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '/members/' in x)
        parser.add_rule(gallery_user_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'descrow')
                                                   ])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        # gallery_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        gallery_href_rule.set_attribute_filter_function(
            'href', lambda x: '/categories/' in x or '/tags/' in x)
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data'].replace(
                ' ', '')  # .replace('\\','')
            # print(script)
            file = 'Not found!!!!'
            if 'jwplayer(' in script:
                file = script.partition('file:"')[2].partition('",')[0]  # +'*'
            elif "video_url:'" in script:
                file = script.partition("video_url:'")[2].partition("',")[
                    0]  # +'*'
            # print(file)
            video = MediaData(URL(file))

            result.set_type('video')
            result.set_video(video)

            #
            # for f in gallery_channel_rule.get_result(['data', 'href']):
            #     result.add_control(ControlInfo(f['data'], URL(f['href'])))

            if gallery_user_rule.is_result():
                f = gallery_user_rule.get_result(['href'])[0]
                print(f)
                result.add_control(
                    ControlInfo('"' + f['data'] + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['href']):
                # print(f)
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-original']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item.get('title', item.get('data', '')),
                                    URL(item['href'])))

        return result
Esempio n. 21
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([
            ('div', 'class', 'well well-sm hover'),
            ('div', 'class', 'channelContainer')
        ])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class',
                                                       'drop2 hidden-xs')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'video-container')
                                            ])
        video_rule.add_process_rule_level('source', {'src', 'label', 'res'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'video_url' in text)
        video_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'm-t-10 overflow-hidden')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule(collect_data=True)
        gallery_user_rule.add_activate_rule_level([
            ('div', 'class', 'pull-left user-container')
        ])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '#' not in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                urls.add(item['res'], URL(item['src']))

            result.set_video(urls.get_media_data(-1))

            for f in gallery_user_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo('"' + f['data'].strip() + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                label = item['href'].strip('*').rpartition('/')[2]
                result.add_control(ControlInfo(label, URL(item['href'])))

        return result
Esempio n. 22
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'mb'),
                                                ('div', 'class', 'mbhd'),
                                                ('div', 'class', 'mb mbr'),
                                                ('div', 'class', 'mbhd mbr'),
                                                ('div', 'class', 'categoriesbox')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'numlist2')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'moviexxx')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda x: "videojs('EPvideo'" in x)
        parser.add_rule(video_rule)

        video_fname_rule = ParserRule()
        video_fname_rule.add_activate_rule_level([('div', 'id', 'hd-p**n-dload')])
        video_fname_rule.add_process_rule_level('a', {'href'})
        # video_fname_rule.set_attribute_filter_function('data',lambda x:"videojs('EPvideo'" in x)
        parser.add_rule(video_fname_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'tab-1')])
        # gallery_href_rule.add_activate_rule_level([('td', 'class', 'btnList')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        gallery_href_rule.set_attribute_filter_function('href', lambda x: '/category/' in x or '/search/' in x)
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            script = video_rule.get_result()[0]['data'].replace(' ', '')
            # print(script)
            vid = script.partition("vid:'")[2].partition("',")[0]
            hash = script.partition("hash:'")[2].partition("',")[0]
            print(vid, hash)

            for item in video_fname_rule.get_result():
                print(item)
                fname = item['href'].rpartition('/')[2]
                print(fname)
                # print('http://v1.s1.n10.nl.cdn.eporner.com/3e7475b4b977ffd9649a87353717fc51/582c90c9027c00/310073-720p.mp4'.format(hash,vid,fname))
                print('http://v1.s1.n10.nl.cdn.eporner.com/{0}/{1}/{2}'.format(hash, vid, fname))
            # script_url=URL(base_url.domain() + (video_rule.get_result()[0]['data'].partition("getScript('")[2].partition("'")[0]))
            # video=EPvideoParser(script_url,self.model)
            # result.set_video(video.get_result())
            result.set_type('video')

            for f in gallery_href_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            # print('return')
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item['alt']))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))
                #
                # for item in startpage_hrefs_rule.get_result(['href', 'data']):
                #     result.add_control(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 23
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'link-3col')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'wp-pagenavi')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(tags_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('iframe', {'src'})
        video_rule.set_attribute_filter_function('src',
                                                 lambda x: 'fileone.tv' in x)
        video_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                print(item)
                try:
                    r = load(URL(item['src']))
                    setup = self.quotes(r.text, "jwplayer('player').setup(",
                                        ")").replace(' ', '')
                    file = self.quotes(setup, "file:'", "'")
                    urls.add("default", URL(file + '*'))
                except LoaderError as err:
                    print(err)

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
Esempio n. 24
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'contents videos')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'id',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'contents')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'jwplayer' in text)
        # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url))
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'info_holder')])
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'l')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_filter_function(
            'href', lambda x: '/profiles/' not in x)
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'info_holder')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '/profiles/' in x)
        gallery_user_rule.set_attribute_modifier_function(
            'href',
            lambda x: self.get_href(x.replace('.html', '/videos/'), base_url))
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                file = self.quotes(item['data'], 'file:', ',').strip(' "')
                urls.add('default', URL(file + '*'))

            result.set_video(urls.get_media_data())

            for f in gallery_user_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo('"' + f['data'].strip() + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 25
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        # print(base_url.domain())
        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule(debug=False)
        startpage_rule.add_activate_rule_level([('ul', 'class', 'video-listing'),
                                                ('ul', 'class', 'video-listing two-in-row'),
                                                ('ul', 'class', 'video-listing four-in-row'),
                                                ('ul', 'class', 'video-listing two-in-row id-recommended-list'),
                                                ('ul', 'class', 'video-listing four-in-row id-recommended-list')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'data-src', 'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function('data-src', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pageNumbersHolder')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'categories-listing'),
                                                      ('ul', 'class', 'categories-popular-listing'),
                                                      ('ul', 'class', 'abc-categories newAbcCategories')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)

        pornstars_rule = ParserRule()
        pornstars_rule.add_activate_rule_level([('div', 'id', 'all_pornstars')])
        pornstars_rule.add_process_rule_level('a', {'href'})
        pornstars_rule.add_process_rule_level('img', {'src', 'alt'})
        # pornstars_rule.set_attribute_filter_function('href',lambda x: '/channel/' in x or '/prime/' in x)
        pornstars_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(pornstars_rule)

        pornstars_hrefs_rule = ParserRule()
        pornstars_hrefs_rule.add_activate_rule_level([('ul', 'class', 'abc-categories newAbcCategories')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        pornstars_hrefs_rule.add_process_rule_level('a', {'href'})
        pornstars_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(pornstars_hrefs_rule)

        channels_rule = ParserRule(debug=False)
        channels_rule.add_activate_rule_level([('ul', 'class', 'channels-list three-in-row')])
        channels_rule.add_process_rule_level('a', {'href'})
        channels_rule.add_process_rule_level('img', {'src', 'data-src', 'alt'})
        channels_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        channels_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url))
        channels_rule.set_attribute_modifier_function('data-src', lambda x: self.get_href(x, base_url))
        parser.add_rule(channels_rule)

        channels_hrefs_rule = ParserRule()
        channels_hrefs_rule.add_activate_rule_level([('div', 'class', 'channel-filters-categories')])
        # channels_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        channels_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        channels_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(channels_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'watch')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'redtube_flv_player' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'video-details')])
        # gallery_href_rule.add_activate_rule_level([('td', 'class', 'links')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        gallery_href_rule.set_attribute_filter_function('href', lambda x: x != '*')
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                script = item['data'].replace(' ', '').replace('\\', '')
                sources = self.quotes(script, 'sources:{"', '"},').split('","')
                for f in sources:
                    t = f.partition('":"')
                    label = t[0]
                    file = self.get_href(t[2], base_url)
                    urls.add(label, URL(file))

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                href = f['href']
                label = f['data']
                if '/redtube/' in href or '/tag/' in href:
                    result.add_control(ControlInfo(label, URL(href)))
                elif '/pornstar/' in href:
                    ps_name = href.rstrip('*').rpartition('/')[2].replace('+', ' ').title()
                    result.add_control(ControlInfo(ps_name, URL(href)))
                else:
                    # adding user
                    result.add_control(ControlInfo("'" + label + "'", URL(href.replace('*', '/videos*'))))
            return result

        if pornstars_rule.is_result():
            result.set_caption_visible(True)
            for item in pornstars_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in pornstars_hrefs_rule.get_result(['href']):
                result.add_control(ControlInfo(item['data'], URL(item['href'])))
            return result

        if channels_rule.is_result():
            result.set_caption_visible(True)
            for item in channels_rule.get_result(['href']):
                thumb_href = item.get('data-src', item.get('src'))
                descr = item.get('alt', '').title()
                result.add_thumb(ThumbInfo(thumb_url=URL(thumb_href), href=URL(item['href']), popup=descr))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in channels_hrefs_rule.get_result(['href']):
                result.add_control(ControlInfo(item['title'], URL(item['href'])))
            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                thumb_href = item.get('data-src', item.get('src'))
                descr = item.get('title', item.get('alt', ''))
                result.add_thumb(ThumbInfo(thumb_url=URL(thumb_href), href=URL(item['href']), popup=descr))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'title']):
                result.add_control(ControlInfo(item['title'], URL(item['href'])))

        return result
Esempio n. 26
0
    def parse_index_file(self, fname, base_url=URL()):
        print(base_url.domain())
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'ownpost')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'nav')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + '/' + x + '*')
        parser.add_rule(startpage_pages_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'gallery')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src', 'class'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: text.replace('/tn_', '/'))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('div', 'class', 'gallery')
                                                   ])
        picture_href_rule.add_process_rule_level('a', {'href'})
        picture_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        picture_href_rule.set_attribute_filter_function(
            'href', lambda x: x.find('/?category=') != -1)
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href'] + '*'),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['src', 'class']):
                # print(f)
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['src']),
                                    rel_name='%03d.jpg' % i))
                i += 1

            for f in picture_href_rule.get_result():
                # print(f)
                result.add_control(
                    ControlInfo(f['data'].replace(',', ''), URL(f['href'])))

        return result
Esempio n. 27
0
    def parse_index_file(self, fname, base_url=URL()):
        print('VP parsing')

        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'bx'),
                                                ('div', 'class', 'bx lastrow')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href', 'class'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagerwrap')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        # startpage_pages_rule.set_attribute_modifier_function('href',lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'video_panel')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'var flashvars' in text)
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'tagas-secondrow')])
        # gallery_href_rule.add_activate_rule_level([('td', 'class', 'btnList')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        # gallery_href_rule.set_attribute_filter_function('href',lambda x:'/category/'in x or '/search/'in x)
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class', 'info')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profile/' in x)
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x.replace('/user/', '/submitted/'),
                                            base_url))
        parser.add_rule(gallery_user_rule)

        for s in open(fname, encoding='utf-8', errors='ignore'):
            # print(s)
            parser.feed(s.replace('</b>', '</a>'))

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data'].replace(' ', '')

            # print(script)

            def get_url_from_script(script='', var=''):
                data = script.partition('flashvars.' + var +
                                        '="')[2].partition('"')[0]
                # print(var,data)
                if data.startswith('https://'): return URL(data)

            videoUrlLow = get_url_from_script(script, 'videoUrlLow')
            videoUrlLow2 = get_url_from_script(script, 'videoUrlLow2')
            videoUrlMedium = get_url_from_script(script, 'videoUrlMedium')
            videoUrlMedium2 = get_url_from_script(script, 'videoUrlMedium2')
            videoUrlHD = get_url_from_script(script, 'videoUrlHD')
            videoUrlHD2 = get_url_from_script(script, 'videoUrlHD2')

            def add_alternate(video, txt, url):
                if url is not None:
                    video.add_alternate(dict(text=txt, url=url))

            # video=MediaData(videoUrlMedium)

            if videoUrlMedium is not None:
                video = MediaData(videoUrlMedium)
            elif videoUrlLow is not None:
                video = MediaData(videoUrlLow)
            else:
                print('No url found')
                return result

            add_alternate(video, 'Low', videoUrlLow)
            add_alternate(video, 'Low2', videoUrlLow2)
            add_alternate(video, 'Medium', videoUrlMedium)
            add_alternate(video, 'Medium', videoUrlMedium2)
            add_alternate(video, 'HD', videoUrlHD)
            add_alternate(video, 'HD', videoUrlHD2)

            result.set_type('video')
            result.set_video(video)

            for f in gallery_user_rule.get_result():
                # print(f)
                result.add_control(
                    ControlInfo('"' + f['data'] + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            # print('return')
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item['alt']))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

                # for item in startpage_hrefs_rule.get_result(['href', 'data']):
                #     result.add_control(ControlInfo(item['data'], URL(item['href'])))

        return result
Esempio n. 28
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'content-inner')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url) + '*')
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination_link')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('div', 'class', 'sub_menu dark-menu'),
            ('div', 'class', 'sub-menu dark-menu')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url) + '*')
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'content')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'jwplayer' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id',
                                                    'media-tags-container')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'thumb-member-username')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                file = self.quotes(item['data'].replace(' ', ''), '"file":"',
                                   '"')
                urls.add('default', URL(file + '*'))

            result.set_video(urls.get_media_data())

            if gallery_user_rule.is_result():
                user = gallery_user_rule.get_result()[0]['href'].rpartition(
                    '/')[2]
                result.add_control(
                    ControlInfo('"' + user + ' uploads"',
                                URL('http://motherless.com/u/' + user + '*')))
                result.add_control(
                    ControlInfo(
                        '"' + user + ' gals"',
                        URL('http://motherless.com/galleries/member/' + user +
                            '*')))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(
                    ControlInfo(item.get('title', item.get('data', '')),
                                URL(item['href'])))

        return result
Esempio n. 29
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'block-video')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src'})
        # startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class',
                                                       'pagination')])
        startpage_pages_rule.add_process_rule_level('a', {'title', 'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('li', 'id',
                                                       'categories')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player_body')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'function playStart()' in text)
        parser.add_rule(video_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([
            ('p', 'class', 'info_category'), ('p', 'class', 'info_tags'),
            ('p', 'class', 'info_cast')
        ])
        picture_href_rule.add_process_rule_level('a', {'href'})

        parser.add_rule(picture_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            result.set_video(
                MediaData(
                    URL(
                        self.get_attr_from_script(
                            video_rule.get_result()[0]['data']))))
            result.set_type('video')

            for f in picture_href_rule.get_result():
                # print(f)
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href'])))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                if '?from=' in item['href']:
                    result.add_page(
                        ControlInfo(item['data'], URL(item['href'] + '*')))
                else:
                    result.add_page(
                        ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
Esempio n. 30
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        # startpage_rule.add_activate_rule_level([('section', '', '')])
        startpage_rule.add_activate_rule_level([
            ('article', 'class', 'teaser singleLink hasButtonRow'),
            ('article', 'class', 'activity video hasButtonFooter')
        ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img',
                                              {'src', 'data-lazysrc', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('nav', 'class', 'clearfix pagination bottom'),
            ('nav', 'class', 'range rangeCount-2 clearfix')
        ])
        startpage_pages_rule.add_process_rule_level('a', {'href', 'data-href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_pages_rule.set_attribute_modifier_function(
            'data-href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_categories_rule = ParserRule()
        startpage_categories_rule.add_activate_rule_level([
            ('select', 'id', 'input_selectCategories')
        ])
        startpage_categories_rule.add_process_rule_level('option', {'value'})
        startpage_categories_rule.set_attribute_modifier_function(
            'value', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_categories_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'playerWrapper')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'sources:' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('dl', 'class', 'group')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([
            ('nav', 'class', 'profileNav clearfix buttonRow')
        ])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '#videos' in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                script = item['data'].replace(' ', '').replace('\\', '')
                sources = self.quotes(script, 'sources:{"', '"},').split('","')
                for f in sources:
                    t = f.partition('":"')
                    label = t[0]
                    file = self.get_href(t[2], base_url)
                    urls.add(label, URL(file))

            result.set_video(urls.get_media_data())

            # for f in gallery_user_rule.get_result():
            #     print(f)
            #     name='"{0}"'.format(f['href'].rpartition('/')[2].partition('#')[0])
            #     result.add_control(ControlInfo(name, URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(
                        item.get('data-lazysrc', item['src'])),
                              href=URL(item['href']),
                              popup=item.get('alt'
                                             '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                # print(item)
                href = item.get('data-href', item['href'])
                # print(href)
                result.add_page(
                    ControlInfo(href.rpartition('/')[2].strip('*'), URL(href)))

            for item in startpage_categories_rule.get_result():
                result.add_control(
                    ControlInfo(item['data'], URL(item['value'])))

        return result