def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'content-inner')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url) + '*') parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination_link')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('div', 'class', 'sub_menu dark-menu'), ('div', 'class', 'sub-menu dark-menu') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url) + '*') parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'content')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'jwplayer' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'media-tags-container')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'thumb-member-username')]) gallery_user_rule.add_process_rule_level('a', {'href'}) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'].replace(' ', ''), '"file":"', '"') urls.add('default', URL(file + '*')) result.set_video(urls.get_media_data()) if gallery_user_rule.is_result(): user = gallery_user_rule.get_result()[0]['href'].rpartition( '/')[2] result.add_control( ControlInfo('"' + user + ' uploads"', URL('http://motherless.com/u/' + user + '*'))) result.add_control( ControlInfo( '"' + user + ' gals"', URL('http://motherless.com/galleries/member/' + user + '*'))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'video-item compact')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_categories_rule = ParserRule() startpage_categories_rule.add_activate_rule_level([ ('nav', 'class', 'video-categories') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_categories_rule.add_process_rule_level('a', {'href'}) # startpage_categories_rule.set_attribute_filter_function('href',lambda x:'/free_porn/' in x) startpage_categories_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_categories_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'cat-menu hidden-xs')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_filter_function( 'href', lambda x: '/free_porn/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('video', '', '')]) video_rule.add_process_rule_level('source', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) video_rule.set_attribute_modifier_function('src', lambda txt: txt + '*') parser.add_rule(video_rule) gallery_rule = ParserRule() gallery_rule.add_activate_rule_level([('div', 'id', 'galleryImages')]) gallery_rule.add_process_rule_level('a', {}) gallery_rule.add_process_rule_level('img', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) gallery_rule.set_attribute_modifier_function( 'src', lambda txt: txt.replace('/thumbs/', '/')) parser.add_rule(gallery_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'tags')]) gallery_href_rule.add_process_rule_level('a', {'href', 'title'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: (self.get_href(x, base_url))) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'uploaded') ]) gallery_user_rule.add_process_rule_level('a', {'href'}) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: urls = UrlList() for item in video_rule.get_result(): urls.add('default', URL(item['src'])) result.set_video(urls.get_media_data(-1)) if gallery_user_rule.is_result(): user = gallery_user_rule.get_result()[0]['href'].rpartition( '/')[2] result.add_control( ControlInfo( '"' + user + '"', URL('http://www.heavy-r.com/user/' + user + '?pro=videos*'))) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() if label == '': label = f['title'] result.add_control(ControlInfo(label, URL(f['href']))) return result if gallery_rule.is_result(): result.set_type('pictures') url = URL(gallery_rule.get_result()[0]['src'] + '*') base_dir = url.get_path(base=Setting.base_dir) result.set_gallery_path(base_dir) for f in gallery_rule.get_result(): picture = FullPictureInfo(abs_href=URL(f['src'] + '*'), rel_name=f['src'].rpartition('/')[2]) picture.set_base(base_dir) result.add_full(picture) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() if label == '': label = f['title'] if '/user/' in f['href']: split = f['href'].rpartition('-') base = split[0].partition('/user/')[0] # print(split) # print(base) result.add_control( ControlInfo(label + ' videos', URL(base + '/uploads-by-user/' + split[2]))) result.add_control( ControlInfo( label + ' gals', URL(base + '/uploads-by-user/' + split[2] + '?photos=1'))) else: result.add_control(ControlInfo(label, URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_categories_rule.get_result(['href'])) > 0: for item in startpage_categories_rule.get_result( ['href', 'data']): result.add_control( ControlInfo(item.get('data', ''), URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('data', ''), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'contents videos')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'id', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'contents')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'jwplayer' in text) # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url)) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'info_holder')]) gallery_href_rule.add_activate_rule_level([('div', 'class', 'l')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_filter_function( 'href', lambda x: '/profiles/' not in x) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'info_holder')]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '/profiles/' in x) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x.replace('.html', '/videos/'), base_url)) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'], 'file:', ',').strip(' "') urls.add('default', URL(file + '*')) result.set_video(urls.get_media_data()) for f in gallery_user_rule.get_result(['data', 'href']): result.add_control( ControlInfo('"' + f['data'].strip() + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([ ('div', 'class', 'well well-sm hover'), ('div', 'class', 'channelContainer') ]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'drop2 hidden-xs')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'video-container') ]) video_rule.add_process_rule_level('source', {'src', 'label', 'res'}) # video_rule.set_attribute_filter_function('data', lambda text: 'video_url' in text) video_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'm-t-10 overflow-hidden')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule(collect_data=True) gallery_user_rule.add_activate_rule_level([ ('div', 'class', 'pull-left user-container') ]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '#' not in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): urls.add(item['res'], URL(item['src'])) result.set_video(urls.get_media_data(-1)) for f in gallery_user_rule.get_result(['data', 'href']): result.add_control( ControlInfo('"' + f['data'].strip() + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): label = item['href'].strip('*').rpartition('/')[2] result.add_control(ControlInfo(label, URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([ ('span', 'class', 'thumb_container_box short'), ('span', 'class', 'thumb_container_box long') ]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'main-sectionpaging')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: util.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('ul', 'class', 'simple-list simple-list--channels') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*') parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('video', '', '')]) video_rule.add_process_rule_level('source', {'src', 'id'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) video_rule.set_attribute_modifier_function('src', lambda txt: txt + '*') parser.add_rule(video_rule) video_script_rule = ParserRule() video_script_rule.add_activate_rule_level([('body', '', '')]) video_script_rule.add_process_rule_level('script', {}) video_script_rule.set_attribute_filter_function( 'data', lambda text: 'shows:' in text) # video_script_rule.set_attribute_modifier_function('src',lambda txt:txt+'*') parser.add_rule(video_script_rule) gallery_rule = ParserRule() gallery_rule.add_activate_rule_level([('div', 'id', 'slideshow')]) gallery_rule.add_process_rule_level('a', {'index'}) gallery_rule.add_process_rule_level('img', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) gallery_rule.set_attribute_modifier_function( 'src', lambda txt: txt.replace('/thumbs/', '/')) parser.add_rule(gallery_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'added')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: util.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_script_rule.is_result() or video_rule.is_result(): files = set() default_vid = None for item in video_script_rule.get_result(): script = item['data'].replace(' ', '') streams = up.unquote(self.quotes(script, '"streams":[', ']')) while '"file":"' in streams: split = streams.partition('"file":"')[2].partition('"') streams = split[2] files.add(split[0] + '*') for item in video_rule.get_result(): files.add(item['src']) if 'id' not in item: default_vid = item['src'] if len(files) == 0: return result if default_vid is None: default_vid = files.pop() else: files.discard(default_vid) urls = UrlList() urls.add('Default', URL(default_vid)) for item in files: urls.add(item[-5:-1], URL(item)) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() result.add_control(ControlInfo(label, URL(f['href']))) return result if gallery_rule.is_result(): result.set_type('pictures') url = URL(gallery_rule.get_result()[0]['src'] + '*') base_dir = url.get_path(base=Setting.base_dir) result.set_gallery_path(base_dir) for f in gallery_rule.get_result(): fname = int(f['index']) # print(fname) picture = FullPictureInfo( abs_href=URL(f['src'] + '*'), rel_name='pic{0:03d}.jpg'.format(fname)) picture.set_base(base_dir) result.add_full(picture) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() result.add_control(ControlInfo(label, URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('title', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'videos_form')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'data-lazy-src'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'videos_page')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer(' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'Categories')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'].replace(' ', ''), "file:'", "'") urls.add('default', URL(file + '*')) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb(ThumbInfo(thumb_url=URL(item['data-lazy-src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href']): href = item['href'] label = href.split('/')[-2] # print(label,href) result.add_control(ControlInfo(label, URL(href))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'video-thumb') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('video', '', '')]) video_rule.add_process_rule_level('source', {'src', 'label', 'res'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) parser.add_rule(video_rule) gallery_href_rule = ParserRule() # gallery_href_rule.add_activate_rule_level([('div', 'class', 'option')]) gallery_href_rule.add_activate_rule_level([('div', 'class', 'tags-container')]) gallery_href_rule.add_process_rule_level('a', {'href'}) # gallery_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(['src', 'res']): urls.add(item['res'], URL(item['src'])) result.set_video(urls.get_media_data(-1)) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href'] + '*'))) return result if startpage_rule.is_result(): # for item in startpage_rule.get_result(): # print(item) for item in startpage_rule.get_result(['href', 'src']): href = item['href'] if '/category/' in href: result.set_caption_visible(True) result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(href), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() # startpage_rule.add_activate_rule_level([('section', '', '')]) startpage_rule.add_activate_rule_level([('div', 'class', 'inner-block') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'id', 'pagination')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'stage-video')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'jwplayer(' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([ ('ul', 'class', 'stats-list stats-list--plain') ]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): # print(item['data']) file = self.quotes(item['data'].replace(' ', ''), 'file:"', '"') urls.add('DEFAULT', URL(file)) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): if '/user/' in f['href']: form = '"{0}"' else: form = '{0}' result.add_control( ControlInfo(form.format(f['data']), URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt' ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'item-col col')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) # startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*' ) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('nav', 'class', 'pagination-col col pagination') ]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: 'http:/' + base_url.get_path() + x + '*') parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('ul', 'class', 'simple-list simple-list--channels') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*') parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('video', 'id', 'thisPlayer')]) video_rule.add_process_rule_level('source', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) video_rule.set_attribute_modifier_function('src', lambda txt: txt + '*') parser.add_rule(video_rule) gallery_rule = ParserRule() gallery_rule.add_activate_rule_level([('div', 'id', 'galleryImages')]) gallery_rule.add_process_rule_level('a', {}) gallery_rule.add_process_rule_level('img', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) gallery_rule.set_attribute_modifier_function( 'src', lambda txt: txt.replace('/thumbs/', '/')) parser.add_rule(gallery_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([ ('div', 'class', 'tags-block'), ('div', 'class', 'submitter-container') ]) gallery_href_rule.add_process_rule_level('a', {'href', 'title'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: (self.get_href(x, base_url))) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): urls.add('default', URL(item['src'])) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() if label == '': label = f['title'] if '/user/' in f['href']: split = f['href'].rpartition('-') base = split[0].partition('/user/')[0] result.add_control( ControlInfo('"' + label + ' videos"', URL(base + '/uploads-by-user/' + split[2]))) result.add_control( ControlInfo( '"' + label + ' gals"', URL(base + '/uploads-by-user/' + split[2] + '?photos=1'))) else: result.add_control(ControlInfo(label, URL(f['href']))) return result if gallery_rule.is_result(): result.set_type('pictures') url = URL(gallery_rule.get_result()[0]['src'] + '*') base_dir = url.get_path(base=Setting.base_dir) result.set_gallery_path(base_dir) for f in gallery_rule.get_result(): picture = FullPictureInfo(abs_href=URL(f['src'] + '*'), rel_name=f['src'].rpartition('/')[2]) picture.set_base(base_dir) result.add_full(picture) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() if label == '': label = f['title'] if '/user/' in f['href']: split = f['href'].rpartition('-') base = split[0].partition('/user/')[0] result.add_control( ControlInfo('"' + label + ' videos"', URL(base + '/uploads-by-user/' + split[2]))) result.add_control( ControlInfo( '"' + label + ' gals"', URL(base + '/uploads-by-user/' + split[2] + '?photos=1'))) else: result.add_control(ControlInfo(label, URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() # startpage_rule.add_activate_rule_level([('section', '', '')]) startpage_rule.add_activate_rule_level([ ('article', 'class', 'teaser singleLink hasButtonRow'), ('article', 'class', 'activity video hasButtonFooter') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'data-lazysrc', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('nav', 'class', 'clearfix pagination bottom'), ('nav', 'class', 'range rangeCount-2 clearfix') ]) startpage_pages_rule.add_process_rule_level('a', {'href', 'data-href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_pages_rule.set_attribute_modifier_function( 'data-href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_categories_rule = ParserRule() startpage_categories_rule.add_activate_rule_level([ ('select', 'id', 'input_selectCategories') ]) startpage_categories_rule.add_process_rule_level('option', {'value'}) startpage_categories_rule.set_attribute_modifier_function( 'value', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_categories_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'playerWrapper')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'sources:' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('dl', 'class', 'group')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([ ('nav', 'class', 'profileNav clearfix buttonRow') ]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '#videos' in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): script = item['data'].replace(' ', '').replace('\\', '') sources = self.quotes(script, 'sources:{"', '"},').split('","') for f in sources: t = f.partition('":"') label = t[0] file = self.get_href(t[2], base_url) urls.add(label, URL(file)) result.set_video(urls.get_media_data()) # for f in gallery_user_rule.get_result(): # print(f) # name='"{0}"'.format(f['href'].rpartition('/')[2].partition('#')[0]) # result.add_control(ControlInfo(name, URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL( item.get('data-lazysrc', item['src'])), href=URL(item['href']), popup=item.get('alt' ''))) for item in startpage_pages_rule.get_result(['href', 'data']): # print(item) href = item.get('data-href', item['href']) # print(href) result.add_page( ControlInfo(href.rpartition('/')[2].strip('*'), URL(href))) for item in startpage_categories_rule.get_result(): result.add_control( ControlInfo(item['data'], URL(item['value']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'item '), ('div', 'class', 'list-categories')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'data-original', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('li', {'class'}) startpage_pages_rule.add_process_rule_level('a', {'href', 'data-query'}) startpage_pages_rule.set_attribute_filter_function( 'class', lambda x: x == 'page') startpage_pages_rule.set_attribute_modifier_function( 'data-query', lambda x: x.partition(':')[2]) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'flashvars' in text) # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url)) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'info')]) gallery_href_rule.add_process_rule_level('a', {'href'}) # gallery_href_rule.set_attribute_filter_function('href',lambda x:'/profiles/' not in x) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'block-user')]) gallery_user_rule.add_process_rule_level('a', {'href', 'title'}) # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profiles/' in x) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x + 'videos/', base_url)) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): flashvars = self.quotes( item['data'].replace(' ', '').replace('\n', '').replace('\t', ''), 'flashvars={', '};').split(',') fv = dict() for flashvar in flashvars: split = flashvar.partition(':') fv[split[0]] = split[2].strip("'\"") # file=self.quotes(item['data'],'file:',',').strip(' "') urls.add('default', URL(fv['video_url'] + '*')) result.set_video(urls.get_media_data()) for f in gallery_user_rule.get_result(['title', 'href']): result.add_control( ControlInfo('"' + f['title'].strip() + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['data-original']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in startpage_pages_rule.get_result(['href', 'data-query']): result.add_page( ControlInfo(item['data-query'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'loop-nav-inner')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('ul', 'class', 'menu')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'section-content'), ('div', 'id', 'video')]) video_rule.add_process_rule_level('iframe', {'src'}) # video_rule.set_attribute_filter_function('src',lambda x:'fileone.tv' in x) video_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): print(item) src = item['src'] if '.video/embed' in src: try: r = load(URL(item['src'])) setup = self.quotes(r.text, 'jwplayer("vplayer").setup(', ")").replace(' ', '') sources = self.quotes(setup, 'sources:[{', '}],').split('},{') for item in sources: if '.mp4' in item: file = self.quotes(item, 'file:"', '"') label = self.quotes(item, 'label:"', '"') urls.add(label, URL(file + '*')) except LoaderError as err: print(err) elif 'javfinder.com/' in src: try: r = load(URL(item['src'])) split1 = r.text.split('<source src="')[1:] for f in split1: f1 = f.partition('>')[0] if '.mp4' in f1: file = f1.partition('"')[0] label = self.quotes(f1, 'res="', '"') urls.add(label, URL(file + '*')) except LoaderError as err: print(err) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() # def star_get_url(txt=''): # return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'image ')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('div', 'class', 'sub_menu dark-menu'), ('div', 'class', 'sub-menu dark-menu') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'flashvars' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'block_content')]) # gallery_href_rule.add_activate_rule_level([('td', 'colspan', '2')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_filter_function( 'href', lambda x: '/tags/' in x or '/categories/' in x) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'block_content')]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '/members/' in x) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url)) # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'].replace(' ', ''), "video_url:'", "'") urls.add('default', URL(file)) result.set_video(urls.get_media_data()) if gallery_user_rule.is_result(): username = gallery_user_rule.get_result()[0].get('data', '***') user = gallery_user_rule.get_result()[0]['href'].rstrip( '/').rpartition('/')[2] result.add_control( ControlInfo( '"' + username + '"', URL('http://gobdsm.com/members/' + user + '/public_videos/'))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_soup(self, soup: BeautifulSoup, result: ParseResult, base_url: URL): # parce video page content = soup.find('div', {'id': 'mediaspace'}) if content is not None: urls = UrlList() is_video = False for script in _iter( content.find_all('script', text=lambda x: 'jwplayer(' in x)): data = str(script.string).replace(' ', '') file = quotes(data, 'file:"', '"') urls.add('DEFAULT', get_url(file, base_url)) is_video = True if is_video: result.set_video(urls.get_media_data()) #adding tags to video tags = list() for item in _iter( soup.find_all('div', {'class': 'more-content'})): for href in _iter(item.find_all('a')): if href.string is not None: if '/user/' in href.attrs['href']: result.add_control( ControlInfo( '"' + str(href.string) + '"', get_url(href.attrs['href'], base_url))) else: tags.append( ControlInfo( str(href.string), get_url(href.attrs['href'], base_url))) for item in tags: result.add_control(item) return result # parce thumbnail page for thumbnail in _iter(soup.find_all('div', {'class': 'post'})): href = get_url(thumbnail.a.attrs['href'], base_url) description = thumbnail.a.img.attrs['alt'] thumb_url = get_url(thumbnail.img.attrs['src'], base_url) duration = thumbnail.find('b', {'class': 'post-duration'}) dur_time = '' if duration is None else str(duration.string) result.add_thumb( ThumbInfo(thumb_url=thumb_url, href=href, popup=description, labels=[{ 'text': dur_time, 'align': 'top right' }, { 'text': description, 'align': 'bottom center' }])) tags_container = soup.find('div', {'class': 'site-cats'}) if tags_container is not None: for tag in _iter(tags_container.find_all('a')): result.add_control( ControlInfo(str(tag.string), get_url(tag.attrs['href'], base_url))) pagination = soup.find('div', {'class': 'pagination'}) if pagination is not None: for page in _iter(pagination.find_all('a')): if page.string is not None and page.string.isdigit(): result.add_page( ControlInfo(page.string, get_url(page.attrs['href'], base_url))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'item ')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'data-original', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination-holder')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'flashvars' in text) # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url)) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'info')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): flashvars = self.quotes( item['data'].replace(' ', '').replace('\n', '').replace('\t', ''), 'flashvars={', '};').split(',') fv = dict() for flashvar in flashvars: split = flashvar.partition(':') fv[split[0]] = split[2].strip("'\"") files = dict() for f in fv: if fv[f].startswith('http://') and fv[f].rpartition( '.')[2].strip('/') == 'mp4': file = fv[f] label = fv.get(f + '_text', f) files[label] = file for key in sorted(files.keys(), reverse=True): urls.add(key, URL(files[key])) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['data-original']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): print('This site is very slow, be patient') parser = SiteParser() def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'item ')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination-holder')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'item')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'].replace(' ', ''), "video_url:'", "'") urls.add('default', URL(file)) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href']): href = item['href'] label = href.split('/')[-2] # print(label,href) result.add_control(ControlInfo(label, URL(href))) return result
def parse_soup(self, soup: BeautifulSoup, result: ParseResult, base_url: URL): # parce video page video = soup.find('video') if video is not None: urls = UrlList() for source in _iter(video.find_all('source')): urls.add(source.attrs['res'], get_url(source.attrs['src'], base_url)) result.set_video(urls.get_media_data(-1)) user = soup.find('div', {'class': 'pull-left user-container'}) if user is not None: user_strings = [string for string in user.stripped_strings] label = '"{0} {1}"'.format(user_strings[0], user_strings[1]) href = user.find('a', href=lambda x: '#' not in x) result.add_control( ControlInfo( label, get_url(href.attrs['href'] + '/videos', base_url))) for tag_container in _iter( soup.find_all('div', {'class': 'tags-container'})): for href in _iter(tag_container.find_all('a')): if href.string is not None: result.add_control( ControlInfo(str(href.string), get_url(href.attrs['href'], base_url))) return result # parce thumbnail page for thumbnail in soup.find_all('div', {'class': 'video-thumb'}): href = get_url(thumbnail.a.attrs['href'], base_url) description = thumbnail.a.img.attrs['alt'] thumb_url = get_url(thumbnail.img.attrs['src'], base_url) duration = thumbnail.find('span', {'class': "time"}) dur_time = '' if duration is None else str(duration.string) quality = thumbnail.find('span', {'class': "quality"}) qual = '' if quality is None else str(quality.string) result.add_thumb( ThumbInfo(thumb_url=thumb_url, href=href, popup=description, labels=[{ 'text': dur_time, 'align': 'top right' }, { 'text': description, 'align': 'bottom center' }, { 'text': qual, 'align': 'top left' }])) tags = soup.find('ul', {'class': 'drop2 hidden-xs'}) if tags is not None: for tag in tags.find_all('a'): result.add_control( ControlInfo( str(tag.string).strip(), get_url(tag.attrs['href'], base_url))) pagination = soup.find('ul', {'class': 'pagination'}) if pagination is not None: for page in pagination.find_all('a'): if page.string.isdigit(): result.add_page( ControlInfo(page.string, get_url(page.attrs['href'], base_url))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('article', '', '')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination2')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('section', 'class', 'categories')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) # startpage_hrefs_rule.set_attribute_modifier_function('title', lambda x: x.replace('Sex films in de categorie ','')) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('video', '', '')]) video_rule.add_process_rule_level('source', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) video_rule.set_attribute_modifier_function('src', lambda txt: txt + '*') parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'tags')]) gallery_href_rule.add_process_rule_level('a', {'href', 'title'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): urls.add('default', URL(item['src'])) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['href']): result.add_control(ControlInfo(f['title'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) def href_simple(txt=''): txt = txt.lower().replace(' ', '').replace('sexfilmsindecategorie', '') txt = txt.replace('insexfilms', '').replace('sexfilms', '').replace('inhd', 'HD') return txt for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(href_simple(item.get('title', '')), URL(item['href'] + '*'))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'videoContainer')]) video_rule.add_process_rule_level('iframe', {'src'}) video_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): try: r = load(URL(item['src'])) r = load(URL(self.quotes(r.text, "jwplayer().load('", "'") + '*')) source = self.quotes(r.text, '<item>', '</item>').strip() split = source.split('<jwplayer:source file="') for l in split: if l is '': continue url = l.partition('"')[0] label = self.quotes(l, 'label="', '"') urls.add(label, URL(url + '*')) except LoaderError as err: print(err) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): if base_url.contain('format=json'): xhr_page = True else: xhr_page = False parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([ ('div', 'class', 'relative margin-auto video-box-wrapper normal-box') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'alt', 'data-srcmedium'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'video-container') ]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'flashvars=' in text.replace(' ', '')) # video_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) # video_href_rule = ParserRule() video_href_rule.add_activate_rule_level([('div', 'class', 'ibInfo js_ibInfo')]) video_href_rule.add_process_rule_level('a', {'href'}) video_href_rule.set_attribute_filter_function( 'href', lambda x: 'javascript' not in x) video_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(video_href_rule) video_user_rule = ParserRule() video_user_rule.add_activate_rule_level([('div', 'class', 'ibLine1')]) video_user_rule.add_process_rule_level('a', {'href'}) # video_user_rule.set_attribute_filter_function('href',lambda x: 'javascript' not in x) video_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(video_user_rule) if not xhr_page: self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): flashvars = self.quotes(item['data'].replace(' ', ''), 'flashvars={', '};').replace('\\', '').split(',"') for v in flashvars: if v.startswith('quality_'): label = self.quotes(v, 'quality_', '"') file = self.quotes(v, ':"', '"') urls.add(label, URL(file + '*')) result.set_video(urls.get_media_data(-1)) for f in video_user_rule.get_result(['data', 'href']): result.add_control( ControlInfo('"' + f['data'] + '"', URL(f['href']))) for f in video_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if xhr_page: def parce_data(data, buttons_added, base_url, items_name='items'): xhr_data = base_url.xhr_data items = data[items_name] nav = data['navigation'] last_page = nav['lastPage'] if last_page is None: last_page = 1 curr_page = int(base_url.get().rpartition('page=')[2]) pattern = nav['urlPattern'].replace('[%pageId%]', '{}') + '*' for item in items: thumb_url = item['thumb_url'] title = item['specialchars_title'] url = item['video_link'] result.add_thumb( ThumbInfo(thumb_url=URL(thumb_url), href=URL(url + '*'), popup=title)) # print('Page {} of {}'.format(curr_page, last_page), pattern.format(curr_page)) if not buttons_added: result.add_page(ControlInfo('1', xhr_data['base_url'])) p_from = curr_page - 5 if p_from < 2: p_from = 2 p_to = curr_page + 5 if p_to > last_page: p_to = last_page for x in range(p_from, p_to): url = URL(pattern.format(x), xhr_data=xhr_data) if x == curr_page: x = str(x) + '(this)' result.add_page(ControlInfo(str(x), url)) last_url = URL(pattern.format(last_page), xhr_data=xhr_data) result.add_page(ControlInfo(str(last_page), last_url)) buttons_added = True return buttons_added with open(fname) as fp: try: json_data = json.load(fp) buttons_added = False if base_url.contain('/keyword/'): for i in json_data: buttons_added = parce_data(i, buttons_added, base_url) elif base_url.contain('/users/'): parce_data(json_data['response'], buttons_added, base_url, items_name='videos') else: for i in json_data: buttons_added = parce_data(json_data[i], buttons_added, base_url) except ValueError: pass return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['data-srcmedium']), href=URL(item['href']), popup=item.get('alt', ''))) xhr_data = {'base_url': base_url} next_url = URL( base_url.get() + '*', xhr_data=xhr_data) # +'?format=json&number_pages=1&page=2*' next_url.add_query([('format', 'json'), ('number_pages', '1'), ('page', '2')]) result.add_page(ControlInfo('next', next_url)) return result
def parse_soup(self, soup: BeautifulSoup, result: ParseResult, base_url: URL): # parce video page content = soup.find('div', {'id': 'content'}) if content is not None: urls = UrlList() is_video = False for script in _iter( content.find_all('script', text=lambda x: 'jwplayer(' in x)): data = str(script.string).replace(' ', '') file = quotes(data, '"file":"', '"') urls.add('DEFAULT', get_url(file, base_url)) is_video = True if is_video: result.set_video(urls.get_media_data()) #adding "user" to video user = soup.find('div', {'class': 'thumb-member-username'}) if user is not None: href = user.find('a').attrs['href'] username = href.rpartition('/')[2] result.add_control( ControlInfo( '"' + username + ' uploads"', URL('http://motherless.com/u/' + username + '*'))) result.add_control( ControlInfo( '"' + username + ' gals"', URL('http://motherless.com/galleries/member/' + username + '*'))) #adding tags to video for item in _iter( soup.find_all('div', {'id': 'media-tags-container'})): for href in _iter(item.find_all('a')): if href.string is not None: result.add_control( ControlInfo( str(href.string), get_url(href.attrs['href'], base_url))) return result # parce thumbnail page for item in _iter(soup.find_all('div', {'class': ['content-inner']})): for thumbnail in _iter(item.find_all('div', {'class': 'thumb'})): href = get_url(thumbnail.a.attrs['href'], base_url) thumb_url = get_url(thumbnail.img.attrs['src'], base_url) duration = thumbnail.find('div', {'class': 'caption left'}) dur_time = '' if duration is None else str(duration.string) caption = thumbnail.find('h2', {'class': 'caption title'}) label = '' if caption is None else str(caption.string) user = thumbnail.find('a', {'class': 'caption left'}) username = '' if user is None else str(user.string) if not 'x' in dur_time: result.add_thumb( ThumbInfo(thumb_url=thumb_url, href=href, popup=label, labels=[{ 'text': dur_time, 'align': 'top right' }, { 'text': label, 'align': 'bottom center' }, { 'text': username, 'align': 'top left' }])) #adding tags to thumbs tags = soup.find('div', {'class': 'dark-menu'}) if tags is not None: for tag in _iter(tags.find_all('a')): # print(tag) result.add_control( ControlInfo( str(tag.string).strip(), get_url(tag.attrs['href'], base_url))) #adding pages to thumbs pagination = soup.find('div', {'class': 'pagination_link'}) if pagination is not None: for page in _iter(pagination.find_all('a')): # print(page) if page.string.isdigit(): result.add_page( ControlInfo(page.string, get_url(page.attrs['href'], base_url))) return result
def parse_soup(self, soup: BeautifulSoup, result: ParseResult, base_url: URL): # parce video page video = soup.find('div', {'class': 'video'}) if video is not None: urls = UrlList() for source in _iter(video.find_all('source')): urls.add(source.attrs['res'], get_url(source.attrs['src'], base_url)) result.set_video(urls.get_media_data(-1)) for tag_container in _iter( soup.find_all('div', {'class': 'video_header'})): for href in _iter(tag_container.find_all('a')): if href.string is not None: result.add_control( ControlInfo(str(href.string), get_url(href.attrs['href'], base_url))) return result # parce thumbnail page thumbs_container = soup.find('div', {'class': 'videos cf'}) if thumbs_container is not None: for thumbnail in _iter( thumbs_container.find_all('div', {'class': ['polaroid']})): href = get_url(thumbnail.a.attrs['href'], base_url) description = thumbnail.a.img.attrs['alt'] thumb_url = get_url(thumbnail.img.attrs['data-src'], base_url) duration = thumbnail.find('div', {'class': "duration"}) dur_time = '' if duration is None else str(duration.string) result.add_thumb( ThumbInfo(thumb_url=thumb_url, href=href, popup=description, labels=[{ 'text': dur_time, 'align': 'top right' }, { 'text': description, 'align': 'bottom center' }])) tags = soup.find('ul', {'class': 'tags cf'}) if tags is not None: for tag in tags.find_all('a'): result.add_control( ControlInfo( str(tag.string).strip(), get_url(tag.attrs['href'], base_url))) pagination = soup.find('div', {'class': 'pagination'}) if pagination is not None: for page in pagination.find_all('a'): if page.string.isdigit(): result.add_page( ControlInfo(page.string, get_url(page.attrs['href'], base_url))) return result # parce categories page categories = set() for category in _iter(soup.find_all('div', {'class': 'catbox'})): href = get_url(category.a.attrs['href'], base_url) thumb_url = get_url(category.img.attrs['data-src'], base_url) title = str(category.find('div', {'class': 'title'}).string) if title not in categories: result.add_thumb( ThumbInfo(thumb_url=thumb_url, href=href, popup=title, labels=[{ 'text': title, 'align': 'top right' }])) categories.add(title) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'link-3col')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('iframe', {'src'}) video_rule.set_attribute_filter_function('src', lambda x: 'fileone.tv' in x) video_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): print(item) try: r = load(URL(item['src'])) setup = self.quotes(r.text, "jwplayer('player').setup(", ")").replace(' ', '') file = self.quotes(setup, "file:'", "'") urls.add("default", URL(file + '*')) except LoaderError as err: print(err) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('section', '', '')]) startpage_rule.add_activate_rule_level([('div', 'class', 'videos cf')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'data-src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'data-src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) categories_rule = ParserRule() # categories_rule.add_activate_rule_level([('section', '', '')]) categories_rule.add_activate_rule_level([('div', 'class', 'catbox')]) categories_rule.add_process_rule_level('a', {'href'}) categories_rule.add_process_rule_level('img', {'data-src', 'alt'}) categories_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) categories_rule.set_attribute_modifier_function( 'data-src', lambda x: self.get_href(x, base_url)) parser.add_rule(categories_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_tags_rule = ParserRule() startpage_tags_rule.add_activate_rule_level([('ul', 'class', 'tags cf') ]) startpage_tags_rule.add_process_rule_level('a', {'href'}) startpage_tags_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_tags_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('video', 'id', 'video_id')]) video_rule.add_process_rule_level('source', {'src', 'res'}) # video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text) video_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'video_header')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): urls.add(item['res'], URL(item['src'])) result.set_video(urls.get_media_data(-1)) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['data-src']), href=URL(item['href']), popup=item.get('alt' ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) if base_url.contain('/categories/'): result.set_caption_visible(True) return result if categories_rule.is_result(): urls = list() for item in categories_rule.get_result(['href']): if item['href'] in urls: continue result.add_thumb( ThumbInfo(thumb_url=URL(item['data-src']), href=URL(item['href']), popup=item.get('alt' ''))) urls.append(item['href']) result.set_caption_visible(True) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() # print(base_url.domain()) def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule(debug=False) startpage_rule.add_activate_rule_level([('ul', 'class', 'video-listing'), ('ul', 'class', 'video-listing two-in-row'), ('ul', 'class', 'video-listing four-in-row'), ('ul', 'class', 'video-listing two-in-row id-recommended-list'), ('ul', 'class', 'video-listing four-in-row id-recommended-list') ]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'data-src', 'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function('data-src', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pageNumbersHolder')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'categories-listing'), ('ul', 'class', 'categories-popular-listing'), ('ul', 'class', 'abc-categories newAbcCategories')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) pornstars_rule = ParserRule() pornstars_rule.add_activate_rule_level([('div', 'id', 'all_pornstars')]) pornstars_rule.add_process_rule_level('a', {'href'}) pornstars_rule.add_process_rule_level('img', {'src', 'alt'}) # pornstars_rule.set_attribute_filter_function('href',lambda x: '/channel/' in x or '/prime/' in x) pornstars_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(pornstars_rule) pornstars_hrefs_rule = ParserRule() pornstars_hrefs_rule.add_activate_rule_level([('ul', 'class', 'abc-categories newAbcCategories')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) pornstars_hrefs_rule.add_process_rule_level('a', {'href'}) pornstars_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(pornstars_hrefs_rule) channels_rule = ParserRule(debug=False) channels_rule.add_activate_rule_level([('ul', 'class', 'channels-list three-in-row')]) channels_rule.add_process_rule_level('a', {'href'}) channels_rule.add_process_rule_level('img', {'src', 'data-src', 'alt'}) channels_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) channels_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url)) channels_rule.set_attribute_modifier_function('data-src', lambda x: self.get_href(x, base_url)) parser.add_rule(channels_rule) channels_hrefs_rule = ParserRule() channels_hrefs_rule.add_activate_rule_level([('div', 'class', 'channel-filters-categories')]) # channels_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) channels_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) channels_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(channels_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'watch')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'redtube_flv_player' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'video-details')]) # gallery_href_rule.add_activate_rule_level([('td', 'class', 'links')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) gallery_href_rule.set_attribute_filter_function('href', lambda x: x != '*') parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): script = item['data'].replace(' ', '').replace('\\', '') sources = self.quotes(script, 'sources:{"', '"},').split('","') for f in sources: t = f.partition('":"') label = t[0] file = self.get_href(t[2], base_url) urls.add(label, URL(file)) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): href = f['href'] label = f['data'] if '/redtube/' in href or '/tag/' in href: result.add_control(ControlInfo(label, URL(href))) elif '/pornstar/' in href: ps_name = href.rstrip('*').rpartition('/')[2].replace('+', ' ').title() result.add_control(ControlInfo(ps_name, URL(href))) else: # adding user result.add_control(ControlInfo("'" + label + "'", URL(href.replace('*', '/videos*')))) return result if pornstars_rule.is_result(): result.set_caption_visible(True) for item in pornstars_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in pornstars_hrefs_rule.get_result(['href']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result if channels_rule.is_result(): result.set_caption_visible(True) for item in channels_rule.get_result(['href']): thumb_href = item.get('data-src', item.get('src')) descr = item.get('alt', '').title() result.add_thumb(ThumbInfo(thumb_url=URL(thumb_href), href=URL(item['href']), popup=descr)) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in channels_hrefs_rule.get_result(['href']): result.add_control(ControlInfo(item['title'], URL(item['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): thumb_href = item.get('data-src', item.get('src')) descr = item.get('title', item.get('alt', '')) result.add_thumb(ThumbInfo(thumb_url=URL(thumb_href), href=URL(item['href']), popup=descr)) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'title']): result.add_control(ControlInfo(item['title'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule(debug=False) startpage_rule.add_activate_rule_level([('div', 'class', 'main l170'), ('div', 'class', 'main l200'), ('div', 'class', 'main'), ('div', 'class', 'profileRight'), ('div', 'class', 'main l200 r300')]) startpage_rule.add_activate_rule_level([('ul', 'class', 'listThumbs'), ('ul', 'class', 'listProfiles'), ('ul', 'class', 'listChannels'), ('ul', 'class', 'listGalleries')]) startpage_rule.add_process_rule_level('a', {'href', 'class', 'style'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') startpage_rule.set_attribute_modifier_function('style', star_get_url) startpage_rule.set_attribute_filter_function('href', lambda x: not '/pictures/' in x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'sFilters initial'), ('ul', 'class', 'sFilters'), ('div', 'class', 'listSearches searchOption'), ('div', 'class', 'alpha') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) startpage_hrefs_rule.set_attribute_filter_function('title', lambda x: 'Combine Category' not in x) parser.add_rule(startpage_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('head', '', '')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'streams:[' in text) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('p', 'class', 'source tags'), ('p', 'class', 'source categories')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('p', 'class', 'source')]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_filter_function('href', lambda x: '/profile/' in x) gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x + '/videos', base_url)) parser.add_rule(gallery_user_rule) gallery_actor_rule = ParserRule() gallery_actor_rule.add_activate_rule_level([('p', 'class', 'source')]) gallery_actor_rule.add_process_rule_level('a', {'href'}) gallery_actor_rule.set_attribute_filter_function('href', lambda x: '/pornstars/' in x) gallery_actor_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x + '/videos', base_url)) parser.add_rule(gallery_actor_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): script = item['data'].replace(' ', '') sources = self.quotes(script, 'streams:[{', '}]').split('},{') for f in sources: label = self.quotes(f, 'id:"', '"') file = self.quotes(f, 'url:"', '"') urls.add(label, URL(file + '*')) result.set_video(urls.get_media_data(-1)) for f in gallery_user_rule.get_result(['href']): result.add_control(ControlInfo('"' + f['data'] + '"', URL(f['href']))) for f in gallery_actor_rule.get_result(['href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): # # for item in startpage_rule.get_result(): # print(item) for item in startpage_rule.get_result(['href', 'src']): caption = '' href = item['href'] if '/channels/' in href or '/pornstars/' in href: result.set_caption_visible(True) caption = item.get('alt', href.rpartition('/')[2].strip('*').replace('-', ' ').title()) result.add_thumb(ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=caption)) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href']): result.add_control(ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result