def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'loop-nav-inner')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('ul', 'class', 'menu')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'section-content'), ('div', 'id', 'video')]) video_rule.add_process_rule_level('iframe', {'src'}) # video_rule.set_attribute_filter_function('src',lambda x:'fileone.tv' in x) video_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): print(item) src = item['src'] if '.video/embed' in src: try: r = load(URL(item['src'])) setup = self.quotes(r.text, 'jwplayer("vplayer").setup(', ")").replace(' ', '') sources = self.quotes(setup, 'sources:[{', '}],').split('},{') for item in sources: if '.mp4' in item: file = self.quotes(item, 'file:"', '"') label = self.quotes(item, 'label:"', '"') urls.add(label, URL(file + '*')) except LoaderError as err: print(err) elif 'javfinder.com/' in src: try: r = load(URL(item['src'])) split1 = r.text.split('<source src="')[1:] for f in split1: f1 = f.partition('>')[0] if '.mp4' in f1: file = f1.partition('"')[0] label = self.quotes(f1, 'res="', '"') urls.add(label, URL(file + '*')) except LoaderError as err: print(err) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() # def star_get_url(txt=''): # return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule(debug=False) startpage_rule.add_activate_rule_level([('div', 'class', 'video_box'), ('div', 'class', 'box-thumbnail')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) # startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') # startpage_rule.set_attribute_modifier_function('style', star_get_url) # startpage_rule.set_attribute_filter_function('href',lambda x: not '/pictures/'in x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'id', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) # startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'id', 'videos_categories')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'playerContainer')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('li', 'class', 'tag-list'), ('li', 'class', 'video-category')]) gallery_href_rule.add_process_rule_level('a', {'href'}) # gallery_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('span', 'id', 'videoUsername')]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_modifier_function('href', lambda x: x.replace('/user/', '/user-videos/')) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace(' ', '').replace('\\', '') flashvars = script.partition('flashvars={')[2].partition('};')[0] # print(flashvars) # def parce(txt): # label = txt.partition('id:"')[2].partition('"')[0] # file = txt.partition('url:"')[2].partition('"')[0] # print(label,file) # return dict(text=label, url=URL(file + '*')) urls = list() while '"quality_' in flashvars: nxt = flashvars.partition('"quality_')[2] t = nxt.partition('":"') label = t[0] file = t[2].partition('",')[0] # print (label, file) if file.startswith('http://'): urls.append(dict(text=label, url=URL(file + '*'))) flashvars = nxt # print(urls) if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: video = MediaData(urls[len(urls) - 1]['url']) for item in urls: video.add_alternate(item) else: return result result.set_type('video') result.set_video(video) for f in gallery_user_rule.get_result(['data', 'href']): username = '******' + f['href'].split('/')[-2] + '"' result.add_control(ControlInfo(username, URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule(debug=True) startpage_rule.add_activate_rule_level([('div', 'class', 'boxC videoList clearfix'), ('div', 'class', 'gallery')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) # startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'id', 'menuLeft') ]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_hrefs_rule.set_attribute_filter_function( 'href', lambda text: '/channels/' in text or '/photos/niches/' in text) parser.add_rule(startpage_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'player')]) video_rule.add_process_rule_level('video', {'file'}) # video_rule.set_attribute_filter_function('data',lambda text:'function playStart()' in text) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([ ('div', 'id', 'videoInfoBox'), ('div', 'id', 'galleryInfoBox') ]) gallery_href_rule.add_activate_rule_level([('td', 'class', 'btnList')]) gallery_href_rule.add_process_rule_level('a', {'href', 'title'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) picture_rule = ParserRule() # gallery rule picture_rule.add_activate_rule_level([('div', 'class', 'gallery iItem ')]) picture_rule.add_activate_rule_level([('div', 'class', 'img vam')]) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: text.replace('_160', '_1000')) parser.add_rule(picture_rule) self.proceed_parcing(parser, fname) result = ParseResult() if len(video_rule.get_result()) > 0: result.set_video( MediaData(URL(video_rule.get_result()[0]['file'] + '*'))) result.set_type('video') for f in gallery_href_rule.get_result(): # print(f) result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(): x = FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i) result.add_full(x) i += 1 # print(f['src']) for f in gallery_href_rule.get_result(): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item['alt'])) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() # def star_get_url(txt=''): # return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'image ')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('div', 'class', 'sub_menu dark-menu'), ('div', 'class', 'sub-menu dark-menu') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'flashvars' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'block_content')]) # gallery_href_rule.add_activate_rule_level([('td', 'colspan', '2')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_filter_function( 'href', lambda x: '/tags/' in x or '/categories/' in x) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'block_content')]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '/members/' in x) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url)) # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'].replace(' ', ''), "video_url:'", "'") urls.add('default', URL(file)) result.set_video(urls.get_media_data()) if gallery_user_rule.is_result(): username = gallery_user_rule.get_result()[0].get('data', '***') user = gallery_user_rule.get_result()[0]['href'].rstrip( '/').rpartition('/')[2] result.add_control( ControlInfo( '"' + username + '"', URL('http://gobdsm.com/members/' + user + '/public_videos/'))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() # print(base_url.domain()) def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule(debug=False) startpage_rule.add_activate_rule_level([('ul', 'class', 'responsiveListing')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'data-original'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url) + '*') startpage_rule.set_attribute_modifier_function('data-original', lambda x: x.replace('//', 'https://')) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('section', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url) + '*') parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'categoryList')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.add_process_rule_level('span', {''}) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url) + '*') parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('body', '', '')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'videoVars' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'videoInfoTop')]) # gallery_href_rule.add_activate_rule_level([('td', 'class', 'links')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url) + '*') # gallery_href_rule.set_attribute_filter_function('href',lambda x: x!='*') parser.add_rule(gallery_href_rule) # # gallery_channel_rule = ParserRule() # gallery_channel_rule.add_activate_rule_level([('p', 'class', 'source')]) # gallery_channel_rule.add_process_rule_level('a', {'href'}) # gallery_channel_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') # parser.add_rule(gallery_channel_rule) self.proceed_parcing(parser, fname) result = ParseResult() if len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace(' ', '') # .replace('\\','') # print(script) urls = list() while '"quality_' in script: nxt = script.partition('"quality_')[2] t = nxt.partition('":"') label = t[0] file = t[2].partition('",')[0].replace('%2F', '/').replace('%3F', '?').replace('%26', '&').replace( '%3D', '=') # print (label, file) urls.append(dict(text=label, url=URL('https:' + file + '*'))) script = nxt if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: default = urls[len(urls) - 1]['url'] for t in urls: if '720p' in t['text']: default = t['url'] video = MediaData(default) for item in urls: video.add_alternate(item) else: return result result.set_type('video') result.set_video(video) # # for f in gallery_channel_rule.get_result(['data', 'href']): # result.add_control(ControlInfo(f['data'], URL(f['href']))) links = set() for f in gallery_href_rule.get_result(['data', 'href']): if f['href'] not in links: label = f['data'].replace('\t', '') if label == '': label = f['href'].rpartition('/')[2] # print(f) result.add_control(ControlInfo(label, URL(f['href']))) links.add(f['href']) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print (item) result.add_thumb(ThumbInfo(thumb_url=URL(item['data-original']), href=URL(item['href']), popup=item.get('title', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): # print(item) result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): href = item['href'] txt = href.rstrip('*').rpartition('/')[2] # print(item) result.add_control(ControlInfo(txt, URL(href))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'clearfix'), ('div', 'class', 'row clearfix video-container')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level( [('div', 'class', 'btn-group clearfix full-width pagination-block')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_categories_rule = ParserRule() startpage_categories_rule.add_activate_rule_level([('ul', 'class', 'main-nav unstyled-list subCategories')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_categories_rule.add_process_rule_level('a', {'href'}) # startpage_categories_rule.set_attribute_filter_function('href',lambda x:'/free_porn/' in x) startpage_categories_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_categories_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'cat-menu hidden-xs')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_filter_function('href', lambda x: '/free_porn/' in x) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('body', '', '')]) video_rule.add_process_rule_level('script', {''}) video_rule.set_attribute_filter_function('data', lambda text: 'var urls' in text) # video_rule.set_attribute_modifier_function('src',lambda txt:txt+'*') parser.add_rule(video_rule) gallery_rule = ParserRule() gallery_rule.add_activate_rule_level([('div', 'id', 'galleryImages')]) gallery_rule.add_process_rule_level('a', {}) gallery_rule.add_process_rule_level('img', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) gallery_rule.set_attribute_modifier_function('src', lambda txt: txt.replace('/thumbs/', '/')) parser.add_rule(gallery_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'video-player-list tag-list-block')]) gallery_href_rule.add_process_rule_level('a', {'href', 'title'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: (self.get_href(x, base_url))) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'video-player-info row')]) gallery_user_rule.add_process_rule_level('a', {'href'}) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x) parser.add_rule(gallery_user_rule) all = '' for s in open(fname, encoding='utf-8', errors='ignore'): parser.feed(s) # .replace('</b>','</a>')) all += s.replace(' ', '') result = ParseResult() if 'urls.push({' in all: video_url = all.partition('urls.push({')[2].partition('"});')[0].partition('file:"')[2] video = MediaData(URL(video_url + '*')) result.set_type('video') result.set_video(video) if gallery_user_rule.is_result(): # print(gallery_user_rule.get_result()) user_name = gallery_user_rule.get_result()[0]['data'].strip() user_number = gallery_user_rule.get_result()[0]['href'].rpartition('-')[2].rstrip('/') # print(user_name, user_number) result.add_control(ControlInfo('"' + user_name + '"', URL('http://shockingmovies.com/uploads-by-user/' + user_number + '/'))) # result.add_control(ControlInfo(user+' gals', URL('http://motherless.com/galleries/member/'+user+'*'))) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip().strip(',') if label == '': label = f['title'] result.add_control(ControlInfo(label, URL(f['href']))) return result if gallery_rule.is_result(): result.set_type('pictures') url = URL(gallery_rule.get_result()[0]['src'] + '*') base_dir = url.get_path(base=Setting.base_dir) result.set_gallery_path(base_dir) for f in gallery_rule.get_result(): picture = FullPictureInfo(abs_href=URL(f['src'] + '*'), rel_name=f['src'].rpartition('/')[2]) picture.set_base(base_dir) result.add_full(picture) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() if label == '': label = f['title'] if '/user/' in f['href']: split = f['href'].rpartition('-') base = split[0].partition('/user/')[0] # print(split) # print(base) result.add_control(ControlInfo(label + ' videos', URL(base + '/uploads-by-user/' + split[2]))) result.add_control( ControlInfo(label + ' gals', URL(base + '/uploads-by-user/' + split[2] + '?photos=1'))) else: result.add_control(ControlInfo(label, URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): # print(item) href = item['href'] page_number = href.rpartition('/page')[2].rpartition('.')[0] result.add_page(ControlInfo(page_number, URL(href))) # print(href,page_number) if len(startpage_categories_rule.get_result(['href'])) > 0: for item in startpage_categories_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item.get('data', ''), URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item.get('data', ''), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): site_url = 'http://' + urlparse(base_url.get())[1].strip('/') print('site url=', site_url) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'bodycontainer')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt', 'class'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: site_url + get_href(x)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() # startpage_pages_rule.add_activate_rule_level([('div', 'class', 'bodycontainer')]) startpage_pages_rule.add_activate_rule_level([('td', 'align', 'right') ]) startpage_pages_rule.add_process_rule_level('a', {'href', 'title'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: site_url + x) parser.add_rule(startpage_pages_rule) site_rule = ParserRule() site_rule.add_activate_rule_level([('div', 'class', 'headerlinetext')]) site_rule.add_process_rule_level('a', {'href'}) parser.add_rule(site_rule) picture_trigger_rule = ParserRule() picture_trigger_rule.add_activate_rule_level([('a', 'class', 'fancybox')]) picture_trigger_rule.add_process_rule_level('img', {'src'}) parser.add_rule(picture_trigger_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'bodycontainer') ]) picture_rule.add_process_rule_level('a', {'href', 'class'}) picture_rule.add_process_rule_level('img', {'alt'}) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'class', 'bodycontainer')]) picture_href_rule.add_activate_rule_level([('h2', 'style', 'font-size:18px')]) picture_href_rule.add_process_rule_level('a', {'href'}) picture_href_rule.set_attribute_modifier_function( 'href', lambda x: site_url + x) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_trigger_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['href', 'alt', 'class']): if f['class'] == 'fancybox': result.add_full( FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i)) i += 1 for item in picture_href_rule.get_result(['href', 'data']): result.add_control( ControlInfo(text=item['data'], url=URL(item['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href', 'src', 'class']): if item['class'] == 'thumb': result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in site_rule.get_result(['href', 'data']): result.add_site(ControlInfo(item['data'], URL(item['href']))) for item in startpage_pages_rule.get_result( ['href', 'data', 'title']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'fancy-thumbnails-container'), ('div', 'class', 'fancy-thumbnails-container inner-content'), ('div', 'class', 'dvd-cover-inner')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src'}) startpage_rule.set_attribute_filter_function('src', lambda x: '.jpg' in x) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'dropdown-menu columns')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/channel/' in x or '/prime/' in x) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'scene')]) video_rule.add_process_rule_level('a', {'href'}) video_rule.add_process_rule_level('video', {'data-src'}) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('ul', 'class', 'info')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_filter_function('href', lambda x: '#' not in x) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: source = '' n = 1 for item in video_rule.get_result(): print(item) scene = 'Scene {0}'.format(n) if base_url.contain(item['href']): source = item['data-src'] scene += '(this)' result.add_control(ControlInfo(scene, URL(self.get_href(item['href'], base_url)))) n += 1 video = MediaData(URL(source)) result.set_video(video) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): print(base_url.get(), base_url.domain()) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'ogpost'), ('div', 'class', 'post300'), ('div', 'class', 'galelement') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: get_href(x, base_url.domain())) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('span', 'class', 'pager'), ('div', 'class', 'pager') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) startpage_href_rule = ParserRule() startpage_href_rule.add_activate_rule_level([('div', 'id', 'right')]) startpage_href_rule.add_activate_rule_level([('div', 'class', 'rightbox')]) startpage_href_rule.add_process_rule_level('a', {'href'}) parser.add_rule(startpage_href_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'galcontentpics')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: _del_thumb(text)) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href', 'src']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href'] + '*'), popup=item.get('alt', ''))) for item in startpage_href_rule.get_result(['href', 'data']): if item['href'].startswith('/'): result.add_control( ControlInfo( item['data'], URL(base_url.domain() + item['href'] + '*'))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page( ControlInfo(item['data'], URL(item['href'] + '*'))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['src']): result.add_full( FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() # def star_get_url(txt=''): # return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([ ('ul', 'class', 'thumbs-items'), ('ul', 'class', 'thumbs-albums'), ('ul', 'class', 'thumbs-categories') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'data-original', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) # startpage_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x,base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'list-categories')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player-holder')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'video_url:' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'specification')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule(collect_data=True) gallery_user_rule.add_activate_rule_level([('div', 'class', 'user-info')]) gallery_user_rule.add_process_rule_level('a', {'href', 'title'}) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x+'/videos',base_url)) # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/members/' in x) parser.add_rule(gallery_user_rule) photo_rule = ParserRule() photo_rule.add_activate_rule_level([('div', 'class', 'ad-thumbs')]) photo_rule.add_process_rule_level('a', {'data-image'}) # photo_rule.set_attribute_filter_function('href', lambda text: '/photos/' in text) photo_rule.set_attribute_modifier_function( 'data-image', lambda x: self.get_href(x, base_url)) parser.add_rule(photo_rule) self.proceed_parcing(parser, fname) result = ParseResult() def add_href_and_user_to_result(): if gallery_user_rule.is_result(['href']): for item in gallery_user_rule.get_result(['href']): # print(item) username = item['title'] # print(username) if username != '': result.add_control( ControlInfo('"' + username + ' videos"', URL(item['href'] + 'public_videos/'))) result.add_control( ControlInfo('"' + username + ' photos"', URL(item['href'] + 'albums/'))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) if video_rule.is_result(): # len(video_rule.get_result()) > 0: # for item in video_rule.get_result(): # print('=============================') # print(item['data']) script = video_rule.get_result()[0]['data'].replace(' ', '') # print(script) url = script.partition("video_url:'")[2].partition("'")[0].rstrip( '/') print(url) video = MediaData(URL(url)) result.set_type('video') result.set_video(video) add_href_and_user_to_result() return result if photo_rule.is_result(): result.set_type('pictures') base_dir = base_url.get_path(base=Setting.base_dir) + base_url.get( ).rpartition('/')[2] + '/' result.set_gallery_path(base_dir) # print(base_dir) for item in photo_rule.get_result(): name = item['data-image'].rpartition('/')[2].strip('*') picture = FullPictureInfo(abs_href=URL(item['data-image']), rel_name=name) picture.set_base(base_dir) result.add_full(picture) add_href_and_user_to_result() return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href', 'data-original']): # print(item) href = item['href'] label = href.split('/')[-2].upper().replace('-', ' ') # print(href,label) result.add_thumb( ThumbInfo(thumb_url=URL(item['data-original']), href=URL(href), popup=label)) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'title']): result.add_control( ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'video-item compact')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_categories_rule = ParserRule() startpage_categories_rule.add_activate_rule_level([ ('nav', 'class', 'video-categories') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_categories_rule.add_process_rule_level('a', {'href'}) # startpage_categories_rule.set_attribute_filter_function('href',lambda x:'/free_porn/' in x) startpage_categories_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_categories_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'cat-menu hidden-xs')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_filter_function( 'href', lambda x: '/free_porn/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('video', '', '')]) video_rule.add_process_rule_level('source', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) video_rule.set_attribute_modifier_function('src', lambda txt: txt + '*') parser.add_rule(video_rule) gallery_rule = ParserRule() gallery_rule.add_activate_rule_level([('div', 'id', 'galleryImages')]) gallery_rule.add_process_rule_level('a', {}) gallery_rule.add_process_rule_level('img', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) gallery_rule.set_attribute_modifier_function( 'src', lambda txt: txt.replace('/thumbs/', '/')) parser.add_rule(gallery_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'tags')]) gallery_href_rule.add_process_rule_level('a', {'href', 'title'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: (self.get_href(x, base_url))) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'uploaded') ]) gallery_user_rule.add_process_rule_level('a', {'href'}) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: urls = UrlList() for item in video_rule.get_result(): urls.add('default', URL(item['src'])) result.set_video(urls.get_media_data(-1)) if gallery_user_rule.is_result(): user = gallery_user_rule.get_result()[0]['href'].rpartition( '/')[2] result.add_control( ControlInfo( '"' + user + '"', URL('http://www.heavy-r.com/user/' + user + '?pro=videos*'))) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() if label == '': label = f['title'] result.add_control(ControlInfo(label, URL(f['href']))) return result if gallery_rule.is_result(): result.set_type('pictures') url = URL(gallery_rule.get_result()[0]['src'] + '*') base_dir = url.get_path(base=Setting.base_dir) result.set_gallery_path(base_dir) for f in gallery_rule.get_result(): picture = FullPictureInfo(abs_href=URL(f['src'] + '*'), rel_name=f['src'].rpartition('/')[2]) picture.set_base(base_dir) result.add_full(picture) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() if label == '': label = f['title'] if '/user/' in f['href']: split = f['href'].rpartition('-') base = split[0].partition('/user/')[0] # print(split) # print(base) result.add_control( ControlInfo(label + ' videos', URL(base + '/uploads-by-user/' + split[2]))) result.add_control( ControlInfo( label + ' gals', URL(base + '/uploads-by-user/' + split[2] + '?photos=1'))) else: result.add_control(ControlInfo(label, URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_categories_rule.get_result(['href'])) > 0: for item in startpage_categories_rule.get_result( ['href', 'data']): result.add_control( ControlInfo(item.get('data', ''), URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('data', ''), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule(debug=False) startpage_rule.add_activate_rule_level([('div', 'class', 'main l170'), ('div', 'class', 'main l200'), ('div', 'class', 'main'), ('div', 'class', 'profileRight'), ('div', 'class', 'main l200 r300')]) startpage_rule.add_activate_rule_level([('ul', 'class', 'listThumbs'), ('ul', 'class', 'listProfiles'), ('ul', 'class', 'listChannels'), ('ul', 'class', 'listGalleries')]) startpage_rule.add_process_rule_level('a', {'href', 'class', 'style'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') startpage_rule.set_attribute_modifier_function('style', star_get_url) startpage_rule.set_attribute_filter_function('href', lambda x: not '/pictures/' in x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'sFilters initial'), ('ul', 'class', 'sFilters'), ('div', 'class', 'listSearches searchOption'), ('div', 'class', 'alpha') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) startpage_hrefs_rule.set_attribute_filter_function('title', lambda x: 'Combine Category' not in x) parser.add_rule(startpage_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('head', '', '')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'streams:[' in text) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('p', 'class', 'source tags'), ('p', 'class', 'source categories')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('p', 'class', 'source')]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_filter_function('href', lambda x: '/profile/' in x) gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x + '/videos', base_url)) parser.add_rule(gallery_user_rule) gallery_actor_rule = ParserRule() gallery_actor_rule.add_activate_rule_level([('p', 'class', 'source')]) gallery_actor_rule.add_process_rule_level('a', {'href'}) gallery_actor_rule.set_attribute_filter_function('href', lambda x: '/pornstars/' in x) gallery_actor_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x + '/videos', base_url)) parser.add_rule(gallery_actor_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): script = item['data'].replace(' ', '') sources = self.quotes(script, 'streams:[{', '}]').split('},{') for f in sources: label = self.quotes(f, 'id:"', '"') file = self.quotes(f, 'url:"', '"') urls.add(label, URL(file + '*')) result.set_video(urls.get_media_data(-1)) for f in gallery_user_rule.get_result(['href']): result.add_control(ControlInfo('"' + f['data'] + '"', URL(f['href']))) for f in gallery_actor_rule.get_result(['href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): # # for item in startpage_rule.get_result(): # print(item) for item in startpage_rule.get_result(['href', 'src']): caption = '' href = item['href'] if '/channels/' in href or '/pornstars/' in href: result.set_caption_visible(True) caption = item.get('alt', href.rpartition('/')[2].strip('*').replace('-', ' ').title()) result.add_thumb(ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=caption)) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href']): result.add_control(ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumb vidItem')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) # startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'left-menu-box')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_filter_function( 'href', lambda x: '/videos/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'block videoDetail vidItem')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'jwplayer' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'content-tags')]) # gallery_href_rule.add_activate_rule_level([('div', 'class', 'column second')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'user-card')]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.add_process_rule_level('span', {'class'}) # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profile/' in x) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_user_rule) gallery_user_name_rule = ParserRule() gallery_user_name_rule.add_activate_rule_level([('div', 'class', 'user-data')]) gallery_user_name_rule.add_process_rule_level('span', {'class'}) # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profile/' in x) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x+'/videos',base_url)) parser.add_rule(gallery_user_name_rule) self.proceed_parcing(parser, fname) result = ParseResult() if len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace('\t', '').replace( '\n', '') # print(video_rule.get_result()[0]['data']) # print('len=',len(video_rule.get_result())) file = '' if 'sources:' in script: sources = script.partition('sources:')[2].partition(']')[0] # print(sources) file = sources.partition('file: "')[2].partition( '",')[0].strip('"').replace(' ', '%20') # print(file) elif "filefallback':" in script: file = script.replace( ' ', '').partition("filefallback':\"")[2].partition('",')[0] # print(file) else: return result video = MediaData(URL(file)) result.set_type('video') result.set_video(video) user_url = gallery_user_rule.get_result(['href'])[0]['href'] user_name = gallery_user_name_rule.get_result(['data'])[0]['data'] # print(user_url,user_name) result.add_control( ControlInfo('"' + user_name + '"', URL(user_url))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href', 'src']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src'].replace(' ', '%20')), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href', 'data'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() # startpage_rule.add_activate_rule_level([('div', 'class', 'post_block')])# startpage_rule.add_activate_rule_level([('div', 'class', 'vid_container')]) # startpage_rule.add_process_rule_level('img', {'src'}) startpage_rule.add_process_rule_level('a', {'href', 'title'}) # startpage_rule.set_attribute_filter_function('src',lambda x: '.jpg' in x) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) # startpage_rule.set_attribute_modifier_function('title', lambda x: x.partition('#')[0]) parser.add_rule(startpage_rule) startpage_combo_rule = ParserRule() # startpage_rule.add_activate_rule_level([('div', 'class', 'post_block')])# startpage_combo_rule.add_activate_rule_level([('div', 'class', 'combo_post_wrap')]) startpage_combo_rule.add_process_rule_level('a', {'href', 'title'}) startpage_combo_rule.add_process_rule_level('img', {'src'}) # startpage_rule.set_attribute_filter_function('src',lambda x: '.jpg' in x) # startpage_combo_rule.set_attribute_modifier_function('title', lambda x: x.partition('#')[0]) startpage_combo_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_combo_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'id', 'center_control')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('ul', 'class', 'dropdown-menu columns') ]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/channel/' in x or '/prime/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'itemprop', 'video')]) # video_rule.add_process_rule_level('a', {'href'}) video_rule.add_process_rule_level('video', {'src'}) parser.add_rule(video_rule) video_multipart_rule = ParserRule() video_multipart_rule.add_activate_rule_level([('div', 'id', 'videos_container')]) # video_rule.add_process_rule_level('a', {'href'}) video_multipart_rule.add_process_rule_level( 'div', {'data-source', 'data-hash', 'data-x', 'data-oid', 'data-pid'}) parser.add_rule(video_multipart_rule) video_usss_rule = ParserRule() video_usss_rule.add_activate_rule_level([('body', '', '')]) # video_rule.add_process_rule_level('a', {'href'}) video_usss_rule.add_process_rule_level('script', {}) video_usss_rule.set_attribute_filter_function('data', lambda x: 'usss' in x) parser.add_rule(video_usss_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([ ('div', 'class', 'popular_block_header_rl') ]) gallery_href_rule.add_process_rule_level('a', {'href'}) # gallery_href_rule.set_attribute_filter_function('href',lambda x: '#' not in x) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_author_rule = ParserRule() gallery_author_rule.add_activate_rule_level([ ('div', 'id', 'posts_container') ]) # post_block gallery_author_rule.add_activate_rule_level([ ('div', 'class', 'post_author_name') ]) # post_block gallery_author_rule.add_process_rule_level('a', {'href'}) # gallery_href_rule.set_attribute_filter_function('href',lambda x: '#' not in x) gallery_author_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_author_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: # print('video rule') # print(video_rule.get_result()) video = MediaData(URL(video_rule.get_result()[0]['src'])) # result.set_type('video') result.set_video(video) for f in gallery_author_rule.get_result(['data', 'href']): # print(f) result.add_control( ControlInfo('"' + f['data'].strip() + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): # print(f) result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if video_multipart_rule.is_result(): res = video_multipart_rule.get_result() series = len(res) s = base_url.get().partition('?s=')[2] if s == '': serie = 1 else: serie = int(s) uid = self.quotes( video_usss_rule.get_result()[0]['data'].replace(' ', ''), 'usss[0]="', '"') curr_result = res[serie - 1] data = { 'uid': uid, 'source': curr_result['data-source'], 'hash': curr_result['data-hash'], 'x': curr_result['data-x'], 'oid': curr_result['data-oid'], 'pid': curr_result['data-pid'] } url = URL(self.get_href('/php/get_vlink.php', base_url), 'POST', post_data=data) r = load(url) video = MediaData(URL(r.text)) result.set_type('video') result.set_video(video) for i in range(1, series + 1): label = 'S{0}'.format(i) if i == serie: label += '(this)' url_i = base_url.get().partition('?')[0] + '?s={0}'.format(i) result.add_control(ControlInfo(label, URL(url_i + '*'))) for f in gallery_author_rule.get_result(['data', 'href']): # print(f) result.add_control( ControlInfo('"' + f['data'].strip() + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): # print(f) result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result() or startpage_combo_rule.is_result( ): # len(startpage_rule.get_result()) > 0: # result.set_type('hrefs') for item in startpage_combo_rule.get_result(): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('title', ''))) for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('title', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): href = item['href'] data = item['data'] n = href.rpartition('/')[2].partition('.')[0] result.add_page(ControlInfo('{1}'.format(data, n), URL(href))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'item photo-item')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('ul', 'class', 'justified-pagination') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) parser.add_rule(startpage_pages_rule) startpage_tags_rule = ParserRule() startpage_tags_rule.add_activate_rule_level([('ul', 'class', 'tags')]) startpage_tags_rule.add_process_rule_level('a', {'href'}) parser.add_rule(startpage_tags_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'photo-item')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: text.replace('thumb', 'origin')) parser.add_rule(picture_rule) picture_model_rule = ParserRule() picture_model_rule.add_activate_rule_level([('div', 'class', 'block attached-model')]) picture_model_rule.add_process_rule_level('a', {'href'}) picture_model_rule.add_process_rule_level('img', {'alt'}) parser.add_rule(picture_model_rule) picture_tags_rule = ParserRule() picture_tags_rule.add_activate_rule_level([('div', 'class', 'block gallery-tags')]) picture_tags_rule.add_activate_rule_level([('ul', 'class', 'tags')]) picture_tags_rule.add_process_rule_level('a', {'href'}) parser.add_rule(picture_tags_rule) for s in open(fname, encoding='utf-8'): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(): result.add_full( FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 for item in picture_model_rule.get_result(['href', 'alt']): result.add_control( ControlInfo(item['alt'], URL(item['href'] + '/galleries'))) for item in picture_tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() # startpage_rule.add_activate_rule_level([('section', '', '')]) startpage_rule.add_activate_rule_level([ ('article', 'class', 'teaser singleLink hasButtonRow'), ('article', 'class', 'activity video hasButtonFooter') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'data-lazysrc', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('nav', 'class', 'clearfix pagination bottom'), ('nav', 'class', 'range rangeCount-2 clearfix') ]) startpage_pages_rule.add_process_rule_level('a', {'href', 'data-href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_pages_rule.set_attribute_modifier_function( 'data-href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_categories_rule = ParserRule() startpage_categories_rule.add_activate_rule_level([ ('select', 'id', 'input_selectCategories') ]) startpage_categories_rule.add_process_rule_level('option', {'value'}) startpage_categories_rule.set_attribute_modifier_function( 'value', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_categories_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'playerWrapper')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'sources:' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('dl', 'class', 'group')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([ ('nav', 'class', 'profileNav clearfix buttonRow') ]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '#videos' in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): script = item['data'].replace(' ', '').replace('\\', '') sources = self.quotes(script, 'sources:{"', '"},').split('","') for f in sources: t = f.partition('":"') label = t[0] file = self.get_href(t[2], base_url) urls.add(label, URL(file)) result.set_video(urls.get_media_data()) # for f in gallery_user_rule.get_result(): # print(f) # name='"{0}"'.format(f['href'].rpartition('/')[2].partition('#')[0]) # result.add_control(ControlInfo(name, URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL( item.get('data-lazysrc', item['src'])), href=URL(item['href']), popup=item.get('alt' ''))) for item in startpage_pages_rule.get_result(['href', 'data']): # print(item) href = item.get('data-href', item['href']) # print(href) result.add_page( ControlInfo(href.rpartition('/')[2].strip('*'), URL(href))) for item in startpage_categories_rule.get_result(): result.add_control( ControlInfo(item['data'], URL(item['value']))) return result
def parse_index_file(self, fname, base_url=URL()): print(base_url.domain()) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'image')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'block center') ]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: text.replace('/tn_', '/')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'class', 'list tags')]) picture_href_rule.add_process_rule_level('a', {'href'}) picture_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href'] + '*'), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['src']): # print(f) result.add_full( FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 for f in picture_href_rule.get_result(): # print(f) result.add_control(ControlInfo(f['data'], URL(f['href']))) return result