def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs200'), ('div', 'class', 'thumbs300')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', self.get_href) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'menu') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) startpage_pages_rule.set_attribute_filter_function( 'href', lambda txt: '/st/' in txt) parser.add_rule(startpage_pages_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'gallery-thumbs')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', self.process_picture_address) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_rule.get_result()) > 0: result.set_type('pictures') for f in picture_rule.get_result(): result.add_full( FullPictureInfo(abs_href=URL(f['src']), rel_name=f['src'].rpartition('/')[2])) return result if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) # # for item in startpage_hrefs_rule.get_result(['href', 'data']): # result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'video')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) channels_rule = ParserRule() channels_rule.add_activate_rule_level([('ul', 'class', 'channels')]) channels_rule.add_process_rule_level('a', {'href', 'title'}) channels_rule.add_process_rule_level('div', {}) channels_rule.add_process_rule_level('img', {'src', 'alt'}) channels_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url).replace('*', '/')) channels_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(channels_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('ul', 'class', 'pagination pagination-lg') ]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('ul', 'class', 'nav nav-stacked navigation') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'jwplayer' in text) # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url)) parser.add_rule(video_rule) video2_rule = ParserRule() video2_rule.add_activate_rule_level([('div', 'id', 'video')]) video2_rule.add_process_rule_level('script', {'src'}) video2_rule.set_attribute_filter_function( 'src', lambda text: 'pornbraze.com/' in text) video2_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video2_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([ ('div', 'class', 'col-xs-12 col-sm-12 col-md-12') ]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: urls = list() for item in video_rule.get_result(): # print(item['data']) script = item['data'].replace(' ', '') if 'sources:[{' in script: txt = '[{' + self.quotes(item['data'].replace(' ', ''), 'sources:[{', '}]') + '}]' j = json.loads(txt) for j_data in j: # print(j_data) if j_data['file'] is not '': data = dict(text=j_data['label'], url=URL(j_data['file'] + '*')) urls.append(data) elif 'sources:' in script: if video2_rule.is_result(['src']): # print(video2_rule.get_result()) php_url = URL( video2_rule.get_result(['src'])[0]['src']) # print(php_url) res = load(php_url) # print(res.text) bitrates = self.quotes(res.text, "'bitrates':[{", "}]").split('},{') # print(bitrates) for line in bitrates: print(line) video_url = self.quotes(line, "'file':'", "'") label = self.quotes(line, 'label:"', '"') data = dict(text=label, url=URL(video_url + '*')) urls.append(data) if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: video = MediaData(urls[0]['url']) for item in urls: video.add_alternate(item) else: return result result.set_type('video') result.set_video(video) for f in gallery_href_rule.get_result(['data', 'href']): # print(f) href = f['href'].replace('*', '/') label = f['data'] if '/users/' in href: href = href + '/videos/public/' label = '"' + label + '"' result.add_control(ControlInfo(label, URL(href))) return result if startpage_rule.is_result() or channels_rule.is_result(): result.set_type('hrefs') for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in channels_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(): label = item['href'].strip('*/').rpartition('/')[2] result.add_control(ControlInfo(label, URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs'), ('div', 'class', 'content_box domain2'), ('div', 'class', 'video_list'), ('div', 'class', 'video_list_models'), ('div', 'class', 'pics_list'), ('div', 'class', 'movie_thumbs')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', self.get_href) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pages'), ('div', 'class', 'pg')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('li', 'class', 'orange dropdown')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_filter_function('href', lambda txt: '/st/' in txt) parser.add_rule(startpage_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'video_url:' in text) parser.add_rule(video_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'thumb_box')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('t.jpg', '.jpg')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'class', 'crumbles'), ('div', 'class', 'tags')]) picture_href_rule.add_process_rule_level('a', {'href', 'title'}) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if video_rule.is_result(): video = MediaData(URL(self.get_attr_from_script(video_rule.get_result()[0]['data']))) result.set_video(video) result.set_type('video') for f in picture_href_rule.get_result(): # print(f) result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') for f in picture_rule.get_result(): result.add_full(FullPictureInfo(rel_name=f['src'])) for f in picture_href_rule.get_result(): # print(f) result.add_control(ControlInfo(f['title'], URL(f['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() # def star_get_url(txt=''): # return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([ ('div', 'class', 'list_videos'), ('div', 'class', 'list_albums'), ('div', 'class', 'list_videos model-girls-list'), ('div', 'class', 'list_videos list_channel') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'model-alpha')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'vids')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'video_url' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'video-categories')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule(collect_data=True) gallery_user_rule.add_activate_rule_level([('div', 'class', 'video-added-info')]) gallery_user_rule.add_process_rule_level('a', {'href'}) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x+'/videos',base_url)) gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '/members/' in x) parser.add_rule(gallery_user_rule) photo_rule = ParserRule() photo_rule.add_activate_rule_level([('div', 'class', 'zoom-gallery')]) photo_rule.add_process_rule_level('a', {'href'}) # photo_rule.set_attribute_filter_function('href', lambda text: '/photos/' in text) photo_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(photo_rule) self.proceed_parcing(parser, fname) result = ParseResult() def add_href_and_user_to_result(): if gallery_user_rule.is_result(['href']): for item in gallery_user_rule.get_result(['href']): # print(item) username = item['data'].strip().partition( 'Added by ')[2].partition(' ')[0] # print(username) if username != '': result.add_control( ControlInfo('"' + username + ' videos"', URL(item['href'] + 'public_videos/'))) result.add_control( ControlInfo('"' + username + ' photos"', URL(item['href'] + 'albums/'))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) if video_rule.is_result(): # len(video_rule.get_result()) > 0: # for item in video_rule.get_result(): # print('=============================') # print(item['data']) script = video_rule.get_result()[0]['data'].replace(' ', '') # print(script) url = script.partition("video_url:'")[2].partition("'")[0] # print(url) video = MediaData(URL(url)) result.set_type('video') result.set_video(video) add_href_and_user_to_result() return result if photo_rule.is_result(): result.set_type('pictures') base_dir = base_url.get_path(base=Setting.base_dir) + base_url.get( ).rpartition('/')[2] + '/' result.set_gallery_path(base_dir) # print(base_dir) for item in photo_rule.get_result(): name = item['href'].rpartition('/')[2].strip('*') picture = FullPictureInfo(abs_href=URL(item['href']), rel_name=name) picture.set_base(base_dir) result.add_full(picture) add_href_and_user_to_result() return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('data', item.get('title', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([ ('span', 'class', 'thumb_container_box short'), ('span', 'class', 'thumb_container_box long') ]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'main-sectionpaging')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: util.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('ul', 'class', 'simple-list simple-list--channels') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*') parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('video', '', '')]) video_rule.add_process_rule_level('source', {'src', 'id'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) video_rule.set_attribute_modifier_function('src', lambda txt: txt + '*') parser.add_rule(video_rule) video_script_rule = ParserRule() video_script_rule.add_activate_rule_level([('body', '', '')]) video_script_rule.add_process_rule_level('script', {}) video_script_rule.set_attribute_filter_function( 'data', lambda text: 'shows:' in text) # video_script_rule.set_attribute_modifier_function('src',lambda txt:txt+'*') parser.add_rule(video_script_rule) gallery_rule = ParserRule() gallery_rule.add_activate_rule_level([('div', 'id', 'slideshow')]) gallery_rule.add_process_rule_level('a', {'index'}) gallery_rule.add_process_rule_level('img', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) gallery_rule.set_attribute_modifier_function( 'src', lambda txt: txt.replace('/thumbs/', '/')) parser.add_rule(gallery_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'added')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: util.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_script_rule.is_result() or video_rule.is_result(): files = set() default_vid = None for item in video_script_rule.get_result(): script = item['data'].replace(' ', '') streams = up.unquote(self.quotes(script, '"streams":[', ']')) while '"file":"' in streams: split = streams.partition('"file":"')[2].partition('"') streams = split[2] files.add(split[0] + '*') for item in video_rule.get_result(): files.add(item['src']) if 'id' not in item: default_vid = item['src'] if len(files) == 0: return result if default_vid is None: default_vid = files.pop() else: files.discard(default_vid) urls = UrlList() urls.add('Default', URL(default_vid)) for item in files: urls.add(item[-5:-1], URL(item)) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() result.add_control(ControlInfo(label, URL(f['href']))) return result if gallery_rule.is_result(): result.set_type('pictures') url = URL(gallery_rule.get_result()[0]['src'] + '*') base_dir = url.get_path(base=Setting.base_dir) result.set_gallery_path(base_dir) for f in gallery_rule.get_result(): fname = int(f['index']) # print(fname) picture = FullPictureInfo( abs_href=URL(f['src'] + '*'), rel_name='pic{0:03d}.jpg'.format(fname)) picture.set_base(base_dir) result.add_full(picture) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() result.add_control(ControlInfo(label, URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('title', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() # def star_get_url(txt=''): # return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'well wellov well-sm'), ('div', 'class', 'col-sm-4 m-t-15')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt', 'data-original'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'btn-group')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'container')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'nuevoplayer' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'm-t-10 overflow-hidden catmenu')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'pull-left user-container')]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x + '/videos', base_url)) gallery_user_rule.set_attribute_filter_function('href', lambda x: '/user/' in x) parser.add_rule(gallery_user_rule) photo_rule = ParserRule() photo_rule.add_activate_rule_level([('div', 'class', 'panel-body')]) photo_rule.add_process_rule_level('a', {'href'}) photo_rule.set_attribute_filter_function('href', lambda text: '/photos/' in text) photo_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(photo_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'] urls = list() for item in script.split(';'): if 'var video' in item: part = item.strip().partition('video_')[2].partition('=') data = dict(text=part[0], url=URL(part[2].strip("'") + '*')) urls.append(data) if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: video = MediaData(urls[-1]['url']) for item in urls: video.add_alternate(item) else: return result result.set_type('video') result.set_video(video) # # for f in gallery_channel_rule.get_result(['data', 'href']): # result.add_control(ControlInfo(f['data'], URL(f['href']))) if gallery_user_rule.is_result(): for item in gallery_user_rule.get_result(): # print(item) # print('*'+item['data'].strip()+'*') username = item['data'].strip() if username != '': result.add_control(ControlInfo('"' + username + '"', URL(item['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if photo_rule.is_result(): result.set_type('pictures') base_dir = base_url.get_path(base=Setting.base_dir) result.set_gallery_path(base_dir) for item in photo_rule.get_result(): name = item['href'].rpartition('/')[2].strip('*') picture = FullPictureInfo(abs_href=URL(item['href']), rel_name=name) picture.set_base(base_dir) result.add_full(picture) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) t_url = self.get_href(item.get('data-original', item['src']), base_url) t_href = item['href'] # print(t_href) if '/album/' in t_href: # print(t_href) t_href = t_href.replace('/album/', '/album/slideshow/') # print(t_href) result.add_thumb(ThumbInfo(thumb_url=URL(t_url), href=URL(t_href), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('ul', 'class', 'thumbs'), ('li', 'class', 'category')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt', 'data-original'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager paging')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level( 'span', {'data-query-key', 'data-query-value'}) # startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url)) parser.add_rule(startpage_pages_rule) channels_rule = ParserRule() channels_rule.add_activate_rule_level([('ul', 'class', 'tag-150-list') ]) channels_rule.add_process_rule_level('a', {'href'}) channels_rule.add_process_rule_level('img', {'src'}) channels_rule.set_attribute_filter_function( 'href', lambda x: '/channel/' in x or '/prime/' in x) channels_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(channels_rule) channel_categories_rule = ParserRule() channel_categories_rule.add_activate_rule_level([ ('ul', 'class', 'link-tag-list long-col') ]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) channel_categories_rule.add_process_rule_level( 'span', {'data-query-key', 'data-query-value'}) # startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url)) parser.add_rule(channel_categories_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player-container')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'players.push' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('ul', 'class', 'video-tag-list')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace(' ', '').replace( '\\', '') sources = script.partition("'sources':{")[2].partition( '}')[0].split(',') urls = list() for item in sources: part = item.strip("\n\t'").partition("':'") if part[2].startswith('http://'): data = dict(text=part[0], url=URL(part[2].strip("'") + '*')) urls.append(data) if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: video = MediaData(urls[-1]['url']) for item in urls: video.add_alternate(item) else: return result result.set_type('video') result.set_video(video) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result def add_key(old, key, value): (addr, br, keys) = old.partition('?') print(addr, br, keys) pairs = keys.split('&') print(pairs) keys = '' found = False for pair in pairs: if pair.startswith(key): keys += key + '=' + value + '&' found = True else: keys += pair + '&' if not found: keys += key + '=' + value return addr + '?' + keys.strip('&') def add_pages_info_to_result(rule, description_key='data-query-value'): for item in rule.get_result(['data-query-key', 'data-query-value']): print(item) key = item['data-query-key'] val = item['data-query-value'] description = item[description_key].strip('\t') old = base_url.get() addr = add_key(old, key, val) result.add_page(ControlInfo(description, URL(addr + '*'))) if channels_rule.is_result(): result.set_type('hrefs') for item in channels_rule.get_result(): # print(item) info = item['href'].rpartition('/')[2].strip('*') result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=info)) add_pages_info_to_result(channel_categories_rule, description_key='data') # for item in channel_categories_rule.get_result(['data-query-key', 'data-query-value']): # print(item) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) t_url = item.get('data-original', item['src']) result.add_thumb( ThumbInfo(thumb_url=URL(t_url), href=URL(item['href']), popup=item.get('alt', ''))) add_pages_info_to_result(startpage_pages_rule) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'videos_form')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'data-lazy-src'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'videos_page')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer(' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'Categories')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'].replace(' ', ''), "file:'", "'") urls.add('default', URL(file + '*')) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb(ThumbInfo(thumb_url=URL(item['data-lazy-src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href']): href = item['href'] label = href.split('/')[-2] # print(label,href) result.add_control(ControlInfo(label, URL(href))) return result
def parse_index_file(self, fname, base_url=URL()): print('This site is very slow, be patient') parser = SiteParser() def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'item ')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination-holder')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'item')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'].replace(' ', ''), "video_url:'", "'") urls.add('default', URL(file)) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href']): href = item['href'] label = href.split('/')[-2] # print(label,href) result.add_control(ControlInfo(label, URL(href))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() # startpage_rule.add_activate_rule_level([('section', '', '')]) startpage_rule.add_activate_rule_level([('div', 'class', 'inner-block') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'id', 'pagination')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'stage-video')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'jwplayer(' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([ ('ul', 'class', 'stats-list stats-list--plain') ]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): # print(item['data']) file = self.quotes(item['data'].replace(' ', ''), 'file:"', '"') urls.add('DEFAULT', URL(file)) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): if '/user/' in f['href']: form = '"{0}"' else: form = '{0}' result.add_control( ControlInfo(form.format(f['data']), URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt' ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'item '), ('div', 'class', 'list-categories')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'data-original', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('li', {'class'}) startpage_pages_rule.add_process_rule_level('a', {'href', 'data-query'}) startpage_pages_rule.set_attribute_filter_function( 'class', lambda x: x == 'page') startpage_pages_rule.set_attribute_modifier_function( 'data-query', lambda x: x.partition(':')[2]) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'flashvars' in text) # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url)) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'info')]) gallery_href_rule.add_process_rule_level('a', {'href'}) # gallery_href_rule.set_attribute_filter_function('href',lambda x:'/profiles/' not in x) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'block-user')]) gallery_user_rule.add_process_rule_level('a', {'href', 'title'}) # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profiles/' in x) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x + 'videos/', base_url)) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): flashvars = self.quotes( item['data'].replace(' ', '').replace('\n', '').replace('\t', ''), 'flashvars={', '};').split(',') fv = dict() for flashvar in flashvars: split = flashvar.partition(':') fv[split[0]] = split[2].strip("'\"") # file=self.quotes(item['data'],'file:',',').strip(' "') urls.add('default', URL(fv['video_url'] + '*')) result.set_video(urls.get_media_data()) for f in gallery_user_rule.get_result(['title', 'href']): result.add_control( ControlInfo('"' + f['title'].strip() + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['data-original']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in startpage_pages_rule.get_result(['href', 'data-query']): result.add_page( ControlInfo(item['data-query'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'image-delete'), ('div', 'class', 'thumbs'), ('div', 'class', 'image')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'main')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'function getEmbed()' in text) parser.add_rule(video_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'thumbs2')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.add_process_rule_level('img', {'src'}) # picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('thumb', 'origin')) parser.add_rule(picture_rule) picture_tags_rule = ParserRule() picture_tags_rule.add_activate_rule_level([('div', 'class', 'main')]) picture_tags_rule.add_process_rule_level('a', {'href'}) picture_tags_rule.set_attribute_filter_function( 'href', lambda txt: '/categories/' in txt or '/model/' in txt) parser.add_rule(picture_tags_rule) for s in open(fname, encoding='utf-8'): parser.feed(s) result = ParseResult() if len(video_rule.get_result()) > 0: result.set_video( MediaData( URL( self.get_attr_from_script( video_rule.get_result()[0]['data'])))) result.set_type('video') for item in picture_tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') result.set_picture_collector(ELSitePictureCollector()) i = 1 for f in picture_rule.get_result(): result.add_full( FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i)) i += 1 for item in picture_tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() domain = base_url.domain() href_rule = ParserRule() # startpage & model's page href_rule.add_activate_rule_level([('div', 'id', 'lst-galleries'), ('div', 'class', 'lblock'), ('div', 'class', 'modal_info_full') ]) href_rule.add_process_rule_level('a', {'href'}) href_rule.add_process_rule_level('img', {'src', 'alt'}) href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) href_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) href_rule.set_attribute_filter_function('href', self.thumb_href_filter) href_rule.set_attribute_filter_function('src', self.thumb_src_filter) parser.add_rule(href_rule) href_page_rule = ParserRule() # page number in model's page href_page_rule.add_activate_rule_level([('div', 'class', 'pages'), ('div', 'class', 'cat')]) href_page_rule.add_process_rule_level('a', {'href'}) href_page_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(href_page_rule) model_litera_rule = ParserRule() model_litera_rule.add_activate_rule_level([('div', 'class', 'babe_index')]) model_litera_rule.add_process_rule_level('a', {'href'}) model_litera_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(model_litera_rule) model_more_rule = ParserRule() model_more_rule.add_activate_rule_level([('div', 'class', 'more'), ('div', 'id', 'MoreCont')]) model_more_rule.add_process_rule_level('a', {'href'}) model_more_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) model_more_rule.set_attribute_filter_function('href', self.thumb_href_filter) parser.add_rule(model_more_rule) picture_rule = ParserRule() # gallery rule picture_rule.add_activate_rule_level([('div', 'class', 'lblock')]) # picture_rule.add_activate_rule_level([('ul', 'class', 'block')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.add_process_rule_level('img', {'alt'}) picture_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) picture_rule.set_attribute_filter_function( 'href', lambda x: x.endswith('.jpg')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() # gallery href's rule picture_href_rule.add_activate_rule_level([('div', 'id', 'ModelMenu'), ('div', 'class', 'lblock')]) picture_href_rule.add_process_rule_level('a', {'href', 'title'}) picture_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) picture_href_rule.set_attribute_filter_function( 'href', self.gal_href_filter) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(): x = FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i) result.add_full(x) i += 1 for f in picture_href_rule.get_result(['href', 'title']): # print(f) result.add_control( ControlInfo(text=f['title'], url=URL(f['href']))) return result if len(href_rule.get_result()) > 0: result.set_type('hrefs') for item in href_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in model_more_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in model_litera_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in href_page_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'data-original'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'listTags listTags5')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('body', '', '')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'jwplayer("jw_video").setup' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('ul', 'class', 'stickerNav')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(gallery_href_rule) # # gallery_channel_rule = ParserRule() # gallery_channel_rule.add_activate_rule_level([('p', 'class', 'source')]) # gallery_channel_rule.add_process_rule_level('a', {'href'}) # gallery_channel_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') # parser.add_rule(gallery_channel_rule) self.proceed_parcing(parser, fname) result = ParseResult() if len(video_rule.get_result()) > 0: script = video_rule.get_result()[0][ 'data'] # .replace(' ', '').replace('\\','') # print(script) # print('len=',len(video_rule.get_result())) sources = script.partition('"sources":[{')[2].partition( '}]')[0].split('},{') # for i in sources: # print(i) def parce(txt): label = txt.partition('"label":"')[2].partition('"')[0] file = txt.partition('"file":"')[2].partition('"')[0] # print(label,file) return dict(text=label, url=URL(file + '*')) if len(sources) == 1: video = MediaData(parce(sources[0])['url']) elif len(sources) > 1: video = MediaData(parce(sources[len(sources) - 1])['url']) for item in sources: video.add_alternate(parce(item)) else: return result result.set_type('video') result.set_video(video) # # for f in gallery_channel_rule.get_result(['data', 'href']): # result.add_control(ControlInfo(f['data'], URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href', 'data-original']): result.add_thumb( ThumbInfo(thumb_url=URL(item['data-original']), href=URL(item['href']), popup=item.get('title', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href', 'data'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thmb-wrapper')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level( 'img', {'src', 'alt', 'data-src', 'data-original'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'id', 'divPagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'listTags listTags5')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'videoContainer')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'playerData.cdnPath' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([ ('div', 'class', 'video-info-tags float-left') ]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(gallery_href_rule) gallery_channel_rule = ParserRule() gallery_channel_rule.add_activate_rule_level([ ('div', 'class', 'video-info-uploaded float-right') ]) gallery_channel_rule.add_process_rule_level('a', {'href'}) gallery_channel_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') gallery_channel_rule.set_attribute_filter_function( 'href', lambda x: '/categories/' in x) parser.add_rule(gallery_channel_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([ ('div', 'class', 'video-info-uploaded float-right') ]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '/user/' in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace( ' ', '') # .replace('\\','') # print(video_rule.get_result()[0]['data']) # print(script) # print('len=',len(video_rule.get_result())) lines = script.split('\n') data = list() for i in lines: if i.strip().startswith('playerData.cdnPath'): if "''" not in i: data.append(i.strip()) # print(i.strip()) def parce(txt): label = txt.partition('playerData.cdnPath')[2].partition( '=')[0] file = txt.partition("'")[2].partition("'")[0].replace( '%3A', ':').replace('%2F', '/').replace('%26', '&') # print(label,file) return dict(text=label, url=URL(file + '*')) if len(data) == 1: video = MediaData(parce(data[0])['url']) elif len(data) > 1: video = MediaData(parce(data[len(data) - 1])['url']) for item in data: video.add_alternate(parce(item)) else: return result result.set_type('video') result.set_video(video) for f in gallery_user_rule.get_result(['data', 'href']): # print(f) result.add_control( ControlInfo('"' + f['data'] + '"', URL(f['href']))) for f in gallery_channel_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) if 'data-src' in item.keys(): src = item['data-src'] elif 'data-original' in item.keys(): src = item['data-original'] elif 'src' in item.keys(): src = item['src'] else: print('New key found. Need rewrite "startpage_rule"') continue # print(src,item.get('src',''),item.get('data-src','')) result.add_thumb( ThumbInfo(thumb_url=URL(src), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href', 'data'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'item ')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'data-original', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination-holder')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'flashvars' in text) # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url)) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'info')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): flashvars = self.quotes( item['data'].replace(' ', '').replace('\n', '').replace('\t', ''), 'flashvars={', '};').split(',') fv = dict() for flashvar in flashvars: split = flashvar.partition(':') fv[split[0]] = split[2].strip("'\"") files = dict() for f in fv: if fv[f].startswith('http://') and fv[f].rpartition( '.')[2].strip('/') == 'mp4': file = fv[f] label = fv.get(f + '_text', f) files[label] = file for key in sorted(files.keys(), reverse=True): urls.add(key, URL(files[key])) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['data-original']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): if base_url.contain('format=json'): xhr_page = True else: xhr_page = False parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([ ('div', 'class', 'relative margin-auto video-box-wrapper normal-box') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'alt', 'data-srcmedium'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'video-container') ]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'flashvars=' in text.replace(' ', '')) # video_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) # video_href_rule = ParserRule() video_href_rule.add_activate_rule_level([('div', 'class', 'ibInfo js_ibInfo')]) video_href_rule.add_process_rule_level('a', {'href'}) video_href_rule.set_attribute_filter_function( 'href', lambda x: 'javascript' not in x) video_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(video_href_rule) video_user_rule = ParserRule() video_user_rule.add_activate_rule_level([('div', 'class', 'ibLine1')]) video_user_rule.add_process_rule_level('a', {'href'}) # video_user_rule.set_attribute_filter_function('href',lambda x: 'javascript' not in x) video_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(video_user_rule) if not xhr_page: self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): flashvars = self.quotes(item['data'].replace(' ', ''), 'flashvars={', '};').replace('\\', '').split(',"') for v in flashvars: if v.startswith('quality_'): label = self.quotes(v, 'quality_', '"') file = self.quotes(v, ':"', '"') urls.add(label, URL(file + '*')) result.set_video(urls.get_media_data(-1)) for f in video_user_rule.get_result(['data', 'href']): result.add_control( ControlInfo('"' + f['data'] + '"', URL(f['href']))) for f in video_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if xhr_page: def parce_data(data, buttons_added, base_url, items_name='items'): xhr_data = base_url.xhr_data items = data[items_name] nav = data['navigation'] last_page = nav['lastPage'] if last_page is None: last_page = 1 curr_page = int(base_url.get().rpartition('page=')[2]) pattern = nav['urlPattern'].replace('[%pageId%]', '{}') + '*' for item in items: thumb_url = item['thumb_url'] title = item['specialchars_title'] url = item['video_link'] result.add_thumb( ThumbInfo(thumb_url=URL(thumb_url), href=URL(url + '*'), popup=title)) # print('Page {} of {}'.format(curr_page, last_page), pattern.format(curr_page)) if not buttons_added: result.add_page(ControlInfo('1', xhr_data['base_url'])) p_from = curr_page - 5 if p_from < 2: p_from = 2 p_to = curr_page + 5 if p_to > last_page: p_to = last_page for x in range(p_from, p_to): url = URL(pattern.format(x), xhr_data=xhr_data) if x == curr_page: x = str(x) + '(this)' result.add_page(ControlInfo(str(x), url)) last_url = URL(pattern.format(last_page), xhr_data=xhr_data) result.add_page(ControlInfo(str(last_page), last_url)) buttons_added = True return buttons_added with open(fname) as fp: try: json_data = json.load(fp) buttons_added = False if base_url.contain('/keyword/'): for i in json_data: buttons_added = parce_data(i, buttons_added, base_url) elif base_url.contain('/users/'): parce_data(json_data['response'], buttons_added, base_url, items_name='videos') else: for i in json_data: buttons_added = parce_data(json_data[i], buttons_added, base_url) except ValueError: pass return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['data-srcmedium']), href=URL(item['href']), popup=item.get('alt', ''))) xhr_data = {'base_url': base_url} next_url = URL( base_url.get() + '*', xhr_data=xhr_data) # +'?format=json&number_pages=1&page=2*' next_url.add_query([('format', 'json'), ('number_pages', '1'), ('page', '2')]) result.add_page(ControlInfo('next', next_url)) return result
def parse_index_file(self, fname, base_url=URL()): # print(base_url) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumblinks') ]) # startpage_rule.add_process_rule_level('div', {}) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url) + '*') startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url) + '*') parser.add_rule(startpage_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('div', 'id', 'divTags')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(tags_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('td', 'class', 'pages') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) picture_base_addr_rule = ParserRule() picture_base_addr_rule.add_activate_rule_level([('div', 'class', 'imagelinks')]) picture_base_addr_rule.add_process_rule_level('script', {}) picture_base_addr_rule.set_attribute_filter_function( 'data', lambda x: 'unescape' in x) parser.add_rule(picture_base_addr_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'imagelinks')]) picture_rule.add_process_rule_level('script', {}) picture_rule.set_attribute_filter_function('data', lambda x: "'src'" in x) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_base_addr_rule.get_result()) > 0: result.set_type('pictures') base = \ picture_base_addr_rule.get_result()[0]['data'].replace('%2f', '/').partition("unescape('//")[2].partition( "'")[0] # print(base) i = 1 for f in picture_rule.get_result(): picname = f['data'].partition("+'")[2].partition("'")[0] # print(picname) result.add_full( FullPictureInfo(abs_href=URL(base + picname + '*'), rel_name='%03d.jpg' % i)) i += 1 for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'fixed-content')]) startpage_rule.add_process_rule_level('a', {'href', 'class'}) startpage_rule.add_process_rule_level('div', {'style'}) startpage_rule.set_attribute_filter_function( 'class', lambda x: x == 'thumbnail') startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'style', lambda x: x.partition("url('")[2].partition("')")[0]) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('div', 'class', 'col-xs-12 content-pagination') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('section', 'id', 'footer-tag')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) categories_rule = ParserRule() categories_rule.add_activate_rule_level([('ul', 'class', 'nav navbar-nav')]) categories_rule.add_process_rule_level('a', {'href'}) categories_rule.set_attribute_filter_function( 'href', lambda x: '/Category/' in x and "#" not in x) categories_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(categories_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('body', '', '')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'angular.' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'row tag-area')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace(' ', '') json_file_url = self.get_href(self.quotes(script, "host:'", "'"), base_url) # print(json_file_url) from requests_loader import load, LoaderError json_file = Setting.base_dir + 'tsp_video.json' urls = list() result.set_type('video') try: r = load(URL(json_file_url), json_file) links = set() for item in r.json()['mediaSources']: # print(item) if item['source'] not in links: data = dict(text=item['quality'], url=URL(item['source'] + '*')) urls.append(data) links.add(item['source']) if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: video = MediaData(urls[0]['url']) for item in urls: video.add_alternate(item) else: return result result.set_video(video) except LoaderError as err: print(err) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['style']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): label = item['data'].replace(' ', '') # print(item) if len(label) > 0: result.add_page(ControlInfo(label, URL(item['href']))) if categories_rule.is_result(['href']): for item in categories_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item['data'], URL(item['href']))) if tags_rule.is_result(['href']): for item in tags_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'inner_wrap') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt', 'data-original'}) # startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*' ) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'holder_list')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url) + '*') parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'video_url:' in text or 'jwplayer(' in text) parser.add_rule(video_rule) # gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'descrow') ]) gallery_user_rule.add_process_rule_level('a', {'href', 'title'}) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: x + 'public_videos/') gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '/members/' in x) parser.add_rule(gallery_user_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'descrow') ]) gallery_href_rule.add_process_rule_level('a', {'href'}) # gallery_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') gallery_href_rule.set_attribute_filter_function( 'href', lambda x: '/categories/' in x or '/tags/' in x) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace( ' ', '') # .replace('\\','') # print(script) file = 'Not found!!!!' if 'jwplayer(' in script: file = script.partition('file:"')[2].partition('",')[0] # +'*' elif "video_url:'" in script: file = script.partition("video_url:'")[2].partition("',")[ 0] # +'*' # print(file) video = MediaData(URL(file)) result.set_type('video') result.set_video(video) # # for f in gallery_channel_rule.get_result(['data', 'href']): # result.add_control(ControlInfo(f['data'], URL(f['href']))) if gallery_user_rule.is_result(): f = gallery_user_rule.get_result(['href'])[0] print(f) result.add_control( ControlInfo('"' + f['data'] + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['href']): # print(f) result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['data-original']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([ ('div', 'class', 'well well-sm hover'), ('div', 'class', 'channelContainer') ]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'drop2 hidden-xs')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'video-container') ]) video_rule.add_process_rule_level('source', {'src', 'label', 'res'}) # video_rule.set_attribute_filter_function('data', lambda text: 'video_url' in text) video_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'm-t-10 overflow-hidden')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule(collect_data=True) gallery_user_rule.add_activate_rule_level([ ('div', 'class', 'pull-left user-container') ]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '#' not in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): urls.add(item['res'], URL(item['src'])) result.set_video(urls.get_media_data(-1)) for f in gallery_user_rule.get_result(['data', 'href']): result.add_control( ControlInfo('"' + f['data'].strip() + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): label = item['href'].strip('*').rpartition('/')[2] result.add_control(ControlInfo(label, URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'mb'), ('div', 'class', 'mbhd'), ('div', 'class', 'mb mbr'), ('div', 'class', 'mbhd mbr'), ('div', 'class', 'categoriesbox')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'numlist2')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'moviexxx')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda x: "videojs('EPvideo'" in x) parser.add_rule(video_rule) video_fname_rule = ParserRule() video_fname_rule.add_activate_rule_level([('div', 'id', 'hd-p**n-dload')]) video_fname_rule.add_process_rule_level('a', {'href'}) # video_fname_rule.set_attribute_filter_function('data',lambda x:"videojs('EPvideo'" in x) parser.add_rule(video_fname_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'tab-1')]) # gallery_href_rule.add_activate_rule_level([('td', 'class', 'btnList')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) gallery_href_rule.set_attribute_filter_function('href', lambda x: '/category/' in x or '/search/' in x) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): script = video_rule.get_result()[0]['data'].replace(' ', '') # print(script) vid = script.partition("vid:'")[2].partition("',")[0] hash = script.partition("hash:'")[2].partition("',")[0] print(vid, hash) for item in video_fname_rule.get_result(): print(item) fname = item['href'].rpartition('/')[2] print(fname) # print('http://v1.s1.n10.nl.cdn.eporner.com/3e7475b4b977ffd9649a87353717fc51/582c90c9027c00/310073-720p.mp4'.format(hash,vid,fname)) print('http://v1.s1.n10.nl.cdn.eporner.com/{0}/{1}/{2}'.format(hash, vid, fname)) # script_url=URL(base_url.domain() + (video_rule.get_result()[0]['data'].partition("getScript('")[2].partition("'")[0])) # video=EPvideoParser(script_url,self.model) # result.set_video(video.get_result()) result.set_type('video') for f in gallery_href_rule.get_result(['data', 'href']): # print(f) result.add_control(ControlInfo(f['data'], URL(f['href']))) # print('return') return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb(ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item['alt'])) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) # # for item in startpage_hrefs_rule.get_result(['href', 'data']): # result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'link-3col')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('iframe', {'src'}) video_rule.set_attribute_filter_function('src', lambda x: 'fileone.tv' in x) video_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): print(item) try: r = load(URL(item['src'])) setup = self.quotes(r.text, "jwplayer('player').setup(", ")").replace(' ', '') file = self.quotes(setup, "file:'", "'") urls.add("default", URL(file + '*')) except LoaderError as err: print(err) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'contents videos')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'id', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'contents')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'jwplayer' in text) # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url)) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'info_holder')]) gallery_href_rule.add_activate_rule_level([('div', 'class', 'l')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_filter_function( 'href', lambda x: '/profiles/' not in x) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'info_holder')]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '/profiles/' in x) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x.replace('.html', '/videos/'), base_url)) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'], 'file:', ',').strip(' "') urls.add('default', URL(file + '*')) result.set_video(urls.get_media_data()) for f in gallery_user_rule.get_result(['data', 'href']): result.add_control( ControlInfo('"' + f['data'].strip() + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() # print(base_url.domain()) def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule(debug=False) startpage_rule.add_activate_rule_level([('ul', 'class', 'video-listing'), ('ul', 'class', 'video-listing two-in-row'), ('ul', 'class', 'video-listing four-in-row'), ('ul', 'class', 'video-listing two-in-row id-recommended-list'), ('ul', 'class', 'video-listing four-in-row id-recommended-list') ]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'data-src', 'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function('data-src', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pageNumbersHolder')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'categories-listing'), ('ul', 'class', 'categories-popular-listing'), ('ul', 'class', 'abc-categories newAbcCategories')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) pornstars_rule = ParserRule() pornstars_rule.add_activate_rule_level([('div', 'id', 'all_pornstars')]) pornstars_rule.add_process_rule_level('a', {'href'}) pornstars_rule.add_process_rule_level('img', {'src', 'alt'}) # pornstars_rule.set_attribute_filter_function('href',lambda x: '/channel/' in x or '/prime/' in x) pornstars_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(pornstars_rule) pornstars_hrefs_rule = ParserRule() pornstars_hrefs_rule.add_activate_rule_level([('ul', 'class', 'abc-categories newAbcCategories')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) pornstars_hrefs_rule.add_process_rule_level('a', {'href'}) pornstars_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(pornstars_hrefs_rule) channels_rule = ParserRule(debug=False) channels_rule.add_activate_rule_level([('ul', 'class', 'channels-list three-in-row')]) channels_rule.add_process_rule_level('a', {'href'}) channels_rule.add_process_rule_level('img', {'src', 'data-src', 'alt'}) channels_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) channels_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url)) channels_rule.set_attribute_modifier_function('data-src', lambda x: self.get_href(x, base_url)) parser.add_rule(channels_rule) channels_hrefs_rule = ParserRule() channels_hrefs_rule.add_activate_rule_level([('div', 'class', 'channel-filters-categories')]) # channels_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) channels_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) channels_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(channels_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'watch')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'redtube_flv_player' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'video-details')]) # gallery_href_rule.add_activate_rule_level([('td', 'class', 'links')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) gallery_href_rule.set_attribute_filter_function('href', lambda x: x != '*') parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): script = item['data'].replace(' ', '').replace('\\', '') sources = self.quotes(script, 'sources:{"', '"},').split('","') for f in sources: t = f.partition('":"') label = t[0] file = self.get_href(t[2], base_url) urls.add(label, URL(file)) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): href = f['href'] label = f['data'] if '/redtube/' in href or '/tag/' in href: result.add_control(ControlInfo(label, URL(href))) elif '/pornstar/' in href: ps_name = href.rstrip('*').rpartition('/')[2].replace('+', ' ').title() result.add_control(ControlInfo(ps_name, URL(href))) else: # adding user result.add_control(ControlInfo("'" + label + "'", URL(href.replace('*', '/videos*')))) return result if pornstars_rule.is_result(): result.set_caption_visible(True) for item in pornstars_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in pornstars_hrefs_rule.get_result(['href']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result if channels_rule.is_result(): result.set_caption_visible(True) for item in channels_rule.get_result(['href']): thumb_href = item.get('data-src', item.get('src')) descr = item.get('alt', '').title() result.add_thumb(ThumbInfo(thumb_url=URL(thumb_href), href=URL(item['href']), popup=descr)) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in channels_hrefs_rule.get_result(['href']): result.add_control(ControlInfo(item['title'], URL(item['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): thumb_href = item.get('data-src', item.get('src')) descr = item.get('title', item.get('alt', '')) result.add_thumb(ThumbInfo(thumb_url=URL(thumb_href), href=URL(item['href']), popup=descr)) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'title']): result.add_control(ControlInfo(item['title'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): print(base_url.domain()) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'ownpost')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'nav')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + '/' + x + '*') parser.add_rule(startpage_pages_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'gallery')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src', 'class'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: text.replace('/tn_', '/')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'class', 'gallery') ]) picture_href_rule.add_process_rule_level('a', {'href'}) picture_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) picture_href_rule.set_attribute_filter_function( 'href', lambda x: x.find('/?category=') != -1) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href'] + '*'), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['src', 'class']): # print(f) result.add_full( FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 for f in picture_href_rule.get_result(): # print(f) result.add_control( ControlInfo(f['data'].replace(',', ''), URL(f['href']))) return result
def parse_index_file(self, fname, base_url=URL()): print('VP parsing') parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'bx'), ('div', 'class', 'bx lastrow') ]) startpage_rule.add_process_rule_level('a', {'href', 'class'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagerwrap')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) # startpage_pages_rule.set_attribute_modifier_function('href',lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'video_panel')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'var flashvars' in text) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'tagas-secondrow')]) # gallery_href_rule.add_activate_rule_level([('td', 'class', 'btnList')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') # gallery_href_rule.set_attribute_filter_function('href',lambda x:'/category/'in x or '/search/'in x) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'info')]) gallery_user_rule.add_process_rule_level('a', {'href'}) # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profile/' in x) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x.replace('/user/', '/submitted/'), base_url)) parser.add_rule(gallery_user_rule) for s in open(fname, encoding='utf-8', errors='ignore'): # print(s) parser.feed(s.replace('</b>', '</a>')) result = ParseResult() if len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace(' ', '') # print(script) def get_url_from_script(script='', var=''): data = script.partition('flashvars.' + var + '="')[2].partition('"')[0] # print(var,data) if data.startswith('https://'): return URL(data) videoUrlLow = get_url_from_script(script, 'videoUrlLow') videoUrlLow2 = get_url_from_script(script, 'videoUrlLow2') videoUrlMedium = get_url_from_script(script, 'videoUrlMedium') videoUrlMedium2 = get_url_from_script(script, 'videoUrlMedium2') videoUrlHD = get_url_from_script(script, 'videoUrlHD') videoUrlHD2 = get_url_from_script(script, 'videoUrlHD2') def add_alternate(video, txt, url): if url is not None: video.add_alternate(dict(text=txt, url=url)) # video=MediaData(videoUrlMedium) if videoUrlMedium is not None: video = MediaData(videoUrlMedium) elif videoUrlLow is not None: video = MediaData(videoUrlLow) else: print('No url found') return result add_alternate(video, 'Low', videoUrlLow) add_alternate(video, 'Low2', videoUrlLow2) add_alternate(video, 'Medium', videoUrlMedium) add_alternate(video, 'Medium', videoUrlMedium2) add_alternate(video, 'HD', videoUrlHD) add_alternate(video, 'HD', videoUrlHD2) result.set_type('video') result.set_video(video) for f in gallery_user_rule.get_result(): # print(f) result.add_control( ControlInfo('"' + f['data'] + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): # print(f) result.add_control(ControlInfo(f['data'], URL(f['href']))) # print('return') return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item['alt'])) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) # for item in startpage_hrefs_rule.get_result(['href', 'data']): # result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'content-inner')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url) + '*') parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination_link')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('div', 'class', 'sub_menu dark-menu'), ('div', 'class', 'sub-menu dark-menu') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'}) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url) + '*') parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'content')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'jwplayer' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'media-tags-container')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'thumb-member-username')]) gallery_user_rule.add_process_rule_level('a', {'href'}) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'].replace(' ', ''), '"file":"', '"') urls.add('default', URL(file + '*')) result.set_video(urls.get_media_data()) if gallery_user_rule.is_result(): user = gallery_user_rule.get_result()[0]['href'].rpartition( '/')[2] result.add_control( ControlInfo('"' + user + ' uploads"', URL('http://motherless.com/u/' + user + '*'))) result.add_control( ControlInfo( '"' + user + ' gals"', URL('http://motherless.com/galleries/member/' + user + '*'))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item.get('title', item.get('data', '')), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'block-video') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src'}) # startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) startpage_pages_rule.add_process_rule_level('a', {'title', 'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('li', 'id', 'categories')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player_body')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'function playStart()' in text) parser.add_rule(video_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([ ('p', 'class', 'info_category'), ('p', 'class', 'info_tags'), ('p', 'class', 'info_cast') ]) picture_href_rule.add_process_rule_level('a', {'href'}) parser.add_rule(picture_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if len(video_rule.get_result()) > 0: result.set_video( MediaData( URL( self.get_attr_from_script( video_rule.get_result()[0]['data'])))) result.set_type('video') for f in picture_href_rule.get_result(): # print(f) result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']))) for item in startpage_pages_rule.get_result(['href', 'data']): if '?from=' in item['href']: result.add_page( ControlInfo(item['data'], URL(item['href'] + '*'))) else: result.add_page( ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() # startpage_rule.add_activate_rule_level([('section', '', '')]) startpage_rule.add_activate_rule_level([ ('article', 'class', 'teaser singleLink hasButtonRow'), ('article', 'class', 'activity video hasButtonFooter') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'data-lazysrc', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('nav', 'class', 'clearfix pagination bottom'), ('nav', 'class', 'range rangeCount-2 clearfix') ]) startpage_pages_rule.add_process_rule_level('a', {'href', 'data-href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_pages_rule.set_attribute_modifier_function( 'data-href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_categories_rule = ParserRule() startpage_categories_rule.add_activate_rule_level([ ('select', 'id', 'input_selectCategories') ]) startpage_categories_rule.add_process_rule_level('option', {'value'}) startpage_categories_rule.set_attribute_modifier_function( 'value', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_categories_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'playerWrapper')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'sources:' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('dl', 'class', 'group')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([ ('nav', 'class', 'profileNav clearfix buttonRow') ]) gallery_user_rule.add_process_rule_level('a', {'href'}) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) gallery_user_rule.set_attribute_filter_function( 'href', lambda x: '#videos' in x) parser.add_rule(gallery_user_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): script = item['data'].replace(' ', '').replace('\\', '') sources = self.quotes(script, 'sources:{"', '"},').split('","') for f in sources: t = f.partition('":"') label = t[0] file = self.get_href(t[2], base_url) urls.add(label, URL(file)) result.set_video(urls.get_media_data()) # for f in gallery_user_rule.get_result(): # print(f) # name='"{0}"'.format(f['href'].rpartition('/')[2].partition('#')[0]) # result.add_control(ControlInfo(name, URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL( item.get('data-lazysrc', item['src'])), href=URL(item['href']), popup=item.get('alt' ''))) for item in startpage_pages_rule.get_result(['href', 'data']): # print(item) href = item.get('data-href', item['href']) # print(href) result.add_page( ControlInfo(href.rpartition('/')[2].strip('*'), URL(href))) for item in startpage_categories_rule.get_result(): result.add_control( ControlInfo(item['data'], URL(item['value']))) return result