def get_start_button_menu_text_url_dict(self): return dict(HD=URL('http://www.pornhd4k.com/kategori/porno/hd*'))
def get_href(self, txt='', base_url=URL()): if txt.startswith('http://'): return txt if txt.startswith('/'): return base_url.domain() + txt return base_url.get().partition('?')[0] + txt
def get_start_button_menu_text_url_dict(self): return dict( Galleries_Recently_Updated=URL( 'http://motherless.com/galleries/updated*'), Galleries_Most_Viewed=URL( 'http://motherless.com/galleries/viewed*'), Galleries_Most_Favorited=URL( 'http://motherless.com/galleries/favorited*'), Videos_Recent=URL('http://motherless.com/videos/recent*'), Videos_Most_Viewed=URL('http://motherless.com/videos/viewed*'), Videos_Most_Favoritede=URL( 'http://motherless.com/videos/favorited*'), Videos_Popular=URL('http://motherless.com/videos/popular*'), Videos_Live=URL('http://motherless.com/live/videos*'), Videos_All_Time_Most_Viewed=URL( 'http://motherless.com/videos/all/viewed*'), Videos_All_Time_Most_Favorited=URL( 'http://motherless.com/videos/all/favorited*'), Videos_Archived=URL('http://motherless.com/videos/archives*'))
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('ul', 'class', 'thumbs'), ('li', 'class', 'category')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt', 'data-original'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager paging')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level( 'span', {'data-query-key', 'data-query-value'}) # startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url)) parser.add_rule(startpage_pages_rule) channels_rule = ParserRule() channels_rule.add_activate_rule_level([('ul', 'class', 'tag-150-list') ]) channels_rule.add_process_rule_level('a', {'href'}) channels_rule.add_process_rule_level('img', {'src'}) channels_rule.set_attribute_filter_function( 'href', lambda x: '/channel/' in x or '/prime/' in x) channels_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(channels_rule) channel_categories_rule = ParserRule() channel_categories_rule.add_activate_rule_level([ ('ul', 'class', 'link-tag-list long-col') ]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) channel_categories_rule.add_process_rule_level( 'span', {'data-query-key', 'data-query-value'}) # startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url)) parser.add_rule(channel_categories_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player-container')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'players.push' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('ul', 'class', 'video-tag-list')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace(' ', '').replace( '\\', '') sources = script.partition("'sources':{")[2].partition( '}')[0].split(',') urls = list() for item in sources: part = item.strip("\n\t'").partition("':'") if part[2].startswith('http://'): data = dict(text=part[0], url=URL(part[2].strip("'") + '*')) urls.append(data) if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: video = MediaData(urls[-1]['url']) for item in urls: video.add_alternate(item) else: return result result.set_type('video') result.set_video(video) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result def add_key(old, key, value): (addr, br, keys) = old.partition('?') print(addr, br, keys) pairs = keys.split('&') print(pairs) keys = '' found = False for pair in pairs: if pair.startswith(key): keys += key + '=' + value + '&' found = True else: keys += pair + '&' if not found: keys += key + '=' + value return addr + '?' + keys.strip('&') def add_pages_info_to_result(rule, description_key='data-query-value'): for item in rule.get_result(['data-query-key', 'data-query-value']): print(item) key = item['data-query-key'] val = item['data-query-value'] description = item[description_key].strip('\t') old = base_url.get() addr = add_key(old, key, val) result.add_page(ControlInfo(description, URL(addr + '*'))) if channels_rule.is_result(): result.set_type('hrefs') for item in channels_rule.get_result(): # print(item) info = item['href'].rpartition('/')[2].strip('*') result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=info)) add_pages_info_to_result(channel_categories_rule, description_key='data') # for item in channel_categories_rule.get_result(['data-query-key', 'data-query-value']): # print(item) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) t_url = item.get('data-original', item['src']) result.add_thumb( ThumbInfo(thumb_url=URL(t_url), href=URL(item['href']), popup=item.get('alt', ''))) add_pages_info_to_result(startpage_pages_rule) return result
def can_accept_index_file(self, base_url=URL()): return base_url.contain('babesandstars.com/')
def can_accept_index_file(self, base_url=URL()): return base_url.contain('collectionofbestporn.com/')
def startpage(self): return URL("http://www.pornhd.com/?order=newest*")
def startpage(self): return URL("http://fineartteens.com/")
def can_accept_index_file(self, base_url=URL()): for site in self.accepted_sites: if base_url.contain(site): return True return False
def can_accept_index_file(self, base_url=URL()): return base_url.contain('sexix.net/')
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'videoContainer')]) video_rule.add_process_rule_level('iframe', {'src'}) video_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): try: r = load(URL(item['src'])) r = load(URL(self.quotes(r.text, "jwplayer().load('", "'") + '*')) source = self.quotes(r.text, '<item>', '</item>').strip() split = source.split('<jwplayer:source file="') for l in split: if l is '': continue url = l.partition('"')[0] label = self.quotes(l, 'label="', '"') urls.add(label, URL(url + '*')) except LoaderError as err: print(err) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def startpage(self): return URL("http://sexix.net/?orderby=date*")
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'videos_form')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'data-lazy-src'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'videos_page')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer(' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'Categories')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): file = self.quotes(item['data'].replace(' ', ''), "file:'", "'") urls.add('default', URL(file + '*')) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb(ThumbInfo(thumb_url=URL(item['data-lazy-src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href']): href = item['href'] label = href.split('/')[-2] # print(label,href) result.add_control(ControlInfo(label, URL(href))) return result
def startpage(self): return URL("http://www.pornhd4k.com/")
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs'), ('div', 'class', 'movie_thumbs')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', get_href) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'head') ]) startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pages') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: 'http://www.tomorrowporn.com' + x) parser.add_rule(startpage_pages_rule) href_rule = ParserRule() href_rule.add_activate_rule_level([('ul', 'class', 'sub_thumb_list')]) href_rule.add_process_rule_level('a', {'href'}) href_rule.add_process_rule_level('img', {'src', 'alt'}) href_rule.set_attribute_modifier_function('href', get_href) parser.add_rule(href_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([ ('div', 'class', 'thumb_box'), ('div', 'class', 'thumb_box bottom_corners'), ('div', 'class', 'thumb_box top_corners') ]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: text.replace('t', '')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'class', 'menus')]) picture_href_rule.add_process_rule_level('h2', set()) picture_href_rule.add_process_rule_level('a', {'href', 'title'}) parser.add_rule(picture_href_rule) for s in open(fname, encoding='utf-8'): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: # result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(href_rule.get_result()) > 0: # result.set_type('hrefs') for item in href_rule.get_result(): # print (item) if 'src' in item: result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) if len(picture_rule.get_result()) > 0: # result.set_type('pictures') for f in picture_rule.get_result(): result.add_full(FullPictureInfo(rel_name=f['src'])) for f in picture_href_rule.get_result(): # print(f) result.add_control(ControlInfo(f['title'], URL(f['href']))) return result
def parse_index_file(self, fname, base_url=URL()): print(base_url.get(), base_url.domain()) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'post'), ('div', 'class', 'post300')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: get_href(x)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'id', 'pager')]) startpage_pages_rule.add_activate_rule_level([('div', 'id', 'pc')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'id', 'cc')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src', 'title'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: _del_thumb(text)) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'id', 'cc')]) picture_href_rule.add_activate_rule_level([('div', 'class', 'shorttext')]) picture_href_rule.add_process_rule_level('a', {'href', 'alt'}) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href', 'src']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href'] + '*'), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page( ControlInfo(item['data'], URL(item['href'] + '*'))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['src', 'title']): result.add_full( FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 for f in picture_href_rule.get_result(): if f['href'].startswith('/'): result.add_control( ControlInfo(text=f['alt'], url=URL(base_url.domain() + f['href']))) return result
def startpage(self): return URL("http://collectionofbestporn.com/most-recent*")
def startpage(self): return URL("http://www.pornstar.hu/galleries")
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'video-thumb') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('video', '', '')]) video_rule.add_process_rule_level('source', {'src', 'label', 'res'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) parser.add_rule(video_rule) gallery_href_rule = ParserRule() # gallery_href_rule.add_activate_rule_level([('div', 'class', 'option')]) gallery_href_rule.add_activate_rule_level([('div', 'class', 'tags-container')]) gallery_href_rule.add_process_rule_level('a', {'href'}) # gallery_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(['src', 'res']): urls.add(item['res'], URL(item['src'])) result.set_video(urls.get_media_data(-1)) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href'] + '*'))) return result if startpage_rule.is_result(): # for item in startpage_rule.get_result(): # print(item) for item in startpage_rule.get_result(['href', 'src']): href = item['href'] if '/category/' in href: result.set_caption_visible(True) result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(href), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def get_start_button_menu_text_url_dict(self): return dict(Pornstars=URL('http://toseeporn.com/Actor*'), Home=URL('http://toseeporn.com/*'), Search_Example=URL( 'http://toseeporn.com/Search=asian%20sex%20diary*'))
def can_accept_index_file(self, base_url=URL()): return base_url.contain('pornhd.com/')
def startpage(self): return URL("http://toseeporn.com/Category/West%20Porn*")
def startpage(self): return URL("http://www.babesandstars.com/galleries/")
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'fixed-content')]) startpage_rule.add_process_rule_level('a', {'href', 'class'}) startpage_rule.add_process_rule_level('div', {'style'}) startpage_rule.set_attribute_filter_function( 'class', lambda x: x == 'thumbnail') startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'style', lambda x: x.partition("url('")[2].partition("')")[0]) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('div', 'class', 'col-xs-12 content-pagination') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('section', 'id', 'footer-tag')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) categories_rule = ParserRule() categories_rule.add_activate_rule_level([('ul', 'class', 'nav navbar-nav')]) categories_rule.add_process_rule_level('a', {'href'}) categories_rule.set_attribute_filter_function( 'href', lambda x: '/Category/' in x and "#" not in x) categories_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(categories_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('body', '', '')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'angular.' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'row tag-area')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace(' ', '') json_file_url = self.get_href(self.quotes(script, "host:'", "'"), base_url) # print(json_file_url) from requests_loader import load, LoaderError json_file = Setting.base_dir + 'tsp_video.json' urls = list() result.set_type('video') try: r = load(URL(json_file_url), json_file) links = set() for item in r.json()['mediaSources']: # print(item) if item['source'] not in links: data = dict(text=item['quality'], url=URL(item['source'] + '*')) urls.append(data) links.add(item['source']) if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: video = MediaData(urls[0]['url']) for item in urls: video.add_alternate(item) else: return result result.set_video(video) except LoaderError as err: print(err) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['style']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): label = item['data'].replace(' ', '') # print(item) if len(label) > 0: result.add_page(ControlInfo(label, URL(item['href']))) if categories_rule.is_result(['href']): for item in categories_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item['data'], URL(item['href']))) if tags_rule.is_result(['href']): for item in tags_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item['data'], URL(item['href']))) return result
def get_start_button_menu_text_url_dict(self): return dict( Videos=URL('http://www.babesandstars.com/videos/'), Photos=URL('http://www.babesandstars.com/galleries/'), Top100models=URL('http://www.babesandstars.com/top-models/'))
def startpage(self): return URL("http://www.tomorrowporn.com/")
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() domain = base_url.domain() href_rule = ParserRule() # startpage & model's page href_rule.add_activate_rule_level([('div', 'class', 'galleries'), ('div', 'class', 'models'), ('div', 'class', 'videos')]) href_rule.add_activate_rule_level([('div', 'class', 'items')]) href_rule.add_process_rule_level('a', {'href'}) href_rule.add_process_rule_level('img', {'src', 'alt'}) href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) href_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(href_rule) href_page_rule = ParserRule() # page number in model's page href_page_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) href_page_rule.add_process_rule_level('a', {'href'}) href_page_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(href_page_rule) model_litera_rule = ParserRule() model_litera_rule.add_activate_rule_level([('span', 'class', 'chars')]) model_litera_rule.add_process_rule_level('a', {'href'}) model_litera_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(model_litera_rule) picture_rule = ParserRule() # gallery rule picture_rule.add_activate_rule_level([('div', 'class', 'picture')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.add_process_rule_level('img', {'alt'}) picture_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(picture_rule) video_rule = ParserRule() # gallery rule video_rule.add_activate_rule_level([('div', 'class', 'video')]) video_rule.add_process_rule_level('source', {'src'}) # video_rule.add_process_rule_level('img', {'alt'}) # video_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url)) parser.add_rule(video_rule) picture_href_rule = ParserRule() # gallery href's rule picture_href_rule.add_activate_rule_level([('div', 'class', 'model')]) picture_href_rule.add_activate_rule_level([('div', 'class', 'links')]) picture_href_rule.add_process_rule_level('a', {'href'}) picture_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(video_rule.get_result()) > 0: result.set_video( MediaData(URL(video_rule.get_result()[0]['src'] + '*'))) result.set_type('video') for f in picture_href_rule.get_result(['href', 'data']): # print(f) result.add_control( ControlInfo(text=f['data'], url=URL(f['href']))) return result if len(picture_rule.get_result()) > 0: result.set_type('pictures') for f in picture_rule.get_result(): x = FullPictureInfo(abs_href=URL(f['href']), rel_name=f['href'].rpartition('/')[2]) result.add_full(x) for f in picture_href_rule.get_result(['href', 'data']): # print(f) result.add_control( ControlInfo(text=f['data'], url=URL(f['href']))) return result if len(href_rule.get_result()) > 0: result.set_type('hrefs') for item in href_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in model_litera_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in href_page_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def can_accept_index_file(self, base_url=URL()): return base_url.contain('tomorrowporn.com/')
def startpage(self): return URL("http://motherless.com/videos/recent?page=1*")
def get_start_button_menu_text_url_dict(self): return dict(Pictures=URL('http://www.bravonude.com/'), Movies=URL('http://www.bravonude.com/erotica-videos/'))