class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None html = self.utils.get_page(url, headers={ "Accept-Encoding": "", }) if html: a = pq(html) # items items = a('a.list_model') for item in items.items(): modelurl = item.attr('href') name = item('b').text() board = item('img').attr('src') if self.dbUtils.get_db_url(modelurl): continue html = self.utils.get_page(modelurl) if html: b = pq(html) products = b('a.list_model2') try: for product in products.items(): data_p = self.common.parse_item(product) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data except: continue self.dbUtils.put_db_url(modelurl) self.log('parsed url %s' % url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.utils.verify = False self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None while True: html = self.utils.get_page(url) if html: if self.dbUtils.get_db_url(url): pass else: a = pq(html) # items items = a('ul.set.babes_main li') parse_succeed = True for item in items.items(): try: name = item('b a').text() board = 'https:' + item('a img').attr( 'lsrc') + '.jpg' model_url = urljoin('https://www.hqbabes.com/', item('b a').attr('href')) html2 = self.utils.get_page(model_url) if html2: b = pq(html2) modelitems = b('ul.set.babe li') for modelitem in modelitems.items(): try: data_p = self.common.parse_item( modelitem) data_t = { 'name': self.utils.format_name(name), 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data except: self.log('parsed error 1 %s_%s' % (url, modelitem)) parse_succeed = False continue except: self.log('parsed error 2 %s_%s' % (url, item)) parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) next_url = a('#pages li a[count="Next Page"]') if next_url: url = urljoin('https://www.hqbabes.com/', next_url.attr('href')) self.log('request %s' % url) else: break else: self.log('request %s error' % url) continue except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.utils.verify = False self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('div.thumbs div.thumb > a') processNum = 0 parse_succeed = True for item in items.items(): try: name = item.text() model_url = urljoin('https://www.erosberry.com/', item.attr('href')) html2 = self.utils.get_page(model_url) if html2: b = pq(html2) board = urljoin('https://www.erosberry.com/', b('div.info > img').attr('src')) modelitems = b('div.girl_thumbs div.container > a') for modelitem in modelitems.items(): try: data_p = self.common.parse_item(modelitem) data_t = { 'name': name, 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data processNum += 1 except: parse_succeed = False continue except: parse_succeed = False continue if parse_succeed and processNum > 0: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def urls_genarator(self): for i in range(self.start, self.end + 1): yield self.url.format(page=i * 44) yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None url_origin = url index = 1 while True: search_url = "%s?page=%s" % (url_origin, index) if index == 1: if self.dbUtils.get_db_url(url_origin): index = index + 1 continue elif self.dbUtils.get_db_url(search_url): index = index + 1 continue break if index > 2: index = index - 1 search_url = "%s?page=%s" % (url_origin, index) else: search_url = url_origin while True: self.log('request %s' % search_url) html2 = self.utils.get_page(search_url) if html2: if self.dbUtils.get_db_url(search_url): pass else: a = pq(html2) items = a('div.js_video_row div.video-box a.video-box-image') parse_successed = True for item in items.items(): try: data_p = self.common.parse_item(item) if not data_p: parse_successed = False continue elif self.common.parse_detail_fr_brief_duplicate(data_p): continue data_t = { 'name': 'Categories', 'url': data_p.get('brief').get('url'), # 'refurl': search_url } data = dict(data_t, **data_p) yield data except: parse_successed = False continue if parse_successed: self.log('parsed url %s' % search_url) self.dbUtils.put_db_url(search_url) else: self.log('request %s error' % search_url) next_url = pq(html2)('#next .prev-next a').attr("data-page-number") if next_url: search_url = "%s?page=%s" % (url_origin, next_url) else: break else: break except: self.log('error in parse url %s' % url) yield None yield None def urls_genarator(self): html = self.utils.get_page(self.url) if html: a = pq(html) categorys = a('#categoryList a.categoryBox') for category in categorys.items(): yield urljoin("https://www.youporn.com", category.attr('href')) yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: if self.dbUtils.get_db_url(url): pass else: a = pq(html) # items items = a('#popular_pornstars_wrapper div.pornstar a') for item in items.items(): name = item('img').attr('alt') board = item('img').attr('src') model_url_origin = item.attr('href') index = 1 while True: model_url = "%spage/%s/" % (model_url_origin, index) if index == 1: if self.dbUtils.get_db_url(model_url_origin): index = index + 1 continue elif self.dbUtils.get_db_url(model_url): index = index + 1 continue break if index > 2: index = index - 1 model_url = "%spage/%s/" % (model_url_origin, index) else: model_url = model_url_origin while True: self.log('request %s' % model_url) html2 = self.utils.get_page(model_url) if html2: if self.dbUtils.get_db_url(model_url): pass else: data_ps, parse_res = self.parse_sub_page( html2) for data_p in data_ps: data_t = { 'name': name, 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data if parse_res: self.log('parsed url %s' % model_url) self.dbUtils.put_db_url(model_url) next_url = pq(html2)('#pagination_next') if next_url: model_url = next_url.attr('href') else: break else: break else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def parse_sub_page(self, html): b = pq(html) items = b('a.video-thumb-link') sub_datas = [] parse_successed = None for item in items.items(): try: data_p = self.common.parse_item(item) sub_datas.append(data_p) if not parse_successed: parse_successed = True else: parse_successed = True & parse_successed except: parse_successed = False return sub_datas, parse_successed