class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) items = a( 'div.gallery-section div.thumb-list.gallery.thumb-list--sidebar div.thumb-list__item.gallery-thumb a' ) parse_succeed = True for item in items.items(): try: data_p = self.common.parse_item(item) data_t = { 'name': "Galleries", 'url': data_p.get('brief').get('url'), 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserHunterSingleUrl(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('#content h2').prev_all('ul.gallery-a li') parse_succeed = True for item in items.items(): try: data_p = self.common.parse_item(item) data_t = { 'url': data_p.get('brief').get('url'), 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('.products .contain .grid .col-sm-12') parse_succeed = True for item in items.items(): try: data_p = self.common.parse_item(item) data_t = { 'name': data_p.get('brief').get('name'), 'url': data_p.get('brief').get('url'), # 'board': data_p.get('brief').get('board'), 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.dbUtils.put_db_url(url) else: self.log('html none in parse url %s' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: soup = pq(html) items = soup('.listProfiles li') for item in items.items(): data = {} try: data_gen = self.common.parse_item(item) while True: data = next(data_gen) if not data: break yield data except: self.log('error in item in url %s' % url) continue self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('ul.gallery-d li') for item in items.items(): data_p = self.common.parse_item(item) data_t = { 'name': data_p.get('brief').get('name'), 'url': data_p.get('brief').get('url'), 'board': data_p.get('brief').get('board'), 'refurl': url } data = dict(data_t, **data_p) yield data self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('li.g1-collection-item') for item in items.items(): data = self.common.parse_item(item) yield data self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None url_origin = url index = 1 while True: search_url = "%s?page=%s" % (url_origin, index) if index == 1: if self.dbUtils.get_db_url(url_origin): index = index + 1 continue elif self.dbUtils.get_db_url(search_url): index = index + 1 continue break if index > 2: index = index - 1 search_url = "%s?page=%s" % (url_origin, index) else: search_url = url_origin while True: self.log('request %s' % search_url) html2 = self.utils.get_page(search_url) if html2: if self.dbUtils.get_db_url(search_url): pass else: a = pq(html2) items = a('div.js_video_row div.video-box a.video-box-image') parse_successed = True for item in items.items(): try: data_p = self.common.parse_item(item) if not data_p: parse_successed = False continue elif self.common.parse_detail_fr_brief_duplicate(data_p): continue data_t = { 'name': 'Categories', 'url': data_p.get('brief').get('url'), # 'refurl': search_url } data = dict(data_t, **data_p) yield data except: parse_successed = False continue if parse_successed: self.log('parsed url %s' % search_url) self.dbUtils.put_db_url(search_url) else: self.log('request %s error' % search_url) next_url = pq(html2)('#next .prev-next a').attr("data-page-number") if next_url: search_url = "%s?page=%s" % (url_origin, next_url) else: break else: break except: self.log('error in parse url %s' % url) yield None yield None def urls_genarator(self): html = self.utils.get_page(self.url) if html: a = pq(html) categorys = a('#categoryList a.categoryBox') for category in categorys.items(): yield urljoin("https://www.youporn.com", category.attr('href')) yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: if self.dbUtils.get_db_url(url): pass else: a = pq(html) # items items = a('#content > ul li.pornstars a') for item in items.items(): name = item('a img').attr('alt') board = item('a img').attr('src') model_url_origin = urljoin( 'https://www.thumbzilla.com/', item.attr('href')) index = 1 while True: model_url = "%s?page=%s" % (model_url_origin, index) if index == 1: if self.dbUtils.get_db_url(model_url_origin): index = index + 1 continue elif self.dbUtils.get_db_url(model_url): index = index + 1 continue break if index > 2: index = index - 1 model_url = "%s?page=%s" % (model_url_origin, index) else: model_url = model_url_origin while True: self.log('request %s' % model_url) html2 = self.utils.get_page(model_url) if html2: if self.dbUtils.get_db_url(model_url): pass else: data_ps, parse_res = self.parse_sub_page( html2) for data_p in data_ps: data_t = { 'name': name, 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data if parse_res: self.log('parsed url %s' % model_url) self.dbUtils.put_db_url(model_url) next_url = pq(html2)('li.page_next a') if next_url: model_url = urljoin( 'https://www.thumbzilla.com', next_url.attr('href')) else: break else: break else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def parse_sub_page(self, html): b = pq(html) items = b('#content ul li:gt(4) a') sub_datas = [] parse_successed = None for item in items.items(): try: data_p = self.common.parse_item(item) sub_datas.append(data_p) if not parse_successed: parse_successed = True else: parse_successed = True & parse_successed except: parse_successed = False return sub_datas, parse_successed
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a( 'div.ts-responsive-wrap div.tshowcase-inner-box div.tshowcase-box-photo > a' ) for item in items.items(): modelurl = item.attr('href') modelsearch = modelurl name = item('img').attr('title') board = item('img').attr('src') try: while modelsearch is not None: html = self.utils.get_page(modelsearch) if html: b = pq(html) if self.dbUtils.get_db_url(modelsearch): continue else: products = b('div.home_box > a') for product in products.items(): data_p = self.common.parse_item( product) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data self.dbUtils.put_db_url(modelsearch) nexturl = b('link[rel=next]') if nexturl: modelsearch = nexturl.attr('href') else: modelsearch = None else: modelsearch = None except: continue self.log('parsed url %s' % url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None html = self.utils.get_page(url, headers={ "Accept-Encoding": "", }) if html: a = pq(html) # items items = a('ul.links li.hideli') for item in items.items(): modelurl = item('a').attr('href') name = item('a').attr('title') if self.dbUtils.get_db_url(modelurl): continue html = self.utils.get_page(modelurl, headers={ "Accept-Encoding": "", }) if html: b = pq(html) products = b('li.box-shadow') try: for product in products.items(): data_p = self.common.parse_item(product) data_t = { 'name': name, 'url': modelurl, 'refurl': modelurl } data = dict(data_t, **data_p) yield data except: continue self.dbUtils.put_db_url(modelurl) self.log('parsed url %s' % url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) self.utils.verify = False ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('#models tr td a') for item in items.items(): name = item.attr('title') model_url = urljoin('https://www.babesmachine.com', item.attr('href')) if self.dbUtils.get_db_url(model_url): yield None html2 = self.utils.get_page(model_url) if html2: b = pq(html2) modelitems = b('#posts tr td a') parse_succeed = True for modelitem in modelitems.items(): try: data_p = self.common.parse_item(modelitem) data_t = { 'name': name, 'url': model_url, 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % model_url) self.dbUtils.put_db_url(model_url) else: self.log('request %s error' % model_url) continue else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None end_pos = url.rfind('/') - 1 # 倒数第一个"/"的位置再左移一位 start_pos = url.rfind( '/', 0, end_pos) # 网址从开始截至到end_pos的位置,从右往左出现的第一个"/"也就是我们要找的倒数第二个"/" name = url[start_pos + 1:] # 截取网址的倒数第二个 "/" 后面的内容 data_total = 1 html = self.utils.get_page(url) if html: a = pq(html) data_total = a('button.js-load-more').attr('data-total') if not data_total: data_total = 1 if int(data_total) > 0: for page in range(1, int(data_total) + 1): try: cate_url = '%s?mode=async&action=get_block&block_id=list_albums_common_albums_list&from=%s' % ( url, page) if self.dbUtils.get_db_url(cate_url): continue html = self.utils.get_page(cate_url) if html: b = pq(html) items = b('div.masonry_item >a') for item in items.items(): data_p = self.common.parse_item(item) data_t = { 'name': name, 'url': data_p.get('brief').get('url'), 'refurl': cate_url } data = dict(data_t, **data_p) yield data self.dbUtils.put_db_url(cate_url) except: continue self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None ''' urls_genarator @author: chenzf ''' def urls_genarator(self): html = self.utils.get_page(self.url) if html: a = pq(html) categorys = a('div.masonry_item a') for category in categorys.items(): yield category.attr('href'), category.attr('title') yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.utils.verify = False self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('div.thumbs div.thumb > a') processNum = 0 parse_succeed = True for item in items.items(): try: name = item.text() model_url = urljoin('https://www.erosberry.com/', item.attr('href')) html2 = self.utils.get_page(model_url) if html2: b = pq(html2) board = urljoin('https://www.erosberry.com/', b('div.info > img').attr('src')) modelitems = b('div.girl_thumbs div.container > a') for modelitem in modelitems.items(): try: data_p = self.common.parse_item(modelitem) data_t = { 'name': name, 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data processNum += 1 except: parse_succeed = False continue except: parse_succeed = False continue if parse_succeed and processNum > 0: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def urls_genarator(self): for i in range(self.start, self.end + 1): yield self.url.format(page=i * 44) yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: a = pq(html, parser='html') # items items = a('a.artwork') for item in items.items(): modelurl = urljoin('http://www.hegre.com/', item.attr('href').strip()) board = item('img').attr('src') name = item.attr('title') if self.dbUtils.get_db_url(modelurl): continue bFarseSucceed = True html2 = self.utils.get_page(modelurl) if html2: b = pq(html2, parser='html') item_models = b('#films-wrapper div.item') for item_model in item_models.items(): try: data_p = self.common.parse_item(item_model) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data except: bFarseSucceed = False continue b = pq(html2, parser='html') item_models = b('#massages-wrapper div.item') for item_model in item_models.items(): try: data_p = self.common.parse_item(item_model) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data except: bFarseSucceed = False continue self.log('parsed url %s' % modelurl) if bFarseSucceed: self.dbUtils.put_db_url(modelurl) self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url, headers={ "Host": "godsartnudes.com", "Upgrade-Insecure-Requests": "1" }) if html: a = pq(html) # items items = a( 'div.row.gan-central div.col-xxs-12.col-xs-6.col-sm-4.col-md-3 div.Thumb a:last-of-type' ) for item in items.items(): name = item.text() # board = item('a img').attr('lsrc') + '.jpg' model_url = urljoin('http://godsartnudes.com', item.attr('href')) if self.dbUtils.get_db_url(model_url): continue html2 = self.utils.get_page(model_url) if html2: b = pq(html2) modelitems = b( 'div.row.spacetop div.col-xxs-12.col-xs-6.col-sm-4.col-md-3 div.thumbImage > a:first-child' ) parse_succeed = True processNum = 0 for modelitem in modelitems.items(): parse_succeed &= True try: data_p = self.common.parse_item(modelitem) data_t = { 'name': name, 'url': model_url, 'refurl': url } data = dict(data_t, **data_p) yield data processNum += 1 except: parse_succeed = False continue if parse_succeed and processNum > 0: self.log('parsed url %s' % model_url) self.dbUtils.put_db_url(model_url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def urls_genarator(self): for url in range(ord("A"), ord("Z") + 1): yield self.url.format(page=chr(url)) yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html, parser='html') # items items = a('#block-system-main .node-grid') for item in items.items(): board = item('div.field-type-image img').attr('src') name = item('.grid-meta a').text() modelurl = urljoin('http://www.hegregirls.com/', item('.grid-meta a').attr('href')) html2 = self.utils.get_page(modelurl) if html2: b = pq(html2, parser='html') items_model = b( '#main-content .content .content .grid-4') for item_model in items_model.items(): try: if not re.search('galleries', item_model.attr('about')): continue data_p = self.common.parse_item(item_model) data_t = { 'name': name, 'url': modelurl, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data except: continue self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: name = url.split('/')[-2] a = pq(html) # items items = a('#content div.wrap.wrap2 div.thumbs a') parse_succeed = True for item in items.items(): try: data_p = self.common.parse_item(item) data_t = { 'name': name, 'url': data_p.get('brief').get('url'), 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def urls_genarator(self): html = self.utils.get_page(self.url) if html: a = pq(html) models = a('ul.bottomLists2 ul li a') for model in models.items(): yield urljoin('http://xnudegirls.com/', model.attr('href')) yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: if self.dbUtils.get_db_url(url): pass else: a = pq(html) items = a( 'body > div.main-wrap > div.best-list-block.hide-on-search > div.width-wrap > div.thumb-container div.pornstar-thumb-container div.pornstar-thumb-container__info div.pornstar-thumb-container__info-title a') for item in items.items(): model_url_origin = item.attr('href') name = item.text() index = 1 while True: model_url = "%s/%s" % (model_url_origin, index) if index == 1: if self.dbUtils.get_db_url(model_url_origin): index = index + 1 continue elif self.dbUtils.get_db_url(model_url): index = index + 1 continue break if index > 2: index = index - 1 model_url = "%s/%s" % (model_url_origin, index) else: model_url = model_url_origin while True: self.log('request %s' % model_url) html2 = self.utils.get_page(model_url) if html2: if self.dbUtils.get_db_url(model_url): pass else: board = pq(html2)('div.pornstar-logo img').attr('src') data_ps, parse_res = self.parse_sub_page(html2) for data_p in data_ps: data_t = { 'name': name, 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data if parse_res: self.log('parsed url %s' % model_url) self.dbUtils.put_db_url(model_url) next_url = pq(html2)('li.next a').attr("href") if next_url: model_url = next_url else: break else: break; else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def parse_sub_page(self, html): b = pq(html) items = b( 'body > div.main-wrap > main > div > article > div.index-videos.mixed-section > div.thumb-list.thumb-list--sidebar.thumb-list--recent > div.thumb-list__item.video-thumb a.video-thumb-info__name') sub_datas = [] parse_successed = None for item in items.items(): try: data_p = self.common.parse_item(item) sub_datas.append(data_p) if not parse_successed: parse_successed = True else: parse_successed = True & parse_successed except: parse_successed = False return sub_datas, parse_successed
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None while True: html = self.utils.get_page(url) if html: if self.dbUtils.get_db_url(url): pass else: a = pq(html) # items items = a('ul.set.sluts_main li') parse_succeed = True for item in items.items(): try: name = item('b a').text() board = item('a img').attr('lsrc') + '.jpg' model_url = urljoin('https://www.hqsluts.com/', item('b a').attr('href')) html2 = self.utils.get_page(model_url) if html2: b = pq(html2) modelitems = b('ul.set.s**t li') for modelitem in modelitems.items(): try: data_p = self.common.parse_item( modelitem) data_t = { 'name': self.utils.format_name(name), 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) next_url = a('#pages li a[count="Next Page"]') if next_url: url = urljoin('https://www.hqsluts.com/', next_url.attr('href')) self.log('request %s' % url) else: break else: self.log('request %s error' % url) continue except: self.log('error in parse url %s' % url) yield None yield None