class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None html = self.utils.get_page(url, headers={ "Accept-Encoding": "", }) if html: a = pq(html) # items items = a('a.model_item') for item in items.items(): modelurl = item.attr('href') name = item('img').attr('alt') board = item('img').attr('src') if self.dbUtils.get_db_url(modelurl): continue html = self.utils.get_page(modelurl) if html: b = pq(html) products = b('div.gallery_box a') try: for product in products.items(): data_p = self.common.parse_item(product) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data except: continue self.dbUtils.put_db_url(modelurl) self.log('parsed url %s' % url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, url, start, end, savePath): super(CWebParserMultiUrl, self).__init__(url, start, end) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) ''' parse_page @author: chenzf ''' def parse_page(self): urlsGen = self.urls_genarator() while True: try: url = next(urlsGen) if not url: return None html = self.utils.get_page(url) if html: soup = pq(html) contents = soup('.listProfiles li') for content in contents.items(): data = {} try: step = 1 name = None modelurl = None board = None if content('a.thumb'): modelurl = urljoin( 'http://www.pornvidhub.com/', content('a.thumb').attr('href')) name = content('a.title').text() board = content('img').attr('src') data['name'] = name data['url'] = modelurl data['board'] = board step = 2 if modelurl: videos_all = self.parse_video(modelurl) step = 3 while True: video_one, isLast = next(videos_all) if not video_one: step = 4 break if isLast: data['videos'] = video_one self.save_info(data) step = 5 else: data_temp = deepcopy(data) data_temp['videos'] = video_one yield data_temp step = 6 else: data['videos'] = [] yield data step = 7 except: errMsg = 'error in parse %s , step %s' % (modelurl, step) print(errMsg) self.log(errMsg) except: print('error in parse %s' % url) yield None yield None def parse_video(self, url): videos_dict = [] browser = CWebSpiderUtils(None) browser.init_chrome() while True: html = self.utils.get_page(url) page = pq(html) items = page('.listThumbs li') for item in items.items(): url = urljoin('http://www.pornvidhub.com/', item('a.title').attr['href']) if url: video, still = self.parse_video_detail(url, browser) else: video = None still = [] video_item = { 'name': self.utils.format_name(item('a.title').attr['title']), 'url': url, 'video': video, 'stills': still } yield [video_item], False videos_dict.append(video_item) next_btn = page('span.numbers').nextAll('a.nav') if next_btn: url = urljoin('http://www.pornvidhub.com/', next_btn.attr['href']) else: break browser.close_chrome() yield videos_dict, True yield None, False def parse_video_detail(self, url, browser): html = browser.get_chrome(url, '#player #html5') if not html: return None, [] soup = BeautifulSoup(html, 'lxml') Stills = [] stills = soup.find(id='tabPhotos').find_all('img') for still in stills: small_url = still.attrs['src'] thumb = re.search('.*?/(\d+x\d+/).*?', small_url, re.S) if thumb: large_url = small_url.replace(thumb.group(1), "") else: large_url = None Stills.append([large_url, small_url]) soup = pq(html) video_text = soup('#player #html5') if video_text: video = video_text.attr['src'] return video, Stills ''' process_image @author: chenzf ''' # @vthread.pool(8) def process_data(self, data): # self.save_info(data) # print(data.get('videos')[0].get('name')) board = data.get('board') if board: self.utils.download_file( board, '%s\\%s' % (data.get('name'), data.get('name'))) self.process_videos(data) def process_videos(self, data): videos = data.get('videos') modelName = data.get('name') for item in videos: stills = item.get('stills') for i, val in enumerate(stills, start=1): for subVal in val: if subVal: self.utils.download_file( subVal, '%s\\%s\\%s' % (modelName, item.get('name'), str(i))) break video = item.get('video') if video: self.utils.download_file( video, '%s\\%s\\%s' % (modelName, item.get('name'), item.get('name')), headers={'Referer': data.get('url')}) break
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a( 'div.ts-responsive-wrap div.tshowcase-inner-box div.tshowcase-box-photo > a' ) for item in items.items(): modelurl = item.attr('href') modelsearch = modelurl name = item('img').attr('title') board = item('img').attr('src') try: while modelsearch is not None: html = self.utils.get_page(modelsearch) if html: b = pq(html) if self.dbUtils.get_db_url(modelsearch): continue else: products = b('div.home_box > a') for product in products.items(): data_p = self.common.parse_item( product) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data self.dbUtils.put_db_url(modelsearch) nexturl = b('link[rel=next]') if nexturl: modelsearch = nexturl.attr('href') else: modelsearch = None else: modelsearch = None except: continue self.log('parsed url %s' % url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, url, savePath): super(CWebParserSingleUrl, self).__init__(url) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) ''' parse_page @author: chenzf ''' def parse_page(self): try: urlsGen = self.urls_genarator() while True: try: url = next(urlsGen) if not url: yield None html = self.utils.get_page(url) if html: data = {} soup = BeautifulSoup(html, 'lxml') step = 1 #board_image board_image = soup.find('div', class_="board_image") if board_image: name = board_image.find('img').attrs['alt'].strip() board_image = board_image.find('img').attrs['src'] else: name = None board_image = None step = 2 data['name'] = name data['url'] = url data['board'] = board_image step = 3 #poster_image poster_image = soup.find('div', class_="poster_image") if poster_image: poster_image = poster_image.find( 'img').attrs['src'] else: poster_image = None data['poster'] = poster_image step = 4 #profile labels = soup.find('div', class_="labels") rows = labels.find_all('div', class_="row") profile = [] for row in rows: profile.append(row.get_text().strip().replace( '\n', '')) data['profile'] = profile step = 5 #products details = soup.find('div', class_="details") counts = details.find('div', class_="counts") items = counts.find_all('a') products = [] for item in items: products.append(item.get_text().strip()) data['products'] = products step = 6 #detail product data['galleries'] = self.parse_galleries(soup) step = 7 data['films'] = self.parse_films(soup) step = 8 data['massages'] = self.parse_massages(soup) step = 9 yield data except: errMsg = 'error in parse %s , step %s' % (url, step) self.log(errMsg) print(errMsg) except: print('error in parse %s' % url) yield None yield None def parse_galleries(self, soup): galleries_dict = [] galleries = soup.find(id='galleries-wrapper') if galleries: items = galleries.find_all('div', class_='item') for item in items: date_item = item.find('small').string.replace( ' ', '-').replace(',', '-').replace('--', '-').split('-') date = date_item[2] + '-' + date_item[0] + '-' + date_item[1] mid = item.find( 'a', attrs={'data-lightbox': "lightbox--poster_image"}) if not mid: mid_url = None else: mid_url = mid.attrs['href'] large = item.find( 'a', attrs={'data-lightbox': "lightbox--board_image"}) if not large: large_url = None else: large_url = large.attrs['href'] url = urljoin('http://www.hegre.com/', item.find('a').attrs['href']) html = self.utils.get_page(url) soup = BeautifulSoup(html, 'lxml') cover = soup.find( 'div', class_='non-members', attrs={"style": re.compile('background-image.*?')}) if cover: cover = re.search('.*?url\((.*?)\)', cover.attrs['style']).group(1) else: cover = None galleries_dict.append({ 'name': self.utils.format_name(item.find('img').attrs['alt']), 'url': url, 'board': [cover, large_url], 'poster': [mid_url, item.find('img').attrs['src']], 'date': date }) return galleries_dict def parse_massages(self, soup): massages_dict = [] massages = soup.find(id='massages-wrapper') if massages: items = massages.find_all('div', class_='item') for item in items: mid = item.find( 'a', attrs={'data-lightbox': "lightbox--poster_image"}) if not mid: mid_url = None else: mid_url = mid.attrs['href'] large = item.find( 'a', attrs={'data-lightbox': "lightbox--board_image"}) if not large: large_url = None else: large_url = large.attrs['href'] url = urljoin('http://www.hegre.com/', item.find('a').attrs['href']) video, date, cover = self.parse_massages_detail(url) massages_dict.append({ 'name': self.utils.format_name(item.find('img').attrs['alt']), 'url': url, 'board': [cover, large_url, item.find('img').attrs['src']], 'poster': [], 'video': video, 'date': date }) return massages_dict def parse_massages_detail(self, url): data = {} html = self.utils.get_page(url) soup = BeautifulSoup(html, 'lxml') item = soup.find('div', class_="video-player-wrapper") board = None if item: style_text = item.attrs['style'] board = re.search("url\(\'(.*?)\'\)", style_text, re.S).group(1) Full = [] items = soup.find_all('div', class_="resolution content ") for item in items: Full.append(item.find('a').attrs['href']) data['full'] = Full Trailer = [] items = soup.find_all('div', class_="resolution trailer top-resolution") for item in items: Trailer.append(item.find('a').attrs['href']) data['trailer'] = Trailer item = soup.find('div', class_='video-stills') Stills = [] if item: stills = item.find_all('div', class_="img-holder") for still in stills: small = still.find('img') if not small: small_url = None else: small_url = small.attrs['src'] large = still.find('a') if not large: large_url = None else: large_url = large.attrs['href'] Stills.append([large_url, small_url]) data['stills'] = Stills date = soup.find('span', class_="date").string return data, date, board def parse_films(self, soup): films_dict = [] films = soup.find(id='films-wrapper') if films: items = films.find_all('div', class_='item') for item in items: mid = item.find( 'a', attrs={'data-lightbox': "lightbox--poster_image"}) if not mid: mid_url = None else: mid_url = mid.attrs['href'] large = item.find( 'a', attrs={'data-lightbox': "lightbox--board_image"}) if not large: large_url = None else: large_url = large.attrs['href'] url = urljoin('http://www.hegre.com/', item.find('a').attrs['href']) video, date, cover = self.parse_films_detail(url) films_dict.append({ 'name': self.utils.format_name(item.find('img').attrs['alt']), 'url': url, 'board': [cover, large_url], 'poster': [mid_url, item.find('img').attrs['src']], 'video': video, 'date': date }) return films_dict def parse_films_detail(self, url): data = {} html = self.utils.get_page(url) soup = BeautifulSoup(html, 'lxml') board = None item = soup.find('div', class_="video-player-wrapper") if item: style_text = item.attrs['style'] board = re.search("url\(\'(.*?)\'\)", style_text, re.S).group(1) if board: pass else: item = soup.find('div', class_="content-overlay-wrapper") if item: style_text = item.select_one( 'div[class="non-members"]').attrs['style'] board = re.search("url\((.*?)\)", style_text, re.S).group(1) Full = [] items = soup.find_all('div', class_="resolution content ") for item in items: Full.append(item.find('a').attrs['href']) data['full'] = Full Trailer = [] items = soup.find_all('div', class_="resolution trailer top-resolution") for item in items: Trailer.append(item.find('a').attrs['href']) data['trailer'] = Trailer item = soup.find('div', class_='video-stills') Stills = [] if item: stills = item.find_all('div', class_="img-holder") for still in stills: small = still.find('img') if not small: small_url = None else: small_url = small.attrs['src'] large = still.find('a') if not large: large_url = None else: large_url = large.attrs['href'] Stills.append([large_url, small_url]) data['stills'] = Stills date = soup.find('span', class_="date").string return data, date, board ''' process_image @author: chenzf ''' # @vthread.pool(8) def process_data(self, data): # print(data) dir_name = self.savePath.format(filePath=data.get('name')) if not os.path.exists(dir_name): os.makedirs(dir_name) with open(dir_name + '\\info.json', 'w') as f: json.dump(data, f) board = data.get('board') if board: self.utils.download_file(board, '%s\\%s' % (data.get('name'), 'board')) poster = data.get('poster') if poster: self.utils.download_file( poster, '%s\\%s' % (data.get('name'), data.get('name'))) self.process_galleries(data) self.process_massages(data) self.process_films(data) def process_galleries(self, data): galleries = data.get('galleries') modelName = data.get('name') for item in galleries: boards = item.get('board') for board in boards: if board: self.utils.download_file( board, '%s\\%s\\%s\\%s' % (modelName, 'galleries', item.get('name'), 'board')) break posters = item.get('poster') for poster in posters: if poster: self.utils.download_file( poster, '%s\\%s\\%s\\%s' % (modelName, 'galleries', item.get('name'), 'poster')) break def process_massages(self, data): massages = data.get('massages') modelName = data.get('name') for item in massages: boards = item.get('board') for board in boards: if board: self.utils.download_file( board, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), 'board')) break posters = item.get('poster') for poster in posters: if poster: self.utils.download_file( poster, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), 'poster')) break video = item.get('video') stills = video.get('stills') for i, val in enumerate(stills, start=1): for subVal in val: if subVal: self.utils.download_file( subVal, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), str(i))) break trailers = video.get('trailer') for trailer in trailers: if trailer: self.utils.download_file( trailer, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), item.get('name'))) break def process_films(self, data): films = data.get('films') modelName = data.get('name') for item in films: boards = item.get('board') for board in boards: if board: self.utils.download_file( board, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), 'board')) break posters = item.get('poster') for poster in posters: if poster: self.utils.download_file( poster, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), 'poster')) break video = item.get('video') stills = video.get('stills') for i, val in enumerate(stills, start=1): for subVal in val: if subVal: self.utils.download_file( subVal, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), str(i))) break trailers = video.get('trailer') for trailer in trailers: if trailer: self.utils.download_file( trailer, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), item.get('name'))) break ''' urls_genarator @author: chenzf ''' def urls_genarator(self): html = self.utils.get_page(self.url) soup = BeautifulSoup(html, 'lxml') item_div = soup.find_all('div', class_="item") for item in item_div: url = urljoin( 'http://www.hegre.com/', item.find('a', class_='artwork').attrs['href'].strip()) yield url yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: a = pq(html, parser='html') # items items = a('a.artwork') for item in items.items(): modelurl = urljoin('http://www.hegre.com/', item.attr('href').strip()) board = item('img').attr('src') name = item.attr('title') if self.dbUtils.get_db_url(modelurl): continue bFarseSucceed = True html2 = self.utils.get_page(modelurl) if html2: b = pq(html2, parser='html') item_models = b('#films-wrapper div.item') for item_model in item_models.items(): try: data_p = self.common.parse_item(item_model) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data except: bFarseSucceed = False continue b = pq(html2, parser='html') item_models = b('#massages-wrapper div.item') for item_model in item_models.items(): try: data_p = self.common.parse_item(item_model) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data except: bFarseSucceed = False continue self.log('parsed url %s' % modelurl) if bFarseSucceed: self.dbUtils.put_db_url(modelurl) self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: if self.dbUtils.get_db_url(url): pass else: a = pq(html) items = a('#pornstars_list li.ps_info a') for item in items.items(): model_url_origin = urljoin('https://www.redtube.com/', item.attr('href')) name = item('img').attr('alt') board = item('img').attr('src') index = 1 while True: model_url = "%s?page=%s" % (model_url_origin, index) if index == 1: if self.dbUtils.get_db_url(model_url_origin): index = index + 1 continue elif self.dbUtils.get_db_url(model_url): index = index + 1 continue break if index > 2: index = index - 1 model_url = "%s?page=%s" % (model_url_origin, index) else: model_url = model_url_origin while True: self.log('request %s' % model_url) html2 = self.utils.get_page(model_url) if html2: if self.dbUtils.get_db_url(model_url): pass else: data_ps, parse_res = self.parse_sub_page(html2) for data_p in data_ps: data_t = { 'name': self.utils.format_name(name), 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data if parse_res: self.log('parsed url %s' % model_url) self.dbUtils.put_db_url(model_url) next_url = pq(html2)('#wp_navNext').attr("href") if next_url: model_url = urljoin('https://www.redtube.com/', next_url) else: break else: break; else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def parse_sub_page(self, html): b = pq(html) items = b('#pornstar_profile_block > .videoblock_list > .video_block_wrapper a.video_link') sub_datas = [] parse_successed = None for item in items.items(): try: data_p = self.common.parse_item(item) sub_datas.append(data_p) if not parse_successed: parse_successed = True else: parse_successed = True & parse_successed except: parse_successed = False return sub_datas, parse_successed
class CWebParserSite(CWebParserMultiUrl): def __init__(self, url, start, end, savePath): super(CWebParserMultiUrl, self).__init__(url, start, end) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) ''' parse_page @author: chenzf ''' def parse_page(self): try: urlsGen = self.urls_genarator() while True: url = next(urlsGen) if not url: yield None html = self.utils.get_page(url) if html: soup = BeautifulSoup(html, 'lxml') items = soup.find_all('div', class_="item") for item in items: posterImg = item.find( 'div', class_='img-holder').find('img').attrs['src'] mid = item.find('div', class_='cover-links').find( 'a', attrs={'data-lightbox': "lightbox--posterImg"}) if not mid: midUrl = None else: midUrl = mid.attrs['href'] large = item.find('div', class_='cover-links').find( 'a', attrs={'data-lightbox': "lightbox--board_image"}) if not large: largeUrl = None else: largeUrl = large.attrs['href'] name = item.find('a', class_='open-in-content-overlay' ).attrs['title'].strip() url = urljoin( 'http://www.hegre.com/', item.find('a', class_='open-in-content-overlay'). attrs['href']) data = { 'name': self.utils.format_name(name), 'small': posterImg, 'mid': midUrl, 'large': largeUrl, 'url': url, 'detail': self.process_data_detail(url) } yield data except: print('error in parse %s' % url) yield None yield None ''' parse_page @author: chenzf ''' def parse_page_detail(self, html): data = {} soup = BeautifulSoup(html, 'lxml') board = None item = soup.find('div', class_="content-overlay-wrapper") if item: style_text = item.select_one( 'div[class="non-members"]').attrs['style'] board = re.search("url\((.*?)\)", style_text, re.S).group(1) data['board'] = board DownLoad = [] items = soup.find_all('div', class_="gallery-zips") for item in items: DownLoad.append(item.find('a').attrs['href']) data['download'] = DownLoad data['date'] = soup.find('span', class_="date").string return data ''' process_data_detail @author: chenzf ''' def process_data_detail(self, url): detail = None html = self.utils.get_page(url) if html: detail = self.parse_page_detail(html) return detail ''' process_data @author: chenzf ''' # @vthread.pool(8) def process_data(self, data): dir_name = self.savePath.format(filePath=data.get('name')) if not os.path.exists(dir_name): os.makedirs(dir_name) with open(dir_name + '\\info.json', 'w') as f: json.dump(data, f) for subkeys in ['large', 'mid', 'small']: url = data.get(subkeys) if url: self.utils.download_file( url, '%s\\%s' % (data.get('name'), data.get('name'))) break detail = data.get('detail') board = detail.get('board') if board: self.utils.download_file(board, '%s\\%s' % (data.get('name'), 'cover')) elif data.get('mid'): board = data.get('mid') self.utils.download_file(board, '%s\\%s' % (data.get('name'), 'cover'))
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None while True: html = self.utils.get_page(url) if html: if self.dbUtils.get_db_url(url): pass else: a = pq(html) # items items = a('ul.set.sluts_main li') parse_succeed = True for item in items.items(): try: name = item('b a').text() board = item('a img').attr('lsrc') + '.jpg' model_url = urljoin('https://www.hqsluts.com/', item('b a').attr('href')) html2 = self.utils.get_page(model_url) if html2: b = pq(html2) modelitems = b('ul.set.s**t li') for modelitem in modelitems.items(): try: data_p = self.common.parse_item( modelitem) data_t = { 'name': self.utils.format_name(name), 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) next_url = a('#pages li a[count="Next Page"]') if next_url: url = urljoin('https://www.hqsluts.com/', next_url.attr('href')) self.log('request %s' % url) else: break else: self.log('request %s error' % url) continue except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserHunterDb(CWebParser): def __init__(self, url, savePath, start, len, parseOnly): # super(CWebParserSingleUrl, self).__init__(url) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) self.common = CWebParserHunterCommon(savePath, parseOnly) self.parseOnly = parseOnly self.start = start self.len = len ''' parse_page @author: chenzf ''' def parse_page(self): db_count = 0 while True: try: for item in self.common.get_db_item(self.start, self.len): try: url = item.get('url') board = item.get('board') discrib = item.get('discrib') b = pq(url) art_site_info = b('#breadcrumbs li') info_string = [] for it in art_site_info.items(): info_string.append(it.text()) if len(info_string) >=3: site, model, name = info_string[0], info_string[1], info_string[2] video = None video_item = b('video') if video_item: src = [] for src_item in video_item('source').items(): src.append(src_item.attr('src')) video={ 'src': src, 'poster':video_item.attr('poster') } else: previews = b('ul.gallery-b li') stills = [] for preview in previews.items(): stills.append([ preview('a').attr('href'), preview('img').attr('src')]) data = { 'site' : site, 'name' : self.utils.format_name(name), 'model' : self.utils.format_name(model), 'discrib' : self.utils.format_name(discrib), 'board' : board, 'url' : url, 'stills' : stills, 'video' : video } db_count +=1 print('current db index %s' %db_count) yield data except: self.log('error in parse item %s' % url) continue except: self.log('error in parse url %s' % url) yield None yield None ''' process_image @author: chenzf ''' def process_data(self, data): self.common.process_data(data)
class CWebParserHunterCommon(object): def __init__(self, savePath,parseOnly): self.savePath = savePath self.utils = CWebSpiderUtils(savePath) self.dbclient = pymongo.MongoClient("mongodb://localhost:27017/") self.dbname = self.dbclient["HegreHunter"] self.dbcol = self.dbname["datas"] self.parseOnly = parseOnly def parse_item(self, item): url = item('a').attr('href') discrib = item('a').attr('title') if not discrib: discrib = item('img').attr('alt') result = re.findall('[a-zA-z]+://[^\s]*', str(item('img').attr('srcset'))) if self.parseOnly == 1: data = None try: data = { 'discrib' : self.utils.format_name(discrib), 'board' : [result[0] if result and len(result) >= 2 else None,item('img').attr('src'), result[1] if result and len(result) >= 2 else None], 'url' : url, } datatmp = deepcopy(data) if self.dbcol.find_one(datatmp): print('a db record already exist!') else: print('insert a db record!') self.dbcol.insert_one(datatmp) except Exception as e: print('database error') print(e) return data else: b = pq(url) art_site_info = b('#breadcrumbs li') info_string = [] for it in art_site_info.items(): info_string.append(it.text()) if len(info_string) >=3: site, model, name = info_string[0], info_string[1], info_string[2] video = None video_item = b('video') if video_item: src = [] for src_item in video_item('source').items(): src.append(src_item.attr('src')) video={ 'src': src, 'poster':video_item.attr('poster') } else: previews = b('ul.gallery-b li') stills = [] for preview in previews.items(): stills.append([ preview('a').attr('href'), preview('img').attr('src')]) data = { 'site' : site, 'name' : self.utils.format_name(name), 'model' : self.utils.format_name(model), 'discrib' : self.utils.format_name(discrib), 'board' : [result[0] if result and len(result) >= 2 else None,item('img').attr('src'), result[1] if result and len(result) >= 2 else None], 'url' : url, 'stills' : stills, 'video' : video } return data def process_data(self, data): # print(data) if data.get('video'): sub_dir_name = "%s\\%s\\films\\%s %s" %(data.get('site'), data.get('model'), data.get('model'),data.get('name')) else: sub_dir_name = "%s\\%s\\galleries\\%s %s" %(data.get('site'), data.get('model'), data.get('model'),data.get('name')) dir_name = self.savePath.format(filePath=sub_dir_name) if not os.path.exists(dir_name): os.makedirs(dir_name) with open(dir_name + '\\..\\info.json', 'w') as f: json.dump(data, f) boards = data.get('board') for board in boards: if board: self.utils.download_file(board, '%s\\%s' % (sub_dir_name, data.get('name')), headers={'Referer':data.get('url')} ) break if data.get('video'): for src in data.get('video').get('src'): self.utils.download_file(src, '%s\\%s' % (sub_dir_name, data.get('name')), headers={'Referer':data.get('url')} ) break self.utils.download_file(data.get('video').get('poster'), '%s\\%s' % (sub_dir_name, data.get('name')), headers={'Referer':data.get('url')} ) else: stills = data.get('stills') for i, val in enumerate(stills, start=1): for subVal in val: if subVal: self.utils.download_file(subVal, '%s\\%s' % (sub_dir_name, str(i)), headers={'Referer':data.get('url')} ) break def get_db_item(self, start, len): return self.dbcol.find()[int(start):(int(start) + int(len))]
class CWebParserSite(CWebParserMultiUrl): def __init__(self, url, start, end, savePath): super(CWebParserMultiUrl, self).__init__(url, start, end) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) ''' parse_page @author: chenzf ''' def parse_page(self): urlsGen = self.urls_genarator() while True: try: url = next(urlsGen) if not url: yield None html = self.utils.get_page(url) if html: soup = BeautifulSoup(html, 'lxml') contents = soup.select('#block-system-main .node-grid .grid-meta') for content in contents: data = {} try: step = 1 name = None modelurl = None if content.find('a'): name = content.find('a').text modelurl = urljoin('http://www.hegregirls.com/', content.find('a').attrs['href']) nick = None if content.find('div', class_='nick'): nick = content.find('div', class_='nick').text products = None if content.find('span', class_='stats'): products = content.find('span', class_='stats').text data['name'] = name data['url'] = modelurl data['nick'] = nick data['products'] = products step = 2 htmlModel = self.utils.get_page(modelurl) if htmlModel: soupModel = BeautifulSoup(htmlModel, 'lxml') step = 3 board_image = soupModel.find('div', class_="field-name-model-board") if board_image: board_image = board_image.find('img').attrs['src'] data['board'] = board_image step = 4 #poster_image poster_image = soupModel.find('div', class_="box border") if poster_image: poster_image = poster_image.find('img').attrs['src'] data['poster'] = poster_image step = 5 #profile labels = soupModel.find('div', class_="box border") rows = labels.find_all('li') profile = [] for row in rows: profile.append(row.get_text().strip().replace('\n', '')) data['profile'] = profile step = 6 #detail product data['galleries'] = self.parse_galleries(soupModel) step = 7 data['films'] = self.parse_films(soupModel) step = 8 data['massages'] = self.parse_massages(soupModel) step = 9 yield data except: errMsg = 'error in parse %s , step %s' % (modelurl, step) print(errMsg) self.log(errMsg) except: print('error in parse %s' % url) yield None yield None def parse_galleries(self, soup): galleries_dict = [] items = soup.select('#main-content .content .content .grid-4') for item in items: if re.search('galleries', item.attrs['about']): date = "" for s in item.find(class_="release-date").strings: date += s mid = item.find('img') if not mid: mid_url = None; else: mid_url = mid.attrs['src'] if mid: large = mid.parent large_url = large.attrs['rel'][0] else: large_url = None; url = urljoin('http://www.hegregirls.com/', item.attrs['about']) detailurl = item.select('.preview-link a') if detailurl: detailurl = detailurl[0] freeurl = urljoin('http://www.hegregirls.com/', detailurl.attrs['href']) info, poster= self.parse_galleries_detail(freeurl) else: info = None poster = [] galleries_dict.append({ 'name' : self.utils.format_name(item.find('img').attrs['title']), 'url' : url, 'board' : [large_url, mid_url], 'poster': poster, 'info' : info, 'date' : date }) return galleries_dict def parse_galleries_detail(self, url): data = {} html = self.utils.get_page(url) soup = BeautifulSoup(html, 'lxml') board = soup.select('#preview-board img') if board: board = board[0].attrs['src'] board_small = soup.find('div', class_='grid-12 alpha') if board_small: board_small = board_small.find('img') if board_small: board_small = board_small.attrs['src'] poster = [board, board_small] Stills = [] stills = soup.find_all('div', class_="grid-4") for still in stills: small = still.find('img') if not small: small_url = None; else: small_url = small.attrs['src'] large = still.find('a') if not large: large_url = None; else: large_url = large.attrs['href'] Stills.append([ large_url, small_url ]) data['stills'] = Stills return data, poster def parse_massages(self, soup): massages_dict=[] items = soup.select('#main-content .content .content .grid-4') for item in items: if re.search('massages', item.attrs['about']): mid = item.find('img') if not mid: mid_url = None; else: mid_url = mid.attrs['src'] large = item.find(class_='hegre-poster-zoom') if large: large_url = large.attrs['href'] else: large_url = None; url = urljoin('http://www.hegregirls.com/', item.attrs['about']) video, poster= self.parse_massages_detail(url) massages_dict.append({ 'name' : self.utils.format_name(item.find('img').attrs['alt']), 'url' : url, 'board' : [large_url, mid_url], 'poster' : poster, 'video' : video, }) return massages_dict def parse_massages_detail(self, url): return self.parse_films_detail(url) def parse_films(self, soup): films_dict = [] items = soup.select('#main-content .content .content .grid-4') for item in items: if re.search('films', item.attrs['about']): mid = item.find('img') if not mid: mid_url = None; else: mid_url = mid.attrs['src'] large = item.find(class_='hegre-poster-zoom') if large: large_url = large.attrs['href'] else: large_url = None; url = urljoin('http://www.hegregirls.com/', item.attrs['about']) video, poster= self.parse_films_detail(url) films_dict.append({ 'name' : self.utils.format_name(item.find('span').attrs['content']), 'url' : url, 'board' : [large_url, mid_url], 'poster' : poster, 'video' : video, }) return films_dict def parse_films_detail(self, url): data = {} html = self.utils.get_page(url) soup = BeautifulSoup(html, 'lxml') poster = [] item = soup.find('video', class_="hegre-video") if item: poster.append(item.attrs['poster']) item = soup.find('div',class_='video-feature') if item: img = item.find('img') if img: poster.append(img.attrs['src']) Full = [] data['full'] = Full Trailer = [] video = soup.find('source') if video: Trailer.append(video.attrs['src']) data['trailer'] = Trailer Stills = [] data['stills'] = Stills return data, poster ''' process_image @author: chenzf ''' # @vthread.pool(8) def process_data(self, data): # print(data) dir_name = self.savePath.format(filePath=data.get('name')) if not os.path.exists(dir_name): os.makedirs(dir_name) with open(dir_name + '\\info.json', 'w') as f: json.dump(data, f) board = data.get('board') if board: self.utils.download_file(board, '%s\\%s' % (data.get('name'), 'board') ) poster = data.get('poster') if poster: self.utils.download_file(poster, '%s\\%s' % (data.get('name'), data.get('name')) ) self.process_galleries(data) self.process_massages(data) self.process_films(data) def process_galleries(self, data): galleries = data.get('galleries') modelName = data.get('name') for item in galleries: boards = item.get('board') for board in boards: if board: self.utils.download_file(board, '%s\\%s\\%s\\%s' % (modelName, 'galleries', item.get('name'), 'board') ) break posters = item.get('poster') for i, val in enumerate(posters, start=1): # for poster in posters: if val: self.utils.download_file(val, '%s\\%s\\%s\\%s_%s' % (modelName, 'galleries', item.get('name'), 'poster', str(i)) ) info = item.get('info') if info: stills = info.get('stills') for i, val in enumerate(stills, start=1): for subVal in val: if subVal: self.utils.download_file(subVal, '%s\\%s\\%s\\%s' % (modelName, 'galleries', item.get('name'), str(i)) ) break def process_massages(self, data): massages = data.get('massages') modelName = data.get('name') for item in massages: boards = item.get('board') for board in boards: if board: self.utils.download_file(board, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), 'board') ) break posters = item.get('poster') for poster in posters: if poster: self.utils.download_file(poster, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), 'poster') ) break video = item.get('video') stills = video.get('stills') for i, val in enumerate(stills, start=1): for subVal in val: if subVal: self.utils.download_file(subVal, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), str(i)) ) break trailers = video.get('trailer') for trailer in trailers: if trailer: self.utils.download_file(trailer, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), item.get('name')) ) break def process_films(self, data): films = data.get('films') modelName = data.get('name') for item in films: boards = item.get('board') for board in boards: if board: self.utils.download_file(board, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), 'board') ) break posters = item.get('poster') for poster in posters: if poster: self.utils.download_file(poster, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), 'poster') ) break video = item.get('video') stills = video.get('stills') for i, val in enumerate(stills, start=1): for subVal in val: if subVal: self.utils.download_file(subVal, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), str(i)) ) break trailers = video.get('trailer') for trailer in trailers: if trailer: self.utils.download_file(trailer, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), item.get('name')) ) break