def video_page(self, response: HtmlResponse): video_title = response.css('h1.title').css('span::text').get() video_channel = response.css('div.video-actions-container').css( 'div.usernameWrap.clearfix').css('a::text').get() js = response.css('div.video-wrapper').css('#player').css( 'script').get() data_video_id = response.css('div.video-wrapper').css( '#player::attr(data-video-id)').get() prepare_js = js.split('<script type="text/javascript">')[1].split( 'loadScriptUniqueId')[0] exec_js = '{0}\nqualityItems_{1};'.format(prepare_js, data_video_id) js_result = js2py.eval_js(exec_js) # type: js2py.base.JsObjectWrapper quality_items = js_result.to_list() # type: list quality = quality_items[-1]['text'].split('p')[0] if int(quality) >= 720: video_url = quality_items[-1]['url'] self.logger.info('parse [%s] success, url: %s', video_title, video_url) if self.settings.get('ENABLE_SQL'): result = self.data_base.select_all_by_title_my_follow( video_title) if len(result) != 0: for line in result: self.logger.error('has duplicate record: %s', line) else: self.data_base.save_my_follow(video_title, video_channel, video_url, response.url) yield PornhubItem(file_urls=video_url, file_name=video_title, file_channel=video_channel)
def content(self, response): print(response) item = PornhubItem() info = re.search('var flashvars(.*)=(.*?);\n', Selector(response).extract()).group() result = json.loads(re.findall('(\{.*?\});', info)[0]) mediaDefinitions = result.get('mediaDefinitions') count = len(mediaDefinitions) for i in range(0, count): videoUrl = mediaDefinitions[i]['videoUrl'] if videoUrl != '': item['file_urls'] = [videoUrl] item['name'] = result.get('video_title') yield item
def parse_ph_info(self, response): #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() phItem = PornhubItem() _ph_info = re.findall('var flashvars =(.*?),\n', response.body) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p yield phItem
def parse(self, response): LL = response.css("#videoCategory .wrap") for l in LL: item = PornhubItem() item["imageUrl"] = l.css("img::attr(data-thumb_url)").extract()[0] item["linkUrl"] = ("https://www.pornhub.com" + l.css(".title a::attr(href)").extract()[0]) item["name"] = l.css(".title a::text").extract()[0] item["playNum"] = l.css( ".videoDetailsBlock var::text").extract()[0] item["recommendation"] = l.css( ".videoDetailsBlock .value::text").extract()[0] item["time"] = l.css(".duration::text").extract()[0] yield item # https://www.pornhub.com nextP1 = response.css(".page_next a::attr(href)").extract()[0] nextP = "https://www.pornhub.com" + nextP1 if self.count < 20: # 判断是否存在下一页 self.count += 1 yield scrapy.Request(nextP, callback=self.parse)
def parse_ph_info(self, response): phItem = PornhubItem() selector = Selector(response) # logging.info(selector) _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract()) logging.debug('PH信息的JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url) yield phItem
def video_page(self, response: HtmlResponse): # some video has "Watch Full Video" button full_video_button = response.css("#trailerFullLengthDownload") video_title = response.css('h1.title').css('span::text').get() video_channel = response.css('div.video-actions-container').css( 'div.usernameWrap.clearfix').css('a::text').get() if full_video_button: button_title = full_video_button.css('::attr(data-title)').get() if button_title != 'Buy Full Video': full_url = full_video_button.css('::attr(href)').get() self.logger.info('%s detected full video, original name: %s', video_channel, video_title) yield scrapy.Request(full_url, callback=self.video_page, priority=100) else: self.logger.info('%s detected buy video, drop', video_channel) else: self.logger.info('get model: %s, title: %s', video_channel, video_title) player_id_element = response.css('#player') js = player_id_element.css('script').get() data_video_id = player_id_element.css( '::attr(data-video-id)').get() prepare_js = js.split('<script type="text/javascript">')[1].split( 'loadScriptUniqueId')[0] exec_js = '{0}\nqualityItems_{1};'.format(prepare_js, data_video_id) js_result = js2py.eval_js( exec_js) # type: js2py.base.JsObjectWrapper quality_items = js_result.to_list() # type: list quality = quality_items[-1]['text'] if quality != '240p' or quality != '"480p"': video_url = quality_items[-1]['url'] yield PornhubItem(file_urls=video_url, file_name=video_title, file_channel=video_channel, parent_url=response.url)
def video_content(self, response): tag, duration = response.meta.get('item') item = PornhubItem() link_url = response.url try: title = response.xpath('//span[@class="inlineFree"]/text()').get() except: title = None try: count = response.xpath('//span[@class="count"]/text()').get() except: count = None try: video_tags = ','.join( response.xpath( '//div[@class="categoriesWrapper"]/a//text()').getall()) except: video_tags = None try: percent = response.xpath('//span[@class="percent"]/text()').get() except: percent = None try: img_url = response.xpath( '//meta[@property="og:image"]/@content').get() except: img_url = None # 得到视频截图,其中S{?} 代表有多少个视频截图,得到值,将img切割,推导式拼接 # 有一定情况出现问题 try: video_screenshot_img = re.findall( '"urlPattern":"(.*?)","thumbHeig', response.text)[0] num = int(re.findall('S{(\d+)}', video_screenshot_img)[0]) start_video_img = video_screenshot_img.split('S{')[0] video_screenshot_imgs = [ start_video_img + 'S{}.jpg'.format(i) for i in range(1, num) ] except: video_screenshot_imgs = None # 有一定情况出现问题 # 网络小水管,1080p是必须登陆才能在页面上看到的,如果需要请携带cookies重写中间件 # 如果需要下载视频,请携带请求头请求,不然返回403 # if '"quality":"1080"' in response.text: # video_url = re.findall('"quality":"1080","videoUrl":"(.*?)"},',response.text,re.S|re.I)[0] try: if '"quality":"720"' in response.text: video_url = re.findall('"quality":"720","videoUrl":"(.*?)"},', response.text, re.S | re.I)[0] elif '"quality":"480"' in response.text: video_url = re.findall('"quality":"480","videoUrl":"(.*?)"},', response.text, re.S | re.I)[0] elif '"quality":"240"' in response.text: video_url = re.findall('"quality":"240","videoUrl":"(.*?)"},', response.text, re.S | re.I)[0] except: # video_url = None with open('erorr_request.txt', 'a') as f: f.write(title + ',' + link_url) f.write('\n') item['tag'] = tag item['duration'] = duration item['title'] = title item['link_url'] = link_url item['count'] = count item['video_tags'] = video_tags item['percent'] = percent item['img_url'] = img_url item['video_screenshot_imgs'] = video_screenshot_imgs item['video_url'] = video_url yield item