def parse_ph_info(self, response): phItem = PornVideoItem() selector = Selector(response) # logging.info(selector) _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract()) logging.debug('PH信息的JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) phItem['crwal_time'] = crwal_time logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url + ' crwal_time:' + crwal_time) yield phItem
def parse_ph_info(self, response): phItem = PornVideoItem() selector = Selector(response) # logging.info(selector) _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract()) # logging.debug('PH信息的JSON:') # logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p phItem['issave'] = 0 #是否本地保存 phItem['createtime'] = int(time.time()) phItem['updatetime'] = 0 phItem['local_mp4_url'] = '' print '成功抓取一条' # self.item_list.insert_one(phItem) # logging.info('duration:' + duration + ' title:' + title + ' image_url:' # + image_url + ' link_url:' + link_url) yield phItem
def parse_ph_info(self, response): vodeo_url = response.xpath('//*[@id="player"]/div[21]/video/source/@src').extract_first() phItem = PornVideoItem() phItem['file_urls'] = [vodeo_url] selector = Selector(response) # logging.info(selector) _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract()) logging.debug('PH信息的JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url) yield phItem
def parse_ph_info(self, response): ph_item = PornVideoItem() selector = Selector(response) # logging.info(selector) _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract()) logging.debug('PH信息的JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) image_url = _ph_info_json.get('image_url') duration = _ph_info_json.get('video_duration') title = _ph_info_json.get('video_title') link_url = _ph_info_json.get('link_url') quality_480p = _ph_info_json.get('quality_480p') ph_item['video_duration'] = duration ph_item['video_title'] = title ph_item['image_url'] = image_url ph_item['link_url'] = link_url ph_item['quality_480p'] = quality_480p sha1_object = sha1() sha1_object.update(quality_480p) file_sha1 = sha1_object.hexdigest() # 检查这个文件有没有下载过了 image_file_name = os.path.join(self.file_dir, file_sha1 + '.jpg') mp4_file_name = os.path.join(self.file_dir, file_sha1 + '.mp4') if os.path.exists(mp4_file_name): ph_item['exists'] = True yield ph_item else: ph_item['exists'] = False ph_item['video_file_path'] = mp4_file_name ph_item['image_file_path'] = image_file_name # urllib.urlretrieve(image_url, image_file_name) curl = pycurl.Curl() # curl.setopt(pycurl.USERAGENT,response.headers["User-Agent"]) curl.setopt(pycurl.URL, image_url) curl.setopt(pycurl.REFERER, response.url) curl.setopt(pycurl.SSL_VERIFYPEER, 1) curl.setopt(pycurl.SSL_VERIFYHOST, 2) curl.setopt(pycurl.WRITEDATA, file(image_file_name, "wb")) curl.perform() curl.close() curl2 = pycurl.Curl() curl2.setopt(pycurl.URL, quality_480p) curl2.setopt(pycurl.REFERER, response.url) curl2.setopt(pycurl.SSL_VERIFYPEER, 1) curl2.setopt(pycurl.SSL_VERIFYHOST, 2) curl2.setopt(pycurl.WRITEDATA, file(mp4_file_name, "wb")) curl2.perform() curl2.close() # urllib.urlretrieve(quality_480p, mp4_file_name) yield ph_item
def parse_ph_info(self, response): phItem = PornVideoItem() selector = Selector(response) # logging.info(selector) _ph_info = re.findall('var flashvars =(.*?)[,|;]\n', selector.extract()) logging.debug('PH信息的JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url) yield phItem