Ejemplo n.º 1
0
    def parse_ph_info(self, response):
        phItem = PornVideoItem()
        selector = Selector(response)
        # logging.info(selector)
        _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract())
        logging.debug('PH信息的JSON:')
        logging.debug(_ph_info)
        _ph_info_json = json.loads(_ph_info[0])
        duration = _ph_info_json.get('video_duration')
        phItem['video_duration'] = duration
        title = _ph_info_json.get('video_title')
        phItem['video_title'] = title
        image_url = _ph_info_json.get('image_url')
        phItem['image_url'] = image_url
        link_url = _ph_info_json.get('link_url')
        phItem['link_url'] = link_url
        quality_480p = _ph_info_json.get('quality_480p')
        phItem['quality_480p'] = quality_480p
        crwal_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        phItem['crwal_time'] = crwal_time

        logging.info('duration:' + duration + ' title:' + title +
                     ' image_url:' + image_url + ' link_url:' + link_url +
                     ' crwal_time:' + crwal_time)

        yield phItem
Ejemplo n.º 2
0
 def parse_ph_info(self, response):
     phItem = PornVideoItem()
     selector = Selector(response)
     # logging.info(selector)
     _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract())
     # logging.debug('PH信息的JSON:')
     # logging.debug(_ph_info)
     _ph_info_json = json.loads(_ph_info[0])
     duration = _ph_info_json.get('video_duration')
     phItem['video_duration'] = duration
     title = _ph_info_json.get('video_title')
     phItem['video_title'] = title
     image_url = _ph_info_json.get('image_url')
     phItem['image_url'] = image_url
     link_url = _ph_info_json.get('link_url')
     phItem['link_url'] = link_url
     quality_480p = _ph_info_json.get('quality_480p')
     phItem['quality_480p'] = quality_480p
     phItem['issave'] = 0  #是否本地保存
     phItem['createtime'] = int(time.time())
     phItem['updatetime'] = 0
     phItem['local_mp4_url'] = ''
     print '成功抓取一条'
     # self.item_list.insert_one(phItem)
     # logging.info('duration:' + duration + ' title:' + title + ' image_url:'
     #              + image_url + ' link_url:' + link_url)
     yield phItem
Ejemplo n.º 3
0
    def parse_ph_info(self, response):
        vodeo_url = response.xpath('//*[@id="player"]/div[21]/video/source/@src').extract_first()

        phItem = PornVideoItem()
        phItem['file_urls'] = [vodeo_url]

        selector = Selector(response)
        # logging.info(selector)
        _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract())
        logging.debug('PH信息的JSON:')
        logging.debug(_ph_info)
        _ph_info_json = json.loads(_ph_info[0])
        duration = _ph_info_json.get('video_duration')
        phItem['video_duration'] = duration
        title = _ph_info_json.get('video_title')
        phItem['video_title'] = title
        image_url = _ph_info_json.get('image_url')
        phItem['image_url'] = image_url
        link_url = _ph_info_json.get('link_url')
        phItem['link_url'] = link_url
        quality_480p = _ph_info_json.get('quality_480p')
        phItem['quality_480p'] = quality_480p
        logging.info('duration:' + duration + ' title:' + title + ' image_url:'
                     + image_url + ' link_url:' + link_url)
        yield phItem
Ejemplo n.º 4
0
    def parse_ph_info(self, response):
        ph_item = PornVideoItem()
        selector = Selector(response)
        # logging.info(selector)
        _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract())
        logging.debug('PH信息的JSON:')
        logging.debug(_ph_info)
        _ph_info_json = json.loads(_ph_info[0])

        image_url = _ph_info_json.get('image_url')
        duration = _ph_info_json.get('video_duration')
        title = _ph_info_json.get('video_title')
        link_url = _ph_info_json.get('link_url')
        quality_480p = _ph_info_json.get('quality_480p')

        ph_item['video_duration'] = duration
        ph_item['video_title'] = title
        ph_item['image_url'] = image_url
        ph_item['link_url'] = link_url
        ph_item['quality_480p'] = quality_480p
        sha1_object = sha1()
        sha1_object.update(quality_480p)
        file_sha1 = sha1_object.hexdigest()
        # 检查这个文件有没有下载过了
        image_file_name = os.path.join(self.file_dir, file_sha1 + '.jpg')
        mp4_file_name = os.path.join(self.file_dir, file_sha1 + '.mp4')
        if os.path.exists(mp4_file_name):
            ph_item['exists'] = True
            yield ph_item
        else:
            ph_item['exists'] = False
            ph_item['video_file_path'] = mp4_file_name
            ph_item['image_file_path'] = image_file_name
            # urllib.urlretrieve(image_url, image_file_name)
            curl = pycurl.Curl()
            # curl.setopt(pycurl.USERAGENT,response.headers["User-Agent"])
            curl.setopt(pycurl.URL, image_url)
            curl.setopt(pycurl.REFERER, response.url)
            curl.setopt(pycurl.SSL_VERIFYPEER, 1)
            curl.setopt(pycurl.SSL_VERIFYHOST, 2)
            curl.setopt(pycurl.WRITEDATA, file(image_file_name, "wb"))
            curl.perform()
            curl.close()
            curl2 = pycurl.Curl()
            curl2.setopt(pycurl.URL, quality_480p)
            curl2.setopt(pycurl.REFERER, response.url)
            curl2.setopt(pycurl.SSL_VERIFYPEER, 1)
            curl2.setopt(pycurl.SSL_VERIFYHOST, 2)
            curl2.setopt(pycurl.WRITEDATA, file(mp4_file_name, "wb"))
            curl2.perform()
            curl2.close()
            # urllib.urlretrieve(quality_480p, mp4_file_name)

            yield ph_item
Ejemplo n.º 5
0
 def parse_ph_info(self, response):
     phItem = PornVideoItem()
     selector = Selector(response)
     # logging.info(selector)
     _ph_info = re.findall('var flashvars =(.*?)[,|;]\n',
                           selector.extract())
     logging.debug('PH信息的JSON:')
     logging.debug(_ph_info)
     _ph_info_json = json.loads(_ph_info[0])
     duration = _ph_info_json.get('video_duration')
     phItem['video_duration'] = duration
     title = _ph_info_json.get('video_title')
     phItem['video_title'] = title
     image_url = _ph_info_json.get('image_url')
     phItem['image_url'] = image_url
     link_url = _ph_info_json.get('link_url')
     phItem['link_url'] = link_url
     quality_480p = _ph_info_json.get('quality_480p')
     phItem['quality_480p'] = quality_480p
     logging.info('duration:' + duration + ' title:' + title +
                  ' image_url:' + image_url + ' link_url:' + link_url)
     yield phItem