コード例 #1
0
ファイル: vipfree.py プロジェクト: zhjih7988/movie_spider
    def item_parse(self, response):
        # 解析影片介绍及播放页面
        
        detail = response.meta['detail']
        soup = BeautifulSoup(detail, "lxml")
        values = soup.find('a')
        cover_image = {
            "img"    : values['src'],
            "width"  : 0,
            "height" : 0
            }
        video_src_list = response.xpath('//*[@id="playlist1"]/ul/li').extract()
        
        item = VipfreeItem()
        # 站内唯一标识
        item['item_id']       = response.url.split('/')[-1].split('.')[-2]
        # 影片标题
        item['title']         = values['title']
        # 图片信息
        item['cover_images'].append(cover_image)
        # 播放页链接
        item['content_url']   = response.url
        # 介绍
        item['description']   = response.xpath('//*[@id="list3"]/div/div/text()').extract()[-1]
        # 播放地址数
        item['video_src_cnt'] = len(video_src_list)
        # 图片数量
        item['cover_img_cnt'] = 1

        logger.info(item['title'])
        logger.info(item['description'])
        
        yield item
コード例 #2
0
ファイル: xicidaili.py プロジェクト: zhjih7988/movie_spider
    def parse(self, response):
        logger.info('开始爬取数据.')
        ip_list = response.xpath('//*[@id="ip_list"]')

        trs = ip_list[0].xpath('tr')

        items = []

        for ip in trs[1:]:
            pre_item = CollectipsIpItem()

            pre_item['IP'] = ip.xpath('td[3]/text()')[0].extract()

            pre_item['PORT'] = ip.xpath('td[4]/text()')[0].extract()

            pre_item['POSITION'] = ip.xpath(
                'string(td[5])')[0].extract().strip()

            pre_item['TYPE'] = ip.xpath('td[7]/text()')[0].extract()

            pre_item['SPEED'] = ip.xpath('td[8]/div[@class="bar"]/@title').re(
                '\d{0,2}\.\d{0,}')[0]

            pre_item['LAST_CHECK_TIME'] = ip.xpath(
                'td[10]/text()')[0].extract()

            items.append(pre_item)

        return items
コード例 #3
0
    def parse(self, response):
        """
        解析电影列表页
        """
        logger.info(u'解析电影列表页:%s' % response.url)

        # 找出所有影片链接
        detail_link_list = response.xpath(
            '//div[@class="index-area clearfix"]/ul/li/a/@href').extract(
            )  # 获取当前页所有详情链接
        detail_list = response.xpath(
            '//div[@class="index-area clearfix"]/ul/li/a').extract(
            )  # 获取当前页所有详情信息

        for detail_link, detail in zip(detail_link_list, detail_list):
            yield scrapy.Request(url=self.base_domain + detail_link,
                                 headers=headers,
                                 meta={'detail': detail},
                                 callback=self.item_parse)  # 解析单页页
            break

        # 超出下一页按钮
        next_page_list = response.xpath(
            '//div[@class="page mb clearfix"]/a').extract()  # 获取当前页所有详情链接

        for button_item in next_page_list:
            soup = BeautifulSoup(button_item, "lxml")
            link_list = soup.findAll('a')

            for tag_a in link_list:
                if u'>' == tag_a.text:
                    link = self.base_domain + tag_a['href']
                    logger.info(u"下一页链接: %s" % link)
                    yield scrapy.Request(url=link, callback=self.parse)
コード例 #4
0
ファイル: xiaoma.py プロジェクト: zhjih7988/movie_spider
 def __init__(self):
     super(XiaomaSpider, self).__init__()
     
     # 初始化 start_urls, 生成电影的地址
     movie_root = u'http://efx6.cn/movie.php?m=http://www.360kan.com/dianying/list.php?cat=all%26pageno={pageno}'
     for i in range(MAX_PAGE_INDEX):
         url = movie_root.format(pageno = str(i+1))
         logger.info(u'初始化网页链接 %s' % url)
         self.start_urls.append(url)
コード例 #5
0
ファイル: quanmin.py プロジェクト: zhjih7988/movie_spider
    def item_parse(self, response):
        # 解析影片介绍及播放页面

        detail     = response.meta['detail']  # 图片, 主演, 片名
        soup_meta  = BeautifulSoup(detail, "lxml")
        movie_tag  = soup_meta.find('a')
        title      = movie_tag['title']  # 片名
#         href       = movie_tag['href']   # 链接  直接调用response.url
        img_url    = soup_meta.find('img')['src']  # 封面图片地址
        year       = soup_meta.find('span', {'class' : 'hint'}).text   # 年份
        zhuyan     = soup_meta.find('p', {'class' : 'star'}).text   # 主演

        play_item = response.xpath('//p[@class="vspy"]/a').extract()
        video_src_list = len(play_item)  # 播放源数量

        tyyp   = response.xpath('/html/body/div[1]/section/div[1]/div/div[5]/div/h3[1]/span').extract()[0]
        tyyp   = BeautifulSoup(tyyp, "lxml").text.replace(' ', '')  # 影片分类
        
        desc  = response.xpath('//p[@class="item-desc js-close-wrap"]').extract()[0]
        desc  = BeautifulSoup(desc, "lxml").text.replace('\n', '')  # 简介
        
        cover_image = {
            "img"   : img_url,
            "width" : 0,
            "heigh" : 0
            }
        
        # 数视频源数量        
#         video_src_list = response.xpath('//p[@class="vspy"]').extract()
#         logger.info(video_src_list)

        item = QuanminItem()
        # 站内唯一标识
        item['item_id']       = response.url.split('/')[-1].split('.')[-2]
        # 影片标题
        item['title']         = title
        # 图片信息
        item['cover_images'].append(cover_image)
        # 播放页链接
        item['content_url']   = response.url
        # 介绍
        item['description']   = desc  # response.xpath('//*[@id="list3"]/div/div/text()').extract()[-1]
        # 播放地址数/片源数量
        item['video_src_cnt'] = video_src_list
        # 图片数量
        item['cover_img_cnt'] = len(item['cover_images'])
        # 主演
        item['actor_list']    = zhuyan.replace('/', ',').replace(' ', ',')
        # 上映年份
        item['show_year']     = year
        # 影片类型
        item['item_catagory'] = tyyp
        
        logger.info(u"成功解析: %s url: %s" % (item['title'], item['content_url']))

        yield item
コード例 #6
0
ファイル: pipelines.py プロジェクト: zhjih7988/movie_spider
    def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem('Missing{0}!'.format(data))
        if valid:
            self.collection.insert(dict(item))


#             log.msg('question added to mongodb database!',
#                     level=log.DEBUG,spider=spider)

        logger.info(u'存入数据成功')
        return item
コード例 #7
0
ファイル: xiaoma.py プロジェクト: zhjih7988/movie_spider
    def parse(self, response):
        logger.info(u'解析电影列表页面: %s' % response.url)
        
        # 找出所有影片链接
        detail_link_list = response.xpath(
            '//div[@class="s-tab-main"]/ul/li/a/@href').extract() # 获取当前页所有详情链接
        detail_list      = response.xpath(
            '//div[@class="s-tab-main"]/ul/li/a').extract()       # 获取当前页所有详情信息

        for detail_link, detail in zip(detail_link_list, detail_list):
            url = self.base_domain + detail_link[1:]
            logger.info(u'影片播放页 传参 %s' % url)
            yield scrapy.Request(url = url,
                                 headers=headers,
                                 meta={'detail':detail},
                                 callback=self.item_parse)  # 解析播放页
コード例 #8
0
ファイル: xiaoma.py プロジェクト: zhjih7988/movie_spider
    def item_parse(self, response):
        logger.info(u'解析电影详情页面: %s' % response.url)
        
        detail     = response.meta['detail']  # 图片, 主演, 片名
        soup_meta  = BeautifulSoup(detail, "lxml")
        movie_tag  = soup_meta.find('a')
        title      = movie_tag['title']       # 片名
        img_url    = soup_meta.find('img')['src']                   # 封面图片地址
        score      = soup_meta.find('span', {'class' : 's2'}).text  # 评分
        zhuyan     = soup_meta.find('p', {'class' : 'star'}).text   # 主演

        desc  = response.xpath('//p[@class="item-desc js-close-wrap"]').extract()[0]
        desc  = BeautifulSoup(desc, "lxml").text.replace('\n', '')  # 简介
        
        cover_image = {
            "img"   : img_url,
            "width" : 0,
            "heigh" : 0
            }

        item = XiaomaItem()
        # 站内唯一标识
        item['item_id']       = response.url.split('/')[-1].split('.')[-2]
        # 影片标题
        item['title']         = title
        # 图片信息
        item['cover_images'].append(cover_image)
        # 播放页链接
        item['content_url']   = response.url
        # 介绍
        item['description']   = desc
        # 播放地址数/片源数量
        item['video_src_cnt'] = 1
        # 图片数量
        item['cover_img_cnt'] = len(item['cover_images'])
        # 主演
        item['actor_list']    = zhuyan.replace('/', ',').replace(' ', ',')
        # 评分
        item['score']         = score
        
        logger.info(u"成功解析: %s url: %s" % (item['title'], item['content_url']))

        yield item
コード例 #9
0
ファイル: quanmin.py プロジェクト: zhjih7988/movie_spider
    def parse(self, response):
        #先,解析起始路径 start_urls
        logger.info(u'解析页面地址url:%s' % response.url)


        # 找出所有影片链接
        detail_link_list = response.xpath(
            '//html/body/section/div[3]/div/div[1]/div/div/ul[2]/li/a/@href').extract() # 获取当前页所有详情链接
        detail_list      = response.xpath(
            '//html/body/section/div[3]/div/div[1]/div/div/ul[2]/li/a').extract()         # 获取当前页所有详情信息
              
        for detail_link, detail in zip(detail_link_list, detail_list):
            yield scrapy.Request(url = detail_link,
                                 meta={'detail':detail},
                                 headers=headers,
                                 callback=self.item_parse)  # 解析单页页


        if self.page_count > MAX_PAGE_INDEX :
            return
        else:
            self.page_count += 1

#         if detail_link_list or len(detail_link_list) == 0:
#             # 下一页按钮一直会在, 但是可能只有广告, 这里针对电影数做个判断
#             return

        # 寻找下一页按钮
        next_page_list = response.xpath('//div[@class="paging"]').extract()
          
        for button_item in next_page_list:
            soup = BeautifulSoup(button_item, "lxml")
            link_list = soup.findAll('a')
              
            for tag_a in link_list:
                if u'下一页' in tag_a.text :
                    link = self.base_domain + tag_a['href']
                    logger.info(u"下一页链接: %s" % link)
                    yield scrapy.Request(url = link, headers=headers, callback=self.parse)
コード例 #10
0
ファイル: vipfree.py プロジェクト: zhjih7988/movie_spider
    def sub_parse(self, response):
        # 分页执行

        # 找出所有影片链接
        detail_link_list = response.xpath('//div[@class="item"]/ul/div/a/@href').extract() # 获取当前页所有详情链接
        detail_list      = response.xpath('//div[@class="item"]/ul/div/a').extract()       # 获取当前页所有详情信息
             
        for detail_link, detail in zip(detail_link_list, detail_list):
            yield scrapy.Request(url = self.base_domain + detail_link[1:],
                                 meta={'detail':detail},
                                 headers=headers,
                                 callback=self.item_parse)  # 解析单页页面

        # 寻找下一页按钮
        next_page_list = response.xpath('/html/body/div[2]/div/div[3]/div[3]/ul/li/a').extract()

        for button_item in next_page_list:
            if u'下一页' in button_item :
                logger.info(u'------------正在翻页------------')
                soup = BeautifulSoup(button_item, "lxml")
                link = soup.find('a')['href']
                link = self.base_domain + '/' + link
                logger.info(link)
                yield scrapy.Request(url = link, headers=headers, callback=self.sub_parse)
コード例 #11
0
    def item_parse(self, response):
        # 解析影片介绍页面
        
        link_list   = response.xpath("(.//div[@class='videourl clearfix'])[1]/ul/li/a/@href").extract()
        title_list  = response.xpath("(.//div[@class='videourl clearfix'])[1]/ul/li/a/@title").extract() # 正片
        title       = response.xpath(".//dt[@class='name']/text()").extract()[0]
        protagonist = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract()
        mtype       = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract()
        director    = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract()
        description = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract()
        show_year   = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract()
        region      = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract()
        lang        = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract()
        cover_url   = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract()
#         play_url    = response.xpath(".//div[@class='ct-c']/dt[1]/text()").extract()
        
        
#         protagonist = response.xpath().extract()
        
        logger.info(protagonist)
        logger.info("titile %s, link %s " % (str(len(title_list)), str(len(link_list))))
        
        
        
        
        
        for i in range(len(title_list)):
            logger.info("title index %d" % (i,))
            logger.info("title value %s" % (title_list[i]))
            item = Liliyy123Item()
            
            item['title']       = title
            item['protagonist'] = protagonist
            item['type']        = mtype
            item['director']    = director
            item['description'] = description
            item['show_year']   = show_year
            item['region']      = region
            item['lang']        = lang
            item['cover_url']   = cover_url
            item['play_url']    = self.base_domain + link_list[i]
            
#             logger.info(item)
            yield item
コード例 #12
0
    def item_parse(self, response):
        """
        解析影片播放详情页
        """
        logger.info(u'解析影片播放详情页: %s' % response.url)

        # 解析影片介绍页面
        logger.info(u'解析影片介绍页面: %s' % response.url)

        detail = response.meta['detail']  # 图片, 片名
        soup_meta = BeautifulSoup(detail, "lxml")
        title = soup_meta.find('p', {'class': 'name'}).text  #片名
        img_url = soup_meta.find('img')['data-original']  # 封面图片地址

        actors = soup_meta.findAll('p', {'class': 'actor'})
        actor_list = actors[0].text  # 演员表
        item_catagory = actors[1].text  # 影片类型
        show_year = actors[2].text.split('/')[0]  # 上映年份
        region = actors[2].text.split('/')[1]  # 所属地区

        # -------------------------------------------------- #

        director = response.xpath(
            '//div[@class="ct-c"]/dl/dd[1]/a/text()').extract()  # 导演
        if len(director) == 0:
            director = ''
        else:
            director = director[0]

        lang = response.xpath(
            '//div[@class="ct-c"]/dl/dd[4]/text()').extract()  # 影片语言
        if len(lang) == 0:
            lang = ''
        else:
            lang = lang[0]

        play_item = response.xpath(
            '//div[@class="playfrom tab8 clearfix"]/ul/li').extract()
        video_src_list = len(play_item)  # 播放源数量

        desc = response.xpath('////div[@name="ee"]').extract()[0]
        desc = BeautifulSoup(desc, "lxml").text.replace('\n', '')  # 简介

        cover_image = {"img": img_url, "width": 0, "heigh": 0}

        item = ZxkkItem()
        # 站内唯一标识
        item['item_id'] = response.url.split('?')[-1].split('.')[-2]
        # 影片标题
        item['title'] = title
        # 图片信息
        item['cover_images'].append(cover_image)
        # 播放页链接
        item['content_url'] = response.url
        # 介绍
        item['description'] = desc
        # 播放地址数/片源数量
        item['video_src_cnt'] = video_src_list
        # 图片数量
        item['cover_img_cnt'] = len(item['cover_images'])
        # 导员
        item['director'] = director
        # 演员表
        item['actor_list'] = actor_list
        # 上映年份
        item['show_year'] = show_year
        # 影片类型
        item['item_catagory'] = item_catagory
        # 影片类型
        item['sub_channel'] = item_catagory
        # 上映年份
        item['region'] = region

        logger.info(u"成功解析: %s url: %s" % (item['title'], item['content_url']))

        yield item
コード例 #13
0
ファイル: main.py プロジェクト: zhjih7988/movie_spider
# -*- coding: utf-8 -*-
import os
import sys

base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, base_dir)

from scrapy.cmdline import execute
from movie_spider.common import logger
from rest_spider import images_loader

# 莉莉影视
# execute(['scrapy', 'crawl', 'lili'])

# 唯爱痞电影网
# execute(['scrapy', 'crawl', 'vipfree'])

# 全民影院
# execute(['scrapy', 'crawl', 'quanmin'])

# 小马影院
# execute(['scrapy', 'crawl', 'xiaoma'])

# 在线看看
execute(['scrapy', 'crawl', 'zxkk'])

# 下载图片, 更新相对链接
images_loader.master_main()

logger.info(u'全部任务执行完毕')
コード例 #14
0
ファイル: vipfree.py プロジェクト: zhjih7988/movie_spider
 def parse(self, response):
     # 只执行一次,解析起始路径 start_urls
     
     logger.info(u'开始解析')
     for url in self.start_urls:
         yield scrapy.Request(url, headers=headers, callback=self.sub_parse)
コード例 #15
0
ファイル: douban.py プロジェクト: zhjih7988/movie_spider
 def parse(self, response):
     logger.info(str(response.body))
     pass