コード例 #1
0
ファイル: mv.py プロジェクト: RunningPdt/movie
 def parse(self, response):
     item = MovieItem()
     for sel in response.xpath('//div[@class="info"]'):
         title = sel.xpath('div[@class="hd"]/a/span/text()').extract()
         full_title = ''
         for each in title:
             full_title += each
         movie_info = sel.xpath('div[@class="bd"]/p/text()').extract()
         star = sel.xpath(
             'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract()[0]
         quote = sel.xpath('div[@class="bd"]/p/span/text()').extract()
         if quote:
             quote = quote[0]
         else:
             quote = ''
         item['title'] = full_title
         print(item['title'].encode('GBK', 'ignore').decode('GBK'))
         item['movie_info'] = ';'.join(movie_info).replace(' ', '').replace(
             '\n', '')
         print(item['movie_info'].encode('GBK', 'ignore').decode('GBK'))
         item['star'] = star[0]
         print(item['star'])
         item['quote'] = quote
         print(item['quote'].encode('GBK', 'ignore').decode('GBK'))
         yield item
     next_page = response.xpath(
         '//span[@class="next"]/link/@href').extract()
     if next_page:
         next_page = next_page[0]
         print(self.start_urls[0] + str(next_page))
         yield scrapy.Request(self.start_urls[0] + str(next_page),
                              callback=self.parse)
コード例 #2
0
 def parse(self, response):
     selector = Selector(response)
     movies = selector.xpath('//div[@class="info"]')
     for movie in movies:
         item = MovieItem()
         title = movie.xpath('div[@class="hd"]/a/span/text()').extract()
         fullTitle = ''
         for each in title:
             fullTitle += each
         movieInfo = movie.xpath('div[@class="bd"]/p/text()').extract()
         star = movie.xpath(
             'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract()[0]
         quote = movie.xpath('div[@class="bd"]/p/span/text()').extract()
         url1 = movie.xpath('div[@class="hd"]/a/@href').extract()[0]
         if quote:
             quote = quote[0]
         else:
             quote = ''
         item['title'] = fullTitle
         item['movieInfo'] = ';'.join(movieInfo).replace(' ', '').replace(
             '\n', '')
         item['star'] = star[0]
         item['quote'] = quote
         item['url'] = url1
         print(url1)
         yield Request(url1,
                       callback=self.parseContent,
                       meta={'item': item})
     nextPage = selector.xpath('//span[@class="next"]/link/@href').extract()
     if nextPage:
         nextPage = nextPage[0]
         print(self.url + str(nextPage))
         yield Request(self.url + str(nextPage), callback=self.parse)
コード例 #3
0
    def parse(self, response):
        movies = response.xpath('//ul[@class="top-list  fn-clear"]/li')

        for each_movie in movies:
            item = MovieItem()
            item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]

            sta_lst = each_movie.xpath(
                './span[@class="state1 new100state1"]/text()').extract()
            sta = sta_lst[0].strip() if sta_lst else None
            if not sta:
                sta = each_movie.xpath(
                    './span[@class="state1 new100state1"]/font/text()'
                ).extract()[0]
            else:
                sta = sta_lst[0].strip()
            item['status'] = sta

            tv = each_movie.xpath('./span[@class="mjtv"]/text()').extract()
            item['tv'] = tv[0] if tv else None

            lst_update_tm = each_movie.xpath(
                './div[@class="lasted-time new100time fn-right"]/text()'
            ).extract()
            lst_up_time = lst_update_tm[0] if lst_update_tm else None
            if not lst_up_time:
                lst_up_time = each_movie.xpath(
                    './div[@class="lasted-time new100time fn-right"]/font/text()'
                ).extract()[0]
            item['update_time'] = lst_up_time

            yield item
コード例 #4
0
    def parse(self, response):
        # print('Preparing parse')
        # 业务逻辑
        # print(response)
        searchResults = json.loads(response.body)['data']['searchResults']

        for result in searchResults:
            movie = result["movie"]
            if movie["movieId"] not in self.movieMap.keys():
                # print(movie["movieId"])
                pass
            else:
                urls = []
                urls.append(self.imageUrlPrefix + movie["posterPath"])
                item = MovieItem()
                item['imageUrls'] = urls
                item['movieId'] = movie["movieId"]
                item['movieName'] = movie['title']
                item['directors'] = movie['directors']
                item['actors'] = movie['actors']
                item['posterPath'] = movie['posterPath']
                item['plotSummary'] = movie['plotSummary']
                item['avgRating'] = movie['avgRating']
                item['numRatings'] = movie['numRatings']

                self.movieMap.pop(movie["movieId"])
                self.totalNum += 1
                yield item
コード例 #5
0
    def parse(self, response):
        movies = response.xpath('//div[@class="list_2"]/ul/li')

        for each_movie in movies:
            item = MovieItem()
            item['name'] = each_movie.xpath('./a/@title').extract()[0]
            yield item
コード例 #6
0
    def parse2(self, response):
        star_to_num = {
            '力荐': 5,
            '推荐': 4,
            '还行': 3,
            '较差': 2,
            '很差': 1,
        }
        item = MovieItem()
        # 最后 class 空格不能省略 /div[@class="comment-item "]
        comments = Selector(response=response).xpath(
            '//*[@id="comments"]/div[@class="comment-item "]')
        for comment in comments:
            star = comment.xpath('./div[2]/h3/span[2]/span[2]/@title').get()
            if star not in star_to_num:  # 处理没有评分的情况,那上面得到的是完整时间,比如 2020-05-17 17:27:09
                continue
            star = star_to_num[star]
            record_time = comment.xpath(
                './div[2]/h3/span[2]/span[3]/text()').get().strip()
            short = comment.xpath('./div[2]/p/span/text()').get()

            item['short'] = short
            item['star'] = star
            item['record_time'] = record_time

            yield item
コード例 #7
0
ファイル: mov.py プロジェクト: Minokun/movie
 def parse_detail(self, response):
     import re
     import codecs
     p = re.compile(r'(http:\/\/cdn.*?_240.mp4.*?\")')
     res = p.findall(str(response.body))
     res = re.sub('"', '', res[0]).split(':')
     res = 'http:' + res[len(res) - 1]
     # for index, val in enumerate(res):
     #     if re.match('http://.*?\.m$', val):
     #         del res[index]
     video_url = '\r\n'.join(res)
     with codecs.open('mov.txt', 'a', 'utf-8') as f:
         f.write(video_url + '\r\n')
     item_loader = MovItem(item=MovieItem(), response=response)
     item_loader.add_value("url", response.url)
     item_loader.add_value("thumb", response.meta.get("thumb"))
     item_loader.add_value("title", response.meta.get("title"))
     item_loader.add_value("duration", response.meta.get("duration"))
     item_loader.add_value("post_date", response.meta.get("post_date"))
     item_loader.add_value("video_url", video_url)
     item_loader.add_xpath(
         "views_num",
         "//div[contains(@id,'tabInfo')]/div[contains(@class,'col3')]/p[1]/text()"
     )
     item_loader.add_xpath(
         "channel",
         "//div[contains(@id,'tabInfo')]/div[contains(@class,'col3')]/p[2]/a/text()"
     )
     movie_item = item_loader.load_item()
     yield movie_item
コード例 #8
0
 def parse(self, response):  #response是网页的内容
     selector2 = lxml.etree.HTML(response.text)
     item = MovieItem()
     num = selector2.xpath('//p[@class="f4"]/text()')
     level = re.findall("\d+", str(num))
     item['level'] = level
     yield item
コード例 #9
0
 def parse(self, response):
     r = json.loads(response.text)
     html = r['html']
     soup = BeautifulSoup(html, 'lxml')
     all_comment = soup.find_all(class_='comment')
     for comment in all_comment:
         item = MovieItem()
         item['Commentator'] = comment.find('a', class_='').text
         item['time'] = comment.find('span',
                                     class_='comment-time').text.strip()
         item['votes'] = comment.find('span', class_='votes').text
         item['short'] = comment.find('span', class_='short').text
         try:
             allstar = comment.find('span', {
                 'class': re.compile('allstar\d.*')
             }).attrs['class']
             item['allstar'] = allstar[0][-2]
         except:
             item['allstar'] = 0
         yield item
     self.offset += 20
     if self.offset > 1001:
         return None
     data = {
         'start': self.offset,
         'limit': '20',
         'sort': 'new_score',
         'status': 'P',
         'comments_only': '1'
     }
     url = 'https://movie.douban.com/subject/26985127/comments?' + urlencode(
         data)
     yield scrapy.Request(url, callback=self.parse)
コード例 #10
0
    def parse_item(self, response):
        hxs = Selector(response)
        movie_name = hxs.xpath(
            '//*[@id="content"]/h1/span[1]/text()').extract()
        movie_director = hxs.xpath(
            '//*[@id="info"]/span[1]/span[2]/a/text()').extract()
        movie_writer = hxs.xpath(
            '/html/body/div[3]/div[1]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span[2]/span[2]/a[1]/text()'
        ).extract()
        movie_description = hxs.xpath(
            '//*[@id="info"]/span[2]/span[2]/a/text()').extract()

        movie_roles_paths = hxs.xpath('//*[@id="info"]/span[3]/span[2]')
        movie_roles = []
        for movie_roles_path in movie_roles_paths:
            movie_roles = movie_roles_path.xpath(
                './/*[@rel="v:starring"]/text()').extract()

        item = MovieItem()
        item['movie_name'] = ''.join(movie_name).strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"',
                                                    '\\\"').replace(':', ';')
        item['movie_director'] = movie_director[0].strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(
                ':', ';') if len(movie_director) > 0 else ''
        item['movie_description'] = movie_description[0].strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(
                ':', ';') if len(movie_description) > 0 else ''
        item['movie_writer'] = ';'.join(movie_writer).strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"',
                                                    '\\\"').replace(':', ';')
        item['movie_roles'] = ';'.join(movie_roles).strip().replace(
            ',', ';').replace('\'', '\\\'').replace('\"',
                                                    '\\\"').replace(':', ';')
        yield item
コード例 #11
0
 def parse(self, response):
     movies = response.xpath('//ul[@class="top-list  fn-clear"]/li')
     for each_movie in movies:
         item = MovieItem()  # 新建一个类
         item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]
         print(item['name'])
         yield item
コード例 #12
0
 def parse(self, response):
     movies = response.xpath('//ul[@class="top-list  fn-clear"]/li')
     for movie in movies:
         item = MovieItem()
         # scrapy 自带的解析器
         item['name'] = movie.xpath('./h5/a/@title').extract()[0]
         yield item
コード例 #13
0
ファイル: meiju.py プロジェクト: zzqfsy/scrapy-movie-demo
    def parse(self, response):
        movies = response.xpath('//ul[@class="top-list  fn-clear"]/li')
        for each_movie in movies:
            item = MovieItem()
            item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]

            state = each_movie.xpath(
                './span[@class="state1 new100state1"]/font/text()').extract()
            if state:
                item['state'] = state[0]
            else:
                item['state'] = each_movie.xpath(
                    './span[@class="state1 new100state1"]/text()').extract()[0]

            item['type'] = each_movie.xpath(
                './span[@class="mjjq"]/text()').extract()[0]

            updateDate = each_movie.xpath('./div/font/text()').extract()
            if updateDate:
                item['updateDate'] = updateDate[0]
            else:
                item['updateDate'] = each_movie.xpath(
                    './div/text()').extract()[0]

            yield item
コード例 #14
0
 def parse(self, response):
     print "crazylog------------------response =", type(response)
     movies = response.xpath('//ul[@class="top-list  fn-clear"]/li')
     for each_movie in movies:
         item = MovieItem()
         item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]
         yield item
コード例 #15
0
    def parse(self, response):
        items = []

        lilist = response.xpath('/html/body//ul[@class="picList clearfix"]/li')

        for li in lilist:
            item = MovieItem()

            item['title'] = li.xpath(
                './div[@class="txt"]/p[@class="pTit"]/span[@class="sTit"]//text()'
            ).extract()[0]
            item['img'] = 'http:' + li.xpath(
                './div[@class="pic"]/img/@src').extract()[0]
            if len(
                    li.xpath(
                        './div[@class="txt"]/p[@class="pTxt pIntroHide"]//text()'
                    ).extract()):
                item['intro'] = li.xpath(
                    './div[@class="txt"]/p[@class="pTxt pIntroHide"]//text()'
                ).extract()[0]
            else:
                item['intro'] = li.xpath(
                    './div[@class="txt"]/p[@class="pTxt pIntroShow"]//text()'
                ).extract()[0]
            names = ''
            for name in li.xpath(
                    './div[@class="txt"]/p[@class="pActor"]/a//text()'):
                actor = name.extract()
                names = names + '#' + actor
            item['names'] = names
            items.append(item)
        return items
コード例 #16
0
 def parse(self, response):
     print response
     movies = response.xpath('//ul[@class="top-list fn-clear"]/li')
     print 'movies %r', movies
     for each_movie in movies:
         item = MovieItem()
         item['name'] = each_movie.xpath('./h5/a@title').extract()[0]
         yield item
コード例 #17
0
 def parse_detail(self, response):
     # 解析详情页中的电影类型,进行持久化存储
     item = MovieItem()
     item['name'] = response.xpath(
         '//div[@class="stui-content__detail"]/h1/text()').get()
     desc = response.xpath('//span[@class="detail-content"]/text()').get()
     desc = ''.join(desc)
     item['desc'] = desc
コード例 #18
0
ファイル: meiju.py プロジェクト: zjarlin/FengSpider
 def parse(self, response):
     soup = BeautifulSoup(response.body, "lxml")
     movies = soup.find_all('ul', class_="top-list")[0].find_all("li")
     for movie in movies:
         name = movie.find('a').get_text()
         item = MovieItem()
         item['name'] = name
         yield item
コード例 #19
0
ファイル: meiju.py プロジェクト: zhench/learnpython
 def parse(self, response):
     print('start crawl')
     movies = response.xpath('//ul[@class="top-list  fn-clear"]/li')
     #movies= response.xpath('//ul[@class="top-list  fn-clear"]/li')
     print(len(movies))
     for each_movie in movies:
         item=MovieItem()
         item['name']=each_movie.xpath('./h5/a/@title').extract()[0]
         yield item
コード例 #20
0
 def parse(self, response):
     movie = response.xpath(
         '//ul[contains(@class,"top-list") and contains(@class,"fn-clear")]/li')  # 用xpath对网页返回的response对象进行解析
     for each_movie in movie:
         item = MovieItem()  # 实例出item对象
         item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]  # 提起name字段
         item["classification"] = each_movie.xpath('./span[2]/text()').extract_first()  # 提取classification字段
         item["state"] = each_movie.xpath('./span[1]/font[1]/text()').extract_first()
         yield item  # 返回item,并交由pipeline管道处理
コード例 #21
0
    def parse(self, response):
        movies = response.xpath(
            '//ul[contains(@class,"top-list") and contains(@class,"fn-clear")]/li'
        )  # ul[@class=" fn-clear"]/li

        for each_movie in movies:
            item = MovieItem()
            item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]
            yield item
コード例 #22
0
ファイル: spider.py プロジェクト: 395299296/wechat
 def parse_item(self, response):
     selector = Selector(response)
     title = selector.xpath('//div[@class="title_all"]/h1/font/text()')
     link = selector.xpath('//td[@bgcolor="#fdfddf"]/a/@href')
     if len(title) > 0 and len(link) > 0:
         item = MovieItem()
         item['movie_title'] = title[0].extract()
         item['movie_link'] = link[0].extract()
         yield item
コード例 #23
0
ファイル: meiju.py プロジェクト: wantwantwant/TV
    def parse(self, response):
        # print(response.status_code, response.content, response.text)
        # 非框架下写法 dom = lxml.etree.HTML(response.text); dom.xpath('')
        # scrapy框架下正确写法Selector(response.text).xpath('').extract()

        # 获取剧集名
        movie_list = response.xpath(
            '//ul[@class="top-list  fn-clear"]/li')  # [<li>对象,<li>对象]
        # /h5/text()
        for movie in movie_list:
            # movie.xpath('./h5/text()').extract_first()    # xpath()返回[Selector(),Selector()] xpath().extract()返回['剧集名1','剧集名2'xpath()]
            # xpath().extract_first()  返回'剧集名1'
            # .表示在子标签基础上继续解析
            name = movie.xpath('./h5/text()').extract_first()

            item = MovieItem()
            item.name = name  # item['name'] = name
            yield item  # 相当于同步脚本方法中的return
コード例 #24
0
    def parse(self, response):

        # movies = response.xpath('//ul[@class="top-list  fn-clear"]/li')
        movies = response.xpath('//div[@class="l week-hot layout-box"]/ul/li')
        for each_movie in movies:
            item = MovieItem()

            item['name'] = each_movie.xpath(
                './a/@title | ./p/a/@title').extract()[0]
            yield item
コード例 #25
0
ファイル: dianying.py プロジェクト: xbuding/crawler
 def parse_detail_page(self, response):
     doc = pq(response.text)
     movie = MovieItem()
     movie['movie_name'] = doc(
         '#header > div > div.bd2 > div.bd3 > div.co_area2 > div.title_all > h1'
     ).text().strip()
     movie['raw_url'] = response.url
     movie['cover_image'] = doc('#Zoom > p:nth-child(1) > img').attr('src')
     download_url = self.get_download_url(doc)
     movie['download_url'] = [url for url in download_url if url]
     yield movie
コード例 #26
0
 def parse_movie_item(self, response):
     item = MovieItem()
     item['url'] = response.url
     item['name'] = response.xpath(
         '//span[@property="v:itemreviewed"]/text()').extract_first()
     item['summary'] = response.xpath(
         '//span[@property="v:summary"]/text()').extract_first()
     item['score'] = response.xpath(
         '//strong[@property="v:average"]/text()').extract_first()
     print('------------------', item)
     yield item
コード例 #27
0
 def parse(self, response):
     #pass
     movies = response.xpath('//ul[@class="top-list  fn-clear"]/li')
     for each_movie in movies:
         item = MovieItem()
         item['name'] = each_movie.xpath('./h5/a/@title').extract()[0]
         #add item
         #item['subType'] = each_movie.xpath('./span[@class="mjjq"]/text()').extract()[0]
         item['subType'] = each_movie.xpath(
             './span[@class="mjjq"]/text()').extract()[0]
         yield item
コード例 #28
0
 def parse_item(self, response):
     a_list = response.xpath('//div[@class="co_content8"]//table//a')
     for a in a_list:
         # 要注意关注seletor对象中的data属性值
         title = a.xpath('./text()').extract_first()
         href = a.xpath('./@href').extract_first()
         url = 'http://www.ygdy8.net' + href
         movie = MovieItem(title=title)
         # yield Request(参数)  url 发送的请求  callback 执行的方法  meta就是 响应时候携带的参数
         yield scrapy.Request(url=url,
                              callback=self.parse_detail,
                              meta={'movie': movie})
コード例 #29
0
 def parse(self, response):
     item = MovieItem()
     selector = Selector(response)
     movie = selector.xpath('//*[@id="content"]/div/div[1]/ol/li')
     for each in movie:
         item['title'] = each.xpath('div/div[2]/div[1]/a/span[1]/text()').extract()[0]
         print(item['title'])
         content = each.xpath('div/div[2]/div[2]/p[1]/text()').extract()[0]\
                     +each.xpath('div/div[2]/div[2]/p[1]/text()[2]').extract()[0]
         item['movie_info'] = content.replace('\n','').replace(' ', '')
         item['star'] = each.xpath('div/div[2]/div[2]/div/span[2]/text()').extract()[0]
         item['quote'] = each.xpath('div/div[2]/div[2]/p[2]/span/text()').extract()[0]
         yield item
コード例 #30
0
ファイル: meiju.py プロジェクト: gdlearn/scrapy-the-blog
 def parse(self, response):
     # if(re.match("http://www.dawnfly.cn/article-0-\d+|\d+-\d+.html",response.url)):
         movies = response.xpath('//nav[@class="main-nav"]/div/ul/li')
         print response.url
         for each_movie in movies:
             item = MovieItem()
             item['cat_url']=each_movie.xpath('./a/@href').extract()[0]
             if(item['cat_url'].startswith("/article-")):
                 # 栏目分类url
                 # print item['cat_url']
             # if(re.match("/article-",item['cat_url']):
                 # yield item
                 yield Request("http://www.dawnfly.cn"+item['cat_url'],callback=self.parse_cat,meta={'cat_url':item})