Exemple #1
0
    def parse(self, response):
        item = DoubanItem()
        selector = Selector(response)
        movieList = selector.xpath('//ol[@class="grid_view"]/li')
        print(movieList)
        for movie in movieList:
            #读取数据
            item = DoubanItem()
            title = movie.xpath('div/div[@class="info"]/div[1]/a/span/text()').extract()
            item['movieInfo']= movie.xpath('div/div[@class="info"]/div[2]/p[1]/text()').extract()[0]
            item['star'] = movie.xpath('div/div[@class="info"]/div[2]/div/span[@class="rating_num"]/text()').extract()[0]
            quote = movie.xpath('div/div[@class="info"]/div[2]/p[2]/span/text()').extract()
            #特殊数据处理
            if quote:
                item['quote'] = quote[0]
            else:
                item['quote'] = ''
            item['title'] = ''.join(title)

            #数据保存
            yield item
        #下一页处理
        nextPage = selector.xpath('//span[@class="next"]/link/@href').extract()
        if nextPage:
            nextPage = nextPage[0]
            print(self.url + nextPage)
            yield Request(self.url + nextPage, callback=self.parse)
    def parse_rev(getid, ty, response):
        if ty == "rev":
            item = DoubanItem()
            item['_i'] = 'review_profile'
            item['review_id'] = getid

            p = response.body.decode('UTF-8')
            item['review_name'] = etree.HTML(p).xpath(
                '//div[@class="article"]/h1/span/text()')[0]
            tmp = etree.HTML(p).xpath(
                '//div[@class="main-bd"]/p[@class="main-title-tip"]/text()')
            if tmp:
                item['review_spoil'] = tmp[0]
            else:
                item['review_spoil'] = None
            l = etree.HTML(p).xpath(
                '//header[@class="main-hd"]/span/@class')[0]
            item['review_rating'] = re.findall("\d+", l)[0][0]
            item['review_time'] = etree.HTML(p).xpath(
                '//header[@class="main-hd"]/span[@class="main-meta"]/text()'
            )[0]
            string = ""
            m = etree.HTML(p).xpath('//div[@id="link-report"]//p/text()')
            if not m:
                m = etree.HTML(p).xpath('//div[@id="link-report"]/div/text()')
            for n in m:
                string = string + n
            item['review'] = string
            yield item

        item = DoubanItem()
        item['_i'] = 'review_comments'
        item['review_id'] = getid

        p = response.body.decode('UTF-8')
        i = etree.HTML(p).xpath('//div[@class="comment-item"]')
        for j in i:
            string = ""
            item['c_id'] = j.attrib['data-cid']
            item['commenter_id'] = re.split(r'[/?]\s*',
                                            j.attrib['data-user_url'])[4]
            item['ref_cid'] = j.attrib['data-ref_cid']
            item['c_time'] = j.find('div//div[@class="header"]/span').text
            m = j.findall('div//p[@class="comment-text"]')
            if m:
                for n in m:
                    string = string + n.text
            else:
                string = ""
            item['comment'] = string
            yield item
Exemple #3
0
 def errback_httpbin(self, failure):
     if failure.check(HttpError):
         # these exceptions come from HttpError spider middleware
         # you can get the non-200 response
         response = failure.value.response
         self.logger.error('HttpError on %s', response.url)
         self.driver.get(response.url)
         temp_html = BeautifulSoup(self.driver.page_source, 'lxml')
         item = DoubanItem()
         item['rank'] = temp_html.find('span', class_='top250-no').get_text().split('.')[1]
         info = temp_html.find(id='info')
         item['name'] = temp_html.find('h1').find('span').get_text().split()[0]
         item['year'] = re.findall('\d+', temp_html.find('span', class_='year').get_text())[0]
         item['director'] = ''
         item['script'] = ''
         item['actor'] = ''
         role_dict = {u'\u5bfc\u6f14': 'director', u'\u7f16\u5267': 'script', u'\u4e3b\u6f14': 'actor'}
         temp = info.find_all('span', class_='pl')
         for items in temp:
             role = role_dict.get(items.get_text())
             if role is not None:
                 item[role] = items.find_next('span').get_text()
         item['classification'] = '/'.join([x.get_text() for x in info.find_all('span', property='v:genre')])
         item['score'] = temp_html.find(id='interest_sectl').find('strong', class_='ll rating_num').get_text()
         item['story'] = re.sub('(\s)|(/n)', '', temp_html.find('span', property='v:summary').get_text())
         return item
Exemple #4
0
    def parse(self, response):
        sel = Selector(response)
        movie_name = sel.xpath("//div[@class='pl2']/a/text()").extract()
        '''
		//*[@id="content"]/div/div[1]/div/div/table[3]/tbody/tr/td[2]/div/a/span
		'''
        names = []
        print '******'
        for name in movie_name[::2]:
            names.append(name.strip().replace('\\', ""))
        #print movie_name
        print '******'
        movie_url = sel.xpath("//div[@class='pl2']/a/@href").extract()
        movie_score = sel.xpath(
            "//div[@class='pl2']/div/span[@class='rating_nums']/text()"
        ).extract()
        #items = []
        for name, url, score in zip(names, movie_url, movie_score):
            item = DoubanItem()

            #item['movie_name']=[n.encode('utf-8') for n in movie_name]
            #item['movie_score']=[n for n in movie_score]
            #item['movie_url']=[n for n in movie_url]
            item['movie_name'] = name
            item['movie_score'] = score
            item['movie_url'] = url
            #print item
            yield item
            #items.append(item)

        #return items
        #print movie_name,movie_score,movie_url
Exemple #5
0
 def parse(self, response):
     soup = BeautifulSoup(response.body.decode('utf-8', 'ignore'), 'lxml')
     ol = soup.find('ol', attrs={'class': 'grid_view'})
     for li in ol.findAll('li'):
         tep = []
         titles = []
         for span in li.findAll('span'):
             if span.has_attr('class'):
                 if span.attrs['class'][0] == 'title':
                     titles.append(span.string.strip().replace(',', ','))
                 elif span.attrs['class'][0] == 'rating_num':
                     tep.append(span.string.strip().replace(',', ','))
                 elif span.attrs['class'][0] == 'inq':
                     tep.append(span.string.strip().replace(',', ','))
         tep.insert(0, titles[0])
         while len(tep) < 3:
             tep.append("-")
         tep = tep[:3]
         item = DoubanItem()
         item['name'] = tep[0]
         item['fen'] = tep[1]
         item['words'] = tep[2]
         yield item
     a = soup.find('a', text=re.compile("^后页"))
     if a:
         yield scrapy.Request("http://movie.douban.com/top250" +
                              a.attrs['href'],
                              callback=self.parse)
Exemple #6
0
 def parse(self, response):
     movie_list = response.xpath(
         '//div[@class="article"]//ol[@class="grid_view"]/li')
     for it in movie_list:
         douban_item = DoubanItem()
         douban_item['serial_number'] = it.xpath(
             ".//div[@class='item']//em/text()").extract_first()
         douban_item['movie_name'] = it.xpath(
             './/div[@class="hd"]//a/span[1]/text()').extract_first()
         content = it.xpath('.//div[@class="bd"]//p[1]/text()').extract()
         for c_introduce in content:
             douban_item['introduce'] = "".join(c_introduce.split())
         douban_item['star'] = it.xpath(
             './/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract_first()
         douban_item['evaluate'] = it.xpath(
             './/div[@class="star"]/span[4]/text()').extract_first()
         douban_item['describe'] = it.xpath(
             './/p[@class="quote"]/span/text()').extract_first()
         print(douban_item)
         yield douban_item
     next_link = response.xpath('//span[@class="next"]/a/@href').extract()
     if next_link:
         next_link = next_link[0]
         yield scrapy.Request("https://movie.douban.com/top250" + next_link,
                              callback=self.parse)
Exemple #7
0
    def parse(self, response):
        item = DoubanItem()
        movies = response.xpath("//div[@class='info']")

        for each in movies:
            # 标题
            item['title'] = each.xpath(
                ".//span[@class='title'][1]/text()").extract()[0]
            # 信息
            item['bd'] = each.xpath(
                ".//div[@class='bd']/p/text()").extract()[0]
            # 评分
            item['star'] = each.xpath(
                ".//div[@class='star']/span[@class='rating_num']/text()"
            ).extract()[0]
            # 简介
            quote = each.xpath(".//p[@class='quote']/span/text()").extract()
            if len(quote) != 0:
                item['quote'] = quote[0]
            yield item

        if self.offset < 225:
            self.offset += 25
            yield scrapy.Request(self.url + str(self.offset),
                                 callback=self.parse)
Exemple #8
0
    def get_items(self, response):
        result = json.loads(response.text)
        if result.get('res').get('subjects') == []:  #这个if获取的是台词
            text = result.get('res').get('payload').get('text')
            title = result.get('res').get('subject').get('title')
            s = Sql()
            s.save_to_mysql(title, text)
            pass

        elif result.get('res').get('subjects') == None:  #这个if获取的是演员、导演
            itemslist = result.get('res').get('people')
            print(response.url)
            print(itemslist)
            item = DoubanProfession()
            item['category'] = result.get('res').get('payload').get('title')
            s = Sql
            for items in itemslist:
                for field in item.fields:
                    if field in items.keys():
                        item[field] = items.get(field)
                print(item)
                s.save_to_mongo2(item)

        else:
            itemslist = result.get('res').get('subjects')  #最后这里获取的是电影信息
            item = DoubanItem()
            item['category'] = result.get('res').get('payload').get('title')
            for items in itemslist:
                for field in item.fields:
                    if field in items.keys():
                        item[field] = items.get(field)
                yield item
Exemple #9
0
    def parse(self, response):
        # 每页 20个评论数据
        # 一级评论
        titles = response.css(".main-bd > h2 > a::text").extract()
        commentIds = response.css("header > a.name::text").extract()
        data_cid = response.css("div::attr(data-cid)").extract()
        dates = response.css(".main-meta::attr(content)").extract()
        for idx in range(len(commentIds)):
            item = DoubanItem()
            item['parent'] = -1
            item['commentId'] = commentIds[idx]
            item['commentDate'] = dates[idx]
            item['data_cid'] = data_cid[idx]
            item['title'] = '悲惨世界'
            item['type'] = '图书'
            item['level'] = 1

            rev_json = 'https://book.douban.com/j/review/%s/full' % data_cid[
                idx]
            yield scrapy.Request(rev_json,
                                 meta={
                                     'item': item,
                                     'title': titles[idx]
                                 },
                                 callback=self.get_json_commentFull)

            sub_comment_url = 'https://book.douban.com/review/%s/' % data_cid[
                idx]
            yield scrapy.Request(sub_comment_url,
                                 meta={
                                     'sub_comment_url': sub_comment_url,
                                     'data_cid': data_cid[idx]
                                 },
                                 callback=self.parse_sub_review)
Exemple #10
0
    def parse2(self, response):
        # 循环电影条目

        movie_list = response.xpath(
            "//div[@class='article']//ol[@class='grid_view']/li")
        for i_item in movie_list:
            # item 文件导进来
            # 这里实际上就是导入的items自定义的数据结构,所以名称什么的都是一样的
            douban_item = DoubanItem()
            # 写详细的解析
            douban_item['serial_number'] = i_item.xpath(
                ".//div[@class='item']//em/text()").extract_first()
            douban_item['movie_name'] = i_item.xpath(
                ".//div[@class='info']/div[@class='hd']/a/span[1]/text()"
            ).extract_first()
            #print(douban_item)
            # 需要将数据yield到pipelines里去
            yield douban_item
        print('*' * 20, response.request.headers['User-Agent'])
        next_link = response.xpath(
            "//span[@class='next']/link/@href").extract()
        # 解析下一页规则,取后页的Xpath
        if next_link:
            next_link = next_link[0]
            yield scrapy.Request('http://movie.douban.com/top250' + next_link,
                                 callback=self.parse)
Exemple #11
0
 def parse_item(self, response):
     items = DoubanItem()
     x = Selector(response)
     lb = x.xpath('//div[@class="hd"]/h1/text()').extract()[0]
     zz_yz = x.re('作者</span.*?</a></span></span></p>')
     x_k = x.xpath('//div[@class="info"]')
     sm = x_k.xpath('./div[@class="title"]/a/text()').extract()
     jj = x_k.xpath('./div[@class="article-desc-brief"]/text()').extract()
     for i in range(len(sm)):
         items['lb'] = lb
         items['sm'] = sm[i]
         zz_k = re.findall('作者</span.*?</a></span></span>', zz_yz[i])
         items['zz'] = re.findall('([〕〔\u4e00-\u9fa5·\s]{2,})', zz_k[0])[1:]
         yz_k = re.findall('译者</span.*?</a></span></span>', zz_yz[i])
         if not yz_k:
             items['yz'] = None
         else:
             items['yz'] = re.findall('([〕〔\u4e00-\u9fa5·]+)', yz_k[0])[1:]
         pf = x_k[i].xpath(
             './div/span[@class="rating-average"]/text()').extract()
         if not pf:
             items['pf'] = None
         else:
             items['pf'] = pf[0]
         items['jj'] = jj[i]
         yield items
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     return i
 def parse(self, response):
     url = response.url
     args = urllib.parse.urlparse(url)
     params = urllib.parse.parse_qs(args.query, True)
     type_num = params.get('type')
     if len(type_num) == 0:
         return
     movie_type = TYPE_SETTINGS.get(type_num[0])
     element_list = json.loads(response.body)
     # 语料库中25%作为测试数据
     test_num = len(element_list) * 0.25
     for element in element_list:
         item = DoubanItem()
         item['title'] = element.get('title')
         item['type_num'] = type_num[0]
         item['movie_type'] = movie_type
         item['types'] = '|'.join(element.get('types')) if type(
             element.get('types')) is list else element.get('types')
         item['regions'] = '|'.join(element.get('regions')) if type(
             element.get('regions')) is list else element.get('regions')
         item['url'] = element.get('url')
         item['score'] = element.get('score')
         item['vote_count'] = str(
             element.get('vote_count')) if element.get('vote_count') else ''
         item['mid'] = element.get('id')
         item['release_date'] = element.get('release_date')
         item['rank'] = str(
             element.get('rank')) if element.get('rank') else ''
         item['property'] = '1' if test_num >= 0 else '0'
         test_num -= 1
         yield item
Exemple #13
0
    def parse(self, response, **kwargs):
        item = DoubanItem()
        # 获取JSON数据
        json_text = response.text
        # 解码
        movie_dict = json.loads(json_text)
        if len(movie_dict["data"]) == 0:
            return
        # for循环遍历每部电影
        for one_movie in movie_dict["data"]:
            # 获取电影名称
            item["title"] = one_movie["title"]
            # 获取导演
            item["directors"] = one_movie["directors"]
            # 获取演员
            item["casts"] = one_movie["casts"]
            # 获取评分
            item["rate"] = one_movie["rate"]
            # 获取封面
            item["cover"] = one_movie["cover"]

            yield item

        # 爬取更多数据
        url_next = 'https://movie.douban.com/j/new_search_subjects?tags=电影&start=%d&countries=中国大陆'%(self.currentPage*20)
        self.currentPage += 1
        yield Request(url_next, headers=self.headers)
Exemple #14
0
    def parse(self, response):
        item = DoubanItem()
        
        for i in range(1, 26):
            info = response.xpath(f"//div[@class='article']/div[@class='indent']/table[{i}]")
            item['book'] = info.xpath(".//div[@class='pl2']/a/@title")[0].extract()
            item['author'] = info.xpath(".//p[@class='pl']/text()")[0].extract().split(' / ')[0]
            item['time'] = info.xpath(".//p[@class='pl']/text()")[0].extract().split(' / ')[-2]
            price = info.xpath(".//p[@class='pl']/text()")[0].extract().split(' / ')[-1]
            if len(price) == 1:
                price = str(float(price))
            # print(price)
            item['price'] = float(re.findall(r"\d+.*?\d+", price)[0])
            item['star'] = float(info.xpath(".//div[@class='star clearfix']/span[@class='rating_nums']/text()").extract()[0])
            tmp = info.xpath(".//div[@class='star clearfix']/span[@class='pl']/text()")[0].extract()
            item['mark_num'] = int(re.findall(r"\d+", tmp)[0])
            # print(item['book'])

            yield item

        if self.index < 225:
            self.index += 25
            self.url = f'https://book.douban.com/top250?start={self.index}'

            yield Request(self.url, callback = self.parse)
Exemple #15
0
 def parse_item(self, response):
     item = {}
     item = DoubanItem()
     item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
     item['score'] = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()
     item['link'] = response.url
     yield item
Exemple #16
0
 def parse_item(self, response):
     # i = {}
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     # return i
     item = DoubanItem()
     num = response.xpath(
         '//div[@class="top250"]/span[@class="top250-no"]/text()').extract(
         )[0]
     item['ranking'] = int(re.sub(r'\D', '', num))
     item['movie_name'] = response.xpath(
         '//div[@id="content"]/h1/span/text()').extract()[0]
     item['score'] = response.xpath(
         '//div[@class="rating_self clearfix"]/strong/text()').extract()[0]
     item['info'] = "".join(
         response.xpath('//div[@id="link-report"]/span/text()').extract())
     item['doctor'] = response.xpath(
         '//span[@class="attrs"]/a/text()').extract()[0]
     # actors = response.xpath('//span[@class="actor"]/span[@class="attrs"]/span/a/text()').extract()
     # print(actors[0:3],'++++++++++++++actors++++++++')
     # item['actor'] = actors[0:3]
     item['img'] = response.xpath(
         '//div[@id="mainpic"]/a/img/@src').extract()[0]
     yield item
Exemple #17
0
    def parse(self, response):

        item = DoubanItem()
        movies = response.xpath('//div[@class="info"]')

        for each in movies:

            item['title'] = each.xpath(
                './/a/span[@class="title"][1]/text()').extract()[0]
            bd = each.xpath('.//div[@class="bd"]/p/text()').extract()[0]
            item['bd'] = bd.replace('\n', "").replace('    ', '')
            quote = each.xpath('.//p[@class="quote"]/span/text()').extract()
            if len(quote) != 0:
                item['quote'] = quote[0]
            else:
                pass
            item['star'] = each.xpath(
                './/div[@class="star"]//span[2]/text()').extract()[0]

            yield item

        if self.offset < 225:
            self.offset += 25
            yield scrapy.Request(self.url + str(self.offset),
                                 callback=self.parse)
Exemple #18
0
    def parse_sub_review(self, response):

        parent_id = response.meta['data_cid']
        script = response.text
        d = re.search(re.compile("'comments': (.*)"), script)
        if d:
            if not d.group():
                logging.info(response.meta['sub_comment_url'])
            else:
                json_text = d.group().strip("'comments':").strip(",")
                info = json.loads(json_text)

                for review in info:
                    item = DoubanItem()
                    item['parent'] = parent_id
                    item['commentDate'] = review['create_time']
                    item['commentId'] = review['author']['name']
                    item['commentContent'] = review['text']
                    item['data_cid'] = review['id']
                    item['title'] = '悲惨世界'
                    item['type'] = '图书'
                    item['level'] = 2
                    yield item

                    sub_sub_url = f"https://book.douban.com/j/review/comment/{review['id']}/replies?start=0&count=500"
                    yield scrapy.Request(sub_sub_url,
                                         meta={'parent_id': review['id']},
                                         callback=self.parse_sub_sub_review)
    def parse(self, response):
        movie_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li')
        for i in movie_list:
            douban_item = DoubanItem()
            douban_item['number'] = i.xpath(
                './div/div[1]/em/text()').extract_first()
            douban_item['name'] = i.xpath(
                './div/div[2]/div[1]/a/span[1]/text()').extract_first()
            content = i.xpath(
                './div/div[2]/div[2]/p[1]/text()').extract()[1].strip()
            douban_item['introduce'] = "".join(content.split())
            douban_item['star'] = i.xpath(
                './div/div[2]/div[2]/div/span[2]/text()').extract_first()
            douban_item['evaluate'] = i.xpath(
                './div/div[2]/div[2]/div/span[4]/text()').extract_first()
            douban_item['describe'] = i.xpath(
                './div/div[2]/div[2]/p[2]/span/text()').extract_first()
            yield douban_item

        next_link = response.xpath(
            '//*[@id="content"]/div/div[1]/div[2]/span[3]/a/@href').extract()
        if next_link:
            print(next_link)
            next_link = next_link[0]
            yield scrapy.Request('https://movie.douban.com/top250' + next_link,
                                 callback=self.parse)
Exemple #20
0
class DouBanSpider(Spider):
    name = 'dou_ban'
    allowed_domains = ['movie.douban.com']
    start_urls = 'https://movie.douban.com/subject/27113517/comments?start={}&limit=20&sort=new_score&status=P'
    num = 0
    item = DoubanItem()

    def start_requests(self):
        yield Request(self.start_urls.format(self.num), self.parse_response)

    #spider解析数据
    def parse_response(self, response):

        soup = BeautifulSoup(response.text, 'lxml')
        # if soup.state_code!=200:
        # break
        selects = soup.find('div', id='comments').find_all('div')
        for user in selects[:-1]:
            self.item['user_name'] = user.find(
                'span', class_='comment-info').a.get_text()
            span_time = user.find('span', class_='comment-time')
            self.item['time'] = span_time['title']
            self.item['comment'] = user.p.span.get_text()
            yield self.item
        self.num += 20
        yield Request(self.start_urls.format(self.num), self.parse_response)
Exemple #21
0
 def parse(self, response):
     #循环电影的条目
     movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li")
     for i_item in movie_list:
         #item文件导进来
         douban_item =DoubanItem()
         #写详细的xpath,进行数据的解析
         douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first()
         douban_item['movie_name'] = i_item.xpath(".//div[@class='info']//div[@class='hd']/a/span[1]/text()").extract_first()
         content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract()
         #数据的处理
         for i_content in content:
             content_s = "".join(i_content.split())
             douban_item['introduce'] = content_s
         douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first()
         douban_item['evaluate'] = i_item.xpath(".//div[@class='star']//span[4]/text()").extract_first()
         douban_item['describe'] = i_item.xpath(".//p[@class='quote']/span/text()").extract_first()
         #yield到item pipeline,进行数据 读取,清洗
         yield douban_item
     #解析下一页规则,取下一页的xpath
     next_link = response.xpath("//span[@class='next']/link/@href").extract()
     if next_link:
         next_link = next_link[0]
         #yield 提交到调度器中
         yield scrapy.Request("https://movie.douban.com/top250"+next_link,callback=self.parse)
    def parse(self, response):
        item = DoubanItem()

        # movies = response.xpath("//div[@class='info']")
        movies = response.xpath("//div[@class='item']")
        for each in movies:
            # 排名
            item['rank'] = each.xpath("./div[@class='pic']/em/text()").extract()[0]
            info = each.xpath("./div[@class='info']")
            # 标题
            item['title'] = info.xpath(".//span[@class='title'][1]/text()").extract()[0]
            # 基本信息
            item['actor_info'] = info.xpath("./div[@class='bd']/p/text()").extract()[0].replace('\n', '').replace(' ',
                                                                                                                  '')
            item['movie_info'] = info.xpath("./div[@class='bd']/p/text()").extract()[1].replace('\n', '').replace(' ',
                                                                                                                  '')
            # 评分
            item['star'] = \
                info.xpath("./div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()").extract()[0]
            # 简介
            quote = info.xpath("./div[@class='bd']/p[@class='quote']/span/text()").extract()
            if len(quote) != 0:
                item['quote'] = quote[0]
            else:
                item['quote'] = ''

            yield item

        if self.offset < 225:
            self.offset += 25
            yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Exemple #23
0
 def parse(self, response):
     #print response.body
     item = DoubanItem()
     selector = Selector(response)
     Movies = selector.xpath('//div[@class="info"]')
     for eachMovie in Movies:
         title = eachMovie.xpath('div[@class="hd"]/a/span/text()').extract()
         fullTitle = ''
         for each in title:
             fullTitle += each
         movieInfo = eachMovie.xpath(
             'div[@class="bd"]/p[@class=""]/text()').extract()
         star = eachMovie.xpath(
             'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract()[0]
         quote = eachMovie.xpath(
             'div[@class="bd"]/p[@class="quote"]/span/text()').extract()
         #quote可能为空,需要先进行判断
         if quote:
             quote = quote[0]
         else:
             quote = ''
         item['title'] = fullTitle
         item['movieInfo'] = ';'.join(movieInfo)
         item['star'] = star
         item['quote'] = quote
         yield item
     nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
     #第十页是最后一页,没有下一页的链接
     if nextLink:
         nextLink = nextLink[0]
         #print nextLink
         yield Request(self.url + nextLink, callback=self.parse)
Exemple #24
0
 def parse(self, response):
     # 循环电影的列表
     move_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li")
     for i in move_list:
         douban_item = DoubanItem()
         # 写详细的 xpath, 进行规则解析数据 
         # extract_first()  获取第一个数据
         # extract() 获取所有数据
         douban_item['number'] = i.xpath(".//div[@class='item']//em/text()").extract_first()
         douban_item['name'] = i.xpath(".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first()
         douban_item['star'] = i.xpath(".//div[@class='info']/div[@class='bd']//span[@class='rating_num']/text()").extract_first()
         douban_item['evaluate'] = i.xpath(".//div[@class='info']//div[@class='star']/span[4]/text()").extract_first()
         douban_item['describe'] = i.xpath(".//div[@class='info']/div[@class='bd']//span[@class='inq']/text()").extract_first()
         content = i.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract()
         # 数据处理
         for x in content:
             content_s = "".join(x.split())
             douban_item['introduce'] = content_s
         # 你需要把数据 yield 到 pipelines 里面去
         yield douban_item
     next_link = response.xpath("//span[@class='next']/link/@href").extract()
     # 解析下一页
     if next_link:
         next_link = next_link[0]
         # 满足下一页条件,则 传入 路径,并且调用回调函数, self.parse 去解析爬到的内容
         yield scrapy.Request("http://movie.douban.com/top250" + next_link, callback=self.parse)
Exemple #25
0
 def content_parse(self, response):
     '''
     获取用户评论信息
     '''
     contents = response.xpath('//div[@class="comment"]')
     for content in contents:
         item = DoubanItem()
         name = content.xpath(
             './h3/span[@class="comment-info"]/a/text()').extract()[0]
         score = content.xpath(
             './h3/span[@class="comment-info"]/span[2]').attrib.get('title')
         # 这里span标签内的文字换行会导致写入数据出现问题,因此直接把评论带标签拿出来,之后再做处理
         comment = content.xpath('./p/span[@class="short"]').extract()[0]
         date = content.xpath(
             './h3/span[@class="comment-info"]/span[@class="comment-time "]/text()'
         ).extract()[0].strip()
         # 获取评论用户主页链接,用于爬取用户常居城市
         href = content.xpath(
             './h3/span[@class="comment-info"]/a/@href').extract()[0]
         item['name'] = name
         # 判断用户是否评分,未评分第二个span标签是时间,这里通过长度判断
         if len(score) < 5:
             item['score'] = score
         else:
             item['score'] = '--'
         item['comment'] = comment
         item['date'] = date
         item['href'] = href
         yield item  # 返回item
    def parse(self, response):
        # print(response.text)
        html = etree.HTML(response.text)  # 用etree来解析
        li_list = html.xpath("//ol[@class='grid_view']/li")  # 用Xpath来查找元素
        # item_list=[]
        for li in li_list:
            item = DoubanItem()  # 导入items.py中定义的item类格式进行封装
            item['em'] = li.xpath(".//em/text()")[
                0]  # 注意,通过xpath返回的始终是一个list.需要处理后才能用
            item['title'] = li.xpath(".//img/@src")[0]
            item['img'] = li.xpath(".//span[@class='title']/text()")[0]
            item['comment'] = li.xpath(".//div[@class='star']/span/text()")[-1]
            # item_list.append(item)
            yield item  # 利用yield返回一个迭代对象 这里是产生一个item并继续往下执行下一个yield
            # print('打印for循环中的记录', item)

        try:  # 最后一页没有href值,会抛异常,所以用try来处理
            next_page = html.xpath("//span[@class='next']/a/@href")[
                0]  # 在当前页面获取下一页的地址
            print("一页循环完毕,进入下一页" + "-" * 100)
            # 利用Request手动发送请求, 回调函数调用自己去访问下一页形成递归循环算法
            # 注意callback=self.parse不能加括号,代表把函数的地址赋值给callback.如果加括号,代表把函数的执行结果给callback.
            yield scrapy.Request(url='https://movie.douban.com/top250' +
                                 next_page,
                                 callback=self.parse)
        except:
            print("下载完毕!")
Exemple #27
0
    def parse(self, response):
        json_obj = json.loads(response.text)
        if 'data' not in response.text:
            logging.error('No data in json api:' + response.url)
        result_list = json_obj['data']
        for result in result_list:
            try:
                item = DoubanItem()
                url = result['url']
                url = utils.get_real_url(url)
                item['url'] = url
                item['dType'] = 'META_VIDEO_S1'
                item['type'] = response.meta['type']
                item['vTitle'] = result['title']
                item['vScore'] = result['rate']
                item['vCoverUrl'] = result['cover']
                item['vDirector'] = result['directors'] and result[
                    'directors'][0] or ''
                item['vStars'] = '/'.join(result['casts'])

                # request
                request = scrapy.Request(
                    response.urljoin(url),
                    meta={'cookiejar': response.meta['cookiejar']},
                    callback=self.parse_video,
                    errback=self.err_back)
                request.meta['item'] = item
                yield request
            except Exception as e:
                logging.error('Item in json api error:' + response.url +
                              ' item url:' + url)
                continue
    def parse(self, response):

        # 实例化item类
        item = DoubanItem()

        # 找出每部电影总的div  并进行遍历  再取详细信息
        for each in response.xpath("//div[@class='info']"):
            title = each.xpath(
                'div[@class="hd"]/a/span[@class="title"]/text()').extract()
            content = each.xpath('div[@class="bd"]/p/text()').extract()
            score = each.xpath(
                'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()
            info = each.xpath(
                'div[@class="bd"]/p[@class="quote"]/span/text()').extract()

            item['title'] = title[0]
            item['content'] = content[0]
            item['score'] = score[0]
            item['info'] = info

            yield item

            # 排行榜共250条 每25条一页
            if self.start <= 225:
                self.start += 25
                url = self.url + str(self.start) + self.end
                yield scrapy.Request(url, callback=self.parse)
Exemple #29
0
 def parse(self, response):
     movie_list = response.xpath(
         "//div[@class='article']//ol[@class='grid_view']/li")
     for i_item in movie_list:
         douban_item = DoubanItem()
         douban_item['serial_number'] = i_item.xpath(
             ".//div[@class='item']//em/text()").extract_first()
         douban_item['movie_name'] = i_item.xpath(
             ".//div[@class='info']/div[@class='hd']/a/span[1]/text()"
         ).extract_first()
         content = i_item.xpath(
             ".//div[@class='info']//div[@class='bd']/p[1]/text()").extract(
             )
         for i_content in content:
             content_s = "".join(i_content.split())
             douban_item['introduce'] = content_s
         douban_item['star'] = i_item.xpath(
             ".//span[@class='rating_num']/text()").extract_first()
         douban_item['evaluate'] = i_item.xpath(
             ".//div[@class='star']//span[4]/text()").extract_first()
         douban_item['describe'] = i_item.xpath(
             ".//p[@class='quote']/span/text()").extract_first()
         yield douban_item
     next_link = response.xpath(
         "//span[@class='next']/link/@href").extract()
     if next_link:
         next_link = next_link[0]
         yield scrapy.Request("https://movie.douban.com/top250" + next_link,
                              callback=self.parse)
Exemple #30
0
    def parse(self, response):
        # 循环电影条目
        movie_list = response.xpath('//div[@class="article"]//ol[@class="grid_view"]/li')
        for i_item in movie_list:
            # item 文件导进来
            douban_item = DoubanItem()
            # 写详细的xpath,进行数据的解析
            douban_item['seria_number'] = i_item.xpath('.//div[@class="pic"]/em/text()').extract_first()
            douban_item['movie_name'] = i_item.xpath('.//div[@class="hd"]/a/span[1]/text()').extract_first()
            content = i_item.xpath('.//div[@class="bd"]/p[1]/text()').extract()
            contents = i_item.xpath('.//div[@class="bd"]/p[1]/text()').extract()
            # 有多行时,进行数据处理
            for i_content in content:
                douban_item['introduce'] = "".join(i_content.split())
            douban_item['star'] = i_item.xpath('.//div[@class="bd"]/div[@class="star"]/span[2]/text()').extract_first()
            douban_item['evaluate'] = i_item.xpath('.//div[@class="bd"]/div[@class="star"]/span[4]/text()').extract_first()
            douban_item['discribtion'] = i_item.xpath('.//div[@class="bd"]/p[@class="quote"]/span[1]/text()').extract_first()
            # extract()是提取选择器里面的内容,不加则只是一个选择器Selector
            # 最后将数据yield到pipelines里面去
            yield douban_item
            # 解析下一页规则,去后一页的xpath
            next_link=response.xpath('//span[@class="next"]/link/@href').extract()
            next_links=response.xpath('//span[@class="next"]/link/@href').extract_first()

            if next_link:
                next_link=next_link[0]
                yield scrapy.Request('https://movie.douban.com/top250' + next_link, callback=self.parse)