Exemple #1
0
    def parse2(self, response):
        # open_in_browser(response)
        meta = response.meta
        item = DoubanspiderItem()
        comments = response.xpath('//div[@class="comment"]')
        if comments and len(comments) != 0:
            for comment in comments:
                item["name"] = meta["keyword"]
                item["content"] = comment.xpath(
                    './p/text()').extract()[0].strip()
                # item["time"] = comment.xpath('./descendant::span[@class="comment-time "][1]/text()').extract()[0].strip()

                yield item

            next_page = response.xpath(
                '//div[@id="paginator"]/a[last()]/@href').extract()
            if len(next_page) != 0:
                next_page = next_page[0]

                yield scrapy.Request(meta['prefix'] + next_page,
                                     callback=self.parse2,
                                     meta={
                                         "keyword": meta['keyword'],
                                         'prefix': meta['prefix']
                                     })
Exemple #2
0
 def parse(self, response):
     if self.start_urls is None or self.max_page is None or self.search_str_list is None or not self.search_str_list:
         print("参数错误!!!")
         return None
     for each in response.xpath('//*/table[@class="olt"]/tr'):
         url = each.xpath('td[1]/a/@href').extract_first()
         title = each.xpath('td[1]/a/@title').extract_first()
         time = each.xpath('td[4][@class="time"]/text()').extract_first()
         if url is None or title is None or time is None or not any(
                 str in title for str in self.search_str_list):
             continue
         item = DoubanspiderItem()
         item['url'] = url
         item['title'] = title
         item['time'] = time
         self.count = self.count + 1
         yield item
     nexturl = response.xpath(
         '//*/span[@class="next"]/a/@href').extract_first()
     cur_page = response.xpath(
         '//*/span[@class="thispage"]/text()').extract_first()
     if nexturl and cur_page and int(cur_page) < int(self.max_page):
         yield scrapy.Request(url=nexturl, callback=self.parse, meta={})
     else:
         return None
Exemple #3
0
    def parse(self, resonpse):

        item = DoubanspiderItem()

        movies = response.xpath("//div[@class=\'info\']")

        for each in movies:
            title = each.xpath(
                'div[@class="hd"]/a/span[@class="title"]/text()').extract()
            content = each.xpath('div[@class="bd"]/p/text()').extract()
            score = each.xpath(
                'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()
            info = each.xpath(
                'div[@class="bd"]/p[@class="quote"]/span/text()').extract()

            item['title'] = title[0]
            #以;作为分割,将content列表里所有的元素合并成一个新的字符串
            item['content'] = ';'.join(content)
            item['score'] = score[0]
            item['info'] = info[0]

            #提交item

            yield item

        if self.start <= 225:
            self.start += 25
            yield scrapy.Request(self.url + str(self.start) + self.end,
                                 callback=self.parse)
Exemple #4
0
    def parse(self, response):
        item = DoubanspiderItem()
        selector = Selector(response)
        movies = selector.xpath('//div[@class="info"]')
        for eachmovie in movies:
            title = eachmovie.xpath(
                '//div[@class="hd"]/a/span/text()').extract()
            fulltitle = ''
            for each in title:
                fulltitle += each

            moviesinfo = eachmovie.xpath(
                '//div[@class="bd"]/p/text()').extract()
            # star = eachmovie.xpath('//div[@class="bd"]/div[@class="star"]/span/em/text()').extract()[0]
            quote = eachmovie.xpath(
                '//div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            if quote:
                quote = quote[0]
            else:
                quote = ''
            item['title'] = fulltitle
            item['movieInfo'] = ';'.join(moviesinfo)
            # item['star'] = star
            item['quote'] = quote
            yield item
        nextlinks = selector.xpath(
            '//span[@class="next"]/link/@href').extract()
        if nextlinks:
            nextlinks = nextlinks[0]
            print nextlinks
            yield Request(self.url + nextlinks, callback=self.parse)
Exemple #5
0
    def parse(self, response):

        for each in response.xpath(
                '//div[@id="content"]//div[@class="article"]//ol//div[@class="item"]'
        ):
            item = DoubanspiderItem()

            item['title'] = each.xpath(
                './/div[@class="info"]//div[@class="hd"]//span[1]/text()'
            ).extract()[0]

            item['posterLink'] = each.xpath(
                './/div[@class="pic"]//img/@src').extract()[0]

            # item['bd'] = each.xpath('.//div[@class="info"]//div[@class="bd"]/p/text()').extract()[0]

            # item['star'] = each.xpath('.//div[@class="info"]//div[@class="bd"]//div[@class="star"]//span[@class="rating_num"]/text()').extract()[0]

            # quote = each.xpath('.//div[@class="info"]//div[@class="bd"]//p[@class="quote"]//span/text()').extract()

            # if len(quote) != 0:
            #     item['quote'] = quote[0]

            yield item

        if self.offset < 25:
            self.offset += 25

            yield scrapy.Request(self.urls + str(self.offset),
                                 callback=self.parse)
Exemple #6
0
 def parse(self, response):
     result = json.loads(response.text)
     for data in result["subjects"]:
         item = DoubanspiderItem()
         item['id'] = data['id']
         item['title'] = data['title']
         item['url'] = data['cover']
         item['rate'] = data['rate']
         yield item
Exemple #7
0
 def parse(self, response):
     # print response.body
     item = DoubanspiderItem()
     url = "https://www.douban.com/accounts/login"
     data = {
         'redir': 'https://www.douban.com',
         'form_email': '',
         'form_password': ''
     }
     header = {
         'Host': 'accounts.douban.com',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:49.0) Gecko/20100101 Firefox/49.0',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
         'Accept-Encoding': 'gzip, deflate, br',
         'Referer': 'https://www.douban.com/accounts/login',
         'Content-Type': 'application/x-www-form-urlencoded',
         'Content-Length': '126'
     }
     data['form_email'] = "15711057804"
     data['form_password'] = "******"
     s = requests.Session()
     text = s.get(url).text
     if '请输入上图中的单词' in text:  # 如果有验证码
         page = etree.HTML(text)
         img = page.xpath('//img[@id="captcha_image"]/@src')  # 取得验证码图片
         id = page.xpath(
             '//div[@class="captcha_block"]/input[@type="hidden"]/@value'
         )  # 取得登录必需的验证码值
         pic = requests.get(img[0])
         with open(
                 '/home/hadoop/文档/HiData/青软实训/python/PythonProject/Spider/doubanSpider/checkMa.jpg',
                 'wb') as f:
             for chunk in pic.iter_content(1024):
                 if chunk:
                     f.write(chunk)
         captcha = raw_input('请输入验证码:')
         print captcha
         data['captcha-solution'] = captcha
         data['captcha-id'] = id[0]
     p = s.post(url, headers=header, data=data)
     if '的帐号' in p.text:
         print('登录成功')
         print "s.cookies: ", s.cookies
         next_url = "https://www.douban.com/mine/"
         cont = s.get(next_url).text
         print "cont: ", cont
         item = self.parse2(cont, s, item)
         return item
         # yield Request(url=next_url, meta={'item': item}, cookies=cookie, headers=user_agent, callback=self.parse2)
     else:
         print('登录失败')
Exemple #8
0
 def parse_Douban(self, response):
     # return
     print(response.url)
     for i in range(0, 25):  # 可以获取父类的xpath(25个)遍历当前xpath节点也可以直接取子节点
         item = DoubanspiderItem()
         item["title"] = response.xpath(
             "//div[@class='item']//a/span[1]/text()").extract()[i]
         item["bd"] = response.xpath(
             "//div[@class='info']/div[@class='bd']/p[1]/text()").extract(
             )[i]
         item["star"] = response.xpath(
             "//span[@class='rating_num']/text()").extract()[i]
         item["quote"] = response.xpath("//p[@class='quote']").extract()[i]
         yield item
Exemple #9
0
 def parse(self, response):
     for i in range(0, 25):
         item = DoubanspiderItem()
         item["title"] = response.xpath(
             "//div[@class='item']//a/span[1]/text()").extract()[i]
         item["bd"] = response.xpath(
             "//div[@class='info']/div[@class='bd']/p[1]/text()").extract(
             )[i]
         item["star"] = response.xpath(
             "//span[@class='rating_num']/text()").extract()[i]
         item["quote"] = response.xpath("//p[@class='quote']").extract()[i]
         yield item
     self.page += 10
     url = self.base_url + str(self.page)
     yield scrapy.Request(url, callback=self.parse)
Exemple #10
0
    def parse3(self, response):
        meta = response.meta
        item = DoubanspiderItem()
        contents = response.xpath('//div[@class="short-content"]')
        if contents and len(contents) != 0:
            for content in contents:
                item['name'] = meta['keyword']
                item['content'] = content.xpath(
                    './text()').extract()[0].strip()
                yield item

            next_page = response.xpath('//link[@rel="next"]/@href').extract()
            if len(next_page) != 0:
                next_page = next_page[0]

                yield scrapy.Request(url=meta['prefix'] + next_page,
                                     callback=self.parse3,
                                     meta=meta)
Exemple #11
0
    def parse(self, response):

        item = DoubanspiderItem()

        movies = response.xpath("//div[@class=\'info\']")

        for movie in movies:

            name = movie.xpath('div[@class="hd"]/a/span/text()').extract()

            message = movie.xpath('div[@class="bd"]/p/text()').extract()

            star = movie.xpath(
                'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()

            number = movie.xpath(
                'div[@class="bd"]/div[@class="star"]/span/text()').extract()

            quote = movie.xpath(
                'div[@class="bd"]/p[@class="quote"]/span/text()').extract()

            if quote:
                quote = quote[0]
            else:
                quote = ''

            item['title'] = ''.join(name)
            item['info'] = quote
            item['score'] = star[0]
            item['content'] = ';'.join(message).replace(' ',
                                                        '').replace('\n', '')
            item['number'] = number[1].split('人')[0]

            # 提交item
            yield item

        if self.start <= 225:
            self.start += 25
            yield scrapy.Request(self.url + str(self.start) + self.end,
                                 callback=self.parse)
Exemple #12
0
    def parse(self, response):
        selectors = response.xpath("//*[@id='content']/div/div[1]/ol/li")
        const_item = DoubanspiderItem()
        for selector in selectors:

            name = selector.xpath(".//span[@class='title']/text()").extract()
            name = name[0]
            xiangqing = selector.xpath(
                ".//div[@class='bd']/p/text()[2]").extract()
            xiangqing = xiangqing[0].strip()
            index2 = xiangqing.find('/', xiangqing.find('/', 2) + 1)
            juqing = xiangqing[index2 + 1:]
            year = re.findall('(.*?)/', xiangqing)[0].strip()
            city = re.findall('(.*?)/', xiangqing)[1].strip()
            zhiyuan = selector.xpath(
                ".//div[@class='bd']/p/text()[1]").extract()
            zhiyuan = zhiyuan[0].strip()
            index1 = zhiyuan.find(':', 1)
            index2 = zhiyuan.find('主', 1)
            index3 = zhiyuan.find(':', index1 + 1)
            daoyan = zhiyuan[index1 + 1:index2]
            zhuyan = zhiyuan[index3 + 1:]
            pinglun = selector.xpath(
                './/div[@class="star"]/span[4]/text()').extract()
            link = selector.xpath(".//div[@class ='hd']/a/@href").extract()
            pingfen = selector.xpath(
                './/div[@class="star"]/span[2]/text()').extract()
            items = {
                '电影名': name,
                '电影链接': link,
                '上映时间': year,
                '上映城市': city,
                '剧情': juqing,
                '导演': daoyan,
                '主演': zhuyan,
                '豆瓣评分': pingfen
            }
            const_item['name'] = name
            const_item['link'] = link
            yield const_item
Exemple #13
0
    def parse(self, response):
        item = DoubanspiderItem()
        movieInfo = response.xpath('//div[@class="info"]')
        for movie in movieInfo:
            item['title'] = movie.xpath(
                './div[@class="hd"]/a/span[@class="title"]/text()')[0].extract(
                )
            item['score'] = movie.xpath(
                './div[@class="bd"]/div[@class="star"]/span/text()'
            )[0].extract()
            content = movie.xpath('./div[@class="bd"]/p/text()')
            item['content'] = content[0].extract().strip(
            ) if len(content) > 0 else 'NULL'
            info = movie.xpath(
                './div[@class="bd"]/p[@class="quote"]/span/text()')
            item['info'] = info[0].extract() if len(info) > 0 else 'NULL'

            yield item

        if self.offset <= 225:
            self.offset += 25
            url = self.url + str(self.offset) + self.end
            yield scrapy.Request(url, callback=self.parse)
Exemple #14
0
    def parse(self, response):
        print(response.url)
        movies = response.xpath('//div[@class="info"]')

        for movie in movies:
            movie_name = movie.xpath(
                './div[@class="hd"]/a/span[1]/text()').extract()[0]
            movie_info = movie.xpath('./div[@class="bd"]/p[1]/text()').extract(
            )[0].strip() + ' / ' + movie.xpath(
                './div[@class="bd"]/p[1]/text()').extract()[1].strip()
            movie_rating = movie.xpath(
                './div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()[0]
            movie_quote = movie.xpath(
                './div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            if movie_quote:
                movie_quote = movie_quote[0]
            else:
                movie_quote = ''

            item = DoubanspiderItem()

            item['movie_name'] = movie_name
            item['movie_info'] = movie_info
            item['movie_rating'] = movie_rating
            item['movie_quote'] = movie_quote

            yield item
        page = int(re.search(r'top250\?(\D*)(\d+)', response.url).group(2))
        s = re.search(r'top250\?(\D*)(\d+)', response.url).group(1)
        if s == '':
            s = 'start='

        if page <= 225:
            url = re.sub(r'top250\?(\D*)(\d+)', 'top250?' + s + str(page + 25),
                         response.url)
            yield scrapy.Request(url, callback=self.parse)
Exemple #15
0
    def parse(self, response):
        item = DoubanspiderItem()

        item[
            'posterLink'] = 'https://img3.doubanio.com/view/photo/m/public/p480747492.webp'
        yield item