Esempio n. 1
0
    def parse(self, response):
        item=HexunpjtItem()
        item['name']=response.xpath('//span[@class="ArticleTitleText"]/a/text()').extract()
        item['url']=response.xpath('//span[@class="ArticleTitleText"]/a/@href').extract()

        #需用正则匹配
        pat_hits='click(.*?)<'
        pat_comment='comment(.*?)<'
        filter_first1=re.compile(pat_hits).findall(str(response.body))
        filter_first2=re.compile(pat_comment).findall(str(response.body))
        pat_hits='>(\d*?)<'
        pat_comment='>(\d*?)<'
        item['hits']=re.compile(pat_hits).findall(str(filter_first1))
        item['comment']=re.compile(pat_comment).findall(str(filter_first2))

        yield item

        #获取总页数
        pat='blog.hexun.com/p(\d*?)/'
        data=re.compile(pat).findall(str(response.body))#--->list
        # if (len(data)>=2):
        #     totalurl=data[-2]
        # totalurl=1
        # print('一共'+str(totalurl)+'页')

        for i in range(2,10):
            #构造下次要爬的url
            nexturl="http://"+str(self.uid)+".blog.hexun.com/p"+str(i)+"/default.html"
            yield Request(nexturl,callback=self.parse, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64;x64,rv:61.0) Gecko/20100101 Firefox/61.0'
            })
Esempio n. 2
0
 def parse(self, response):
     item = HexunpjtItem()
     articles = response.xpath("/div[@class='Article']")
     for article in articles:
         item['name'] = articles.xpath("./span[@class='ArticleTitleText']/a/text()")
         print(item['name'])
     yield item
Esempio n. 3
0
 def parse(self, response):
     item = HexunpjtItem()
     item['name'] = response.xpath(".//span[@class='ArticleTitleText']/a/text()").extract()
     item['url'] = response.xpath(".//span[@class='ArticleTitleText']/a/@href").extract()
     # 获取class='ArticleInfo'的所有子节点
     clicks =  response.xpath(".//div[@class='ArticleInfo']")
     # 设置空的点击数和评论数列表
     hit = []
     comment = []
     for click in clicks:
         # 获取文章的点击Id
         click_id = click.xpath("./span/@id").extract()
         # 用replace去掉不需要click字符,直接获取id
         click_id = click_id[0].strip('click')
         hcurl = "http://click.tool.hexun.com/linkclick.aspx?blogid=19020056&articleids=" + click_id
         # print(hcurl)
         r = requests.get(hcurl,headers=headers).text
         # print(r)
         # par2是点击数的正则  par3是评论数的正则
         par2 = r"click\d*?','(\d*?)'"
         par3 = r"comment\d*?','(\d*?)'"
         hit.append(re.compile(par2).findall(str(r)))
         comment.append(re.compile(par3).findall(str(r)))
     item['hit'] = hit
     item['comment'] = comment
     # page为该个人微博的总页数
     page = response.xpath("//div[@class='PageSkip_1']/a[5]/text()").extract()
     for i in range(2,int(page[0])+1):
         next_url = "http://fjrs168.blog.hexun.com/p"+str(i)+"/default.html"
         print(next_url)
         yield Request(next_url, callback=self.parse, headers=headers)
     yield item
Esempio n. 4
0
 def parse(self, response):
     item = HexunpjtItem()
     item['name'] = response.xpath(
         "//span[@class='ArticleTitleText']/a/text()").extract()
     item["url"] = response.xpath(
         "//span[@class='ArticleTitleText']/a/@href").extract()
     #接下来需要使用urllib和re模块获取博文的评论数和阅读数
     #首先提取存储评论数和点击数网址的正则表达式
     pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">'
     #hcurl为存储评论数和点击数的网址
     hcurl = re.compile(pat1).findall(str(response.body))[0]
     # 模拟成浏览器
     headers2 = (
         "User-Agent",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"
     )
     opener = urllib.request.build_opener()
     opener.addheaders = [headers2]
     # 将opener安装为全局
     urllib.request.install_opener(opener)
     #data为对应博客列表页的所有博文的点击数与评论数数据
     data = urllib.request.urlopen(hcurl).read()
     #pat2为提取文章阅读数的正则表达式
     pat2 = "click\d*?','(\d*?)'"
     #pat3为提取文章评论数的正则表达式
     pat3 = "comment\d*?','(\d*?)'"
     #提取阅读数和评论数数据并分别赋值给item下的hits和comment
     item["hits"] = re.compile(pat2).findall(str(data))
     item["comment"] = re.compile(pat3).findall(str(data))
     yield item
     #提取博文列表页的总页数
     pat4 = "blog.hexun.com/p(.*?)/"
     #通过正则表达式获取到的数据为一个列表,倒数第二个元素为总页数
     data2 = re.compile(pat4).findall(str(response.body))
     if (len(data2) >= 2):
         totalurl = data2[-2]
     else:
         totalurl = 1
     #在实际运行中,下一行print的代码可以注释掉,在调试过程中,可以开启下一行print的代码
     #print("一共"+str(totalurl)+"页")
     #进入for循环,依次爬取各博文列表页的博文数据
     for i in range(2, int(totalurl) + 1):
         #构造下一次要爬取的url,爬取一下页博文列表页中的数据
         nexturl = "http://" + str(
             self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html"
         #进行下一次爬取,下一次爬取仍然模拟成浏览器进行
         yield Request(
             nexturl,
             callback=self.parse,
             headers={
                 'User-Agent':
                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"
             })
Esempio n. 5
0
    def parse(self, response):
        item = HexunpjtItem()
        item['name'] = response.xpath(
            '//span[@class="ArticleTitleText"]/a/text()').extract()
        item['url'] = response.xpath(
            '//span[@class="ArticleTitleText"]/a/@href').extract()
        # content = response.xpath('//div[@class="ArticleSubstanceText"]/text()')
        # 接下来需要使用urllib和re模块获取博文的评论数和阅读数
        # 首先提取存储评论数和点击数网址的正则表达式
        pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">'
        # hcur1为存储评论数和点击数的网址
        hcur1 = re.compile(pat1).findall(str(response.body))[0]
        # 模拟成浏览器
        headers2 = (
            'User-Agent',
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/2010010116039659-115975012-115943906-115943904-115926686-115858644-115801649-11 Firefox/61.0'
        )
        opener = urllib.request.build_opener()
        opener.addheaders = [headers2]
        # 将opener安装为全局
        urllib.request.install_opener(opener)
        # data为对应博客列表页的所有博文的点击数与评论数数据
        data = urllib.request.urlopen(hcur1).read()
        # pat2为提取文章阅读数的正则表达式
        pat2 = "click\d*?','(\d*?)'"
        # pat3为提取文章评论数的正则表达式
        pat3 = "comment\d*?','(\d*?)'"
        # 提取文读数和评论数数据并分配
        item['hits'] = re.compile(pat2).findall(str(data))
        item['comment'] = re.compile(pat3).findall(str(data))
        yield item
        # 提取博文列表页的总页数
        pat4 = 'blog.hexun.com/p(.*?)/'
        # 通过正则表达式获取到的数据为一个列表,倒数第二个元素为总页数
        data2 = re.compile(pat4).findall(str(response.body))
        if len(data2) >= 2:
            totalurl = int(data2[-2])
        else:
            totalurl = 1

        for i in range(2, totalurl + 1):
            # 构造下一个要爬取的url
            nexturl = 'http://' + str(
                self.uid) + '.blog.hexun.com/p%s/default.html' % i
            # 进行下一次爬取,下一次爬取仍然模拟成浏览器进行
            yield Request(
                nexturl,
                callback=self.parse,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0'
                })
Esempio n. 6
0
    def parse(self, response):
        item = HexunpjtItem()
        item['name'] = response.xpath('//span[@class="ArticleTitleText"]/a/text()').extract()
        item['url'] = response.xpath('//span[@class="ArticleTitleText"]/a/@href').extract()
        # 使用urllib和re模块获取博文的评论数和阅读数
        # 首先提取存储评论数和点击数网址的正则表达式
        # url click&comment
        pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">'
        # hcurl 为存储评论数和点击数的网址
        hcurl = re.compile(pat1).findall(str(response.body))[0]
        # 模拟成浏览器
        headers2 = ("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
        opener = urllib.request.build_opener()
        opener.addheaders = [headers2]
        # 将 opener 安装为全局
        urllib.request.install_opener(opener)
        data = urllib.request.urlopen(hcurl).read()
        # pat2 为提取文章阅读数的正则
        pat2 = "click\d*?','(\d*?)'"
        # pat3 为评论数的正则
        pat3 = "comment\d*?','(\d*?)'"
        # 赋值给item
        item['hits'] = re.compile(pat2).findall(str(data))
        item['comment'] = re.compile(pat3).findall(str(data))

        yield item

        # 提取文章列表总数
        pat4 = 'blog.hexun.com/p(.*?)/'
        # 通过正则表达式获取到的数据为一个列表,倒数第二个为总页数
        data2 = re.compile(pat4).findall(str(response.body))
        if(len(data2)>=2):
            totalurl = data2[-2]
        else:
            totalurl = 1
        # print('totalurl' + str(totalurl))
        print(str(response.url))
        # 判断是否需要生成所有连接

        # for 循环,依次爬取各博文列表的博文数据
        global CHECK_NEXT_URL
        if CHECK_NEXT_URL == 1:
            for i in range(2, int(totalurl) + 1):
                CHECK_NEXT_URL = 0
                # 构造下一次要爬取的url, 爬取下一页博文列表中的数据
                nexturl = "http://" + str(self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html"
                # 进行下一次爬取,模拟浏览器运行
                yield Request(nexturl, callback=self.parse, headers={'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}  )
Esempio n. 7
0
    def parse(self, response):
        item = HexunpjtItem()
        item['name'] = response.xpath(
            "//span[@class='ArticleTitleText']/a/text()").extract()
        item['url'] = response.xpath(
            "//span[@class='ArticleTitleText']/a/@href").extract()
        #评论数、点击数网址正则表达式
        pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">'
        hcurl = re.compile(pat1).findall(str(response.body))[0]
        #print(hcurl)

        header2 = (
            "User-Agent",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0"
        )
        opener = urllib.request.build_opener()
        opener.addheaders = [header2]
        #opener安装为全局
        urllib.request.install_opener(opener)
        data = urllib.request.urlopen(hcurl).read()
        #阅读数与评论数正则表达式
        pat2 = "click\d*?','(\d*?)'"
        pat3 = "comment\d*?','(\d*?)'"
        #提取阅读数、评论数并赋值
        item["hits"] = re.compile(pat2).findall(str(data))
        item["comment"] = re.compile(pat3).findall(str(data))
        yield item
        #提取总页数
        pat4 = "blog.hexun.com/p(.*?)/"
        data2 = re.compile(pat4).findall(str(response.body))
        if (len(data2) >= 2):
            totalurl = data2[-2]
        else:
            totalurl = 1
        print("一共" + str(totalurl) + "页")
        for i in range(2, int(totalurl) + 1):
            nexturl = "http://" + str(
                self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html"

            yield Request(
                nexturl,
                callback=self.parse,
                headers={
                    'User-Agent':
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
                })
            print("当前爬取到:" + str(i) + "页")
Esempio n. 8
0
    def parse(self, response):
        item = HexunpjtItem()
        item['name'] = response.xpath(
            '//span[@class="ArticleTitleText"]/a/text()').extract()
        # print(item['name'])
        item['url'] = response.xpath(
            '//span[@class="ArticleTitleText"]/a/@href').extract()
        # print (item['url'])
        pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)"'  #抓取点击数及评论数网址
        hcurl = re.compile(pat1).findall(str(response.body))[0]
        # hcurl = h.split('&')[0]
        # print(hcurl)
        header = (
            "User-agent",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36"
        )
        opener = urllib.request.build_opener()
        opener.addheaders = [header]
        urllib.request.install_opener(opener)
        data = urllib.request.urlopen(hcurl).read()
        # print(data)
        pat2 = "click\d*?','(\d*?)'"  #获取点击量
        pat3 = "comment\d*?','(\d*?)'"  #获取评论数
        item['hits'] = re.compile(pat2).findall(str(data))
        # print(item['hits'])
        item['comment'] = re.compile(pat3).findall(str(data))
        # print (item['comment'])
        yield item

        pat4 = "blog.hexun.com/p(.*?)/"
        data2 = re.compile(pat4).findall(str(response.body))
        if (len(data2) >= 2):
            totalurl = data2[-2]
        else:
            totalurl = 1
        for i in range(2, int(totalurl) + 1):
            nexturl = "http://" + str(
                self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html"
            yield Request(
                nexturl,
                callback=self.parse,
                headers={
                    "User-agent":
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36"
                })
Esempio n. 9
0
    def parse(self, response):
        item = HexunpjtItem()
        item['name'] = response.xpath('//span[@class="ArticleTitleText"]/a/text()').extract()
        item['url'] = response.xpath('//span[@class="ArticleTitleText"]/a/@href').extract()
        # 接下来使用urllib和re模块获取评论数和阅读数
        # 首先提取春初评论数和点击数的正则表达式
        print('~~~~~~~~~~~~~~~~~')
        print(item['name'])
        print(item['url'])
        
        pat_click = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">'
        hintcurl = re.findall(pat_click, str(response.body))[0]

        headers2 = ("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36")
        opener = request.build_opener()
        opener.addheaders = [headers2]
        request.install_opener(opener)
        data = request.urlopen(hintcurl).read()

        pat_hits = "click\d*?','(\d*?)'"

        pat_comnum = "comment\d*?','(\d*?)'"

        item['hits'] = re.findall(pat_hits, str(data))
        item['comment'] = re.findall(pat_comnum, str(data))

        yield item

        pat_page_number = "blog.hexun.com/p(.*?)/"
        data2 = re.findall(pat_page_number, str(response.body))

        if len(data2) >=2:
            totalurl = data2[-2]
        else:
            totalurl = 1
        print(str(totalurl))

        for i in range(2, int(totalurl) + 1):
            nexturl = "http://"+ str(self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html"
            yield Request(nexturl, callback=self.parse, headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"})
Esempio n. 10
0
    def parse(self, response):
        item = HexunpjtItem()
        item['name'] = response.xpath(
            "//span[@class='ArticleTitleText']/a/text()").extract()
        item['url'] = response.xpath(
            "//span[@class='ArticleTitleText']/a/@href").extract()
        pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)"></script>'
        hcurl = re.compile(pat1).findall(str(response.body))[0]

        headers2 = (
            'User-Agent',
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
        )
        opener = urllib2.build_opener()
        opener.addheaders = [headers2]
        urllib2.install_opener(opener)
        data = urllib2.urlopen(hcurl).read()
        pat2 = "'click\d*?','(\d*?)'"
        pat3 = "'comment\d*?','(\d*?)'"
        item['hit'] = re.compile(pat2).findall(str(data))
        item['comment'] = re.compile(pat3).findall(str(data))
        yield item
        pat4 = "blog.hexun.com/p(.*?)/"
        data2 = re.compile(pat4).findall(str(response.body))
        if len(data2) >= 2:
            totalurl = data2[-2]
        else:
            totalurl = 1

        print u"一共" + str(totalurl) + u"页"
        for i in range(2, int(totalurl) + 1):
            nexturl = "http://" + str(
                self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html"
            yield Request(
                nexturl,
                callback=self.parse,
                headers={
                    'User-Agent':
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
                })
Esempio n. 11
0
    def parse(self, response):
        item = HexunpjtItem()
        item["name"] = response.xpath(
            "//span[@class='ArticleTitleText']/a/text()").extract()
        item["url"] = response.xpath(
            "//span[@class='ArticleTitleText']/a/@href").extract()
        pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">'  #提取存储参数的网址
        hcurl = re.compile(pat1).findall(str(response.body))[0]
        header2 = (
            "User-Agent",
            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
        )  #元组
        opener = urllib.request.build_opener()
        opener.addheaders = [header2]
        urllib.request.install_opener(opener)

        data = urllib.request.urlopen(hcurl).read()  #数据
        pat2 = "click\d*?','(\d*?)'"
        pat3 = "comment\d*?','(\d*?)'"
        item["hits"] = re.compile(pat2).findall(str(data))
        item["comment"] = re.compile(pat3).findall(str(data))
        yield item

        pat4 = "blog.hexun.com/p(.*?)/"
        data2 = re.compile(pat4).findall(str(response.body))  #获得一个列表
        if (len(data2) >= 2):
            totalurl = data2[-2]
        else:
            totalurl = data2[0]
        for i in range(2, int(totalurl) + 1):
            nexturl = "http://" + str(
                self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html"
            yield Request(
                nexturl,
                headers={
                    "User-Agent":
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
                })
Esempio n. 12
0
 def parse(self, response):
     item = HexunpjtItem()
     item["name"] = response.xpath(
         "//span[@class='ArticleTitleText']/a/text()").extract()
     item["url"] = response.xpath(
         "//span[@class='ArticleTitleText']/a/@href").extract()
     pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*)">'
     hcurl = re.compile(pat1).findall(str(response.body))[0]
     headers2 = (
         "User-Agent",
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"
     )
     opener = urllib2.build_opener()
     opener.addheaders = [headers2]
     urllib2.install_opener(opener)
     data = urllib2.urlopen(hcurl).read()
     pat2 = "click\d*? ', '(\d*? )'"
     pat3 = "comment\d*? ', '(\d*? )'"
     item["hits"] = re.compile(pat2).findall(str(data))
     item["comments"] = re.compile(pat3).findall(str(data))
     yield item
     pat4 = "blog.hexun.com/p(.*? )/"
     data2 = re.compile(pat4).findall(str(response.body))
     if (len(data2) >= 2):
         totalurl = data2[-2]
     else:
         totalurl = 1
     for i in range(2, int(totalurl) + 1):
         nexturl = "http://" + str(
             self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html"
         yield Request(
             nexturl,
             callback=self.parse,
             headers={
                 'User-Agent':
                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"
             })
Esempio n. 13
0
    def parse(self, response):
        #with open('14756002.html','wb') as htmlfile:
        #htmlfile.write(response.text.encode('utf-8'))
        pages = response.css('.Article')
        js_url = response.css(
            '#DefaultContainer1_ArticleList_Panel1 script::attr(src)'
        ).extract_first()
        hitsandcomment = self.get_commnetandhits(js_url, response.url)
        item = HexunpjtItem()
        for page in pages:
            item['title'] = page.css(
                '.ArticleTitleText a::text').extract_first()
            item['link'] = page.css(
                '.ArticleTitleText a::attr(href)').extract_first()
            item['word_count'] = page.css(
                '.ArticleWordCount::text').extract_first()
            hits_comment_id = page.css('.ArticleInfo span::attr(id)').extract(
            )  #结果:['click116597112', 'comment116597112']
            item['hits'] = hitsandcomment[hits_comment_id[0]]
            item['comment'] = hitsandcomment[hits_comment_id[1]]
            print(item['hits'], item['comment'])
            yield item

        next_page = response.css(
            '.PageSkip .PageSkip_1 a::attr(href)').extract()[-1]
        if next_page:
            print(next_page)
            #print(response.url)
            yield Request(
                next_page,
                callback=self.parse,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
                    'Referer': response.url
                })