def parse(self, response): item=HexunpjtItem() item['name']=response.xpath('//span[@class="ArticleTitleText"]/a/text()').extract() item['url']=response.xpath('//span[@class="ArticleTitleText"]/a/@href').extract() #需用正则匹配 pat_hits='click(.*?)<' pat_comment='comment(.*?)<' filter_first1=re.compile(pat_hits).findall(str(response.body)) filter_first2=re.compile(pat_comment).findall(str(response.body)) pat_hits='>(\d*?)<' pat_comment='>(\d*?)<' item['hits']=re.compile(pat_hits).findall(str(filter_first1)) item['comment']=re.compile(pat_comment).findall(str(filter_first2)) yield item #获取总页数 pat='blog.hexun.com/p(\d*?)/' data=re.compile(pat).findall(str(response.body))#--->list # if (len(data)>=2): # totalurl=data[-2] # totalurl=1 # print('一共'+str(totalurl)+'页') for i in range(2,10): #构造下次要爬的url nexturl="http://"+str(self.uid)+".blog.hexun.com/p"+str(i)+"/default.html" yield Request(nexturl,callback=self.parse, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64;x64,rv:61.0) Gecko/20100101 Firefox/61.0' })
def parse(self, response): item = HexunpjtItem() articles = response.xpath("/div[@class='Article']") for article in articles: item['name'] = articles.xpath("./span[@class='ArticleTitleText']/a/text()") print(item['name']) yield item
def parse(self, response): item = HexunpjtItem() item['name'] = response.xpath(".//span[@class='ArticleTitleText']/a/text()").extract() item['url'] = response.xpath(".//span[@class='ArticleTitleText']/a/@href").extract() # 获取class='ArticleInfo'的所有子节点 clicks = response.xpath(".//div[@class='ArticleInfo']") # 设置空的点击数和评论数列表 hit = [] comment = [] for click in clicks: # 获取文章的点击Id click_id = click.xpath("./span/@id").extract() # 用replace去掉不需要click字符,直接获取id click_id = click_id[0].strip('click') hcurl = "http://click.tool.hexun.com/linkclick.aspx?blogid=19020056&articleids=" + click_id # print(hcurl) r = requests.get(hcurl,headers=headers).text # print(r) # par2是点击数的正则 par3是评论数的正则 par2 = r"click\d*?','(\d*?)'" par3 = r"comment\d*?','(\d*?)'" hit.append(re.compile(par2).findall(str(r))) comment.append(re.compile(par3).findall(str(r))) item['hit'] = hit item['comment'] = comment # page为该个人微博的总页数 page = response.xpath("//div[@class='PageSkip_1']/a[5]/text()").extract() for i in range(2,int(page[0])+1): next_url = "http://fjrs168.blog.hexun.com/p"+str(i)+"/default.html" print(next_url) yield Request(next_url, callback=self.parse, headers=headers) yield item
def parse(self, response): item = HexunpjtItem() item['name'] = response.xpath( "//span[@class='ArticleTitleText']/a/text()").extract() item["url"] = response.xpath( "//span[@class='ArticleTitleText']/a/@href").extract() #接下来需要使用urllib和re模块获取博文的评论数和阅读数 #首先提取存储评论数和点击数网址的正则表达式 pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">' #hcurl为存储评论数和点击数的网址 hcurl = re.compile(pat1).findall(str(response.body))[0] # 模拟成浏览器 headers2 = ( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0" ) opener = urllib.request.build_opener() opener.addheaders = [headers2] # 将opener安装为全局 urllib.request.install_opener(opener) #data为对应博客列表页的所有博文的点击数与评论数数据 data = urllib.request.urlopen(hcurl).read() #pat2为提取文章阅读数的正则表达式 pat2 = "click\d*?','(\d*?)'" #pat3为提取文章评论数的正则表达式 pat3 = "comment\d*?','(\d*?)'" #提取阅读数和评论数数据并分别赋值给item下的hits和comment item["hits"] = re.compile(pat2).findall(str(data)) item["comment"] = re.compile(pat3).findall(str(data)) yield item #提取博文列表页的总页数 pat4 = "blog.hexun.com/p(.*?)/" #通过正则表达式获取到的数据为一个列表,倒数第二个元素为总页数 data2 = re.compile(pat4).findall(str(response.body)) if (len(data2) >= 2): totalurl = data2[-2] else: totalurl = 1 #在实际运行中,下一行print的代码可以注释掉,在调试过程中,可以开启下一行print的代码 #print("一共"+str(totalurl)+"页") #进入for循环,依次爬取各博文列表页的博文数据 for i in range(2, int(totalurl) + 1): #构造下一次要爬取的url,爬取一下页博文列表页中的数据 nexturl = "http://" + str( self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html" #进行下一次爬取,下一次爬取仍然模拟成浏览器进行 yield Request( nexturl, callback=self.parse, headers={ 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0" })
def parse(self, response): item = HexunpjtItem() item['name'] = response.xpath( '//span[@class="ArticleTitleText"]/a/text()').extract() item['url'] = response.xpath( '//span[@class="ArticleTitleText"]/a/@href').extract() # content = response.xpath('//div[@class="ArticleSubstanceText"]/text()') # 接下来需要使用urllib和re模块获取博文的评论数和阅读数 # 首先提取存储评论数和点击数网址的正则表达式 pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">' # hcur1为存储评论数和点击数的网址 hcur1 = re.compile(pat1).findall(str(response.body))[0] # 模拟成浏览器 headers2 = ( 'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/2010010116039659-115975012-115943906-115943904-115926686-115858644-115801649-11 Firefox/61.0' ) opener = urllib.request.build_opener() opener.addheaders = [headers2] # 将opener安装为全局 urllib.request.install_opener(opener) # data为对应博客列表页的所有博文的点击数与评论数数据 data = urllib.request.urlopen(hcur1).read() # pat2为提取文章阅读数的正则表达式 pat2 = "click\d*?','(\d*?)'" # pat3为提取文章评论数的正则表达式 pat3 = "comment\d*?','(\d*?)'" # 提取文读数和评论数数据并分配 item['hits'] = re.compile(pat2).findall(str(data)) item['comment'] = re.compile(pat3).findall(str(data)) yield item # 提取博文列表页的总页数 pat4 = 'blog.hexun.com/p(.*?)/' # 通过正则表达式获取到的数据为一个列表,倒数第二个元素为总页数 data2 = re.compile(pat4).findall(str(response.body)) if len(data2) >= 2: totalurl = int(data2[-2]) else: totalurl = 1 for i in range(2, totalurl + 1): # 构造下一个要爬取的url nexturl = 'http://' + str( self.uid) + '.blog.hexun.com/p%s/default.html' % i # 进行下一次爬取,下一次爬取仍然模拟成浏览器进行 yield Request( nexturl, callback=self.parse, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0' })
def parse(self, response): item = HexunpjtItem() item['name'] = response.xpath('//span[@class="ArticleTitleText"]/a/text()').extract() item['url'] = response.xpath('//span[@class="ArticleTitleText"]/a/@href').extract() # 使用urllib和re模块获取博文的评论数和阅读数 # 首先提取存储评论数和点击数网址的正则表达式 # url click&comment pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">' # hcurl 为存储评论数和点击数的网址 hcurl = re.compile(pat1).findall(str(response.body))[0] # 模拟成浏览器 headers2 = ("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36") opener = urllib.request.build_opener() opener.addheaders = [headers2] # 将 opener 安装为全局 urllib.request.install_opener(opener) data = urllib.request.urlopen(hcurl).read() # pat2 为提取文章阅读数的正则 pat2 = "click\d*?','(\d*?)'" # pat3 为评论数的正则 pat3 = "comment\d*?','(\d*?)'" # 赋值给item item['hits'] = re.compile(pat2).findall(str(data)) item['comment'] = re.compile(pat3).findall(str(data)) yield item # 提取文章列表总数 pat4 = 'blog.hexun.com/p(.*?)/' # 通过正则表达式获取到的数据为一个列表,倒数第二个为总页数 data2 = re.compile(pat4).findall(str(response.body)) if(len(data2)>=2): totalurl = data2[-2] else: totalurl = 1 # print('totalurl' + str(totalurl)) print(str(response.url)) # 判断是否需要生成所有连接 # for 循环,依次爬取各博文列表的博文数据 global CHECK_NEXT_URL if CHECK_NEXT_URL == 1: for i in range(2, int(totalurl) + 1): CHECK_NEXT_URL = 0 # 构造下一次要爬取的url, 爬取下一页博文列表中的数据 nexturl = "http://" + str(self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html" # 进行下一次爬取,模拟浏览器运行 yield Request(nexturl, callback=self.parse, headers={'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} )
def parse(self, response): item = HexunpjtItem() item['name'] = response.xpath( "//span[@class='ArticleTitleText']/a/text()").extract() item['url'] = response.xpath( "//span[@class='ArticleTitleText']/a/@href").extract() #评论数、点击数网址正则表达式 pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">' hcurl = re.compile(pat1).findall(str(response.body))[0] #print(hcurl) header2 = ( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:63.0) Gecko/20100101 Firefox/63.0" ) opener = urllib.request.build_opener() opener.addheaders = [header2] #opener安装为全局 urllib.request.install_opener(opener) data = urllib.request.urlopen(hcurl).read() #阅读数与评论数正则表达式 pat2 = "click\d*?','(\d*?)'" pat3 = "comment\d*?','(\d*?)'" #提取阅读数、评论数并赋值 item["hits"] = re.compile(pat2).findall(str(data)) item["comment"] = re.compile(pat3).findall(str(data)) yield item #提取总页数 pat4 = "blog.hexun.com/p(.*?)/" data2 = re.compile(pat4).findall(str(response.body)) if (len(data2) >= 2): totalurl = data2[-2] else: totalurl = 1 print("一共" + str(totalurl) + "页") for i in range(2, int(totalurl) + 1): nexturl = "http://" + str( self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html" yield Request( nexturl, callback=self.parse, headers={ 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36" }) print("当前爬取到:" + str(i) + "页")
def parse(self, response): item = HexunpjtItem() item['name'] = response.xpath( '//span[@class="ArticleTitleText"]/a/text()').extract() # print(item['name']) item['url'] = response.xpath( '//span[@class="ArticleTitleText"]/a/@href').extract() # print (item['url']) pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)"' #抓取点击数及评论数网址 hcurl = re.compile(pat1).findall(str(response.body))[0] # hcurl = h.split('&')[0] # print(hcurl) header = ( "User-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" ) opener = urllib.request.build_opener() opener.addheaders = [header] urllib.request.install_opener(opener) data = urllib.request.urlopen(hcurl).read() # print(data) pat2 = "click\d*?','(\d*?)'" #获取点击量 pat3 = "comment\d*?','(\d*?)'" #获取评论数 item['hits'] = re.compile(pat2).findall(str(data)) # print(item['hits']) item['comment'] = re.compile(pat3).findall(str(data)) # print (item['comment']) yield item pat4 = "blog.hexun.com/p(.*?)/" data2 = re.compile(pat4).findall(str(response.body)) if (len(data2) >= 2): totalurl = data2[-2] else: totalurl = 1 for i in range(2, int(totalurl) + 1): nexturl = "http://" + str( self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html" yield Request( nexturl, callback=self.parse, headers={ "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" })
def parse(self, response): item = HexunpjtItem() item['name'] = response.xpath('//span[@class="ArticleTitleText"]/a/text()').extract() item['url'] = response.xpath('//span[@class="ArticleTitleText"]/a/@href').extract() # 接下来使用urllib和re模块获取评论数和阅读数 # 首先提取春初评论数和点击数的正则表达式 print('~~~~~~~~~~~~~~~~~') print(item['name']) print(item['url']) pat_click = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">' hintcurl = re.findall(pat_click, str(response.body))[0] headers2 = ("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36") opener = request.build_opener() opener.addheaders = [headers2] request.install_opener(opener) data = request.urlopen(hintcurl).read() pat_hits = "click\d*?','(\d*?)'" pat_comnum = "comment\d*?','(\d*?)'" item['hits'] = re.findall(pat_hits, str(data)) item['comment'] = re.findall(pat_comnum, str(data)) yield item pat_page_number = "blog.hexun.com/p(.*?)/" data2 = re.findall(pat_page_number, str(response.body)) if len(data2) >=2: totalurl = data2[-2] else: totalurl = 1 print(str(totalurl)) for i in range(2, int(totalurl) + 1): nexturl = "http://"+ str(self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html" yield Request(nexturl, callback=self.parse, headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"})
def parse(self, response): item = HexunpjtItem() item['name'] = response.xpath( "//span[@class='ArticleTitleText']/a/text()").extract() item['url'] = response.xpath( "//span[@class='ArticleTitleText']/a/@href").extract() pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)"></script>' hcurl = re.compile(pat1).findall(str(response.body))[0] headers2 = ( 'User-Agent', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" ) opener = urllib2.build_opener() opener.addheaders = [headers2] urllib2.install_opener(opener) data = urllib2.urlopen(hcurl).read() pat2 = "'click\d*?','(\d*?)'" pat3 = "'comment\d*?','(\d*?)'" item['hit'] = re.compile(pat2).findall(str(data)) item['comment'] = re.compile(pat3).findall(str(data)) yield item pat4 = "blog.hexun.com/p(.*?)/" data2 = re.compile(pat4).findall(str(response.body)) if len(data2) >= 2: totalurl = data2[-2] else: totalurl = 1 print u"一共" + str(totalurl) + u"页" for i in range(2, int(totalurl) + 1): nexturl = "http://" + str( self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html" yield Request( nexturl, callback=self.parse, headers={ 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" })
def parse(self, response): item = HexunpjtItem() item["name"] = response.xpath( "//span[@class='ArticleTitleText']/a/text()").extract() item["url"] = response.xpath( "//span[@class='ArticleTitleText']/a/@href").extract() pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">' #提取存储参数的网址 hcurl = re.compile(pat1).findall(str(response.body))[0] header2 = ( "User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" ) #元组 opener = urllib.request.build_opener() opener.addheaders = [header2] urllib.request.install_opener(opener) data = urllib.request.urlopen(hcurl).read() #数据 pat2 = "click\d*?','(\d*?)'" pat3 = "comment\d*?','(\d*?)'" item["hits"] = re.compile(pat2).findall(str(data)) item["comment"] = re.compile(pat3).findall(str(data)) yield item pat4 = "blog.hexun.com/p(.*?)/" data2 = re.compile(pat4).findall(str(response.body)) #获得一个列表 if (len(data2) >= 2): totalurl = data2[-2] else: totalurl = data2[0] for i in range(2, int(totalurl) + 1): nexturl = "http://" + str( self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html" yield Request( nexturl, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" })
def parse(self, response): item = HexunpjtItem() item["name"] = response.xpath( "//span[@class='ArticleTitleText']/a/text()").extract() item["url"] = response.xpath( "//span[@class='ArticleTitleText']/a/@href").extract() pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*)">' hcurl = re.compile(pat1).findall(str(response.body))[0] headers2 = ( "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0" ) opener = urllib2.build_opener() opener.addheaders = [headers2] urllib2.install_opener(opener) data = urllib2.urlopen(hcurl).read() pat2 = "click\d*? ', '(\d*? )'" pat3 = "comment\d*? ', '(\d*? )'" item["hits"] = re.compile(pat2).findall(str(data)) item["comments"] = re.compile(pat3).findall(str(data)) yield item pat4 = "blog.hexun.com/p(.*? )/" data2 = re.compile(pat4).findall(str(response.body)) if (len(data2) >= 2): totalurl = data2[-2] else: totalurl = 1 for i in range(2, int(totalurl) + 1): nexturl = "http://" + str( self.uid) + ".blog.hexun.com/p" + str(i) + "/default.html" yield Request( nexturl, callback=self.parse, headers={ 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0" })
def parse(self, response): #with open('14756002.html','wb') as htmlfile: #htmlfile.write(response.text.encode('utf-8')) pages = response.css('.Article') js_url = response.css( '#DefaultContainer1_ArticleList_Panel1 script::attr(src)' ).extract_first() hitsandcomment = self.get_commnetandhits(js_url, response.url) item = HexunpjtItem() for page in pages: item['title'] = page.css( '.ArticleTitleText a::text').extract_first() item['link'] = page.css( '.ArticleTitleText a::attr(href)').extract_first() item['word_count'] = page.css( '.ArticleWordCount::text').extract_first() hits_comment_id = page.css('.ArticleInfo span::attr(id)').extract( ) #结果:['click116597112', 'comment116597112'] item['hits'] = hitsandcomment[hits_comment_id[0]] item['comment'] = hitsandcomment[hits_comment_id[1]] print(item['hits'], item['comment']) yield item next_page = response.css( '.PageSkip .PageSkip_1 a::attr(href)').extract()[-1] if next_page: print(next_page) #print(response.url) yield Request( next_page, callback=self.parse, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Referer': response.url })