class QbSpiderSpider(scrapy.Spider): name = 'qb_spider' allowed_domains = ['qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/text/page/1/'] item = QiushibaikeItem() def parse(self, response): duanzidivs = response.xpath("//div[@class='col1 old-style-col1']/div") for duanzidiv in duanzidivs: author = duanzidiv.xpath(".//h2/text()").get().strip() self.item['author'] = author # 解决内容显示不完整的段子:先获取到完整段子的链接,request访问,处理交给parse_all_content函数 if duanzidiv.xpath(".//div[@class='content']/span[2]"): all_link = duanzidiv.xpath("./a/@href").get() content_url = "https://www.qiushibaike.com" + all_link yield scrapy.Request(url=content_url, callback=self.parse_all_content) else: content = "".join( duanzidiv.xpath(".//div[@class='content']/span//text()"). getall()).strip() self.item['content'] = content yield self.item # 爬取多页共二十页,也可以判断页面有没有下一页如果有则爬取 for i in range(2, 21): next_url = "https://www.qiushibaike.com/text/page/" + str(i) + "/" yield scrapy.Request(url=next_url, callback=self.parse) def parse_all_content(self, response): all_content = response.xpath( "//div[@class='content']//text()").getall() all_content = ''.join(all_content).strip() self.item['content'] = all_content yield self.item
def get_page(self,response): test = BeautifulSoup(response.text,'lxml') #print (test.prettify()) articles = BeautifulSoup(response.text,'lxml').find_all('div',class_='article block untagged mb15') print ('现在是第'+str(response.meta['page'])+'页') for article in articles: item = QiushibaikeItem() id = article.get('id')[11:] name = article.find('div',class_='author clearfix').find('h2').get_text() content = article.find('div',class_='content').find('span').get_text() agreed_number = article.find('span',class_='stats-vote').find('i').get_text() item['page_number'] = str(response.meta['page']) item['id'] = id item['name'] = name item['content'] = content item['agreed_number'] = agreed_number #print(item) #discuss_number = # print ('-----------------------------------------') # print('id:' + id) # print ('姓名: '+name.strip()) # print ('内容: '+content.strip()) # print ('点赞: '+agreed_number.strip()) # # print('-----------------------------------------') yield item
def parse(self, response): print('\nstart {} ......\n'.format(response.url)) content_left_div = response.xpath('//*[@id="content-left"]') content_list_div = content_left_div.xpath('./div') for content_div in content_list_div: try: item = QiushibaikeItem() try: item['author'] = content_div.xpath( './div/a[2]/h2/text()').get().strip() except: item['author'] = content_div.xpath( './div/span[2]/h2/text()').get().strip() # item['content'] = "".join(content_div.xpath('./a/div[@class="content"]/span/text()').getall()).strip().replace('\n', '') item['content'] = "".join( content_div.xpath( './a[contains(@href, "article")]/div[@class="content"]/span/text()' ).getall()).strip().replace('\n', '') item['_id'] = content_div.attrib['id'] yield item except Exception as e: print(response.url) print("item error", e.args) next_page = response.xpath( '//*[@id="content-left"]/ul/li[last()]/a/@href').get() if next_page: next_page = 'https://www.qiushibaike.com' + next_page yield scrapy.Request(url=next_page, callback=self.parse)
def parse(self, response): list = response.xpath('//*[@id="content-left"]/div') for info in list: item = QiushibaikeItem() username = info.xpath('.//div[1]/a[2]/h2/text()').extract_first() if username: item['username'] = username.replace('\n', '') item['avatar'] = response.urljoin( info.xpath('.//div[1]/a[1]/img/@src').extract_first()) item['content'] = info.xpath( './/a[1]/div/span[1]/text()').extract_first().replace( ' ', '').replace('\n', '') contentImage = info.xpath( './/div[2]/a/img/@src').extract_first() if contentImage: item['contentImage'] = response.urljoin(contentImage) else: item['contentImage'] = '' item['nextPage'] = response.urljoin( response.xpath( "//ul[@class='pagination']/li[last()]/a/@href"). extract_first()) yield item nextPage = response.xpath( "//ul[@class='pagination']/li[last()]/a/@href").extract_first() if nextPage: nextUrl = response.urljoin(nextPage) yield scrapy.Request(nextUrl, self.parse)
def parse(self, response): # print(response) div_lists = response.xpath('//*[@id="content"]/div/div[2]/div') # print(div_lists) all_data = [] for div in div_lists: #选取列表第一个元素转化为哦字符串 # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() #直接选取第一个元素转换为字符串 author = div.xpath( './div[1]/a[2]/h2/text() | ./div[1]/span/h2/text()' ).extract_first() #返回一个列表字符串 page_text = div.xpath('./a[1]/div/span/text() ').extract() #使用join方法将列表转换为字符串 page_text = ''.join(page_text) # print(author,page_text) #基于终端存储 # dic = { # 'author':author, # 'page_text':page_text # } # all_data.append(dic) # return all_data #基于管道 item = QiushibaikeItem() item['author'] = author item['page_text'] = page_text #将item提交到管道 yield item
def parse_item(self, response): for sel in response.xpath( '//div[@class="article block untagged mb15"]'): item = QiushibaikeItem() item['author'] = sel.xpath('.//h2/text()')[0].extract() item['duanzi'] = sel.xpath( 'div[@class="content"]/text()').extract() yield item
def parse_item(self, response): #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() for sel in response.xpath('//div[@class="article block untagged mb15"]'): item=QiushibaikeItem() item['author']=sel.xpath('.//h2/text()')[0].extract() item['duanzi']=sel.xpath('div[@class="content"]/text()').extract() yield item
def parse_item(self, response): i = QiushibaikeItem() i['content'] = response.xpath( '//div[@class="content"]/text()').extract() i['link'] = response.xpath('//link[@rel="canonical"]/@href').extract() print(i['content']) print(i['link']) print('') #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return i
def parse(self, response): content_left_node = response.xpath( "//div[@id='content-left']") # 确定发布区的节点区域 div_node_list = content_left_node.xpath("./div") crawl_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') for div_node in div_node_list: item = QiushibaikeItem() l = QItemLoader(item, selector=div_node) l.add_xpath( 'name', ".//div[@class='author clearfix']/a[contains(@onclick,'web-list-author-text')]/h2/text()", ) l.add_xpath('info', ".//div[@class='content']/span[1]//text()") l.add_value('crawl_date', crawl_date) data = l.load_item() print(data) yield data
def parse(self, response): content_left_div = response.xpath( '// *[ @ id = "content"] / div / div[2]') content_list_div = content_left_div.xpath('./div') for content_div in content_list_div: item = QiushibaikeItem() item['author'] = content_div.xpath( './div[1]/a[2]/h2/text()').get(), item['content'] = content_div.xpath( './a[1]/div/span/text()').getall(), item['_id'] = content_div.attrib['id'] yield item next_page = response.xpath( '//*[@id="content"]/div/div[2]/ul/li[8]/a').attrib['href'] if next_page is not None: yield response.follow(next_page, callback=self.parse())
def parse(self, response): content_left_div = response.xpath('//*[@class="col1 old-style-col1"]') content_list_div = content_left_div.xpath('./div') for content_div in content_list_div: item = QiushibaikeItem() item['author'] = content_div.xpath('./div/a[2]/h2/text()').get() item['content'] = content_div.xpath('./a/div/span/text()').getall() item['_id'] = content_div.attrib['id'] yield item try: next_page = response.xpath( '//*[@class="col1 old-style-col1"]/ul/li[last()]/a' ).attrib['href'] yield response.follow(next_page, callback=self.parse) except KeyError: print("key error")
def parse(self, response): str="" item=QiushibaikeItem() for qiushi in response.xpath('//div[@class="content-block clearfix"]/div[@id="content-left"]/div'): if len(qiushi.xpath('.//div[@class="author clearfix"]/a[2]/h2/text()')) is 0: item['author']="匿名用户" else: a=[x.strip() for x in qiushi.xpath('.//div[@class="author clearfix"]/a[2]/h2/text()').extract()] b=[x.strip() for x in qiushi.xpath('.//a/div[@class="content"]/span/text()').extract()] c=[x.strip() for x in qiushi.xpath('.//div[@class="stats"]/span[1]//text()').getall()] d=[x.strip() for x in qiushi.xpath('.//div[@class="stats"]/span[2]/a//text()').getall()] item['author']=str.join(a) item['content']=str.join(b) item['fullName']=str.join(c) item['sum_comment']=str.join(d) yield item next_page = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract() if next_page: next_page='https://www.qiushibaike.com'+next_page[0] yield Request(next_page,headers=self.headers,callback=self.parse,dont_filter=True)
def parse(self, response): for item in response.xpath( '//div[@id="content-left"]/div[@class="article block untagged mb15"]' ): qiubai = QiushibaikeItem() icon = item.xpath( './div[@class="author clearfix"]/a[1]/img/@src').extract() if icon: icon = icon[0] qiubai['userIcon'] = icon userName = item.xpath( './div[@class="author clearfix"]/a[2]/h2/text()').extract() if userName: userName = userName[0] qiubai['userName'] = userName content = item.xpath( './a[@class="contentHerf"]/div[@class="content"]/span/descendant::text()' ).extract() if content: con = '' for str in content: con += str qiubai['content'] = con like = item.xpath( './div[@class="stats"]/span[@class="stats-vote"]/i/text()' ).extract() if like: like = like[0] qiubai['like'] = like comment = item.xpath( './div[@class="stats"]/span[@class="stats-comments"]/a/i/text()' ).extract() if comment: comment = comment[0] qiubai['comment'] = comment yield qiubai
def parse(self, response): # 解析: 作者名称,段子内容 div_list = response.xpath('//*[@id="content"]/div/div[2]/div') # 存储所有解析到的数据 all_data = [] for div in div_list: # xpath返回的是列表,但是列表的元素一定是Selector类型对象 # extract可以将Selector对象中的data参数存储的字符串提取出来,list存储 # author = div.xpath('.//h2/text()').extract()[0] author = div.xpath('.//h2/text()').extract_first().strip() # 列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取出来,存储到列表当中 content = div.xpath('./a/div/span/text()')[0].extract().strip() content = ''.join(content) # print(author, content) item = QiushibaikeItem() item['author'] = author item['content'] = content # 提交item到管道 yield item
def parse(self, response): content_left_div = response.xpath("//*[@id='content']/div/div[2]") content_list_div = content_left_div.xpath('./div') for content_div in content_list_div: item = QiushibaikeItem() item['author'] = content_div.xpath('./div/a[2]/h2/text()').get(), item['content'] = content_div.xpath('./a/div/span/text()').getall() item['_id'] = content_div.attrib['id'] yield item next_page = response.xpath( '/html/body/div[1]/div/div[2]/ul/li[8]/a').attrib['href'] if next_page is not None: yield response.follow(next_page, callback=self.parse) # items.py:定义我们要存储数据的字段。 # middlewares.py:就是中间件,在这里面可以做一些在爬虫过程中想干的事情,比如爬虫在响应的时候你可以做一些操作。 # pipelines.py:用来定义一些存储信息的文件,比如我们要连接 MySQL或者 MongoDB 就可以在这里定义。 # settings.py:定义我们的各种配置,比如配置请求头信息等。
def parse(self, response): # selectorList divs = response.xpath('//div[@class="col1 old-style-col1"]/div') for div in divs: # selector author = div.xpath('.//h2/text()').get().strip() content = div.xpath('.//div[@class="content"]//text()').getall() content = ",".join(content).strip() # duanzi = { # 'author': author, # 'content': content # } # yield duanzi item = QiushibaikeItem(author=author, content=content) yield item next_url = response.xpath( "//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else: yield scrapy.Request(self.base_domain + next_url, callback=self.parse)
def parse(self, response): # page = response.url.split("/")[-2] # filename = 'qiushi-%s.html' % page # with open(filename, 'wb') as f: # f.write(response.body) # self.log('Saved file %s' % filename) content_left_div = response.xpath('//*[@class="col1 old-style-col1"]') content_list_div = content_left_div.xpath('./div') for content_div in content_list_div: item = QiushibaikeItem() item['author'] = content_div.xpath('./div/a[2]/h2/text()').get() item['content'] = content_div.xpath('./a/div/span/text()').getall() item['_id'] = content_div.attrib['id'] yield item next_page = response.xpath( '//*[@class="old-style-col1"]/ul/li[last()]/a').attrib['href'] if next_page is not None: yield response.follow(next_page, callback=self.parse)
def parse(self, response): # 区域内查找使用相对路径,否则会重复查询 for info in response.xpath( '//div[contains(@class, "article block untagged mb15")]'): item = QiushibaikeItem() content = info.xpath('./a/div/span/text()').extract() author = info.xpath('./div/a[2]/h2/text()').extract() rep = re.compile(r'\\n') content = re.sub(rep, "", str(content)).strip() author = re.sub(rep, "", str(author)).strip() item['content'] = content item['author'] = author item['fancy'] = info.xpath('./div/span/i/text()').extract() item['comment'] = info.xpath('./div/span/a/i/text()').extract() yield item next_page = response.xpath( '//span[@class="current"]/../following-sibling::li[1]/a/@href' ).extract_first() if next_page is not None: # next_page = 'http:' + next_page yield response.follow(next_page, headers=self.headers, callback=self.parse)