def parse(self, response): """ 得到解析的页面数据 :param response: :return: """ # 调用xpath解析 div_list = response.xpath('//div[@id="content-left"]/div') # 存储解析到的页面数据到列表中 data_list = [] for div in div_list: # xpath 解析到的指定内容被存储到Selector对象中 # extract() 方法可以将Selector对象中存储的数据值拿到 # title = div.xpath('./div/a[2]/h2/text()').extract()[0] title = div.xpath('./div/a[2]/h2/text()').extract_first() content = div.xpath( './/div[@class="content"]/span/text()').extract()[0] # print(title,content) dict = {'author': title, 'content': content} data_list.append(dict) # 1.将解析到的数据值( author 和 content ) 存储到 items 对象中 item = QiubaiproItem() item['author'] = title item['content'] = content print(content) # 2.将item 对象提交给管道 yield item
def parse(self, response): div_list = response.xpath('//div[@class="col1 old-style-col1"]/div') for div in div_list: author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() content = div.xpath('./a[1]/div/span//text()').extract() content = ''.join(content) item = QiubaiproItem() item['author'] = author item['content'] = content yield item #将item提交给了管道
def parse(self, response): li_list = response.xpath('//*[@id="content"]/div/div[2]/div/ul/li') # print(li_list) # all_data = [] for li in li_list: author = li.xpath('./div/div/a/span/text()').extract_first() content = li.xpath('./div/a/text()').extract_first() item = QiubaiproItem() item['author'] = author item['content'] = content yield item
def parse(self, response): div_list = response.xpath('//div[@class="col1 old-style-col1"]/div') # 存储所有解析到的数据 all_data = [] for div in div_list: author = div.xpath('./div[1]/a[1]/img/@alt').extract()[0] content = div.xpath('./a[1]/div/span//text()').extract() content = ''.join(content) item = QiubaiproItem() item['author'] = author item['content'] = content # 将item提交给管道 yield item
def parse(self, response): div_list = response.xpath('//*[@id="content"]/div/div[2]/div') for div in div_list: # xpath返回的是列表,但是列表元素一定是Selector类型的对象 # extract可以将Selector对象中data参数存储的字符串提取出来 author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() # 列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取了出来,返回一个列表 content = div.xpath('./a/div/span//text()').extract() content = ''.join(content) item = QiubaiproItem() item['author'] = author item['content'] = content # 将item提交给管道 yield item
def parse(self, response): div_list = response.xpath('//*[@id="content-left"]/div') for div in div_list: # xpath返回的是列表,但是列表元素一定是Selector类型的对象 # extract可以将Selector对象中存储在data中的字符串提取出来 # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() author = div.xpath( './div[1]/a[2]/h2/text() | ./div[1]/span/h2/text()' ).extract_first().strip() content = div.xpath('./a/div/span[1]//text()').extract() content = ''.join(content) content = content.strip() item = QiubaiproItem() item['author'] = author item['content'] = content yield item
def parse(self, response): # 推荐xpath(scrapy里集成了xpath解析接口) # 每个段子的外层容器 div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: author = div.xpath('./div/a[2]/h2/text()').extract_first() content = div.xpath('.//div[@class="content"]/span/text()').extract_first() # step001 将解析到的数据 存到items对象 item = QiubaiproItem() item['author'] = author item['content'] = content # step002 将item对象提交给管道 yield item
def parse(self, response): # 解析:作者的名称 + 段子内容 div_list = response.xpath('//div[@class="col1 old-style-col1"]/div') all_data = [] # 存储所有解析到的数据 for div in div_list: # xpath返回的是列表,但是列表元素一定是Selector类型的对象 # extract()可以将Selector对象中data参数存储的字符串提取出来 # author_name = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() author = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first() # 如果列表中只有一个元素可以用该方法 # 如果列表调用了extract之后,则表示将列表中每一个Selector对象data对应的字符串提取出来 content = div.xpath('./a[1]/div/span//text()').extract() content = ''.join(content) item = QiubaiproItem() item['author'] = author item['content'] = content yield item # 将item提交给了管道
def parse(self, response): # 解析作者的名称+段子内容 div_list = response.xpath('//div[@id="content-left"]/div') all_data = [] for div in div_list: # 返回为列表,列表元素为selector类型的对象 # extract可以将selector对象中data参数存储字符串提取出来 author = div.xpath( './div[1]/a[2]/h2/text() | ./div[1]/span/h2/text()' ).extract_first() content = div.xpath('./a[1]/div/span//text()').extract() content = ''.join(content) item = QiubaiproItem() item['author'] = author item['content'] = content # 将item提交给管道 yield item
def parse(self, response): # 解析:作者名称 + 段子内容、 div_list = response.xpath('//*[@id="content"]/div/div[2]/div') all_data = [] # 存储所有解析到的数据 for div in div_list: # xpath返回的是列表,但是列表元素一定是Selector类型的对象 # extract可以将Selector对象中data参数存储的字符串提取出来 # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() # 列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取了出来 content = div.xpath('./a[1]/div/span//text()').extract() content = ''.join(content) item = QiubaiproItem() item['author'] = author item['content'] = content yield item # 将item提交给管道
def parse(self, response): #解析作者的名称和段子的内容(全部) div_list = response.xpath('//div[@class="col1 old-style-col1"]/div') for div in div_list: # extract() 可以将selector 中参数data存储的字符串提取出来 # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() author = div.xpath( './div[1]/a[2]/h2/text() | ./div[1]/span/h2/text()' ).extract_first() # 列表调用 extract()之后,表示将selector中的每一个data中的字符串提取出来 text = div.xpath('./a[1]/div/span//text()').extract() text = "".join(text) #将列表转为字符串 item = QiubaiproItem() #实例化一个item对象 #封装到item中 item['author'] = author item['text'] = text yield item #将item提交给管道
def parse(self, response): # 解析:作者的名称+段子的内容 div_list = response.xpath('//*[@id="content"]/div/div[2]/div') # print(div_list) for div in div_list: # xpath返回的是列表,但是列表元素一定是Selector类型的对象 # extract可以将Selector对象中data参数存储的字符串提取出来 author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() # 列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取出来 content1 = div.xpath('./a[1]/div/span/text()').extract() # 将content中的元素连接成一个字符串 content2 = ''.join(content1) #将解析的数据封装存储到item类型的对象中 item = QiubaiproItem() item['author'] = author item['content'] = content2 #将item类型的对象提交给管道进行持久化存储的操作 yield item
def parse(self, response): div_list = response.xpath('//div[@class="col1 old-style-col1"]/div') all_data = [] for div in div_list: #extract()方法提取selector对象的数据 author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() if author == None: author = "匿名用户" content = div.xpath('./a[1]/div/span[1]//text()').extract() content = ''.join(content) item = QiubaiproItem() #item.Field()方法基本上继承了字典,所以用字典的访问方式 item['author'] = author item['content'] = content yield item if self.page_num <= 11: new_url = self.url + str(self.page_num) self.page_num += 1 #手动请求发送:callback回调函数是专门用于作数据解析 yield scrapy.Request(url=new_url, callback=self.parse)
def parse(self, response): divs = response.xpath( '//div[contains(@class,"col1 old-style-col1")]/div') for div in divs: author = div.xpath( './div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()')[0] content = div.xpath('./a[1]/div/span//text()') # 有其他子标签用//text()!! # extract or extract_first #print(author.extract().strip()) # 将selector中的data对象提取 #print("".join(content.extract()).replace(" ","").strip()) #print('======') ############################## # 基于管道, 指令的方式见资料代码 ############################## item = QiubaiproItem() item['author'] = author.extract().strip() item['content'] = "".join(content.extract()).replace(" ", "").strip() yield item
def parse(self, response): div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: #数据指纹:爬取到一条数据的唯一标识 author = div.xpath('./div/a[2]/h2/text() | ./div/span[2]/h2/text()' ).extract_first() content = div.xpath('./a/div/span//text()').extract() content = ''.join(content) item = QiubaiproItem() item['author'] = author item['content'] = content #数据指纹的创建 data = author + content hash_key = hashlib.sha256(data.encode()).hexdigest() ex = self.conn.sadd('hash_keys', hash_key) if ex == 1: print('有新数据更新......') yield item else: print('无数据更新!')
def parse(self, response): content = response.xpath('//div[@id="content"]') content_left = content.xpath('./div[1]/div[2]') div_list = content_left.xpath('./div') # 解析作者名称 段子内容 for div in div_list: # xpath返回的是列表,但元素一定是selector类型 # extract可以将select对象中data提取 # author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() author = div.xpath( './div[1]/a[2]/h2/text() | ./div[1]/span/h2/text()' )[0].extract() # content = div.xpath('./a[1]/div/span//text()')[0].extract() # 将每一个列表元素的data提取 content = div.xpath('./a[1]/div/span//text()').extract() content = ''.join(content) item = QiubaiproItem() item['author'] = author item['content'] = content yield item
def parse(self, response): # 解析:作者名称和段子的内容 div_list = response.xpath( '//*[@class="content-block clearfix"]/div[2]/div') for div in div_list: # xpath返回的是列表,但是列表元素一定是selector类型对象 # extract()可以将selector对象中data参数存储的字符串提取出来 author = div.xpath('./div[1]/a[2]/h2/text()').extract() author = ''.join(author) #author = author.strip('\n') # 列表调用了.extract()之后,则表示将列表中每一个selector对象中data对应的字符串提取出来 content = div.xpath( './a[1]/div/span//text()|./div[1]/span/h2/text()').extract() #content = content.strip('\n') content = ''.join(content) item = QiubaiproItem() item['author'] = author item['content'] = content # 将item提交给pipeline yield item