def parse_university(self, response): # 使用filter函数过滤网页中的特殊符号,因为response的属性是只读的,所以我们使用replace方法来重新生成一个response对象 response = response.replace(body=filter(response.body)) # 选择出父节点,缩减xpath表达式的重复编码 wiki_content = response.xpath('//div[@id="wikiContent"]')[0] # 定义一个Item对象,并设置name,rank的值 item = UniversityItem( name=wiki_content.xpath('./h1[@class="wikiTitle"]/text()').get(), rank=response.meta['rank']) # 取出表格中左列的文本 keys = wiki_content.xpath( './div[@class="infobox"]/table//tr/td[1]/p/text()').extract() # 取出表格中右列单元格节点list cols = wiki_content.xpath('./div[@class="infobox"]/table//tr/td[2]') # 循环上步得到的单元格节点list,并取出每个单元格中的文本,这样做是为了解决有些单元格存在多个p标签的问题 values = [','.join(col.xpath('.//text()').extract()) for col in cols] # 将左、右单元格组成一个字典 data = dict(zip(keys, values)) # 从字典中的值设置到item相应的属性中 item['country'] = data.get(u'国家', '') item['state'] = data.get(u'州省', '') item['city'] = data.get(u'城市', '') item['undergraduate_num'] = data.get(u'本科生人数', '') item['postgraduate_num'] = data.get(u'研究生人数', '') item['website'] = data.get(u'网址', '') # 使用内置logger对象记录日志 self.logger.info(u'%s scraped' % item['name']) yield item
def parse_university(self, response): response = response.replace(body=response.text.replace("\t", "")) self.logger.info(response.url) item = UniversityItem( name=response.xpath("//*[@id='wikiContent']/h1/text()").extract(), rank=response.meta["rank"], ) infobox = response.xpath("//div[@class='infobox']")[0] keys = infobox.xpath("./table//tr/td[1]/p/text()").extract() cols = infobox.xpath("./table//tr/td[2]") values = ["".join(col.xpath(".//text()").extract_first()) for col in cols] data = dict(zip(keys, values)) # print(data) item["country"] = data.get("国家", "") item["state"] = data.get("州省", "") item["city"] = data.get("城市", "") item["undergraduate_num"] = data.get("本科生人数", "") item["postgraduate_num"] = data.get("研究生人数", "") item["website"] = data.get("网址", "") self.logger.info("item %s scrapy" % item["name"]) yield item
def parse_university(self, response): response = response.replace( body=response.text.replace('\t', '').replace('\r\n', '')) item = UniversityItem() data = {} item['name'] = response.xpath('//div[@id="wikiContent"]/h1/text()')[0] print(item['name']) table = response.xpath( '//div[@id="wikiContent"]/div[@class="infobox"]/table') if table: table = table[0] keys = table.xpath('.//td[1]/p/text()').extract() cols = table.xpath('.//td[2]') # 当我们确定解析出来的数据只有一个时,可以使用extract_first函数直接提取列表内的内容 values = [ ' '.join(col.xpath('.//text()').extract_first()) for col in cols ] # values = [] # print(data['name']) # for col in cols: # values.append(' '.join(col.xpath('.//text()'))) if len(keys) == len(values): data.update(zip(keys, values)) # yield出去的数据,会被框架接受,进行下一步的处理 # 如果没有任何处理,会打印到控制台里 item['rank'] = data.get('排名') item['country'] = data.get('国家') item['state'] = data.get('周省') item['city'] = data.get('城市') item['undergraduate_num'] = data.get('本科生人数') item['postgraduate_num'] = data.get('研究生人数') item['website'] = data.get('网址') yield item
def parse_university(self, response): response = response.replace(body=filter(response.body)) wiki_content = response.xpath('//div[@id="wikiContent"]')[0] item = UniversityItem( name=wiki_content.xpath('./h1[@class="wikiTitle"]/text()').get(), rank=response.meta['rank']) keys = wiki_content.xpath('./div[@class="infobox"]/table//tr/td[1]/p/text()').extract() cols = wiki_content.xpath('./div[@class="infobox"]/table//tr/td[2]') values = [','.join(col.xpath('.//text()').extract()) for col in cols] data = dict(zip(keys, values)) item['country'] = data.get(u'国家', '') item['state'] = data.get(u'州省', '') item['city'] = data.get(u'城市', '') item['undergraduate_num'] = data.get(u'本科生人数', '') item['postgraduate_num'] = data.get(u'研究生人数', '') item['website'] = data.get(u'网址', '') self.logger.info(u'%s scraped' % item['name']) yield item
def parse_university(self, response): response = response.replace(body=response.text.replace('\t', '')) self.logger.info(response.url) item = UniversityItem(name=response.xpath( '//*[@id="wikiContent"]/h1/text()').extract_first(), rank=response.meta['class']) wiki_content = response.xpath('//div[@class="infobox"]')[0] keys = wiki_content.xpath('./table/tbody/tr/td[1]/p/text()').extract() cols = wiki_content.xpath('./table//tr/td[2]') values = [''.join(col.xpath('.//text()').extract()) for col in cols] data = dict(zip(keys, values)) item['country'] = data.get('国家', '') item['state'] = data.get('州省', '') item['city'] = data.get('城市', '') item['undergraduate_num'] = data.get('本科生人数', '') item['postgraduate_num'] = data.get('研究生人数', '') item['website'] = data.get('网址', '') self.logger.info('item %s scraped' % item["name"]) yield item
def parse_university(self, response): response = response.replace(body=response.text.replace('\t', '')) self.logger.info(response.url) item = UniversityItem( name=response.xpath("//*[@id='wikiContent']/h1/text()").extract_first(), rank=response.meta['rank'], ) wiki_content = response.xpath("//div[@class='infobox']")[0] # item = dict(title=response.xpath("//*[@id='wikiContent']/h1/text()").extract_first()) # item['rank'] = response.meta['rank'] keys = wiki_content.xpath("./table//tr/td[1]/p/text()").extract() colums = wiki_content.xpath("./table//tr/td[2]") values = [''.join(col.xpath('.//text()').extract()) for col in colums] data = dict(zip(keys, values)) item['country'] = data.get('国家', '') item['state'] = data.get('州省', '') item['city'] = data.get('城市', '') item['undergraduate_num'] = data.get('本科生人数', '') item['postgraduate_num'] = data.get('研究生人数', '') item['website'] = data.get('网址', '') # item.update(zip(keys, values)) self.logger.info('item %s scraped' % item['name']) yield item
def parse_university(self, response): response = response.replace(body=filter(response.text)) self.logger.info(response.url) wiki = response.xpath('//div[@id="wikiContent"]')[0] item = UniversityItem( rank=response.meta['rank'], name=response.xpath( '//div[@id="wikiContent"]/h1/text()').extract_first()) keys = wiki.xpath( './div[@class="infobox"]/table/tbody/tr/td[1]/p/text()').extract() cols = wiki.xpath('./div[@class="infobox"]/table/tbody/tr/td[2]') values = [','.join(col.xpath('.//text()').extract()) for col in cols] data = dict((zip(keys, values))) item['country'] = data.get('国家', '') item['state'] = data.get('州省', '') item['city'] = data.get('城市', '') item['undergraduate_num'] = data.get('本科生人数', '') item['postgraduate_num'] = data.get('研究生人数', '') item['website'] = data.get('网址', '') yield item self.logger.info('item %s scraped' % item['name']) info = {item['name']: item} print(info)