コード例 #1
0
 def parse_university(self, response):
     # 使用filter函数过滤网页中的特殊符号,因为response的属性是只读的,所以我们使用replace方法来重新生成一个response对象
     response = response.replace(body=filter(response.body))
     # 选择出父节点,缩减xpath表达式的重复编码
     wiki_content = response.xpath('//div[@id="wikiContent"]')[0]
     # 定义一个Item对象,并设置name,rank的值
     item = UniversityItem(
         name=wiki_content.xpath('./h1[@class="wikiTitle"]/text()').get(),
         rank=response.meta['rank'])
     # 取出表格中左列的文本
     keys = wiki_content.xpath(
         './div[@class="infobox"]/table//tr/td[1]/p/text()').extract()
     # 取出表格中右列单元格节点list
     cols = wiki_content.xpath('./div[@class="infobox"]/table//tr/td[2]')
     # 循环上步得到的单元格节点list,并取出每个单元格中的文本,这样做是为了解决有些单元格存在多个p标签的问题
     values = [','.join(col.xpath('.//text()').extract()) for col in cols]
     # 将左、右单元格组成一个字典
     data = dict(zip(keys, values))
     # 从字典中的值设置到item相应的属性中
     item['country'] = data.get(u'国家', '')
     item['state'] = data.get(u'州省', '')
     item['city'] = data.get(u'城市', '')
     item['undergraduate_num'] = data.get(u'本科生人数', '')
     item['postgraduate_num'] = data.get(u'研究生人数', '')
     item['website'] = data.get(u'网址', '')
     # 使用内置logger对象记录日志
     self.logger.info(u'%s scraped' % item['name'])
     yield item
コード例 #2
0
    def parse_university(self, response):

        response = response.replace(body=response.text.replace("\t", ""))

        self.logger.info(response.url)
        item = UniversityItem(
            name=response.xpath("//*[@id='wikiContent']/h1/text()").extract(),
            rank=response.meta["rank"],
        )
        infobox = response.xpath("//div[@class='infobox']")[0]
        keys = infobox.xpath("./table//tr/td[1]/p/text()").extract()
        cols = infobox.xpath("./table//tr/td[2]")
        values = ["".join(col.xpath(".//text()").extract_first()) for col in cols]
        data = dict(zip(keys, values))
        # print(data)
        item["country"] = data.get("国家", "")
        item["state"] = data.get("州省", "")
        item["city"] = data.get("城市", "")
        item["undergraduate_num"] = data.get("本科生人数", "")
        item["postgraduate_num"] = data.get("研究生人数", "")
        item["website"] = data.get("网址", "")

        self.logger.info("item %s scrapy" % item["name"])

        yield item
コード例 #3
0
ファイル: usnews.py プロジェクト: Rulesbreaker/py-practice
 def parse_university(self, response):
     response = response.replace(
         body=response.text.replace('\t', '').replace('\r\n', ''))
     item = UniversityItem()
     data = {}
     item['name'] = response.xpath('//div[@id="wikiContent"]/h1/text()')[0]
     print(item['name'])
     table = response.xpath(
         '//div[@id="wikiContent"]/div[@class="infobox"]/table')
     if table:
         table = table[0]
         keys = table.xpath('.//td[1]/p/text()').extract()
         cols = table.xpath('.//td[2]')
         # 当我们确定解析出来的数据只有一个时,可以使用extract_first函数直接提取列表内的内容
         values = [
             ' '.join(col.xpath('.//text()').extract_first())
             for col in cols
         ]
         # values = []
         # print(data['name'])
         # for col in cols:
         #     values.append(' '.join(col.xpath('.//text()')))
         if len(keys) == len(values):
             data.update(zip(keys, values))
     # yield出去的数据,会被框架接受,进行下一步的处理
     # 如果没有任何处理,会打印到控制台里
     item['rank'] = data.get('排名')
     item['country'] = data.get('国家')
     item['state'] = data.get('周省')
     item['city'] = data.get('城市')
     item['undergraduate_num'] = data.get('本科生人数')
     item['postgraduate_num'] = data.get('研究生人数')
     item['website'] = data.get('网址')
     yield item
コード例 #4
0
 def parse_university(self, response):
     response = response.replace(body=filter(response.body))
     wiki_content = response.xpath('//div[@id="wikiContent"]')[0]
     item = UniversityItem(
         name=wiki_content.xpath('./h1[@class="wikiTitle"]/text()').get(),
         rank=response.meta['rank'])
     keys = wiki_content.xpath('./div[@class="infobox"]/table//tr/td[1]/p/text()').extract()
     cols = wiki_content.xpath('./div[@class="infobox"]/table//tr/td[2]')
     values = [','.join(col.xpath('.//text()').extract()) for col in cols]
     data = dict(zip(keys, values))
     item['country'] = data.get(u'国家', '')
     item['state'] = data.get(u'州省', '')
     item['city'] = data.get(u'城市', '')
     item['undergraduate_num'] = data.get(u'本科生人数', '')
     item['postgraduate_num'] = data.get(u'研究生人数', '')
     item['website'] = data.get(u'网址', '')
     self.logger.info(u'%s scraped' % item['name'])
     yield item
コード例 #5
0
ファイル: u2.py プロジェクト: colorfulComeMonochrome/qianmu
 def parse_university(self, response):
     response = response.replace(body=response.text.replace('\t', ''))
     self.logger.info(response.url)
     item = UniversityItem(name=response.xpath(
         '//*[@id="wikiContent"]/h1/text()').extract_first(),
                           rank=response.meta['class'])
     wiki_content = response.xpath('//div[@class="infobox"]')[0]
     keys = wiki_content.xpath('./table/tbody/tr/td[1]/p/text()').extract()
     cols = wiki_content.xpath('./table//tr/td[2]')
     values = [''.join(col.xpath('.//text()').extract()) for col in cols]
     data = dict(zip(keys, values))
     item['country'] = data.get('国家', '')
     item['state'] = data.get('州省', '')
     item['city'] = data.get('城市', '')
     item['undergraduate_num'] = data.get('本科生人数', '')
     item['postgraduate_num'] = data.get('研究生人数', '')
     item['website'] = data.get('网址', '')
     self.logger.info('item %s scraped' % item["name"])
     yield item
コード例 #6
0
 def parse_university(self, response):
     response = response.replace(body=response.text.replace('\t', ''))
     self.logger.info(response.url)
     item = UniversityItem(
         name=response.xpath("//*[@id='wikiContent']/h1/text()").extract_first(),
         rank=response.meta['rank'],
     )
     wiki_content = response.xpath("//div[@class='infobox']")[0]
     # item = dict(title=response.xpath("//*[@id='wikiContent']/h1/text()").extract_first())
     # item['rank'] = response.meta['rank']
     keys = wiki_content.xpath("./table//tr/td[1]/p/text()").extract()
     colums = wiki_content.xpath("./table//tr/td[2]")
     values = [''.join(col.xpath('.//text()').extract()) for col in colums]
     data = dict(zip(keys, values))
     item['country'] = data.get('国家', '')
     item['state'] = data.get('州省', '')
     item['city'] = data.get('城市', '')
     item['undergraduate_num'] = data.get('本科生人数', '')
     item['postgraduate_num'] = data.get('研究生人数', '')
     item['website'] = data.get('网址', '')
     # item.update(zip(keys, values))
     self.logger.info('item %s scraped' % item['name'])
     yield item
コード例 #7
0
ファイル: university.py プロジェクト: Bestlzh/learngit
 def parse_university(self, response):
     response = response.replace(body=filter(response.text))
     self.logger.info(response.url)
     wiki = response.xpath('//div[@id="wikiContent"]')[0]
     item = UniversityItem(
         rank=response.meta['rank'],
         name=response.xpath(
             '//div[@id="wikiContent"]/h1/text()').extract_first())
     keys = wiki.xpath(
         './div[@class="infobox"]/table/tbody/tr/td[1]/p/text()').extract()
     cols = wiki.xpath('./div[@class="infobox"]/table/tbody/tr/td[2]')
     values = [','.join(col.xpath('.//text()').extract()) for col in cols]
     data = dict((zip(keys, values)))
     item['country'] = data.get('国家', '')
     item['state'] = data.get('州省', '')
     item['city'] = data.get('城市', '')
     item['undergraduate_num'] = data.get('本科生人数', '')
     item['postgraduate_num'] = data.get('研究生人数', '')
     item['website'] = data.get('网址', '')
     yield item
     self.logger.info('item %s scraped' % item['name'])
     info = {item['name']: item}
     print(info)