Example #1
0
    def parse_data(self, response):

        item = AqiItem()
        # 取出所有行数的tr标签
        tr_list = response.xpath('//tr')
        # 删除第一行,表头
        tr_list.pop(0)

        # 遍历tr,取出数据
        for tr in tr_list:
            # 日期
            item['date'] = tr.xpath('./td[1]/text()').extract()
            # AQI
            item['aqi'] = tr.xpath('./td[2]/text()').extract()
            # 质量等级
            item['level'] = tr.xpath('./td[3]/span/text()').extract()
            # PM2.5
            item['PM2_5'] = tr.xpath('./td[4]/text()').extract()
            # PM10
            item['PM10'] = tr.xpath('./td[5]/text()').extractt()
            # SO2
            item['SO2'] = tr.xpath('./td[6]/text()').extract()
            # CO
            item['CO'] = tr.xpath('./td[7]/text()').extract()
            # NO2
            item['NO2'] = tr.xpath('./td[8]/text()').extract()
            # O3 8h
            item['O3'] = tr.xpath('./td[9]/text()').extract()

            yield item
Example #2
0
    def parse_day(self, response):
        item = AqiItem()

        title = response.xpath('//*[@id="title"]/text()').extract_first()
        item['city_name'] = title[8:-11]

        # 1. 取出所有 tr_list
        tr_list = response.xpath('//tr')

        # 2.删除表头
        tr_list.pop(0)

        for tr in tr_list:
            # 日期
            item['date'] = tr.xpath('./td[1]/text()').extract_first()
            # AQI
            item['aqi'] = tr.xpath('./td[2]/text()').extract_first()
            # 质量等级
            item['level'] = tr.xpath('./td[3]//text()').extract_first()
            # PM2.5
            item['pm2_5'] = tr.xpath('./td[4]/text()').extract_first()
            # PM10
            item['pm10'] = tr.xpath('./td[5]/text()').extract_first()
            # 二氧化硫
            item['so_2'] = tr.xpath('./td[6]/text()').extract_first()
            # 一氧化碳
            item['co'] = tr.xpath('./td[7]/text()').extract_first()
            # 二氧化氮
            item['no_2'] = tr.xpath('./td[8]/text()').extract_first()
            # 臭氧
            item['o_3'] = tr.xpath('./td[9]/text()').extract_first()

            # 将数据 -->engine-->pipeline
            yield item
Example #3
0
    def parse_day(self, response):
        # 获取所有的数据节点
        node_list = response.xpath('//tr')

        city = response.xpath('//div[@class="panel-heading"]/h3/text()'
                              ).extract_first().split('2')[0]
        # 遍历数据节点列表
        for node in node_list:
            # 创建存储数据的item容器
            item = AqiItem()
            # 先填写一些固定参数
            item['city'] = city
            item['url'] = response.url
            item['timestamp'] = time.time()
            # 数据
            item['date'] = node.xpath('./td[1]/text()').extract_first()
            item['AQI'] = node.xpath('./td[2]/text()').extract_first()
            item['LEVEL'] = node.xpath('./td[3]/span/text()').extract_first()
            item['PM2_5'] = node.xpath('./td[4]/text()').extract_first()
            item['PM10'] = node.xpath('./td[5]/text()').extract_first()
            item['SO2'] = node.xpath('./td[6]/text()').extract_first()
            item['CO'] = node.xpath('./td[7]/text()').extract_first()
            item['NO2'] = node.xpath('./td[8]/text()').extract_first()
            item['O3'] = node.xpath('./td[9]/text()').extract_first()

            # for k, v in item.items():
            #     print(k, v)
            #     print('=========================')

            # 将数据返回给引擎
            yield item
Example #4
0
    def parse_day(self, response):
        """
        解析目标数据  每天的数据
        :param response:
        :return:
        """
        item = AqiItem()
        title = response.xpath('//*[@id="title"]/text()').extract_first()
        item['city_name'] = title[8:-11]

        tr_list = response.xpath('//tr')
        tr_list.pop(0)
        for tr in tr_list:
            # 日期
            item['date'] = tr.xpath('./td[1]/text()').extract_first()
            # AQI
            item['aqi'] = tr.xpath('./td[2]/text()').extract_first()
            # 质量等级
            item['level'] = tr.xpath('./td[3]//text()').extract_first()
            # PM2.5
            item['pm2_5'] = tr.xpath('./td[4]/text()').extract_first()
            # PM10
            item['pm_10'] = tr.xpath('./td[5]/text()').extract_first()
            # 二氧化硫
            item['so_2'] = tr.xpath('./td[6]/text()').extract_first()
            # 一氧化碳
            item['co'] = tr.xpath('./td[7]/text()').extract_first()
            # 二氧化氮
            item['no_2'] = tr.xpath('./td[8]/text()').extract_first()
            # 臭氧
            item['o_3'] = tr.xpath('./td[9]/text()').extract_first()

            yield item
Example #5
0
    def parse_day(self, response):
        '''
            从Response里获取每个城市每个月的每一天数据,并存储到item
        '''
        url = response.url
        urlencode_city = url[url.find("=") + 1:url.rfind("&")]
        # 将url编码的字符串,转为UTF-8字符串
        city = urllib.unquote(urlencode_city)

        tr_list = response.xpath("//div[@class='row']//tr")
        tr_list.pop(0)

        for tr in tr_list:
            item = AqiItem()
            item['city'] = city.decode("utf-8")
            item['date'] = tr.xpath("./td[1]/text()").extract_first()
            item['aqi'] = tr.xpath("./td[2]/text()").extract_first()
            item['level'] = tr.xpath("./td[3]/span/text()").extract_first()
            item['pm2_5'] = tr.xpath("./td[4]/text()").extract_first()
            item['pm10'] = tr.xpath("./td[5]/text()").extract_first()
            item['so2'] = tr.xpath("./td[6]/text()").extract_first()
            item['co'] = tr.xpath("./td[7]/text()").extract_first()
            item['no2'] = tr.xpath("./td[8]/text()").extract_first()
            item['o3'] = tr.xpath("./td[9]/text()").extract_first()

            yield item
Example #6
0
    def parse(self, response):
        city_name_list = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()').extract()[2:3]
        city_link_list = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/@href').extract()[2:3]

        for city_name, city_link in zip(city_name_list, city_link_list):
            item = AqiItem()
            item['city_name'] = city_name

            # 拼接url
            city_url = self.base_url + city_link

            # 发送城市月份天气请求
            yield scrapy.Request(city_url, callback=self.parse_month, meta={'aqi': item})
Example #7
0
 def parse(self, response):
     # 所有的城市列表的href
     monthdata_href_list = response.xpath(
         '//div[@class="all"]//ul//a/@href')[:3]
     for monthdata_href in monthdata_href_list:
         item = AqiItem()
         url = 'https://www.aqistudy.cn/historydata/' + monthdata_href.extract(
         )
         item['city'] = url[55:]
         # print(url)
         yield scrapy.Request(url,
                              callback=self.parse_monthdata,
                              meta={'item': item})
Example #8
0
 def parse(self, response):
     item = AqiItem()
     city_name_list = response.xpath(
         '/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()'
     ).extract()
     city_link_list = response.xpath(
         '/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/@href'
     ).extract()
     for city_name, city_link in zip(city_name_list, city_link_list):
         month_url = self.base_url + city_link
         item["city_name"] = city_name
         yield scrapy.Request(url=month_url,
                              meta={"aqi_item": item},
                              callback=self.month_parse)
Example #9
0
 def parse(self, response):
     city_name_list = response.xpath(
         '//div[@class="bottom"]/ul/div[2]/li/a/text()').extract()
     city_link_list = response.xpath(
         '//div[@class="bottom"]/ul/div[2]/li/a/@href').extract()
     # print(city_name_list, city_link_list)
     for city_name, city_link in zip(city_name_list, city_link_list):
         item = AqiItem()
         # item = {}
         item['city_name'] = city_name
         city_url = self.base_url + city_link
         item['city_url'] = city_url
         # yield scrapy.FormRequest(city_url, callback=self.parse_month, meta={'aqi': item})
         yield item
Example #10
0
    def parse(self, response):
        city_name_list = response.xpath(
            '/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()'
        ).extract()[36:37]
        city_link_list = response.xpath(
            '/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/@href'
        ).extract()[36:37]

        for city_name, city_link in zip(city_name_list, city_link_list):
            item = AqiItem()
            item['city_name'] = city_name
            url = self.base_url + city_link
            yield scrapy.Request(url,
                                 meta={'citykey': item},
                                 callback=self.parse_month)
 def parse_day(self, response):
     city_name = response.meta['city']
     node_list = response.xpath('//tbody/tr')
     node_list.pop(0)
     for node in node_list:
         item = AqiItem()
         item['city'] = city_name
         item['date'] = node.xpath('./td[1]/text()').extract_first()
         item['aqi'] = node.xpath('./td[2]/text()').extract_first()
         item['level'] = node.xpath('./td[3]/span/text()').extract_first()
         item['pm2_5'] = node.xpath('./td[4]/text()').extract_first()
         item['pm10'] = node.xpath('./td[5]/text()').extract_first()
         item['so2'] = node.xpath('./td[6]/text()').extract_first()
         item['co'] = node.xpath('./td[7]/text()').extract_first()
         item['no2'] = node.xpath('./td[8]/text()').extract_first()
         item['o3'] = node.xpath('./td[9]/text()').extract_first()
         yield item
Example #12
0
File: aqi.py Project: KenZQ/python
    def parse_day(self, response):
        node_list = response.xpath("//tr")
        node_list.pop(0)
        city = response.xpath("//h2[@id='title']/text()").extract()[0]

        for node in node_list:
            item = AqiItem()
            item['city'] = city[8:-11]
            item['date'] = node.xpath("./td[1]/text()")
            item['aqi'] = node.xpath("./td[2]/text()")
            item['level'] = node.xpath("./td[3]/span/text()")
            item['pm2_5'] = node.xpath("./td[4]/text()")
            item['pm10'] = node.xpath("./td[5]/text()")
            item['so2'] = node.xpath("./td[6]/text()")
            item['co'] = node.xpath("./td[7]/text()")
            item['no2'] = node.xpath("./td[8]/text()")
            item['o3'] = node.xpath("./td[9]/text()")

            yield item
Example #13
0
    def parse_day(self, response):
        # 根据标题下标获取城市名
        title = response.xpath("//h2[@id='title']/text()").extract_first()
        city_name = title[:title.find(u'空气')]

        node_list = response.xpath('//tbody/tr')
        node_list.pop(0)
        for node in node_list:
            item = AqiItem()
            item['city'] = city_name
            item['date'] = node.xpath('./td[1]/text()').extract_first()
            item['aqi'] = node.xpath('./td[2]/text()').extract_first()
            item['level'] = node.xpath('./td[3]/span/text()').extract_first()
            item['pm2_5'] = node.xpath('./td[4]/text()').extract_first()
            item['pm10'] = node.xpath('./td[5]/text()').extract_first()
            item['so2'] = node.xpath('./td[6]/text()').extract_first()
            item['co'] = node.xpath('./td[7]/text()').extract_first()
            item['no2'] = node.xpath('./td[8]/text()').extract_first()
            item['o3'] = node.xpath('./td[9]/text()').extract_first()
            yield item
Example #14
0
    def parse_day(self, response):
        node_list = response.xpath("//tr")
        node_list.pop(0)

        # extract_first() 直接获取列表里的第一个元素并返回(不会返回列表)
        # extract() 返回列表

        for node in node_list:
            item = AqiItem()
            item['city'] = response.meta["city_name"]
            item['date'] = node.xpath("./td[1]/text()").extract()[0]
            item['aqi'] = node.xpath("./td[2]/text()").extract()[0]
            item['level'] = node.xpath("./td[3]/span/text()").extract()[0]
            item['pm2_5'] = node.xpath("./td[4]/text()").extract()[0]
            item['pm10'] = node.xpath("./td[5]/text()").extract()[0]
            item['so2'] = node.xpath("./td[6]/text()").extract()[0]
            item['co'] = node.xpath("./td[7]/text()").extract()[0]
            item['no2'] = node.xpath("./td[8]/text()").extract()[0]
            item['o3'] = node.xpath("./td[9]/text()").extract()[0]

            yield item
Example #15
0
    def parse_day(self, response):
        node_list = response.xpath("//div[@class='row']//tbody/tr")

        if not len(node_list):
            return

        node_list.pop(0)

        for node in node_list:
            item = AqiItem()
            item['city'] = response.meta['city']
            item["date"] = node.xpath("./td[1]//text()").extract_first()
            item["aqi"] = node.xpath("./td[2]//text()").extract_first()
            item["level"] = node.xpath("./td[3]//text()").extract_first()
            item["pm2_5"] = node.xpath("./td[4]//text()").extract_first()
            item["pm10"] = node.xpath("./td[5]//text()").extract_first()
            item["so2"] = node.xpath("./td[6]//text()").extract_first()
            item["co"] = node.xpath("./td[7]//text()").extract_first()
            item["no2"] = node.xpath("./td[8]//text()").extract_first()
            item["o3"] = node.xpath("./td[9]//text()").extract_first()
            yield item
Example #16
0
    def parse_day(self, response):
        '''
            从Response里获取每个城市每个月的每一天数据,并保存到item
        '''
        tr_list = response.xpath("//div[@class='row']//tr")
        # 删除第一个元素 标题
        # tr_list.pop(0)

        for tr in tr_list:
            item = AqiItem()
            item['city'] = response.meta["name"]
            item['date'] = tr.xpath("./td[1]/text()").extract_first()
            item['aqi'] = tr.xpath("./td[2]/text()").extract_first()
            item['level'] = tr.xpath("./td[3]/span/text()").extract_first()
            item['pm2_5'] = tr.xpath("./td[4]/text()").extract_first()
            item['pm10'] = tr.xpath("./td[5]/text()").extract_first()
            item['so2'] = tr.xpath("./td[6]/text()").extract_first()
            item['co'] = tr.xpath("./td[7]/text()").extract_first()
            item['no2'] = tr.xpath("./td[8]/text()").extract_first()
            item['o3'] = tr.xpath("./td[9]/text()").extract_first()

            yield item
Example #17
0
    def parse_day(self, response):
        print(len(response.body))

        city_name = response.meta['city']
        node_list = response.xpath("//tbody/tr")
        # 30
        node_list.pop(0)

        for node in node_list:
            item = AqiItem()
            item['city'] = city_name
            item['date'] = node.xpath("./td[1]/text()").extract_first()
            item['aqi'] = node.xpath("./td[2]/text()").extract_first()
            item['level'] = node.xpath("./td[3]/span/text()").extract_first()
            item['pm2_5'] = node.xpath("./td[4]/text()").extract_first()
            item['pm10'] = node.xpath("./td[5]/text()").extract_first()
            item['so2'] = node.xpath("./td[6]/text()").extract_first()
            item['co'] = node.xpath("./td[7]/text()").extract_first()
            item['no2'] = node.xpath("./td[8]/text()").extract_first()
            item['o3'] = node.xpath("./td[9]/text()").extract_first()
            print(item["city"])

            yield item