Ejemplo n.º 1
0
    def parse_city_day(self, response):
        """
        解析城市的日期
        :param response:
        :return:
        """
        #通过url获取城市名称
        url = response.url
        #初始化item.py中的类
        item = AirQualityItem()
        city_url_name = url[url.find('=') + 1:url.find('&')]

        # 解析url中文,city_url_name为字符串,需解析为中文
        # item['city_name'] = city_url_name

        #为item属性赋值
        item['city_name'] = parse.unquote(city_url_name)

        # 获取每日记录
        day_record_list = response.xpath(
            '//table[@class="table table-condensed '
            'table-bordered table-striped table-hover '
            'table-responsive"]//tr')
        for i, day_record in enumerate(day_record_list):
            if i == 0:
                # 跳过表头
                continue
            td_list = day_record.xpath('.//td')

            item['record_date'] = td_list[0].xpath(
                'text()').extract_first()  # 检测日期
            item['aqi_val'] = td_list[1].xpath('text()').extract_first()  # AQI
            item['range_val'] = td_list[2].xpath(
                'text()').extract_first()  # 范围
            item['quality_level'] = td_list[3].xpath(
                './/div/text()').extract_first()  # 质量等级
            item['pm2_5_val'] = td_list[4].xpath(
                'text()').extract_first()  # PM2.5
            item['pm10_val'] = td_list[5].xpath(
                'text()').extract_first()  # PM10
            item['so2_val'] = td_list[6].xpath('text()').extract_first()  # SO2
            item['co_val'] = td_list[7].xpath('text()').extract_first()  # CO
            item['no2_val'] = td_list[8].xpath('text()').extract_first()  # NO2
            item['o3_val'] = td_list[9].xpath('text()').extract_first()  # O3
            item['rank'] = td_list[10].xpath('text()').extract_first()  # 排名

            yield item
Ejemplo n.º 2
0
    def parse(self, response):
        item = AirQualityItem()
        # 解析热门城市的数据地址
        # hcity_list = response.xpath('//div[@class="hot"]//div[@class="bottom"]//a//text()').extract()
        # 解析所有的城市的数据地址
        # ncity_list = response.xpath('//div[@class="all"]//div[@class="bottom"]//a//text()').extract()
        ncity_list = [
            '石家庄', '济南', '兰州', '西宁', '呼和浩特', '青岛', '银川', '哈尔滨', '合肥', '武汉',
            '成都', '长春', '南昌', '南京', '乌鲁木齐', '郑州', '西安', '天津', '太原', '沈阳', '绍兴',
            '衢州', '长沙', '宁波', '南宁', '昆明', '贵阳', '东莞', '中山', '广州', '江门', '肇庆',
            '台州', '上海', '温州', '佛山', '北京', '湖州', '金华', '重庆', '嘉兴', '杭州', '海口',
            '福州', '深圳', '珠海', '丽水', '拉萨', '舟山', '厦门', '惠州'
        ]
        driver = webdriver.Chrome('D:\guge\chromedriver.exe')

        def get_month_set(city):
            month_set = list()
            month_url = base_url_month + city
            driver.get(month_url)
            time.sleep(5)
            dfs = pd.read_html(driver.page_source, header=0)[0]
            time.sleep(5)
            for j in range(0, len(dfs)):
                month_set.append(dfs.iloc[j, 0])
            return month_set

        def get_city_set(hcity_list):
            city_set = list()
            for line in hcity_list:
                city_set.append(line)
            # city_set.append('北京')
            return city_set

        # print month_set
        city_set = get_city_set(ncity_list)

        for city in city_set:
            # month_set = get_month_set(city)
            # file_name = city + '.csv'
            # fp = open(file_name, 'w')
            # 表头
            # for i in range(len(month_set)):
            # str_month = month_set[i]
            # dateurl = base_url+city+'&month='+str_month
            dateurl = base_url_month + city
            driver.get(dateurl)
            time.sleep(5)
            dfs = pd.read_html(driver.page_source, header=0)[0]
            time.sleep(5)

            for j in range(0, len(dfs)):
                item['city_name'] = city
                item['date'] = dfs.iloc[j, 0]  # 检测日期
                item['aqi'] = dfs.iloc[j, 1]  # AQI
                item['grade'] = dfs.iloc[j, 2]  # 范围
                item['pm25'] = dfs.iloc[j, 3]  # PM2.5
                item['pm10'] = dfs.iloc[j, 4]  # PM10
                item['so2'] = dfs.iloc[j, 5]  # SO2
                item['co'] = dfs.iloc[j, 6]  # CO
                item['no2'] = dfs.iloc[j, 7]  # NO2
                item['o3'] = dfs.iloc[j, 8]  # O3
                yield item
                # print('%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (item['date'], item['aqi'], item['grade'], item['pm25'],
                # item['pm10'], item['so2'], item['co'], item['no2'], item['o3']))
                # fp.write(('%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (date, aqi, grade, pm25, pm10, so2, co, no2, o3)))
            # print('%d---%s,%s---DONE' % (city_set.index(city), city, str_month))
        # fp.close()
        driver.quit()
        print('爬虫已经爬完!数据已经生成!')