def parse_city_day(self, response): """ 解析城市的日期 :param response: :return: """ #通过url获取城市名称 url = response.url #初始化item.py中的类 item = AirQualityItem() city_url_name = url[url.find('=') + 1:url.find('&')] # 解析url中文,city_url_name为字符串,需解析为中文 # item['city_name'] = city_url_name #为item属性赋值 item['city_name'] = parse.unquote(city_url_name) # 获取每日记录 day_record_list = response.xpath( '//table[@class="table table-condensed ' 'table-bordered table-striped table-hover ' 'table-responsive"]//tr') for i, day_record in enumerate(day_record_list): if i == 0: # 跳过表头 continue td_list = day_record.xpath('.//td') item['record_date'] = td_list[0].xpath( 'text()').extract_first() # 检测日期 item['aqi_val'] = td_list[1].xpath('text()').extract_first() # AQI item['range_val'] = td_list[2].xpath( 'text()').extract_first() # 范围 item['quality_level'] = td_list[3].xpath( './/div/text()').extract_first() # 质量等级 item['pm2_5_val'] = td_list[4].xpath( 'text()').extract_first() # PM2.5 item['pm10_val'] = td_list[5].xpath( 'text()').extract_first() # PM10 item['so2_val'] = td_list[6].xpath('text()').extract_first() # SO2 item['co_val'] = td_list[7].xpath('text()').extract_first() # CO item['no2_val'] = td_list[8].xpath('text()').extract_first() # NO2 item['o3_val'] = td_list[9].xpath('text()').extract_first() # O3 item['rank'] = td_list[10].xpath('text()').extract_first() # 排名 yield item
def parse(self, response): item = AirQualityItem() # 解析热门城市的数据地址 # hcity_list = response.xpath('//div[@class="hot"]//div[@class="bottom"]//a//text()').extract() # 解析所有的城市的数据地址 # ncity_list = response.xpath('//div[@class="all"]//div[@class="bottom"]//a//text()').extract() ncity_list = [ '石家庄', '济南', '兰州', '西宁', '呼和浩特', '青岛', '银川', '哈尔滨', '合肥', '武汉', '成都', '长春', '南昌', '南京', '乌鲁木齐', '郑州', '西安', '天津', '太原', '沈阳', '绍兴', '衢州', '长沙', '宁波', '南宁', '昆明', '贵阳', '东莞', '中山', '广州', '江门', '肇庆', '台州', '上海', '温州', '佛山', '北京', '湖州', '金华', '重庆', '嘉兴', '杭州', '海口', '福州', '深圳', '珠海', '丽水', '拉萨', '舟山', '厦门', '惠州' ] driver = webdriver.Chrome('D:\guge\chromedriver.exe') def get_month_set(city): month_set = list() month_url = base_url_month + city driver.get(month_url) time.sleep(5) dfs = pd.read_html(driver.page_source, header=0)[0] time.sleep(5) for j in range(0, len(dfs)): month_set.append(dfs.iloc[j, 0]) return month_set def get_city_set(hcity_list): city_set = list() for line in hcity_list: city_set.append(line) # city_set.append('北京') return city_set # print month_set city_set = get_city_set(ncity_list) for city in city_set: # month_set = get_month_set(city) # file_name = city + '.csv' # fp = open(file_name, 'w') # 表头 # for i in range(len(month_set)): # str_month = month_set[i] # dateurl = base_url+city+'&month='+str_month dateurl = base_url_month + city driver.get(dateurl) time.sleep(5) dfs = pd.read_html(driver.page_source, header=0)[0] time.sleep(5) for j in range(0, len(dfs)): item['city_name'] = city item['date'] = dfs.iloc[j, 0] # 检测日期 item['aqi'] = dfs.iloc[j, 1] # AQI item['grade'] = dfs.iloc[j, 2] # 范围 item['pm25'] = dfs.iloc[j, 3] # PM2.5 item['pm10'] = dfs.iloc[j, 4] # PM10 item['so2'] = dfs.iloc[j, 5] # SO2 item['co'] = dfs.iloc[j, 6] # CO item['no2'] = dfs.iloc[j, 7] # NO2 item['o3'] = dfs.iloc[j, 8] # O3 yield item # print('%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (item['date'], item['aqi'], item['grade'], item['pm25'], # item['pm10'], item['so2'], item['co'], item['no2'], item['o3'])) # fp.write(('%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (date, aqi, grade, pm25, pm10, so2, co, no2, o3))) # print('%d---%s,%s---DONE' % (city_set.index(city), city, str_month)) # fp.close() driver.quit() print('爬虫已经爬完!数据已经生成!')