class XingZhengQuHua(WebSpyder): def __init__(self): super(XingZhengQuHua,self).__init__() self.top_url = 'http://www.xzqh.org/html/' self.encoding = 'gbk' self.LOG_FILE = 'XingZhengQuHua.log' self.location = Location() self.logger = LOG('XingZhengQuHua',self.LOG_FILE) def get_location_by_name(self,name): '''根据名字获得经纬度''' locate = self.location.geocode(name) return (locate.latitude, locate.longitude) def get_xiangzheng_jiedao(self,url): '''获得乡镇街道的信息''' data = self.get_data(url) soup = BeautifulSoup(data,'lxml') lis = soup.findAll('ul',attrs={'class':'text_list text_list_f14'})[0].findAll('li') parse = lambda li:(li.findAll('span')[0].get_text(),'http://www.xzqh.org/html/'+li.findAll('a')[0]['href'],li.findAll('a')[0].get_text()) result = map(parse,lis) result = filter(lambda x:x[0] not in ['概况地图','历史沿革'],result) r_dict = {} r_dict['link'] = url for x in result: r_dict[x[2]] = {} r_dict[x[2]]['link'] = x[1] return r_dict def get_xianshi(self,url): '''获得县市的信息''' data = self.get_data(url) soup = BeautifulSoup(data,'lxml') uls = soup.findAll('div',attrs={'class':'cate'})[0].findAll('ul') def tmp_fun(ul): if ul == None: return [] lis = ul.findAll('li') result = [] for li in lis: if li == None: continue result.append((li.get_text(),'http://www.xzqh.org/html/'+li.findAll('a')[0].attrs['href'])) return result result = map(tmp_fun,uls) result = filter(lambda x:x[0] not in ['概况地图','历史沿革'],result) result_dict = {} key_0 = result[0][0][0] result_dict[key_0] = {} def get_line_dict(line): tmp = {} tmp['link'] = line[0][1] for v in line[1:]: tmp[v[0]] = {} tmp[v[0]] = self.get_xiangzheng_jiedao(v[1]) return (line[0][0],tmp) def get_total_dict(line): item = get_line_dict(line) result_dict[key_0][item[0]] = item[1] map(get_total_dict,result[1:]) import json f = open('a.txt','w') f.write(json.dumps(result_dict).encode('utf8')) f.close() return result_dict