def __init__(self): self.admin_db = AdminCodeDatabase() self.admin_data = AdminData() self.period = self.admin_db.period self.Province = self.admin_data.Province self.version = list() for y in self.period: self.version.extend(self.admin_db.version(y))
def __init__(self): Database.__init__(self) self.connect('regionDB','CityStatistics') self.ad = AdminData()
class CityStatDatabase(Database): '''CityStatDatabase类用来处理城市统计数据库 ''' def __init__(self): Database.__init__(self) self.connect('regionDB','CityStatistics') self.ad = AdminData() def find(self,conds,is_to_standard_form=True): '''查询城市统计年鉴的区域数据 :param dict conds: 查询条件 :param bool is_to_standard_form: 标示参数,表示是否输出为标准格式 :return: 城市统计年鉴的区域数据 :rtype: list ''' projection = conds.get('projection') if projection is None: projection = {'region':1,'year':1,'value':1,'acode':1,'_id':0,'variable':1,'scale':1} else: conds.pop('projection') # 设置sorts sorts = conds.get('sorts') if sorts is None: sorts= [('year',ASCENDING),('acode',ASCENDING)] else: conds.pop('sorts') # 设置时间 period = conds.get('year') if period is None: variables = conds.get('variable',self.variables) period = self.period(variables) else: conds.pop('year') result = [] conditions = dict() for key in conds: if re.match('region',key) is not None: continue if isinstance(conds[key],list): conditions[key] = {'$in':conds[key]} else: conditions[key] = conds[key] # 重点是设置区域 if 'region' in conds: for year in period: conditions['year'] = year self.ad.set_year(year) conditions['acode'] = {'$in':[region['acode'] for item in conds['region'] for region in self.ad[tuple(item)]]} result.extend(list(self.collection.find(conditions,projection).sort(sorts))) result_found = pd.DataFrame(result) else: if isinstance(period,list): conditions['year'] = {'$in':period} else: conditions['year'] = period result_found = pd.DataFrame(list(self.collection.find(conditions,projection).sort(sorts))) result = result_found.drop_duplicates(take_last=True) if is_to_standard_form: return RegionFormat(result).transform() else: return result @property def variables(self): return self.collection.find().distinct('variable') def period(self,variable): '''获得变量的所有时期 :param str,list variable: 变量 :return: 变量的年份 :rtype: list ''' if isinstance(variable,str): posts = self.collection.find({'variable':variable}).distinct('year') else: posts = set() for var in variable: periods = self.collection.find({'variable':var}).distinct('year') posts.update(periods) posts = list(posts) return sorted(posts)
class AdminCodeCheck: def __init__(self): self.admin_db = AdminCodeDatabase() self.admin_data = AdminData() self.period = self.admin_db.period self.Province = self.admin_data.Province self.version = list() for y in self.period: self.version.extend(self.admin_db.version(y)) # 测试省级区划 def admin_checker(self,level='s'): no = 1 for ver in sorted(self.version): self.admin_data.set_version(ver) self.admin_division = self.admin_data[level] admin_division_data = [[p['acode'],p['region']] for p in self.admin_division] one_result = pd.DataFrame(admin_division_data,columns=['acode',ver]) one_result = one_result.set_index('acode') if no == 1: result = one_result else: result = pd.merge(result,one_result,left_index=True,right_index=True,how='outer') no = no + 1 return result # 另一个角度测试,从每个省级开始 def admin_division_checker(self,province=None): result_prefectures = None result_counties = dict() result_counties_alone = None for ver in sorted(self.version): self.admin_data.set_version(ver) # 省级行政区划 province_record = self.admin_data[province] if len(province_record) < 1: continue # 地级行政区划 prefecture_records = self.admin_data[tuple([province_record[0]['region'],'f'])] prefectures = [[p['acode'],p['region']] for p in prefecture_records] one_result = pd.DataFrame(prefectures,columns=['acode',ver]) one_result = one_result.set_index('acode') if result_prefectures is None: result_prefectures = one_result else: result_prefectures = pd.merge(result_prefectures,one_result,left_index=True,right_index=True,how='outer') # 县级行政区划 for pre in prefectures: county_records = self.admin_data[tuple([province_record[0]['region'],pre[1],'f'])] counties = [[p['acode'],p['region']] for p in county_records] one_result = pd.DataFrame(counties,columns=['acode',ver]) one_result = one_result.set_index('acode') if result_counties.get(pre[0]) is None: result_counties[pre[0]] = one_result else: result_counties[pre[0]] = pd.merge(result_counties[pre[0]],one_result,left_index=True,right_index=True,how='outer') county_alone_records = self.admin_data.get_county_children(province=province_record[0]['region'],without_prefecture=True) counties_alone = [[p['acode'],p['region']] for p in county_alone_records] one_result_alone = pd.DataFrame(counties_alone,columns=['acode',ver]) one_result_alone = one_result_alone.set_index('acode') if result_counties_alone is None: result_counties_alone = one_result else: result_counties_alone = pd.merge(result_counties_alone,one_result_alone,left_index=True,right_index=True,how='outer') return {'prefectures':result_prefectures,'counties_with_prefecture':result_counties,'counties_without_prefecture':result_counties_alone}
def __init__(self,data): Format.__init__(self,data) self.ad = AdminData() self.to_set_type()
class RegionFormat(Format): '''RegionFormat类用来进行区域数据格式转换 :param list data: 区域数据 ''' def __init__(self,data): Format.__init__(self,data) self.ad = AdminData() self.to_set_type() # 转换格式 def transform(self,sourcetype='stack',targettype='normal',connect='outer'): if (re.match('stack',sourcetype) is not None) and (re.match('normal',targettype) is not None): return self.stack_to_normal(connect=connect) def stack_to_normal(self,connect): '''转换区域数据格式,从stack格式到normal格式 :param str connect: 区域横截面数据连接方式,可以分别是'inner'或'outer' :return: dict ''' scale = self.ndim.get('scale') # 当存在全市和市辖区的差别的时候 if (scale is not None) and scale > 1: # 构建横截面数据(地区|变量) if self.ndim['year'] == 1: g1 = [group for name,group in self._data.groupby(['year'], sort=True)][0] region_dict = dict(zip(g1['acode'],g1['region'])) g2 = g1.groupby(['variable','scale'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({'_'.join(name):data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how=connect) i = i + 1 tags = {'year',self.year[0]} mdata.insert(0, 'region', [region_dict[i] for i in mdata.index]) return {'tags':tags,'data':mdata} # panel data g = self._data.groupby(['year'], sort=True) year = [] pdata =[] for y,g1 in g: region_dict = dict(zip(g1['acode'],g1['region'])) g2 = g1.groupby(['variable','scale'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({'_'.join(name):data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [region_dict[i] for i in mdata.index]) year.append(str(y)) pdata.append(mdata) result = pd.Panel(dict(zip(year,pdata))) mdata = result.swapaxes('items','minor') mdata = mdata.swapaxes('major','minor') mdata = mdata.to_frame(False) result2 = result.dropna(axis=1) mdata2 = result2.swapaxes('items','minor') mdata2 = mdata2.swapaxes('major','minor') mdata2 = mdata2.to_frame(False) return {'data':mdata,'pdata':result,'balanceddata':mdata2} # 如果没有scale,或者scale为1 else: # 时间序列模型 if (self.ndim['variable'] == 1) and (self.ndim['region'] == 1): tags = {'variable':self.variable[0],'region':self.ad.get_by_acode(self.acode[0])[0]['region']} result = pd.Series(dict(zip(self.year,[group for name, group in self._data.groupby(['acode','variable'],sort=True)][0]['value']))) if scale is not None: tags['scale'] = self.scale[0] return {'tags':tags,'data':result} # 构建横截面数据(地区|变量) if self.ndim['year'] == 1: g1 = [group for name,group in self._data.groupby(['year'], sort=True)][0] region_dict = dict(zip(g1['acode'],g1['region'])) g2 = g1.groupby(['variable'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [region_dict[i] for i in mdata.index]) tags = {'year':self.year[0]} if scale is not None: tags['scale'] = self.scale[0] return {'tags':tags,'data':mdata} # panel data g = self._data.groupby(['year'], sort=True) year = [] pdata =[] for y,g1 in g: region_dict = dict(zip(g1['acode'],g1['region'])) g2 = g1.groupby(['variable'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [region_dict[i] for i in mdata.index]) year.append(str(y)) pdata.append(mdata) print(pdata) result = pd.Panel(dict(zip(year,pdata))) mdata = result.swapaxes('items','minor') mdata = mdata.swapaxes('major','minor') mdata = mdata.to_frame(False) result2 = result.dropna(axis=1) mdata2 = result2.swapaxes('items','minor') mdata2 = mdata2.swapaxes('major','minor') mdata2 = mdata2.to_frame(False) if scale is not None: tags = {'scale':self.scale} return {'tags':tags,'data':mdata,'pdata':result,'balanceddata':mdata2} else: return {'data':mdata,'pdata':result,'balanceddata':mdata2} # 辅助函数,返回数据结构 def to_set_type(self): '''辅助函数,用来设置区域数据的参数 :return: 无返回值 ''' # 设定年份 g = self._data.groupby(['year'], sort=True) self.year = [str(name) for name, group in g] # 设定行政区域代码 g = self._data.groupby(['acode'], sort=True) self.acode = [name for name, group in g] self.number_of_region = len(g.groups) # 设定变量 g = self._data.groupby(['variable'], sort=True) self.variable = [name for name, group in g] # 设定变量的维度 self.ndim = {'year':len(self.year),'variable':len(self.variable),'region':self.number_of_region} # 设定区域尺度:全市或者市辖区 if 'scale' in self._data.columns: g = self._data.groupby(['scale'],sort=True) self.scale = [name for name, group in g] self.ndim['scale'] = len(self.scale)
class Layout: ''' 类Layout用于各种数据结构的转换。 属性: self.ad: AdminCode的一个实例 self._data: 读入的数据 self.year:代表数据的时间,是一个str的列表 self.acode:代表地区的行政代码,是一个str的list self.region:代表地区名称,是一个str的list self.variable:代表数据的变量,是一个str的list self.ndim:代表数据的维度,是一个字典 方法: __init__(self,data=None):构造函数,用来进行初始化设置。 _type(self)->dict:辅助函数,用来返回数据的结构类型,无输入参数。返回值是一个字典。: stackToNormal(self):转换数据,从stack格式到normal格式 Demo: 转换区域数据的格式,从stack到normal ad = AdminCode() rdata = RegionalData() # 构建一个RegionalData的实例 mdata = rdata.query(region=ad[u'浙江',u'f'],variable=[u'财政支出'],year=2012) # 查询数据 lout = Layout(mdata) # 进行格式转换 print(lout.stackToNormal()) 得到的结果类似于 region 财政支出 330100 杭州市 78628 330200 宁波市 82844 330300 温州市 38779 330400 嘉兴市 26070 ''' # 构造函数 def __init__(self,data=None): self.ad = AdminData() self._data = data self.tags = self._type() self.ndim = {'year':len(self.tags['year']),'variable':len(self.tags['variable']),'region':len(self.tags['region'])} # 格式转换 def stackToNormal(self): # 构建时间序列数据 if (self.ndim['variable'] == 1) and (self.ndim['region'] == 1): #data = pd.Series(dict(zip(year,[group for name, group in self._data.groupby(['acode','variable'],sort=True)][0]['value']))) #d = dict(zip(year,data)) #sname = '|'.join([self.region[0],self.variable[0]]) #self.tags = [self.region[0],self.variable[0]] #self.tags = {'region':self.region[0],'variable':self.variable[0]} return pd.Series(dict(zip(self.year,[group for name, group in self._data.groupby(['acode','variable'],sort=True)][0]['value']))) # 构建横截面数据(地区|变量) if self.ndim['year'] == 1: g1 = [group for name,group in self._data.groupby(['year'], sort=True)][0] g2 = g1.groupby(['variable'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [self.ad.get_by_acode(item)[0]['region'] for item in mdata.index]) #self.information = self.type['year'] #self.tags = {'region':} return mdata # 构建横截面数据(地区|时间) if self.ndim['variable'] == 1: g = self._data.groupby(['year'], sort=True) i = 0 mdata = [] for name,data in g: rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [self.ad.get_by_acode(item)[0]['region'] for item in mdata.index]) #self.information = self.type['variable'] return mdata # panel data g = self._data.groupby(['year'], sort=True) year = [] pdata =[] for y,g1 in g: g2 = g1.groupby(['variable'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [self.ad.get_by_acode(item)[0]['region'] for item in mdata.index]) year.append(str(y)) pdata.append(mdata) result = pd.Panel(dict(zip(year,pdata))) #self.information = [] return result # 辅助函数,返回数据结构 def _type(self)->dict: g = self._data.groupby(['year'], sort=True) self.year = [str(name) for name, group in g] g = self._data.groupby(['acode'], sort=True) self.acode = [name for name, group in g] self.region = [self.ad.get_by_acode(item)[0]['region'] for item in self.acode] g = self._data.groupby(['variable'], sort=True) self.variable = [name for name, group in g] return {'year':self.year,'region':self.region,'variable':self.variable}
def __init__(self,data=None): self.ad = AdminData() self._data = data self.tags = self._type() self.ndim = {'year':len(self.tags['year']),'variable':len(self.tags['variable']),'region':len(self.tags['region'])}
from lib.database.class_Database import Database from werkzeug.datastructures import ImmutableMultiDict import json year = list(range(1990, 2015)) print(year) # 利用CEIC的数据来做Demo # 1. 导入CEIC数据 db = Database() con = db.connect("regionDB", "CEIC") ceic_region_code = sorted(con.find().distinct("acode")) print(len(ceic_region_code)) # 2. 搜索行政区划代码数据库 admin_data = AdminData() regions = [admin_data.get_by_acode(acode=acode)[0] for acode in ceic_region_code] # 3. 生成区域行政数据 region_list = [] for region in regions: # 第一个元素是行政区划代码 if region["adminlevel"] < 3: parent = u"中国" else: parent = admin_data.database.collection.find_one({"_id": region["parent"]}) parent = "/".join(["中国", "".join([parent["region"], "属下"])]) region_list.append([region["acode"], parent, region["region"]]) print(region_list) # json.dump(region_list,fp=open('e:/gitwork/application/testweb/region_ceic.txt', 'w'))