Ejemplo n.º 1
0
 def __init__(self,data=None):
     Format.__init__(self,data)
     self.ad = AdminData()
     self._type()
Ejemplo n.º 2
0
class RegionFormat(Format):
    '''
    类Format用来进行格式转换。
    '''

    # 构造函数
    def __init__(self,data=None):
        Format.__init__(self,data)
        self.ad = AdminData()
        self._type()

    # 转换格式
    def transform(self,sourcetype='stack',targettype='normal',connect='outer'):
        if (re.match('stack',sourcetype) is not None) and (re.match('normal',targettype) is not None):
            return self._stackToNormal(connect=connect)

    def _stackToNormal(self,connect):
        scale = self.ndim.get('scale')
        # 当存在全市和市辖区的差别的时候
        if (scale is not None) and scale > 1:
            # 构建横截面数据(地区|变量)
            if self.ndim['year'] == 1:
                g1 = [group for name,group in self._data.groupby(['year'], sort=True)][0]
                regionpair = dict(zip(g1['acode'],g1['region']))
                g2 = g1.groupby(['variable','scale'], sort=True)
                i = 0
                mdata = []
                for name,data in g2:
                    rdata = pd.DataFrame({'_'.join(name):data['value'].values},index=data['acode'].values)
                    if i == 0:
                        mdata = rdata
                    else:
                        mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how=connect)
                    i = i + 1
                tags = {'year',self.year[0]}
                mdata.insert(0, 'region', [regionpair[i] for i in mdata.index])
                return {'tags':tags,'data':mdata}

            # panel data
            g = self._data.groupby(['year'], sort=True)
            year = []
            pdata =[]
            for y,g1 in g:
                regionpair = dict(zip(g1['acode'],g1['region']))
                g2 = g1.groupby(['variable','scale'], sort=True)
                i = 0
                mdata = []
                for name,data in g2:
                    rdata = pd.DataFrame({'_'.join(name):data['value'].values},index=data['acode'].values)
                    if i == 0:
                        mdata = rdata
                    else:
                        mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer')
                    i = i + 1
                mdata.insert(0, 'region', [regionpair[i] for i in mdata.index])
                year.append(str(y))
                pdata.append(mdata)
            result = pd.Panel(dict(zip(year,pdata)))
            mdata = result.swapaxes('items','minor')
            mdata = mdata.swapaxes('major','minor')
            mdata = mdata.to_frame(False)
            result2 = result.dropna(axis=1)
            mdata2 = result2.swapaxes('items','minor')
            mdata2 = mdata2.swapaxes('major','minor')
            mdata2 = mdata2.to_frame(False)
            return {'data':mdata,'pdata':result,'balanceddata':mdata2}
        # 如果没有scale,或者scale为1
        else:
            # 时间序列模型
            if (self.ndim['variable'] == 1) and (self.ndim['region'] == 1):
                tags = {'variable':self.variable[0],'region':self.ad.getByAcode(self.acode[0])[0]['region']}
                result = pd.Series(dict(zip(self.year,[group for name, group in self._data.groupby(['acode','variable'],sort=True)][0]['value'])))
                nresult = pd.DataFrame({tags['variable']:result})
                if scale is not None:
                    tags['scale'] = self.scale[0]
                return {'tags':tags,'data':nresult,'sdata':result}

            # 构建横截面数据(地区|变量)
            if self.ndim['year'] == 1:
                g1 = [group for name,group in self._data.groupby(['year'], sort=True)][0]
                regionpair = dict(zip(g1['acode'],g1['region']))
                g2 = g1.groupby(['variable'], sort=True)
                i = 0
                mdata = []
                for name,data in g2:
                    rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values)
                    if i == 0:
                        mdata = rdata
                    else:
                        mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer')
                    i = i + 1
                mdata.insert(0, 'region', [regionpair[i] for i in mdata.index])
                tags = {'year':self.year[0]}
                if scale is not None:
                    tags['scale'] = self.scale[0]
                return {'tags':tags,'data':mdata}

            # panel data
            g = self._data.groupby(['year'], sort=True)
            year = []
            pdata =[]
            for y,g1 in g:
                regionpair = dict(zip(g1['acode'],g1['region']))
                g2 = g1.groupby(['variable'], sort=True)
                i = 0
                mdata = []
                for name,data in g2:
                    rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values)
                    if i == 0:
                        mdata = rdata
                    else:
                        mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer')
                    i = i + 1
                mdata.insert(0, 'region', [regionpair[i] for i in mdata.index])
                year.append(str(y))
                pdata.append(mdata)
            result = pd.Panel(dict(zip(year,pdata)))
            mdata = result.swapaxes('items','minor')
            mdata = mdata.swapaxes('major','minor')
            mdata = mdata.to_frame(False)
            result2 = result.dropna(axis=1)
            mdata2 = result2.swapaxes('items','minor')
            mdata2 = mdata2.swapaxes('major','minor')
            mdata2 = mdata2.to_frame(False)
            if scale is not None:
                tags = {'scale':self.scale}
                return {'tags':tags,'data':mdata,'pdata':result,'balanceddata':mdata2}
            else:
                return {'data':mdata,'pdata':result,'balanceddata':mdata2}

    # 辅助函数,返回数据结构
    def _type(self)->dict:
        g = self._data.groupby(['year'], sort=True)
        self.year = [str(name) for name, group in g]

        g = self._data.groupby(['acode'], sort=True)
        self.acode = [name for name, group in g]
        self.numOfRegion = len(g.groups)

        g = self._data.groupby(['variable'], sort=True)
        self.variable = [name for name, group in g]

        self.ndim = {'year':len(self.year),'variable':len(self.variable),'region':self.numOfRegion}

        if 'scale' in self._data.columns:
            g = self._data.groupby(['scale'],sort=True)
            self.scale = [name for name, group in g]
            self.ndim['scale'] = len(self.scale)
Ejemplo n.º 3
0
 def __init__(self):
     # 连接CityStatistics集合
     Database.__init__(self)
     #self._connect('regionDB','CityStatistics')
     self._connect('regionDB','CEIC')
     self.ad = AdminData()
Ejemplo n.º 4
0
class CityStatDatabase(Database):
    '''
    类CityStatDatabase用来连接CityStatistics数据库
    '''

    # 构造函数
    def __init__(self):
        # 连接CityStatistics集合
        Database.__init__(self)
        #self._connect('regionDB','CityStatistics')
        self._connect('regionDB','CEIC')
        self.ad = AdminData()

    # 查询
    def find(self,conds,toStandardForm=True):

        print(conds)
        # 设置projection
        projection = conds.get('projection')
        if projection is None:
            projection = {'region':1,'year':1,'value':1,'acode':1,'_id':0,'variable':1,'scale':1}
        else:
            conds.pop('projection')
        # 设置sorts
        sorts = conds.get('sorts')
        if sorts is None:
            sorts= [('year',ASCENDING),('acode',ASCENDING)]
        else:
            conds.pop('sorts')

        # 设置时间
        period = conds.get('year')
        if period is None:
            variables = conds.get('variable',self.variables)
            period = self.period(variables)
        else:
            period = period
            conds.pop('year')

        result = []
        conditions = dict()
        for key in conds:
            if re.match('region',key) is not None:
                continue
            if isinstance(conds[key],list):
                conditions[key] = {'$in':conds[key]}
            else:
                conditions[key] = conds[key]

        # 重点是设置区域
        if 'region' in conds:
            for year in period:
                conditions['year'] = year
                self.ad.setYear(year)
                if re.match('^[0-9]{6}$',conds['region'][0]) is not None:
                    conditions['acode'] = {'$in': conds['region']}
                else:
                    conditions['acode'] = {'$in':[region['acode'] for item in conds['region'] for region in self.ad[tuple(item)]]}
                result.extend(list(self.collection.find(conditions,projection).sort(sorts)))
            mresult = pd.DataFrame(result)
        else:
            if isinstance(period,list):
                conditions['year'] = {'$in':period}
            else:
                conditions['year'] = period
            mresult = pd.DataFrame(list(self.collection.find(conditions,projection).sort(sorts)))

        if mresult is None:
            return None

        mresult = mresult.drop_duplicates(keep='last')

        if toStandardForm:
            rformat = RegionFormat(mresult)
            return rformat.transform()
        else:
            return mresult

    @property
    def variables(self):
        return self.collection.find().distinct('variable')

    # 获得变量所有的时期
    def period(self,variable):
        print(variable)
        if isinstance(variable,str):
            posts = self.collection.find({'variable':variable}).distinct('year')
        else:
            posts = set()
            for var in variable:
                periods = self.collection.find({'variable':var}).distinct('year')
                posts.update(periods)
            posts = list(posts)
        return sorted(posts)
Ejemplo n.º 5
0
 def __init__(self):
     # 连接CityStatistics集合
     Database.__init__(self)
     self._connect("regionDB", "CityStatistics")
     self.ad = AdminData()
Ejemplo n.º 6
0
class CityStatDatabase(Database):
    """
    类CityStatDatabase用来连接CityStatistics数据库
    """

    # 构造函数
    def __init__(self):
        # 连接CityStatistics集合
        Database.__init__(self)
        self._connect("regionDB", "CityStatistics")
        self.ad = AdminData()

    # 查询
    def find(self, conds, toStandardForm=True):
        # 设置projection
        projection = conds.get("projection")
        if projection is None:
            projection = {"region": 1, "year": 1, "value": 1, "acode": 1, "_id": 0, "variable": 1, "scale": 1}
        else:
            conds.pop("projection")
        # 设置sorts
        sorts = conds.get("sorts")
        if sorts is None:
            sorts = [("year", ASCENDING), ("acode", ASCENDING)]
        else:
            conds.pop("sorts")

        # 设置时间
        period = conds.get("year")
        if period is None:
            variables = conds.get("variable", self.variables)
            period = self.period(variables)
        else:
            period = period
            conds.pop("year")

        result = []
        conditions = dict()
        for key in conds:
            if re.match("region", key) is not None:
                continue
            if isinstance(conds[key], list):
                conditions[key] = {"$in": conds[key]}
            else:
                conditions[key] = conds[key]

        # 重点是设置区域
        if "region" in conds:
            for year in period:
                conditions["year"] = year
                self.ad.setYear(year)
                conditions["acode"] = {
                    "$in": [region["acode"] for item in conds["region"] for region in self.ad[tuple(item)]]
                }
                result.extend(list(self.collection.find(conditions, projection).sort(sorts)))
            mresult = pd.DataFrame(result)
        else:
            if isinstance(period, list):
                conditions["year"] = {"$in": period}
            else:
                conditions["year"] = period
            mresult = pd.DataFrame(list(self.collection.find(conditions, projection).sort(sorts)))

        mresult = mresult.drop_duplicates(take_last=True)

        if toStandardForm:
            rformat = RegionFormat(mresult)
            return rformat.transform()
        else:
            return mresult

    @property
    def variables(self):
        return self.collection.find().distinct("variable")

    # 获得变量所有的时期
    def period(self, variable):
        if isinstance(variable, str):
            posts = self.collection.find({"variable": variable}).distinct("year")
        else:
            posts = set()
            for var in variable:
                periods = self.collection.find({"variable": var}).distinct("year")
                posts.update(periods)
            posts = list(posts)
        return sorted(posts)