def search_from_dbase(variables=None,query_dict=None,match='exact'): collection_variable = MonCollection(database=MonDatabase(mongodb=MongoDB(), database_name='region'), collection_name='storedvariable') found = collection_variable.find(query_dict) found_dict = {item['origin']:item['variable'] for item in found} if match == 'exact': pd_result = VariableMatcher.search_for_same_variable(variables=variables,source=found_dict.keys()) for ind in pd_result.index: pd_result.loc[ind, 'matched_variable'] = found_dict.get(pd_result.loc[ind, 'matched_variable']) else: pd_result = VariableMatcher.search_for_similar_variable(variables=variables,source=found_dict.keys()) for ind in pd_result.index: pd_result.loc[ind, 'matched_middel_variable'] = pd_result.loc[ind, 'matched_variable'] pd_result.loc[ind,'matched_variable'] = found_dict.get(pd_result.loc[ind,'matched_variable']) return pd_result
class TestCalculate(unittest.TestCase): def setUp(self): mongo = MongoDB(conn_str='mongodb://*****:*****@139.196.189.191:3717/') mdb = MonDatabase(mongodb=mongo, database_name='region') self.mcollection = MonCollection(database=mdb, collection_name='cities') def test_connect_mongodb(self): self.assertEqual(2, len(list(self.mcollection.find())))
class TestCalculate(unittest.TestCase): def setUp(self): mongo = MongoDB( conn_str='mongodb://*****:*****@139.196.189.191:3717/') mdb = MonDatabase(mongodb=mongo, database_name='region') self.mcollection = MonCollection(database=mdb, collection_name='cities') def test_connect_mongodb(self): self.assertEqual(2, len(list(self.mcollection.find())))
class PopCensusDatabase(): """ 类PopCensusDatabase表示人口普查数据库 """ def __init__(self): # 连接PopCensus集合 mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') self.collection = MonCollection(database=mdb, collection_name='popcensus') # 年份 @property def period(self): return sorted(self.collection.find().distinct('year')) # 年份 @property def variables(self): return sorted(self.collection.find().distinct('variable'))
class AdminDatabase(): """ 类AdminDatabase用来连接行政区划数据库 """ def __init__(self): # 连接AdminDatabase集合 mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') self.collection = MonCollection(database=mdb, collection_name='admincode') # 查询 def find(self, **conds): # 设置projection projection = conds.get('projection') if projection is None: projection = { 'region': 1, 'version': 1, 'adminlevel': 1, 'acode': 1, '_id': 1, 'parent': 1 } else: conds.pop('projection') # 设置sorts sorts = conds.get('sorts') if sorts is None: sorts = [('year', ASCENDING), ('acode', ASCENDING)] else: conds.pop('sorts') # 设置查询条件 condition = dict() for key in conds: if isinstance(conds[key], list): condition[key] = {'$in': conds[key]} else: condition[key] = conds[key] # 返回查询结果 return self.collection.find(condition, projection).sort(sorts) # 年份 @property def period(self): return sorted(self.find().distinct('year')) # 版本号 def version(self, year=None): if year is None: return sorted(self.find().distinct('version')) else: return sorted(self.find(year=str(year)).distinct('version'))
def __init__(self, region_query=None): # 设置查询结果 if region_query is None: mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') collection = MonCollection(database=mdb, collection_name='admincode') self.collection = collection.collection else: self.collection = region_query self.collection = None
def search_from_dbase(variables=None, query_dict=None, match='exact'): collection_variable = MonCollection(database=MonDatabase( mongodb=MongoDB(), database_name='region'), collection_name='storedvariable') found = collection_variable.find(query_dict) found_dict = {item['origin']: item['variable'] for item in found} if match == 'exact': pd_result = VariableMatcher.search_for_same_variable( variables=variables, source=found_dict.keys()) for ind in pd_result.index: pd_result.loc[ind, 'matched_variable'] = found_dict.get( pd_result.loc[ind, 'matched_variable']) else: pd_result = VariableMatcher.search_for_similar_variable( variables=variables, source=found_dict.keys()) for ind in pd_result.index: pd_result.loc[ind, 'matched_middel_variable'] = pd_result.loc[ ind, 'matched_variable'] pd_result.loc[ind, 'matched_variable'] = found_dict.get( pd_result.loc[ind, 'matched_variable']) return pd_result
class AdminDatabase(): """ 类AdminDatabase用来连接行政区划数据库 """ def __init__(self): # 连接AdminDatabase集合 mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') self.collection = MonCollection(database=mdb, collection_name='admincode') # 查询 def find(self,**conds): # 设置projection projection = conds.get('projection') if projection is None: projection = {'region':1,'version':1,'adminlevel':1,'acode':1,'_id':1,'parent':1} else: conds.pop('projection') # 设置sorts sorts = conds.get('sorts') if sorts is None: sorts= [('year',ASCENDING),('acode',ASCENDING)] else: conds.pop('sorts') # 设置查询条件 condition = dict() for key in conds: if isinstance(conds[key],list): condition[key] = {'$in':conds[key]} else: condition[key] = conds[key] # 返回查询结果 return self.collection.find(condition,projection).sort(sorts) # 年份 @property def period(self): return sorted(self.find().distinct('year')) # 版本号 def version(self,year=None): if year is None: return sorted(self.find().distinct('version')) else: return sorted(self.find(year=str(year)).distinct('version'))
def __init__(self): # 连接PopCensus集合 mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') self.collection = MonCollection(database=mdb, collection_name='popcensus')
:param values: 详见pandas.pivot_table()函数参数说明 :param index: 详见pandas.pivot_table()函数参数说明 :param columns: 详见pandas.pivot_table()函数参数说明 :param dropna: 详见pandas.pivot_table()函数参数说明 :param fill_value: 详见pandas.pivot_table()函数参数说明 :return: 返回转换后的宽格式表格 :rtype: pandas.DataFrame """ result = pd.pivot_table(data=dataframe, values=values, index=index, columns=columns, dropna=dropna,fill_value=fill_value) return result if __name__ == '__main__': mcollection = MonCollection(database=MonDatabase(mongodb=MongoDB(), database_name='region'), collection_name='provincestat') cursor = mcollection.find({'variable':{'$in':['人均地区生产总值','私人控股企业法人单位数','城镇居民消费','城镇单位就业人员平均工资']}}, projection={'_id':0,'variable':1,'value':1,'province':1,'acode':1,'year':1}) #cursor = mcollection.find({'year':'2010', 'variable':{'$in':['人均地区生产总值','私人控股企业法人单位数','城镇居民消费','城镇单位就业人员平均工资']}}, # projection={'_id':0,'variable':1,'value':1,'province':1,'acode':1}) #cursor = mcollection.find({'variable':'人均地区生产总值','acode':'110000'}, # projection={'_id':0,'variable':1,'value':1,'province':1,'acode':1,'year':1}) mongoconverter = MongoDBToPandasFormat(cursor) # Test first #result = mongoconverter(values='value', index=['year'], columns='variable',dropna=True) result = mongoconverter(values='value', index=['acode','year'], columns='variable',dropna=True) #result = mongoconverter(values='value', index=['acode','year'], columns='variable', # dropna=False, balanced=True) print(result) #result.to_excel('e:/backup/result.xlsx')
def __init__(self): # 连接admindivision集合 mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') self.collection = MonCollection(database=mdb, collection_name='admindivision')
return ref_regions_dict @property def matched_region(self): return self._result if __name__ == '__main__': pop_year = '2010' pop_region_file_2010 = r'E:\data\popcensus\origin\var_temp.xls' raw_region_2010 = Excel(pop_region_file_2010).read() to_be_matched = [re.sub('\s+','',item[0]) for item in raw_region_2010 if re.match('^\s*$',item[0]) is None] pd_to_be_matched = pd.DataFrame(to_be_matched,columns=['region']) pd_to_be_matched['rid'] = range(pd_to_be_matched.shape[0]) collection = MonCollection(database=MonDatabase(mongodb=MongoDB(), database_name='region'), collection_name='admincode') found = collection.collection.find(filter={'year':'2010'}, projection={'acode':True,'region':True,'_id':True}, sort=[('acode',1)]) pd_to_be_compared = pd.DataFrame(list(found)) pd_to_be_compared['cid'] = range(pd_to_be_compared.shape[0]) #pd_to_be_compared['_id'] = pd_to_be_compared['_id'].apply(str) print(pd_to_be_matched,pd_to_be_compared) algo = RegionMatchingOrderAlgorithm(pd_to_be_matched,pd_to_be_compared) # 首先是寻找可靠的匹配作为锚点 algo.find_anchor() # 其次进行顺序的严格匹配 algo.exactly_matching_from_region_set() print(algo.matched_region)
# coding = UTF-8 from libs.imexport.class_mongodb import MongoDB, MonDatabase, MonCollection # 0. 连接数据库 collection_variable = MonCollection(database=MonDatabase( mongodb=MongoDB(), database_name='variable'), collection_name='referencevariable') # 1. 参数设置 CEIC_VARIABLE = False CHINASTAT_VARIABLE = True # 2. 导入CEIC变量 if CEIC_VARIABLE: ceic_collection = MonCollection(database=MonDatabase( mongodb=MongoDB(), database_name='region'), collection_name='ceic') refer_variables = ceic_collection.collection.find().distinct('variable') source = 'CEIC' # 3. 导入中国统计年鉴变量 if CHINASTAT_VARIABLE: Chinastat_collection = MonCollection(database=MonDatabase( mongodb=MongoDB(), database_name='region'), collection_name='provincestat') refer_variables = Chinastat_collection.collection.find().distinct( 'variable') source = '中国统计年鉴' if isinstance(refer_variables, list):
def __init__(self): # 连接AdminDatabase集合 mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') self.collection = MonCollection(database=mdb, collection_name='admincode')
def setUp(self): mongo = MongoDB( conn_str='mongodb://*****:*****@139.196.189.191:3717/') mdb = MonDatabase(mongodb=mongo, database_name='region') self.mcollection = MonCollection(database=mdb, collection_name='cities')
:return: 返回转换后的宽格式表格 :rtype: pandas.DataFrame """ result = pd.pivot_table(data=dataframe, values=values, index=index, columns=columns, dropna=dropna, fill_value=fill_value) return result if __name__ == '__main__': mcollection = MonCollection(database=MonDatabase(mongodb=MongoDB(), database_name='region'), collection_name='provincestat') cursor = mcollection.find( { 'variable': { '$in': ['人均地区生产总值', '私人控股企业法人单位数', '城镇居民消费', '城镇单位就业人员平均工资'] } }, projection={ '_id': 0, 'variable': 1, 'value': 1, 'province': 1, 'acode': 1, 'year': 1 })
# coding = UTF-8 from libs.datasheet.class_sheetanalyst import SheetAnalyst from libs.datasheet.class_DataSheet import DataSheet from os import path, listdir import pandas as pd from libs.imexport.class_mongodb import MongoDB, MonDatabase, MonCollection file_path = r'E:\data\popcensus\origin' print(listdir(file_path)) mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') collection_variable = MonCollection(database=mdb, collection_name='storedvariable') for i in range(1, 9): variable_file = path.join( file_path, ''.join(['popcensus_2000_variable', str(i), '.xls'])) rdata = pd.read_excel(io=variable_file, type=1, sheet='Sheet1') print(rdata) variable_dict = {} for ind in rdata.index: record = { 'origin': rdata.loc[ind, 'origin_var'], 'variable': rdata.loc[ind, 'matched_var'], 'source': '中国人口普查', 'year': '2000' }
return pd_result @staticmethod def fuzzy_variable_matching(variable, compared, error='auto'): if re.match('^auto$', error) is not None: error = max(1, int(len(variable) * 0.6)) return regex.fullmatch('(?:%s){e<=%s}' % (variable, str(error)), compared) if __name__ == '__main__': mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') collection_province = MonCollection(database=mdb, collection_name='popcensus') collection_variable = MonCollection(database=mdb, collection_name='storedvariable') ''' var = set() for v in sorted(collection_province.find().distinct('variable')): var.add(re.sub('\s+','',v)) print(v,len(v)) print(len(var)) print(len(collection_province.find().distinct('variable')))''' #for v in collection_province.find().distinct('variable'): # collection_variable.collection.insert_one({'variable':v,'source':'中国人口普查'}) #vars = [item['variable'] for item in collection_variable.find()] #x = ['总人口_合计', '总人口_男', '总人口_女', '总人口_性别比', '户籍人口', '少数民族人口比重_', '非农业户口人口比重_', '城乡人口_城镇', '城乡人口_乡村', '家庭户_户数', '家庭户_人口数', '家庭户_户规模', '家庭户_其中:一人户', '家庭户类别_一代户', '家庭户类别_二代户', '家庭户类别_三代户', '家庭户类别_四代以上户']
def setUp(self): mongo = MongoDB(conn_str='mongodb://*****:*****@139.196.189.191:3717/') mdb = MonDatabase(mongodb=mongo, database_name='region') self.mcollection = MonCollection(database=mdb, collection_name='cities')