def __init__(self,collection:str='CEIC'): # to connect to Mongodb database self.collectionname = collection self.db = Database() self.conn = self.db.connect('regionDB',collection)
def __init__(self): Database.__init__(self) self.connect('regionDB','CityStatistics') self.ad = AdminData()
def __init__(self): Database.__init__(self) self.connect('regionDB','AdminCode')
class RegionalData: ''' 类RegionalData用来从数据库MongoDB中导出区域数据。 属性: self.conn: 数据库MongoDB某集合的接口 方法: __init__(self,collection:str='CEIC'):构造函数,参数collection表示集合名称。 query(self,region:list=None,year:list=None,variable:list=None,projection:dict={'region':1,'year':1,'value':1,'acode':1,'_id':0,'variable':1,'year':1},sorts:list=[('year',ASCENDING),('acode',ASCENDING)])->pd.DataFrame:查询返回区域数据。 Demo: 查询返回区域数据 ad = AdminCode() rdata = RegionalData() # 初始化,连接MongoDB中的区域数据 mdata = rdata.query(region=ad[u'浙江',u'f'],year=range(2006,2010),variable=[u'财政支出',u'从业人数_在岗职工']) # 查询返回区域数据 mdata = rdata.query(region=ad[u'浙江',u'杭州'],variable=u'财政支出',year=2012) # 查询返回区域数据 mdata得到的结果 acode region value variable year 0 330100 杭州市 27548.0 财政支出 2006 1 330100 杭州市 1162.4 从业人数_在岗职工 2006 2 330200 宁波市 29270.0 财政支出 2006 3 330200 宁波市 887.8 从业人数_在岗职工 2006 ''' # 构造函数 def __init__(self,collection:str='CEIC'): # to connect to Mongodb database self.collectionname = collection self.db = Database() self.conn = self.db.connect('regionDB',collection) # 获得所有的变量名 def variables(self): posts = self.conn.find() return posts.distinct('variable') # 从数据库中获取区域数据 def query(self,region:list=None,year:list=None,variable:list=None,scale:str=None,projection:dict={'region':1,'year':1,'value':1,'acode':1,'_id':0,'variable':1,'year':1},sorts:list=[('year',ASCENDING),('acode',ASCENDING)])->pd.DataFrame: if region is not None: # 如果参数region类型是dict,那么转换为list if isinstance(region,dict): region = [region] # 获得区域的行政代码 regioncode = [item['acode'] for item in region] # 如果参数year类型是str或者int,那么转换为list if re.match('^cCity$',self.collectionname) is not None: if isinstance(year,int): year = [str(year)] else: year = [str(y) for y in year] else: if isinstance(year,(int,str)): year = [year] print(year) # 如果参数variable类型是str,那么转换为list if isinstance(variable,(str)): variable = [variable] if (region is not None) and (year is not None) and (variable is not None): if scale is None: result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'variable':{'$in':variable},'acode':{'$in':regioncode}},projection).sort(sorts))) else: result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'variable':{'$in':variable},'acode':{'$in':regioncode},'scale':scale},projection).sort(sorts))) elif (region is not None) and (year is not None): if scale is None: result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'acode':{'$in':regioncode}},projection).sort(sorts))) else: result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'acode':{'$in':regioncode},'scale':scale},projection).sort(sorts))) elif (region is not None) and (variable is not None): if scale is None: result = pd.DataFrame(list(self.conn.find({'variable':{'$in':variable},'acode':{'$in':regioncode}},projection).sort(sorts))) else: result = pd.DataFrame(list(self.conn.find({'variable':{'$in':variable},'acode':{'$in':regioncode},'scale':scale},projection).sort(sorts))) elif (year is not None) and (variable is not None): if scale is None: result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'variable':{'$in':variable}},projection).sort(sorts))) else: result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'variable':{'$in':variable},'scale':scale},projection).sort(sorts))) elif (region is not None): if scale is None: result = pd.DataFrame(list(self.conn.find({'acode':{'$in':regioncode}},projection).sort(sorts))) else: result = pd.DataFrame(list(self.conn.find({'acode':{'$in':regioncode},'scale':scale},projection).sort(sorts))) elif(year is not None): if scale is None: result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]}},projection).sort(sorts))) else: result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'scale':scale},projection).sort(sorts))) else: if scale is None: result = pd.DataFrame(list(self.conn.find({'variable':{'$in':variable}},projection).sort(sorts))) else: result = pd.DataFrame(list(self.conn.find({'variable':{'$in':variable},'scale':scale},projection).sort(sorts))) # 返回的是pd.DataFrame类型 return result
def __init__(self, year=None): Database.__init__(self) self.connect('microDB', 'Cgss') self.year = int(year)
def __init__(self, website="http://www.tianqihoubao.com"): self.site_scraper = SiteScraper(website) self.data = [] self.db = Database() self.con = self.db.connect("internetDB", "AQI")
class AQISite: """定期抓取空气后天网站的空气污染数据 """ def __init__(self, website="http://www.tianqihoubao.com"): self.site_scraper = SiteScraper(website) self.data = [] self.db = Database() self.con = self.db.connect("internetDB", "AQI") def daily_aqi_data(self, url_pages): """抓取每天的空气污染指数 :param str page: 城市空气污染指数页面 :return: """ for page in url_pages: if re.match("^(/aqi/)[a-zA-Z]+(-)[0-9]*(\.html)", page) is None: continue html = urlopen("http://www.tianqihoubao.com" + page) bsObj = BeautifulSoup(html, "lxml", from_encoding="gb18030") # 获取城市名称 city_title = bsObj.find("h4") city_name = re.split("\d+", city_title.get_text())[0] # 获取空气污染数据 table_list = bsObj.find("table", {"class": "b"}) # 设定标识符,第一次为真 first = True for child in table_list.children: child_str = re.sub("\s", "", str(child)) if len(child_str) < 1: continue if first: title = re.split("<b>", child_str) title = [re.split("</b>", iunit)[0] for iunit in title] var_name = title[1:] first = False continue aqi_data = [] td_data_list = re.split('<td>|<tdclass="aqi-lv[0-9]{1}">', child_str) for td_data in td_data_list: if re.match("^>", td_data) is not None: aqi_single_data = re.split("<", re.split(">", td_data)[1])[0] else: aqi_single_data = re.split("<", td_data)[0] aqi_data.append(aqi_single_data) api_data = aqi_data[1:] a_data = dict(zip(var_name, api_data)) a_data["city"] = city_name print(a_data) self.insertDB(a_data) def insertDB(self, record): new_record = dict() for key in record: value = record[key] if "." in key: key = re.sub("\.", "", key) if re.match("^\d+$", value) is not None: new_record[key] = int(value) elif re.match("^\d+(\.)\d*$", value) is not None: new_record[key] = float(value) else: new_record[key] = value print(new_record) self.con.insert_one(new_record)
# coding=UTF-8 from lib.database.class_Database import Database from flask import Flask, render_template, request, redirect, url_for, jsonify from application.DataWarehouse.data.class_regiondata import RegionData import json app = Flask(__name__) # 创建初始数据 # 导入CEIC数据 db = Database() con = db.connect('regionDB', 'CEIC') period = range(1990,2015) # 创建区域数据 region_list = json.load(open('e:/gitwork/application/testweb/region_ceic.txt')) variables = con.find().distinct('variable') rdata = RegionData() @app.route("/") def index(): project_name = u'创数据' company_date = u'华东理工大学商学院 2015' return render_template('index.html',project_name=project_name,company_date=company_date) @app.route("/query",methods=['GET', 'POST']) def query(): if request.method == 'POST': form_data = request.form #period_chosen = request.form['period'] #variables_chosen = request.form['variable']
# coding=UTF-8 from lib.data.class_AdminData import AdminData from lib.database.class_Database import Database from werkzeug.datastructures import ImmutableMultiDict import json year = list(range(1990, 2015)) print(year) # 利用CEIC的数据来做Demo # 1. 导入CEIC数据 db = Database() con = db.connect("regionDB", "CEIC") ceic_region_code = sorted(con.find().distinct("acode")) print(len(ceic_region_code)) # 2. 搜索行政区划代码数据库 admin_data = AdminData() regions = [admin_data.get_by_acode(acode=acode)[0] for acode in ceic_region_code] # 3. 生成区域行政数据 region_list = [] for region in regions: # 第一个元素是行政区划代码 if region["adminlevel"] < 3: parent = u"中国" else: parent = admin_data.database.collection.find_one({"_id": region["parent"]}) parent = "/".join(["中国", "".join([parent["region"], "属下"])]) region_list.append([region["acode"], parent, region["region"]])