Example #1
0
 def __init__(self,collection:str='CEIC'):
     # to connect to Mongodb database
     self.collectionname = collection
     self.db = Database()
     self.conn = self.db.connect('regionDB',collection)
Example #2
0
 def __init__(self):
     Database.__init__(self)
     self.connect('regionDB','CityStatistics')
     self.ad = AdminData()
Example #3
0
 def __init__(self):
     Database.__init__(self)
     self.connect('regionDB','AdminCode')
Example #4
0
class RegionalData:
    '''
    类RegionalData用来从数据库MongoDB中导出区域数据。
    
    属性:
    self.conn: 数据库MongoDB某集合的接口
    
    方法:
    __init__(self,collection:str='CEIC'):构造函数,参数collection表示集合名称。
    query(self,region:list=None,year:list=None,variable:list=None,projection:dict={'region':1,'year':1,'value':1,'acode':1,'_id':0,'variable':1,'year':1},sorts:list=[('year',ASCENDING),('acode',ASCENDING)])->pd.DataFrame:查询返回区域数据。

    Demo:
    查询返回区域数据
    ad = AdminCode()
    rdata = RegionalData()    # 初始化,连接MongoDB中的区域数据
    mdata = rdata.query(region=ad[u'浙江',u'f'],year=range(2006,2010),variable=[u'财政支出',u'从业人数_在岗职工'])    # 查询返回区域数据
    mdata = rdata.query(region=ad[u'浙江',u'杭州'],variable=u'财政支出',year=2012)    # 查询返回区域数据

    mdata得到的结果
             acode region    value   variable  year
    0   330100    杭州市  27548.0       财政支出  2006
    1   330100    杭州市   1162.4  从业人数_在岗职工  2006
    2   330200    宁波市  29270.0       财政支出  2006
    3   330200    宁波市    887.8  从业人数_在岗职工  2006

    '''
    # 构造函数
    def __init__(self,collection:str='CEIC'):
        # to connect to Mongodb database
        self.collectionname = collection
        self.db = Database()
        self.conn = self.db.connect('regionDB',collection)

    # 获得所有的变量名
    def variables(self):
        posts = self.conn.find()
        return posts.distinct('variable')

    # 从数据库中获取区域数据
    def query(self,region:list=None,year:list=None,variable:list=None,scale:str=None,projection:dict={'region':1,'year':1,'value':1,'acode':1,'_id':0,'variable':1,'year':1},sorts:list=[('year',ASCENDING),('acode',ASCENDING)])->pd.DataFrame:
        if region is not None:
            # 如果参数region类型是dict,那么转换为list
            if isinstance(region,dict):
                region = [region]
            # 获得区域的行政代码
            regioncode = [item['acode'] for item in region]
        # 如果参数year类型是str或者int,那么转换为list
        if re.match('^cCity$',self.collectionname) is not None:
            if isinstance(year,int):
                year = [str(year)]
            else:
                year = [str(y) for y in year]
        else:
            if isinstance(year,(int,str)):
                year = [year]
        print(year)
        # 如果参数variable类型是str,那么转换为list
        if isinstance(variable,(str)):
            variable = [variable]

        if (region is not None) and (year is not None) and (variable is not None):
            if scale is None:
                result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'variable':{'$in':variable},'acode':{'$in':regioncode}},projection).sort(sorts)))
            else:
                result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'variable':{'$in':variable},'acode':{'$in':regioncode},'scale':scale},projection).sort(sorts)))
        elif (region is not None) and (year is not None):
            if scale is None:
                result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'acode':{'$in':regioncode}},projection).sort(sorts)))
            else:
                result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'acode':{'$in':regioncode},'scale':scale},projection).sort(sorts)))
        elif (region is not None) and (variable is not None):
            if scale is None:
                result = pd.DataFrame(list(self.conn.find({'variable':{'$in':variable},'acode':{'$in':regioncode}},projection).sort(sorts)))
            else:
                result = pd.DataFrame(list(self.conn.find({'variable':{'$in':variable},'acode':{'$in':regioncode},'scale':scale},projection).sort(sorts)))
        elif (year is not None) and (variable is not None):
            if scale is None:
                result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'variable':{'$in':variable}},projection).sort(sorts)))
            else:
                result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'variable':{'$in':variable},'scale':scale},projection).sort(sorts)))
        elif (region is not None):
            if scale is None:
                result = pd.DataFrame(list(self.conn.find({'acode':{'$in':regioncode}},projection).sort(sorts)))
            else:
                result = pd.DataFrame(list(self.conn.find({'acode':{'$in':regioncode},'scale':scale},projection).sort(sorts)))
        elif(year is not None):
            if scale is None:
                result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]}},projection).sort(sorts)))
            else:
                result = pd.DataFrame(list(self.conn.find({'year':{'$gte':year[0],'$lte':year[len(year)-1]},'scale':scale},projection).sort(sorts)))
        else:
            if scale is None:
                result = pd.DataFrame(list(self.conn.find({'variable':{'$in':variable}},projection).sort(sorts)))
            else:
                result = pd.DataFrame(list(self.conn.find({'variable':{'$in':variable},'scale':scale},projection).sort(sorts)))
        # 返回的是pd.DataFrame类型
        return result
Example #5
0
 def __init__(self, year=None):
     Database.__init__(self)
     self.connect('microDB', 'Cgss')
     self.year = int(year)
Example #6
0
 def __init__(self, website="http://www.tianqihoubao.com"):
     self.site_scraper = SiteScraper(website)
     self.data = []
     self.db = Database()
     self.con = self.db.connect("internetDB", "AQI")
Example #7
0
class AQISite:
    """定期抓取空气后天网站的空气污染数据

    """

    def __init__(self, website="http://www.tianqihoubao.com"):
        self.site_scraper = SiteScraper(website)
        self.data = []
        self.db = Database()
        self.con = self.db.connect("internetDB", "AQI")

    def daily_aqi_data(self, url_pages):
        """抓取每天的空气污染指数

        :param str page: 城市空气污染指数页面
        :return:
        """
        for page in url_pages:

            if re.match("^(/aqi/)[a-zA-Z]+(-)[0-9]*(\.html)", page) is None:
                continue

            html = urlopen("http://www.tianqihoubao.com" + page)
            bsObj = BeautifulSoup(html, "lxml", from_encoding="gb18030")

            # 获取城市名称
            city_title = bsObj.find("h4")
            city_name = re.split("\d+", city_title.get_text())[0]

            # 获取空气污染数据
            table_list = bsObj.find("table", {"class": "b"})
            # 设定标识符,第一次为真
            first = True

            for child in table_list.children:
                child_str = re.sub("\s", "", str(child))

                if len(child_str) < 1:
                    continue

                if first:
                    title = re.split("<b>", child_str)
                    title = [re.split("</b>", iunit)[0] for iunit in title]
                    var_name = title[1:]
                    first = False
                    continue

                aqi_data = []
                td_data_list = re.split('<td>|<tdclass="aqi-lv[0-9]{1}">', child_str)
                for td_data in td_data_list:
                    if re.match("^>", td_data) is not None:
                        aqi_single_data = re.split("<", re.split(">", td_data)[1])[0]
                    else:
                        aqi_single_data = re.split("<", td_data)[0]
                    aqi_data.append(aqi_single_data)

                api_data = aqi_data[1:]
                a_data = dict(zip(var_name, api_data))
                a_data["city"] = city_name
                print(a_data)
                self.insertDB(a_data)

    def insertDB(self, record):
        new_record = dict()
        for key in record:
            value = record[key]
            if "." in key:
                key = re.sub("\.", "", key)
            if re.match("^\d+$", value) is not None:
                new_record[key] = int(value)
            elif re.match("^\d+(\.)\d*$", value) is not None:
                new_record[key] = float(value)
            else:
                new_record[key] = value
        print(new_record)
        self.con.insert_one(new_record)
Example #8
0
# coding=UTF-8

from lib.database.class_Database import Database
from flask import Flask, render_template, request, redirect, url_for, jsonify
from application.DataWarehouse.data.class_regiondata import RegionData
import json

app = Flask(__name__)

# 创建初始数据
# 导入CEIC数据
db = Database()
con = db.connect('regionDB', 'CEIC')
period = range(1990,2015)
# 创建区域数据
region_list = json.load(open('e:/gitwork/application/testweb/region_ceic.txt'))
variables = con.find().distinct('variable')
rdata = RegionData()

@app.route("/")
def index():
    project_name = u'创数据'
    company_date = u'华东理工大学商学院 2015'
    return render_template('index.html',project_name=project_name,company_date=company_date)

@app.route("/query",methods=['GET', 'POST'])
def query():
    if request.method == 'POST':
        form_data = request.form
        #period_chosen = request.form['period']
        #variables_chosen = request.form['variable']
Example #9
0
# coding=UTF-8

from lib.data.class_AdminData import AdminData
from lib.database.class_Database import Database
from werkzeug.datastructures import ImmutableMultiDict
import json

year = list(range(1990, 2015))
print(year)

# 利用CEIC的数据来做Demo
# 1. 导入CEIC数据
db = Database()
con = db.connect("regionDB", "CEIC")
ceic_region_code = sorted(con.find().distinct("acode"))
print(len(ceic_region_code))

# 2. 搜索行政区划代码数据库
admin_data = AdminData()
regions = [admin_data.get_by_acode(acode=acode)[0] for acode in ceic_region_code]

# 3. 生成区域行政数据
region_list = []
for region in regions:
    # 第一个元素是行政区划代码
    if region["adminlevel"] < 3:
        parent = u"中国"
    else:
        parent = admin_data.database.collection.find_one({"_id": region["parent"]})
        parent = "/".join(["中国", "".join([parent["region"], "属下"])])
    region_list.append([region["acode"], parent, region["region"]])