Beispiel #1
0
class CEICInfoUpdater:
    def __init__(self, mongo=None, redis=None):
        if mongo is None:
            self._mongo = MonCollection(mongodb=MongoDB(),
                                        database='region',
                                        collection_name='CEIC')
        else:
            self._mongo = mongo

        if redis is None:
            self._redis = Redis()
        else:
            self._redis = redis

    def __call__(self):
        ceic_period = sorted(self._mongo.distinct('year'))
        ceic_variable = sorted(self._mongo.distinct('variable'))
        ceic_acode = sorted(self._mongo.distinct('acode'))
        ceic_region = [
            self._mongo.collection.find_one({'acode': acode},
                                            projection={
                                                '_id': False,
                                                'region': True
                                            })['region']
            for acode in ceic_acode
        ]

        #update
        self._redis.set('ceic_acode', ceic_acode)
        self._redis.set('ceic_period', ceic_period)
        self._redis.set('ceic_variable', ceic_variable)
        self._redis.set('ceic_region', ceic_region)
Beispiel #2
0
class AirQualityUpdater:
    def __init__(self, mongo=None, redis=None):
        if mongo is None:
            self._mongo = MonCollection(mongodb=MongoDB(),
                                        database='scraperdata',
                                        collection_name='airqualityfromMin')
        else:
            self._mongo = mongo

        if redis is None:
            self._redis = Redis()
        else:
            self._redis = redis

    def __call__(self):
        airquality_citycode = sorted(self._mongo.distinct('CITYCODE'))
        airquality_city = [
            self._mongo.collection.find_one({'CITYCODE': ccode},
                                            projection={
                                                '_id': False,
                                                'CITY': True
                                            })['CITY']
            for ccode in airquality_citycode
        ]
        airquality_date = ([
            date.strftime("%y-%m-%d")
            for date in sorted(self._mongo.distinct('OPER_DATE'))
        ])

        # update
        self._redis.set('airquality_city', airquality_city)
        self._redis.set('airquality_citycode', airquality_citycode)
        self._redis.set('airquality_date', airquality_date)
Beispiel #3
0
def backup(source, target):
    for database in backup_databases:
        if database not in source.database_names:
            print('No Database named {} remotely!'.format(database))
            raise Exception

        if database not in target.database_names:
            print(source.database_names)
            print('No Database name {} locally!'.format(database))
            raise Exception

        source_database = MonDatabase(source, database_name=database)
        target_database = MonDatabase(target, database_name=database)

        for collection in source_database.collection_names:
            if collection not in target_database.collection_names:
                target_database.create_collection(collection)
            else:
                source_collection = MonCollection(source, source_database,
                                                  collection)
                target_collection = MonCollection(target, target_database,
                                                  collection)
                if source_collection.collection.count(
                ) <= target_collection.collection.count():
                    print('No need to update the collection: {}'.format(
                        collection))
                    continue
                else:
                    print('updating...{}'.format(collection))
                    target_collection.collection.insert_many(
                        source_collection.collection.find())
Beispiel #4
0
 def __init__(self,
              mongodb='mongodb://*****:*****@123.207.185.126:27017/',
              database='proxy',
              collection_name='proxys'):
     # 设置数据库
     self._conn = MonCollection(mongodb=mongodb,
                                database=database,
                                collection_name=collection_name)
Beispiel #5
0
    def __init__(self):
        """ 初始化中国城市统计数据库接口

        """
        mongo = MongoDB(
            conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        self.conn = MonCollection(mongo,
                                  database='region',
                                  collection_name='citystatistics').collection
Beispiel #6
0
    def __init__(self, mongo=None, redis=None):
        if mongo is None:
            self._mongo = MonCollection(mongodb=MongoDB(),
                                        database='scraperdata',
                                        collection_name='airqualityfromMin')
        else:
            self._mongo = mongo

        if redis is None:
            self._redis = Redis()
        else:
            self._redis = redis
Beispiel #7
0
    def __init__(self, mongo=None, redis=None):
        if mongo is None:
            self._mongo = MonCollection(mongodb=MongoDB(),
                                        database='region',
                                        collection_name='CEIC')
        else:
            self._mongo = mongo

        if redis is None:
            self._redis = Redis()
        else:
            self._redis = redis
Beispiel #8
0
def backup(source,target):
    for database in backup_databases:
        if database not in source.database_names:
            print('No Database named {} remotely!'.format(database))
            raise Exception

        if database not in target.database_names:
            print('No Database name {} locally!'.format(database))
            raise Exception

        source_database = MonDatabase(source, database_name=database)
        target_database = MonDatabase(target, database_name=database)

        for collection in source_database.collection_names:
            if collection not in target_database.collection_names:
                target_database.create_collection(collection)

            source_collection = MonCollection(source,source_database,collection)
            target_collection = MonCollection(target,target_database,collection)

            for record in source_collection.find():
                target_collection.collection.insert_one(record)
Beispiel #9
0
class CityStatisticsDatabase:
    def __init__(self):
        """ 初始化中国城市统计数据库接口

        """
        mongo = MongoDB(
            conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        self.conn = MonCollection(mongo,
                                  database='region',
                                  collection_name='citystatistics').collection

    def find(self, *args, **kwargs):
        """ 调用查询接口

        :param args:
        :param kwargs:
        :return:
        """
        found = list(self.conn.find(*args, **kwargs))
        if len(found) > 0:
            found_data = pd.DataFrame(found)
            found_data['var'] = found_data['variable'] + found_data[
                'unit'].apply(lambda x: ''.join(['(', x, ')']))
            pdata = pd.pivot_table(found_data,
                                   values='value',
                                   index=['year', 'acode', 'region'],
                                   columns=['var'])
            pdata = pdata.swaplevel(0, 1, axis=0)
            return pdata

    @property
    def variables(self):
        found = self.conn.find().distinct('variable')
        return pd.DataFrame(sorted(found))

    @property
    def regions(self):
        return None
class AdminDivisionDatabase():
    """ 类AdminDivisionDatabase连接admindivision集合

    """
    def __init__(self):
        # 连接admindivision集合
        mongo = MongoDB()
        mdb = MonDatabase(mongodb=mongo, database_name='region')
        self.collection = MonCollection(database=mdb,
                                        collection_name='admindivision')

    # 查询
    def find(self, **conds):
        # 设置projection
        projection = conds.get('projection')
        if projection is None:
            projection = {
                'region': 1,
                'year': 1,
                'adminlevel': 1,
                'acode': 1,
                '_id': 1,
                'parent': 1,
                'uid': 1
            }
        else:
            conds.pop('projection')
        # 设置sorts
        sorts = conds.get('sorts')
        if sorts is None:
            sorts = [('year', ASCENDING), ('acode', ASCENDING)]
        else:
            conds.pop('sorts')

        # 设置查询条件
        condition = dict()
        for key in conds:
            if isinstance(conds[key], list):
                condition[key] = {'$in': conds[key]}
            else:
                condition[key] = conds[key]

        # 返回查询结果
        return self.collection.find(condition, projection).sort(sorts)

    # 年份
    @property
    def period(self):
        return sorted(self.find().distinct('year'))
Beispiel #11
0
import pandas as pd
from sheldon.database.class_mongodb import MongoDB, MonDatabase, MonCollection

# 0. Initialization
OUTPUT_PATH = r'D:\data\output\mongodb_info.xlsx'

# 1. 连接数据库
mongo = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')

# 2. 数据库信息列表
database_info = []
for database_name in sorted(mongo.database_names):
    database_con = MonDatabase(mongo,database_name=database_name)
    if len(database_con.collection_names) > 0:
        for collection_name in sorted(database_con.collection_names):
            conn = MonCollection(mongo,database=database_name,collection_name=collection_name)
            database_info.append([database_name, collection_name, conn.collection.count()])
    else:
        database_info.append([database_name,None,None])

database_info_pdrame = pd.DataFrame(database_info,columns=['database','collection','count'])
#database_info_pdrame.to_excel(OUTPUT_PATH)







 def __init__(self):
     # 连接admindivision集合
     mongo = MongoDB()
     mdb = MonDatabase(mongodb=mongo, database_name='region')
     self.collection = MonCollection(database=mdb,
                                     collection_name='admindivision')
Beispiel #13
0
# coding = UTF-8

import os
import numpy as np
import pandas as pd
import re
from sheldon.database.class_mongodb import MongoDB, MonDatabase, MonCollection

# 0. Initialize
PATH = r'D:\data\cnki\forimport'

# 1. connect to db
mongo = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
mdb = MonDatabase(mongodb=mongo, database_name='papers')
con = MonCollection(mongo,mdb,collection_name='cnki').collection

for file in os.listdir(PATH):
    file_name = os.path.join(PATH,file)
    mdata = pd.read_excel(file_name)
    records = mdata.to_dict(orient='index')
    for num in records:
        db_record = dict()
        # 标题
        db_record['title'] = records[num]['Title']
        # 作者
        if isinstance(records[num]['Author'],str):
            db_record['author'] = [author for author in  re.split(';',records[num]['Author']) if re.match('^\s*$',author) is None]
        else:
            continue
        # 地址
        if isinstance(records[num]['Author Address'],str):
Beispiel #14
0
import aiohttp
import asyncio
import async_timeout
from bs4 import BeautifulSoup
import json
from sheldon.webscraper.class_proxymanager import ProxyManager
from sheldon.database.class_mongodb import MongoDB, MonDatabase, MonCollection

conn = MonCollection(
    mongodb='mongodb://*****:*****@123.207.185.126:27017/',
    database='scraperdata',
    collection_name='airqualityfromMin').collection


async def get_web(session, url):
    with async_timeout.timeout(10):
        async with session.get(url) as response:
            return await response.text()


async def post_web(session, url, pagenum):
    try:
        pdata = {
            'page.pageNo': pagenum,
            'xmlname': '1462259560614',
            'V_DATE': '2014-01-01',
            'E_DATE': '2017-08-20'
        }
        proxy = ProxyManager().random_proxy
        print('Start Proxy... ', proxy)
        async with session.post(url,
Beispiel #15
0
# coding = UTF-8

import pandas as pd
from sheldon.database.class_mongodb import MongoDB, MonDatabase, MonCollection

# 1. 连接数据库
mongodb = 'mongodb://*****:*****@123.207.185.126:27017/'
database = 'papers'
collection_name = 'csscijournals'
conn = MonCollection(mongodb=mongodb,
                     database=database,
                     collection_name=collection_name)

# 2. 导入杂志文件
journals = pd.read_excel('d:/data/cnki/economic_journals.xlsx')
for item in journals.to_dict(orient='record'):
    print(item)
    conn.collection.insert_one(item)
Beispiel #16
0
# coding = UTF-8

from sheldon.database.class_mongodb import MongoDB, MonDatabase, MonCollection
from pymongo import MongoClient, DESCENDING, ASCENDING
import pandas as pd
from datetime import datetime, date

mongodb = MongoDB()
database_name = 'scraperdata'
collection_name = 'airqualityfromMin'
moncollection = MonCollection(mongodb=mongodb,
                              database=database_name,
                              collection_name=collection_name).collection


def query(start_date, end_date):
    print(type(start_date), end_date)
    found = list(
        moncollection.find(
            {
                '$and': [{
                    'OPER_DATE': {
                        '$gte': start_date
                    }
                }, {
                    'OPER_DATE': {
                        '$lte': end_date
                    }
                }]
            },
            projection={
Beispiel #17
0
class ProxyManager:
    """ ProxyManager类用来管理、检验和更新代理服务器列表
    :param str proxy_web: proxy的地址,默认为http://www.youdaili.net/Daili/guonei/
    :return: 无返回值
    """
    def __init__(self,
                 mongodb='mongodb://*****:*****@123.207.185.126:27017/',
                 database='proxy',
                 collection_name='proxys'):
        # 设置数据库
        self._conn = MonCollection(mongodb=mongodb,
                                   database=database,
                                   collection_name=collection_name)

    def find(self, type=0, limit=None):
        if type == 0:
            found = self._conn.find(filter={'protocol': {
                '$in': [0, 2]
            }},
                                    projection={
                                        '_id': False,
                                        'ip': True,
                                        'port': True,
                                        'type': True
                                    },
                                    sort=[('score', DESCENDING),
                                          ('speed', ASCENDING)],
                                    limit=limit)
        else:
            found = self._conn.find(filter={'protocol': {
                '$in': [1, 2]
            }},
                                    projection={
                                        '_id': False,
                                        'ip': True,
                                        'port': True,
                                        'type': True
                                    },
                                    sort=[('score', DESCENDING),
                                          ('speed', ASCENDING)],
                                    limit=limit)
        return {
            Proxy(ip=item['ip'], port=item['port'], type=type).address
            for item in found
        }

    @property
    def random_proxy(self):
        """ 随机返回一个代理服务器,选择的权重是它的count

        :return: 随机返回一个代理服务器
        """
        return random.choice(list(self.find(limit=40)))

    @property
    def top_50_proxies(self):
        """ 随机返回一个代理服务器,选择的权重是它的count

        :return: 随机返回一个代理服务器
        """
        return list(self.find(limit=50))
Beispiel #18
0
                                                'region': True
                                            })['region']
            for acode in ceic_acode
        ]

        #update
        self._redis.set('ceic_acode', ceic_acode)
        self._redis.set('ceic_period', ceic_period)
        self._redis.set('ceic_variable', ceic_variable)
        self._redis.set('ceic_region', ceic_region)


class MongoDBInfoUpdater:
    updates = {'ceic': CEICInfoUpdater, 'airquality': AirQualityUpdater}

    def __init__(self, to_be_update=['airquality']):
        self._to_be_update = list(to_be_update)

    def __call__(self, *args, **kwargs):
        for to_be_update in self._to_be_update:
            print('update: ', to_be_update)
            self.updates[to_be_update]()()


if __name__ == '__main__':
    mcollection = MonCollection(mongodb=MongoDB(),
                                database='region',
                                collection_name='CEIC')

    updater = MongoDBInfoUpdater()
    updater()
Beispiel #19
0
# coding = UTF-8

from sheldon.database.class_mongodb import MongoDB, MonDatabase, MonCollection

# 1. 连接数据库
mongo = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
mdb = MonDatabase(mongodb=mongo, database_name='papers')
con = MonCollection(mongo, mdb, collection_name='cnki').collection

journals = con.find().distinct('journal')

i = 1
for journal in journals:
    period = con.find({'journal': journal}).distinct('year')
    if len(period) < 8:
        print(journal)
        #raise Exception
    print(journal, '->', sorted(period))
    #print(i)
    i += 1
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6

# output to static HTML file
output_file("chart.html")

# 0. Setup
BASE_PATH = r'D:\data\cnki\pkl'
XLS_PATH = r'D:\data\cnki\xls'

# 1. 数据库
mongo = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
mdb = MonDatabase(mongodb=mongo, database_name='papers')
con = MonCollection(mongo, mdb, collection_name='cnki').collection

journals = sorted(con.find().distinct('journal'))
period = [str(year) for year in range(2010, 2018)]

ORIGINAL_AUTHOR_OUTPUT = False
BUILD_AUTHOR_DICT = False
ORGANIZE_AUTHOR = False
RECORD_DATAFRAME = False
CREATE_WEIGHT = False
CREATE_WEIGHT2 = False
DEA = False
DEA2 = True


# 是否同一个地址
Beispiel #21
0
# coding = UTF-8

from sheldon.database.class_mongodb import MongoDB, MonDatabase, MonCollection

mongodb = MongoDB()
database_name = 'papers'
collection_name = 'cnki'
conn = MonCollection(mongodb=mongodb,
                     database=database_name,
                     collection_name=collection_name).collection

# 作者信息:{姓名:{年份:address}}
unmatched_records = []
matched_authors = dict()
for record in conn.find():
    if (record.get('author') is not None) and (record.get('address')
                                               is not None):
        if len(record.get('author')) == len(record.get('address')):
            for author in record.get('author'):
                if author in matched_authors.keys():
                    pass
                else:
                    matched_authors.update({
                        author:
                        set(record.get('year'), record.get('address'))
                    })
        else:
            pass
Beispiel #22
0
# coding = UTF-8

from sheldon.database.class_mongodb import MongoDB, MonDatabase, MonCollection
from pymongo import MongoClient, DESCENDING, ASCENDING
import re
from datetime import datetime, date

mongodb = MongoDB()
database_name = 'scraperdata'
collection_name = 'airqualityfromMin'
moncollection = MonCollection(mongodb=mongodb,
                              database=database_name,
                              collection_name=collection_name).collection

found_all = moncollection.find()

for item in found_all:
    if isinstance(item['OPER_DATE'], str):
        id = item['_id']
        print(item['OPER_DATE'])
        #oper_date = datetime(*[int(i) for i in re.split('-',item['OPER_DATE'])])
        #moncollection.update_one({'_id':id},{'$set':{'OPER_DATE':oper_date}})