Python MonCollectionの例、lib.base.database.class_mongodb.MonCollection Pythonの例

コード例 #1

0

ファイルを表示

ファイル: class_citystatisticsdatabase.py プロジェクト: plutoese/pluto_archive

    def __init__(self):
        """ 初始化中国城市统计数据库接口

        """
        mongo = MongoDB(conn_str='localhost:27017')
        self.conn = MonCollection(mongo,
                                  database='regiondata',
                                  collection_name='citystatistics').collection

コード例 #2

0

ファイルを表示

ファイル: class_proxymanager.py プロジェクト: plutoese/pluto_archive

class ProxyManager:
    """ ProxyManager类用来管理、检验和更新代理服务器列表
    :param str proxy_web: proxy的地址，默认为http://www.youdaili.net/Daili/guonei/
    :return: 无返回值
    """
    def __init__(self, database='proxy', collection_name='proxys'):
        # 设置数据库
        self._conn = MonCollection(mongodb=MongoDB(),
                                   database=database,
                                   collection_name=collection_name)

    def find(self, type=0, limit=None):
        if type == 0:
            found = self._conn.find(filter={'protocol': {
                '$in': [0, 2]
            }},
                                    projection={
                                        '_id': False,
                                        'ip': True,
                                        'port': True,
                                        'type': True
                                    },
                                    sort=[('score', DESCENDING),
                                          ('speed', ASCENDING)],
                                    limit=limit)
        else:
            found = self._conn.find(filter={'protocol': {
                '$in': [1, 2]
            }},
                                    projection={
                                        '_id': False,
                                        'ip': True,
                                        'port': True,
                                        'type': True
                                    },
                                    sort=[('score', DESCENDING),
                                          ('speed', ASCENDING)],
                                    limit=limit)
        return {
            Proxy(ip=item['ip'], port=item['port'], type=type).address
            for item in found
        }

    @property
    def random_proxy(self):
        """ 随机返回一个代理服务器，选择的权重是它的count

        :return: 随机返回一个代理服务器
        """
        return random.choice(list(self.find(limit=100)))

    @property
    def top_150_proxies(self):
        """ 随机返回一个代理服务器，选择的权重是它的count

        :return: 随机返回一个代理服务器
        """
        return list(self.find(limit=150))

コード例 #3

0

ファイルを表示

ファイル: class_universitieswebscraper.py プロジェクト: plutoese/pluto_archive

    def __init__(self):
        mongo = MongoDB(conn_str='localhost:27017')
        self._college_info = MonCollection(
            mongo, database='webdata',
            collection_name='college_info').collection
        self._college_intro = MonCollection(
            mongo, database='webdata',
            collection_name='college_introduction').collection

        self._headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }

コード例 #4

0

ファイルを表示

    def store_data_to_db(self, data_collection=None, label_collection=None):
        """ 把stata对象中的数据存入数据库

        :param data_collection:
        :param label_collection:
        :return: 返回self
        """
        if data_collection is None:
            data_collection = MonCollection(
                database=MonDatabase(mongodb=MongoDB(),
                                     database_name='surveydata'),
                collection_name='cgssdata').collection

        if label_collection is None:
            label_collection = MonCollection(
                database=MonDatabase(mongodb=MongoDB(),
                                     database_name='surveydata'),
                collection_name='cgsslabel').collection

        for year in self._stata_object:

            stata_data = self._stata_object[year].read()
            records = stata_data.to_dict("records")
            for record in records:
                record["year"] = year
                print(record)
                data_collection.insert_one(record)

            value_labels = self._stata_object[year].value_labels
            str_value_labels = dict()
            for key in value_labels:
                str_value_labels[key] = {
                    str(inn_key): value_labels[key][inn_key]
                    for inn_key in value_labels[key]
                }
            str_value_labels["year"] = year
            str_value_labels["type"] = "value labels"
            print(str_value_labels)
            label_collection.insert_one(str_value_labels)

            variable_labels = self._stata_object[year].variable_labels
            variable_labels["year"] = year
            variable_labels["type"] = "variable labels"
            print(variable_labels)
            label_collection.insert_one(variable_labels)

        return self

コード例 #5

0

ファイルを表示

ファイル: class_gaokaowebscraper.py プロジェクト: plutoese/pluto_archive

 def __init__(self):
     mongo = MongoDB(conn_str='localhost:27017')
     self._web_conn = MonCollection(mongo,
                                    database='cache',
                                    collection_name='gaokaoweb').collection
     self._data_web_conn = MonCollection(
         mongo, database='cache',
         collection_name='gaokaodataweb').collection
     self._university_web_conn = MonCollection(
         mongo, database='cache',
         collection_name='gaokaouniversityweb').collection
     self._data_conn = MonCollection(
         mongo, database='webdata',
         collection_name='gaokao_entrancescore').collection
     self._copy_data_web_conn = MonCollection(
         mongo, database='webdata',
         collection_name='gaokaouniversityweb').collection

コード例 #6

0

ファイルを表示

    def __init__(self, data_collection=None, label_collection=None):
        """ 初始化数据库连接

        :param data_collection:
        :param label_collection:
        """
        if data_collection is None:
            self._data_collection = MonCollection(
                database=MonDatabase(
                    mongodb=MongoDB(conn_str='localhost:27017'),
                    database_name='surveydata'),
                collection_name='cgssdata').collection
        else:
            self._data_collection = data_collection

        if label_collection is None:
            self._label_collection = MonCollection(
                database=MonDatabase(
                    mongodb=MongoDB(conn_str='localhost:27017'),
                    database_name='surveydata'),
                collection_name='cgsslabel').collection
        else:
            self._label_collection = label_collection

コード例 #7

0

ファイルを表示

    def store_label_to_db(self, label_collection=None):
        """ 把变量和值标签关联存储到数据库

        :param label_collection:
        :return: 返回self
        """
        if label_collection is None:
            label_collection = MonCollection(
                database=MonDatabase(mongodb=MongoDB(),
                                     database_name='surveydata'),
                collection_name='cgsslabel').collection

        for year in self._stata_label_object:
            stata_label_data = self._stata_label_object[year].read()
            records = dict(
                zip(stata_label_data.loc[:, "name"],
                    stata_label_data.loc[:, "vallab"]))
            records["year"] = year
            records["type"] = "variable value lables"
            print(records)
            label_collection.insert_one(records)

        return self

コード例 #8

0

ファイルを表示

ファイル: class_citystatisticsdatabase.py プロジェクト: plutoese/pluto_archive

class CityStatisticsDatabase:
    def __init__(self):
        """ 初始化中国城市统计数据库接口

        """
        mongo = MongoDB(conn_str='localhost:27017')
        self.conn = MonCollection(mongo,
                                  database='regiondata',
                                  collection_name='citystatistics').collection

    def find(self, *args, **kwargs):
        """ 调用查询接口

        :param args:
        :param kwargs:
        :return:
        """
        found = list(self.conn.find(*args, **kwargs))
        if len(found) > 0:
            found_data = pd.DataFrame(found)
            found_data['var'] = found_data['variable'] + found_data[
                'unit'].apply(lambda x: ''.join(['(', x, ')']))
            pdata = pd.pivot_table(found_data,
                                   values='value',
                                   index=['year', 'acode', 'region'],
                                   columns=['var'])
            pdata = pdata.swaplevel(0, 1, axis=0)
            return pdata

    @property
    def variables(self):
        found = self.conn.find().distinct('variable')
        return pd.DataFrame(sorted(found))

    @property
    def regions(self):
        return None

コード例 #9

0

ファイルを表示

class AdminDivisionDatabase():
    """ 类AdminDivisionDatabase连接admindivision集合

    """
    def __init__(self):
        # 连接admindivision集合
        mongo = MongoDB()
        mdb = MonDatabase(mongodb=mongo, database_name='region')
        self.collection = MonCollection(database=mdb,
                                        collection_name='admindivision')

    # 查询
    def find(self, **conds):
        # 设置projection
        projection = conds.get('projection')
        if projection is None:
            projection = {
                'region': 1,
                'year': 1,
                'adminlevel': 1,
                'acode': 1,
                '_id': 1,
                'parent': 1,
                'uid': 1
            }
        else:
            conds.pop('projection')
        # 设置sorts
        sorts = conds.get('sorts')
        if sorts is None:
            sorts = [('year', ASCENDING), ('acode', ASCENDING)]
        else:
            conds.pop('sorts')

        # 设置查询条件
        condition = dict()
        for key in conds:
            if isinstance(conds[key], list):
                condition[key] = {'$in': conds[key]}
            else:
                condition[key] = conds[key]

        # 返回查询结果
        return self.collection.find(condition, projection).sort(sorts)

    # 年份
    @property
    def period(self):
        return sorted(self.find().distinct('year'))

コード例 #10

0

ファイルを表示

ファイル: app_missing_date_try.py プロジェクト: plutoese/pluto_archive

# coding = UTF-8

import os
import pandas as pd
from lib.base.database.class_mongodb import MongoDB, MonDatabase, MonCollection

mongo = MongoDB(
    conn_str=
    'mongodb://*****:*****@dds-bp162bb74b8184e41658-pub.mongodb.rds.aliyuncs.com:3717'
)
mdb = MonDatabase(mongodb=mongo, database_name='enterprise')
mcon = MonCollection(mongo, mdb, 'stock_share_holder')

matched_file_path = r'E:\datahouse\projectdata\shareholder\matched'
file_path = r'E:\datahouse\projectdata\shareholder\missing_date_df.xlsx'
share_holder_all_df = pd.read_excel(file_path)

for ind in share_holder_all_df.index:
    codeA = share_holder_all_df.loc[ind, '上市公司代码_ComCd']
    codeB = share_holder_all_df.loc[ind, '股东上市公司代码_SHComCd']
    codeB = codeB[1:]
    found = mcon.collection.find(
        {
            '上市公司代码_ComCd': codeA,
            '所属基金/股票的代码_SecuCd': codeB
        },
        projection={
            '_id': 0,
            '上市公司代码_ComCd': 1,
            '最新公司全称_LComNm': 1,
            '截止日期_EndDt': 1,

コード例 #11

0

ファイルを表示

                    '江西科技师范大学':'江西科技师范学院',
                    '天津外国语大学':'天津工程师范学院',
                    '内蒙古财经大学':'内蒙古财经学院',
                    '贵州师范学院':'贵州师范大学',
                    }


def college_replace(college_name):
    if college_name in college_tranform:
        return college_tranform[college_name]
    else:
        return college_name

# 1. 数据库连接
mongo = MongoDB(conn_str='localhost:27017')
college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection
entrance_score_con = MonCollection(mongo, database='webdata', collection_name='gaokao_entrancescore').collection

# 2. 数据库大学集合
entrance_colleges = entrance_score_con.find().distinct('university')

# 3. 导入校友会大学
college_rate_2011_filepath = r'E:\cyberspace\worklot\college\2011年校友会大学排名.xlsx'
college_2011 = pd.read_excel(college_rate_2011_filepath)
college_2011['学校名称'] = college_2011['学校名称'].apply(college_replace)
for item in college_2011['学校名称']:
    if item not in entrance_colleges:
        print(item)

college_rate_2012_filepath = r'E:\cyberspace\worklot\college\2012年校友会大学排名.xlsx'
college_2012 = pd.read_excel(college_rate_2012_filepath)

コード例 #12

0

ファイルを表示

 def __init__(self):
     # 连接admindivision集合
     mongo = MongoDB()
     mdb = MonDatabase(mongodb=mongo, database_name='region')
     self.collection = MonCollection(database=mdb,
                                     collection_name='admindivision')

コード例 #13

0

ファイルを表示

ファイル: class_literaturereport.py プロジェクト: plutoese/pluto_archive

 def __init__(self, database='papers', collection='econpapers'):
     self.literatures = None
     self._paper_conn = MonCollection(mongodb=MongoDB(),
                                      database=database,
                                      collection_name=collection)

コード例 #14

0

ファイルを表示

ファイル: app_dataset_construction.py プロジェクト: plutoese/pluto_archive

# coding = UTF-8

import re
import pysal
from pymongo import ASCENDING
import pandas as pd
from lib.base.database.class_mongodb import MongoDB, MonCollection
from application.dataworld.admindivision.class_admindivision import AdminDivision

# 1. 数据库连接
mongo = MongoDB(conn_str='localhost:27017')
college_info_con = MonCollection(mongo,
                                 database='webdata',
                                 collection_name='college_info').collection
entrance_score_con = MonCollection(
    mongo, database='webdata',
    collection_name='gaokao_entrancescore').collection

# 2. 步骤参数设置
# a. 导出每年的高考分数数据
IS_EXPORT_RAW_EXAM_SCORE = False
# b. 导出高校信息数据
IS_EXPORT_RAW_COLLEGE_INFO = False
# c. 2011-2013年面板数据
IS_MERGE_INTO_PANEL = False
# d. 合并高校信息数据
IS_MERGE_COLLEGE_INFO = False
# e. 合并大学排名信息
IS_MERGE_COLLEGE_RATE = False
# f. 合并省级经济信息
IS_MERGE_PROVINCE_PERGDP = False

コード例 #15

0

ファイルを表示

class CgssDatabase:
    def __init__(self, data_collection=None, label_collection=None):
        """ 初始化数据库连接

        :param data_collection:
        :param label_collection:
        """
        if data_collection is None:
            self._data_collection = MonCollection(
                database=MonDatabase(
                    mongodb=MongoDB(conn_str='localhost:27017'),
                    database_name='surveydata'),
                collection_name='cgssdata').collection
        else:
            self._data_collection = data_collection

        if label_collection is None:
            self._label_collection = MonCollection(
                database=MonDatabase(
                    mongodb=MongoDB(conn_str='localhost:27017'),
                    database_name='surveydata'),
                collection_name='cgsslabel').collection
        else:
            self._label_collection = label_collection

    def query(self, year, variables=None):

        if variables is not None:
            projection = {"_id": False}
            for var in variables:
                projection[var] = True

            found = self._data_collection.find({"year": year},
                                               projection=projection)
        else:
            found = self._data_collection.find({"year": year},
                                               projection={
                                                   "_id": False,
                                                   "year": False
                                               })

        pdataframe = iterator2dataframes(found, 2000)
        #found = list(found)
        #pdataframe = pd.DataFrame(found)

        pdataframe = pd.DataFrame(pdataframe, columns=variables)
        pdataframe.index = range(1, pdataframe.shape[0] + 1)

        return {
            "dataframe":
            pdataframe,
            "variable_labels":
            self.get_variable_label_df(year=year, variables=variables),
            "value_labels":
            self.get_variable_value_label_df(year=year, variables=variables)
        }

    def get_variable_label_df(self, year, variables=None):
        var_label_dict = self.get_variable_label(year=year)

        if variables is not None:
            result = pd.DataFrame([(var, var_label_dict[var])
                                   for var in variables],
                                  columns=['variable', 'lable'])
        else:
            result = pd.DataFrame([(var, var_label_dict[var])
                                   for var in var_label_dict],
                                  columns=['variable', 'label'])

        result.index = range(1, result.shape[0] + 1)
        return result

    def get_variable_value_label_df(self, year, variables=None):
        var_value_link = self.get_variable_value_link(year=year)
        value_labels = self.get_value_label(year=year)
        value_label_dataframe = None

        if variables is None:
            variables = [var for var in self.get_variable_label(year=year)]

        for var in variables:
            value_label = var_value_link[var]
            if len(value_label) > 0:
                #print(var_value_link[var], " ---> ", value_labels[value_label])
                if value_label_dataframe is None:
                    value_label_dataframe = pd.DataFrame(
                        [(key, value_labels[value_label][key])
                         for key in value_labels[value_label]],
                        columns=["value", "label"])
                    value_label_dataframe['variable'] = var
                    value_label_dataframe = pd.DataFrame(
                        value_label_dataframe,
                        columns=["variable", "value", "label"])
                else:
                    tmp_dataframe = pd.DataFrame(
                        [(key, value_labels[value_label][key])
                         for key in value_labels[value_label]],
                        columns=["value", "label"])
                    tmp_dataframe['variable'] = var
                    tmp_dataframe = pd.DataFrame(
                        tmp_dataframe, columns=["variable", "value", "label"])
                    value_label_dataframe = pd.concat(
                        [value_label_dataframe, tmp_dataframe])

        return value_label_dataframe

    def get_variable_value_link(self, year):
        """ 返回变量和值标签关联信息

        :param year:
        :return:
        """
        return self._label_collection.find(
            {
                "type": "variable value lables",
                "year": year
            },
            projection={
                "_id": False,
                "type": False,
                "year": False
            })[0]

    def get_variable_label(self, year):
        """ 返回某年份的cgss变量

        :param year:
        :return:
        """
        return self._label_collection.find(
            {
                "type": "variable labels",
                "year": year
            },
            projection={
                "_id": False,
                "type": False,
                "year": False
            })[0]

    def get_value_label(self, year):
        """ 返回某年份的cgss值标签

        :param year:
        :return:
        """
        return self._label_collection.find(
            {
                "type": "value labels",
                "year": year
            },
            projection={
                "_id": False,
                "type": False,
                "year": False
            })[0]

    @property
    def year(self):
        """ 返回数据库中cgss的时间跨度

        :return:
        """
        return self._data_collection.find().distinct('year')

コード例 #16

0

ファイルを表示

ファイル: class_gaokaowebscraper.py プロジェクト: plutoese/pluto_archive

class GaoKaoWebScraper():
    def __init__(self):
        mongo = MongoDB(conn_str='localhost:27017')
        self._web_conn = MonCollection(mongo,
                                       database='cache',
                                       collection_name='gaokaoweb').collection
        self._data_web_conn = MonCollection(
            mongo, database='cache',
            collection_name='gaokaodataweb').collection
        self._university_web_conn = MonCollection(
            mongo, database='cache',
            collection_name='gaokaouniversityweb').collection
        self._data_conn = MonCollection(
            mongo, database='webdata',
            collection_name='gaokao_entrancescore').collection
        self._copy_data_web_conn = MonCollection(
            mongo, database='webdata',
            collection_name='gaokaouniversityweb').collection

    def init_first_stage(self):
        web_fmt = "http://college.gaokao.com/schpoint/{}/{}/{}/"
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }

        for i in range(1, 32):
            for j in range(1, 32):
                url = web_fmt.format(''.join(['a', str(i)]),
                                     ''.join(['b', str(j)]), 'p1')
                raw_result = requests.get(url, headers=headers).text
                bs_obj = BeautifulSoup(raw_result, "lxml")
                for string in bs_obj.select('#qx')[0].strings:
                    total_pages = re.split('页', re.split('/', string)[1])[0]
                    break

                if len(total_pages) > 0:
                    for m in range(1, int(total_pages) + 1):
                        web = web_fmt.format(''.join(['a', str(i)]),
                                             ''.join(['b', str(j)]),
                                             ''.join(['p', str(m)]))
                        record = {'type': 'search', 'url': web}
                        print(record)
                        self._web_conn.insert_one(record)

    def init_second_stage(self):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }
        item = self._web_conn.find({'type': 'search'})
        for aitem in item:
            raw_result = requests.get(aitem['url'], headers=headers).text
            bs_obj = BeautifulSoup(raw_result, "lxml")
            for obj in bs_obj.select('.blue'):
                found = obj.find_all(href=re.compile("result"))
                if len(found) > 0:
                    url = found[0]['href']
                    record = {'type': 'data', 'url': url}
                    print(record)
                    self._data_web_conn.insert_one(record)

    def init_three_stage(self):
        university_urls = self._data_web_conn.find().distinct('url')
        for url in university_urls:
            self._university_web_conn.insert_one({'url': url})

    def scrape(self, using_proxy=False):
        vars = ['年份', '最低', '最高', '平均', '录取人数', '录取批次']
        nums = self._copy_data_web_conn.count()
        while nums > 0:
            urls = [
                item['url'] for item in self._copy_data_web_conn.find(limit=5)
            ]
            print(urls)
            start = time.time()
            scraper = StaticWebScraper(urls=urls, using_proxy=using_proxy)
            scraper.start()

            for html in scraper.result:
                url = html[1]
                bs_obj = BeautifulSoup(html[0], "lxml")
                record = dict(
                    zip(['university', 'region', 'type'], [
                        item.contents[0]
                        for item in bs_obj.select('.btnFsxBox > font')
                    ]))

                htmlparser = HtmlParser(html_content=bs_obj)
                table = htmlparser.table('#pointbyarea > table')
                if len(table) > 0:
                    for item in table:
                        copy_record = copy.copy(record)
                        if len(item) == 0:
                            continue
                        if len(item) == 6:
                            for i in range(len(item)):
                                if i in [0, 1, 2, 3, 4]:
                                    if item[i] == '------':
                                        copy_record[vars[i]] = None
                                    else:
                                        copy_record[vars[i]] = int(
                                            float(item[i]))
                                else:
                                    if item[i] == '------':
                                        copy_record[vars[i]] = None
                                    else:
                                        copy_record[vars[i]] = item[i]
                        else:
                            raise Exception

                        found = self._data_conn.find_one(copy_record)
                        if found is None:
                            print('Insert..', copy_record)
                            self._data_conn.insert_one(copy_record)
                self._copy_data_web_conn.delete_one({'url': url})

            print('Total: {}'.format(time.time() - start))
            nums = self._copy_data_web_conn.count()

コード例 #17

0

ファイルを表示

# coding = UTF-8

import re
import pandas as pd
from pymongo import ASCENDING
from lib.base.database.class_mongodb import MongoDB, MonCollection

# 1. 初始化
mongo = MongoDB(conn_str='localhost:27017')
province_con = MonCollection(mongo,
                             database='regiondata',
                             collection_name='provinces').collection
provinces = province_con.find(projection={
    '_id': False,
    'name': True,
    'id': True
},
                              sort=[('id', ASCENDING)])
province_dict = {province['name']: province['id'] for province in provinces}

perGDP_filepath = r'E:\cyberspace\worklot\college\province_perGDP.xlsx'
perGDP = pd.read_excel(perGDP_filepath)

# 2.匹配省份名称
for ind in perGDP.index:
    region = perGDP.loc[ind, 'region']
    region = re.sub('\s+', '', region)
    for province in province_dict:
        if re.search(region, province) is not None:
            perGDP.loc[ind, 'province'] = province
            perGDP.loc[ind, 'acode'] = province_dict[province]

コード例 #18

0

ファイルを表示

ファイル: class_universitieswebscraper.py プロジェクト: plutoese/pluto_archive

class CollegeInfo():
    def __init__(self):
        mongo = MongoDB(conn_str='localhost:27017')
        self._college_info = MonCollection(
            mongo, database='webdata',
            collection_name='college_info').collection
        self._college_intro = MonCollection(
            mongo, database='webdata',
            collection_name='college_introduction').collection

        self._headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }

    def init_first_stage(self):
        web_fmt = "http://college.gaokao.com/schlist/a{}/p{}"

        for i in range(1, 32):
            url = web_fmt.format(str(i), '1')
            raw_result = requests.get(url, headers=self._headers).text
            bs_obj = BeautifulSoup(raw_result, "lxml")
            for string in bs_obj.select('#qx')[0].strings:
                total_pages = re.split('页', re.split('/', string)[1])[0]
                break

            for j in range(1, int(total_pages) + 1):
                surf_url = web_fmt.format(str(i), str(j))
                print(surf_url)
                surf_result = requests.get(surf_url,
                                           headers=self._headers).text
                surf_obj = BeautifulSoup(surf_result, "lxml")
                surf_content = surf_obj.select('.scores_List')[0]

                colleges = [
                    item.attrs['title']
                    for item in surf_content.select('.blue')
                ]

                college_info = []
                for ul_item in surf_content.select('ul'):
                    one_college_info = dict()
                    for n in range(len(ul_item.select('li'))):
                        if n == 1:
                            college_type = (ul_item.select('li')[n]).contents
                            if len(college_type) == 1:
                                one_college_info['985'] = False
                                one_college_info['211'] = False
                            elif len(college_type) == 2:
                                if college_type[1].string == '211':
                                    one_college_info['985'] = False
                                    one_college_info['211'] = True
                                elif college_type[1].string == '985':
                                    one_college_info['985'] = True
                                    one_college_info['211'] = False
                                else:
                                    raise Exception
                            else:
                                one_college_info['985'] = True
                                one_college_info['211'] = True
                        else:
                            key, value = re.split(
                                '：', (ul_item.select('li')[n]).string)
                            if value == '——' or value == '------':
                                value = None
                            one_college_info[key] = value
                    college_info.append(one_college_info)

                for m in range(len(colleges)):
                    college_info[m]['学校'] = colleges[m]

                for college in college_info:
                    found = self._college_info.find_one(college)
                    if found is None:
                        print('Insert..', college)
                        self._college_info.insert_one(college)

コード例 #19

0

ファイルを表示

ファイル: app_export_colleges.py プロジェクト: plutoese/pluto_archive

# coding = UTF-8

import pandas as pd
from lib.base.database.class_mongodb import MongoDB, MonCollection

mongo = MongoDB(conn_str='localhost:27017')
college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection

found = college_info_con.find({"高校性质" : "本科"}, projection={'_id':False, '高校所在地':True, '学校':True})
college_pd = pd.DataFrame(list(found))

college_pd.to_excel(r'E:\cyberspace\worklot\college\colleges.xlsx')

コード例 #20

0

ファイルを表示

ファイル: class_proxymanager.py プロジェクト: plutoese/pluto_archive

 def __init__(self, database='proxy', collection_name='proxys'):
     # 设置数据库
     self._conn = MonCollection(mongodb=MongoDB(),
                                database=database,
                                collection_name=collection_name)

コード例 #21

0

ファイルを表示

ファイル: app_add_acode.py プロジェクト: plutoese/pluto_archive

# coding = UTF-8

import re
from pymongo import ASCENDING
from lib.base.database.class_mongodb import MongoDB, MonCollection

mongo = MongoDB(conn_str='localhost:27017')
province_con = MonCollection(mongo,
                             database='regiondata',
                             collection_name='provinces').collection
college_info_con = MonCollection(mongo,
                                 database='webdata',
                                 collection_name='college_info').collection
entrance_score_con = MonCollection(
    mongo, database='webdata',
    collection_name='gaokao_entrancescore').collection

provinces = province_con.find(projection={
    '_id': False,
    'name': True,
    'id': True
},
                              sort=[('id', ASCENDING)])
province_dict = {province['name']: province['id'] for province in provinces}

COLLEGE_INFO = False
ENTRANCE_EXAM = True

if COLLEGE_INFO:
    for item in college_info_con.find(projection={
            '_id': True,

コード例 #22

0

ファイルを表示

ファイル: class_econpapers_crawler.py プロジェクト: plutoese/pluto_archive

    def __init__(self, journals_webs=None):
        self._journal_websites = journals_webs

        self._pre_conn = MonCollection(mongodb=MongoDB(), database='papers', collection_name='econpaperwebsites')
        self._paper_conn = MonCollection(mongodb=MongoDB(), database='papers', collection_name='econpapers')

コード例 #23

0

ファイルを表示

ファイル: app_cross_holding_data_to_db.py プロジェクト: plutoese/pluto_archive

# coding = UTF-8

import os
import pickle
import numpy as np
import pandas as pd
from lib.base.database.class_mongodb import MongoDB, MonDatabase, MonCollection

mongo = MongoDB(
    conn_str=
    'mongodb://*****:*****@dds-bp162bb74b8184e41658-pub.mongodb.rds.aliyuncs.com:3717'
)
mdb = MonDatabase(mongodb=mongo, database_name='enterprise')
mcon = MonCollection(mongo, mdb, 'cross_holding_data')

PROJECT_DATA_PATH = r'E:\datahouse\projectdata\shareholder'

file_path = os.path.join(PROJECT_DATA_PATH, 'cross_holding_main_table.xls')
cross_holding_data_table = pd.read_excel(file_path)

vars = list(cross_holding_data_table.columns)
var_dtype = dict(zip(vars, [str] * len(vars)))
print(var_dtype)
cross_holding_data_table = pd.read_excel(file_path, dtype=var_dtype)
#cross_holding_data_table = cross_holding_data_table.replace('nan',None)

records = cross_holding_data_table.to_dict('records')

コード例 #24

0

ファイルを表示

ファイル: class_literaturereport.py プロジェクト: plutoese/pluto_archive

                                  type=1)
            abstract = item.get('abstract')
            if abstract is not None:
                abstract = escape_latex(abstract)
                doc.document.append(abstract)
        #doc.document.generate_tex(r'E:\github\latexdoc\latexdoc\template\academicjournal\wlscirep\plutopaper.tex')
        doc.document.generate_pdf(
            r'D:\github\pluto\lib\base\pylatex\template\output\{}'.format(
                file_name))


if __name__ == '__main__':
    report = LiteratureReport()

    conn = MonCollection(mongodb=MongoDB(),
                         database='papers',
                         collection_name='econpapers')
    journals = conn.collection.find().distinct('journal')

    for journal in journals:
        print(journal)
        if journal == 'Econometrica':
            report.load_record_from_db(query={
                'journal': journal,
                'year': {
                    '$gte': 2012
                }
            },
                                       sort=[('journal', ASCENDING),
                                             ('year', DESCENDING)])
            report.to_report(file_name=journal)