def __init__(self): """ 初始化中国城市统计数据库接口 """ mongo = MongoDB(conn_str='localhost:27017') self.conn = MonCollection(mongo, database='regiondata', collection_name='citystatistics').collection
class ProxyManager: """ ProxyManager类用来管理、检验和更新代理服务器列表 :param str proxy_web: proxy的地址,默认为http://www.youdaili.net/Daili/guonei/ :return: 无返回值 """ def __init__(self, database='proxy', collection_name='proxys'): # 设置数据库 self._conn = MonCollection(mongodb=MongoDB(), database=database, collection_name=collection_name) def find(self, type=0, limit=None): if type == 0: found = self._conn.find(filter={'protocol': { '$in': [0, 2] }}, projection={ '_id': False, 'ip': True, 'port': True, 'type': True }, sort=[('score', DESCENDING), ('speed', ASCENDING)], limit=limit) else: found = self._conn.find(filter={'protocol': { '$in': [1, 2] }}, projection={ '_id': False, 'ip': True, 'port': True, 'type': True }, sort=[('score', DESCENDING), ('speed', ASCENDING)], limit=limit) return { Proxy(ip=item['ip'], port=item['port'], type=type).address for item in found } @property def random_proxy(self): """ 随机返回一个代理服务器,选择的权重是它的count :return: 随机返回一个代理服务器 """ return random.choice(list(self.find(limit=100))) @property def top_150_proxies(self): """ 随机返回一个代理服务器,选择的权重是它的count :return: 随机返回一个代理服务器 """ return list(self.find(limit=150))
def __init__(self): mongo = MongoDB(conn_str='localhost:27017') self._college_info = MonCollection( mongo, database='webdata', collection_name='college_info').collection self._college_intro = MonCollection( mongo, database='webdata', collection_name='college_introduction').collection self._headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' }
def store_data_to_db(self, data_collection=None, label_collection=None): """ 把stata对象中的数据存入数据库 :param data_collection: :param label_collection: :return: 返回self """ if data_collection is None: data_collection = MonCollection( database=MonDatabase(mongodb=MongoDB(), database_name='surveydata'), collection_name='cgssdata').collection if label_collection is None: label_collection = MonCollection( database=MonDatabase(mongodb=MongoDB(), database_name='surveydata'), collection_name='cgsslabel').collection for year in self._stata_object: stata_data = self._stata_object[year].read() records = stata_data.to_dict("records") for record in records: record["year"] = year print(record) data_collection.insert_one(record) value_labels = self._stata_object[year].value_labels str_value_labels = dict() for key in value_labels: str_value_labels[key] = { str(inn_key): value_labels[key][inn_key] for inn_key in value_labels[key] } str_value_labels["year"] = year str_value_labels["type"] = "value labels" print(str_value_labels) label_collection.insert_one(str_value_labels) variable_labels = self._stata_object[year].variable_labels variable_labels["year"] = year variable_labels["type"] = "variable labels" print(variable_labels) label_collection.insert_one(variable_labels) return self
def __init__(self): mongo = MongoDB(conn_str='localhost:27017') self._web_conn = MonCollection(mongo, database='cache', collection_name='gaokaoweb').collection self._data_web_conn = MonCollection( mongo, database='cache', collection_name='gaokaodataweb').collection self._university_web_conn = MonCollection( mongo, database='cache', collection_name='gaokaouniversityweb').collection self._data_conn = MonCollection( mongo, database='webdata', collection_name='gaokao_entrancescore').collection self._copy_data_web_conn = MonCollection( mongo, database='webdata', collection_name='gaokaouniversityweb').collection
def __init__(self, data_collection=None, label_collection=None): """ 初始化数据库连接 :param data_collection: :param label_collection: """ if data_collection is None: self._data_collection = MonCollection( database=MonDatabase( mongodb=MongoDB(conn_str='localhost:27017'), database_name='surveydata'), collection_name='cgssdata').collection else: self._data_collection = data_collection if label_collection is None: self._label_collection = MonCollection( database=MonDatabase( mongodb=MongoDB(conn_str='localhost:27017'), database_name='surveydata'), collection_name='cgsslabel').collection else: self._label_collection = label_collection
def store_label_to_db(self, label_collection=None): """ 把变量和值标签关联存储到数据库 :param label_collection: :return: 返回self """ if label_collection is None: label_collection = MonCollection( database=MonDatabase(mongodb=MongoDB(), database_name='surveydata'), collection_name='cgsslabel').collection for year in self._stata_label_object: stata_label_data = self._stata_label_object[year].read() records = dict( zip(stata_label_data.loc[:, "name"], stata_label_data.loc[:, "vallab"])) records["year"] = year records["type"] = "variable value lables" print(records) label_collection.insert_one(records) return self
class CityStatisticsDatabase: def __init__(self): """ 初始化中国城市统计数据库接口 """ mongo = MongoDB(conn_str='localhost:27017') self.conn = MonCollection(mongo, database='regiondata', collection_name='citystatistics').collection def find(self, *args, **kwargs): """ 调用查询接口 :param args: :param kwargs: :return: """ found = list(self.conn.find(*args, **kwargs)) if len(found) > 0: found_data = pd.DataFrame(found) found_data['var'] = found_data['variable'] + found_data[ 'unit'].apply(lambda x: ''.join(['(', x, ')'])) pdata = pd.pivot_table(found_data, values='value', index=['year', 'acode', 'region'], columns=['var']) pdata = pdata.swaplevel(0, 1, axis=0) return pdata @property def variables(self): found = self.conn.find().distinct('variable') return pd.DataFrame(sorted(found)) @property def regions(self): return None
class AdminDivisionDatabase(): """ 类AdminDivisionDatabase连接admindivision集合 """ def __init__(self): # 连接admindivision集合 mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') self.collection = MonCollection(database=mdb, collection_name='admindivision') # 查询 def find(self, **conds): # 设置projection projection = conds.get('projection') if projection is None: projection = { 'region': 1, 'year': 1, 'adminlevel': 1, 'acode': 1, '_id': 1, 'parent': 1, 'uid': 1 } else: conds.pop('projection') # 设置sorts sorts = conds.get('sorts') if sorts is None: sorts = [('year', ASCENDING), ('acode', ASCENDING)] else: conds.pop('sorts') # 设置查询条件 condition = dict() for key in conds: if isinstance(conds[key], list): condition[key] = {'$in': conds[key]} else: condition[key] = conds[key] # 返回查询结果 return self.collection.find(condition, projection).sort(sorts) # 年份 @property def period(self): return sorted(self.find().distinct('year'))
# coding = UTF-8 import os import pandas as pd from lib.base.database.class_mongodb import MongoDB, MonDatabase, MonCollection mongo = MongoDB( conn_str= 'mongodb://*****:*****@dds-bp162bb74b8184e41658-pub.mongodb.rds.aliyuncs.com:3717' ) mdb = MonDatabase(mongodb=mongo, database_name='enterprise') mcon = MonCollection(mongo, mdb, 'stock_share_holder') matched_file_path = r'E:\datahouse\projectdata\shareholder\matched' file_path = r'E:\datahouse\projectdata\shareholder\missing_date_df.xlsx' share_holder_all_df = pd.read_excel(file_path) for ind in share_holder_all_df.index: codeA = share_holder_all_df.loc[ind, '上市公司代码_ComCd'] codeB = share_holder_all_df.loc[ind, '股东上市公司代码_SHComCd'] codeB = codeB[1:] found = mcon.collection.find( { '上市公司代码_ComCd': codeA, '所属基金/股票的代码_SecuCd': codeB }, projection={ '_id': 0, '上市公司代码_ComCd': 1, '最新公司全称_LComNm': 1, '截止日期_EndDt': 1,
'江西科技师范大学':'江西科技师范学院', '天津外国语大学':'天津工程师范学院', '内蒙古财经大学':'内蒙古财经学院', '贵州师范学院':'贵州师范大学', } def college_replace(college_name): if college_name in college_tranform: return college_tranform[college_name] else: return college_name # 1. 数据库连接 mongo = MongoDB(conn_str='localhost:27017') college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection entrance_score_con = MonCollection(mongo, database='webdata', collection_name='gaokao_entrancescore').collection # 2. 数据库大学集合 entrance_colleges = entrance_score_con.find().distinct('university') # 3. 导入校友会大学 college_rate_2011_filepath = r'E:\cyberspace\worklot\college\2011年校友会大学排名.xlsx' college_2011 = pd.read_excel(college_rate_2011_filepath) college_2011['学校名称'] = college_2011['学校名称'].apply(college_replace) for item in college_2011['学校名称']: if item not in entrance_colleges: print(item) college_rate_2012_filepath = r'E:\cyberspace\worklot\college\2012年校友会大学排名.xlsx' college_2012 = pd.read_excel(college_rate_2012_filepath)
def __init__(self): # 连接admindivision集合 mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') self.collection = MonCollection(database=mdb, collection_name='admindivision')
def __init__(self, database='papers', collection='econpapers'): self.literatures = None self._paper_conn = MonCollection(mongodb=MongoDB(), database=database, collection_name=collection)
# coding = UTF-8 import re import pysal from pymongo import ASCENDING import pandas as pd from lib.base.database.class_mongodb import MongoDB, MonCollection from application.dataworld.admindivision.class_admindivision import AdminDivision # 1. 数据库连接 mongo = MongoDB(conn_str='localhost:27017') college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection entrance_score_con = MonCollection( mongo, database='webdata', collection_name='gaokao_entrancescore').collection # 2. 步骤参数设置 # a. 导出每年的高考分数数据 IS_EXPORT_RAW_EXAM_SCORE = False # b. 导出高校信息数据 IS_EXPORT_RAW_COLLEGE_INFO = False # c. 2011-2013年面板数据 IS_MERGE_INTO_PANEL = False # d. 合并高校信息数据 IS_MERGE_COLLEGE_INFO = False # e. 合并大学排名信息 IS_MERGE_COLLEGE_RATE = False # f. 合并省级经济信息 IS_MERGE_PROVINCE_PERGDP = False
class CgssDatabase: def __init__(self, data_collection=None, label_collection=None): """ 初始化数据库连接 :param data_collection: :param label_collection: """ if data_collection is None: self._data_collection = MonCollection( database=MonDatabase( mongodb=MongoDB(conn_str='localhost:27017'), database_name='surveydata'), collection_name='cgssdata').collection else: self._data_collection = data_collection if label_collection is None: self._label_collection = MonCollection( database=MonDatabase( mongodb=MongoDB(conn_str='localhost:27017'), database_name='surveydata'), collection_name='cgsslabel').collection else: self._label_collection = label_collection def query(self, year, variables=None): if variables is not None: projection = {"_id": False} for var in variables: projection[var] = True found = self._data_collection.find({"year": year}, projection=projection) else: found = self._data_collection.find({"year": year}, projection={ "_id": False, "year": False }) pdataframe = iterator2dataframes(found, 2000) #found = list(found) #pdataframe = pd.DataFrame(found) pdataframe = pd.DataFrame(pdataframe, columns=variables) pdataframe.index = range(1, pdataframe.shape[0] + 1) return { "dataframe": pdataframe, "variable_labels": self.get_variable_label_df(year=year, variables=variables), "value_labels": self.get_variable_value_label_df(year=year, variables=variables) } def get_variable_label_df(self, year, variables=None): var_label_dict = self.get_variable_label(year=year) if variables is not None: result = pd.DataFrame([(var, var_label_dict[var]) for var in variables], columns=['variable', 'lable']) else: result = pd.DataFrame([(var, var_label_dict[var]) for var in var_label_dict], columns=['variable', 'label']) result.index = range(1, result.shape[0] + 1) return result def get_variable_value_label_df(self, year, variables=None): var_value_link = self.get_variable_value_link(year=year) value_labels = self.get_value_label(year=year) value_label_dataframe = None if variables is None: variables = [var for var in self.get_variable_label(year=year)] for var in variables: value_label = var_value_link[var] if len(value_label) > 0: #print(var_value_link[var], " ---> ", value_labels[value_label]) if value_label_dataframe is None: value_label_dataframe = pd.DataFrame( [(key, value_labels[value_label][key]) for key in value_labels[value_label]], columns=["value", "label"]) value_label_dataframe['variable'] = var value_label_dataframe = pd.DataFrame( value_label_dataframe, columns=["variable", "value", "label"]) else: tmp_dataframe = pd.DataFrame( [(key, value_labels[value_label][key]) for key in value_labels[value_label]], columns=["value", "label"]) tmp_dataframe['variable'] = var tmp_dataframe = pd.DataFrame( tmp_dataframe, columns=["variable", "value", "label"]) value_label_dataframe = pd.concat( [value_label_dataframe, tmp_dataframe]) return value_label_dataframe def get_variable_value_link(self, year): """ 返回变量和值标签关联信息 :param year: :return: """ return self._label_collection.find( { "type": "variable value lables", "year": year }, projection={ "_id": False, "type": False, "year": False })[0] def get_variable_label(self, year): """ 返回某年份的cgss变量 :param year: :return: """ return self._label_collection.find( { "type": "variable labels", "year": year }, projection={ "_id": False, "type": False, "year": False })[0] def get_value_label(self, year): """ 返回某年份的cgss值标签 :param year: :return: """ return self._label_collection.find( { "type": "value labels", "year": year }, projection={ "_id": False, "type": False, "year": False })[0] @property def year(self): """ 返回数据库中cgss的时间跨度 :return: """ return self._data_collection.find().distinct('year')
class GaoKaoWebScraper(): def __init__(self): mongo = MongoDB(conn_str='localhost:27017') self._web_conn = MonCollection(mongo, database='cache', collection_name='gaokaoweb').collection self._data_web_conn = MonCollection( mongo, database='cache', collection_name='gaokaodataweb').collection self._university_web_conn = MonCollection( mongo, database='cache', collection_name='gaokaouniversityweb').collection self._data_conn = MonCollection( mongo, database='webdata', collection_name='gaokao_entrancescore').collection self._copy_data_web_conn = MonCollection( mongo, database='webdata', collection_name='gaokaouniversityweb').collection def init_first_stage(self): web_fmt = "http://college.gaokao.com/schpoint/{}/{}/{}/" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } for i in range(1, 32): for j in range(1, 32): url = web_fmt.format(''.join(['a', str(i)]), ''.join(['b', str(j)]), 'p1') raw_result = requests.get(url, headers=headers).text bs_obj = BeautifulSoup(raw_result, "lxml") for string in bs_obj.select('#qx')[0].strings: total_pages = re.split('页', re.split('/', string)[1])[0] break if len(total_pages) > 0: for m in range(1, int(total_pages) + 1): web = web_fmt.format(''.join(['a', str(i)]), ''.join(['b', str(j)]), ''.join(['p', str(m)])) record = {'type': 'search', 'url': web} print(record) self._web_conn.insert_one(record) def init_second_stage(self): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } item = self._web_conn.find({'type': 'search'}) for aitem in item: raw_result = requests.get(aitem['url'], headers=headers).text bs_obj = BeautifulSoup(raw_result, "lxml") for obj in bs_obj.select('.blue'): found = obj.find_all(href=re.compile("result")) if len(found) > 0: url = found[0]['href'] record = {'type': 'data', 'url': url} print(record) self._data_web_conn.insert_one(record) def init_three_stage(self): university_urls = self._data_web_conn.find().distinct('url') for url in university_urls: self._university_web_conn.insert_one({'url': url}) def scrape(self, using_proxy=False): vars = ['年份', '最低', '最高', '平均', '录取人数', '录取批次'] nums = self._copy_data_web_conn.count() while nums > 0: urls = [ item['url'] for item in self._copy_data_web_conn.find(limit=5) ] print(urls) start = time.time() scraper = StaticWebScraper(urls=urls, using_proxy=using_proxy) scraper.start() for html in scraper.result: url = html[1] bs_obj = BeautifulSoup(html[0], "lxml") record = dict( zip(['university', 'region', 'type'], [ item.contents[0] for item in bs_obj.select('.btnFsxBox > font') ])) htmlparser = HtmlParser(html_content=bs_obj) table = htmlparser.table('#pointbyarea > table') if len(table) > 0: for item in table: copy_record = copy.copy(record) if len(item) == 0: continue if len(item) == 6: for i in range(len(item)): if i in [0, 1, 2, 3, 4]: if item[i] == '------': copy_record[vars[i]] = None else: copy_record[vars[i]] = int( float(item[i])) else: if item[i] == '------': copy_record[vars[i]] = None else: copy_record[vars[i]] = item[i] else: raise Exception found = self._data_conn.find_one(copy_record) if found is None: print('Insert..', copy_record) self._data_conn.insert_one(copy_record) self._copy_data_web_conn.delete_one({'url': url}) print('Total: {}'.format(time.time() - start)) nums = self._copy_data_web_conn.count()
# coding = UTF-8 import re import pandas as pd from pymongo import ASCENDING from lib.base.database.class_mongodb import MongoDB, MonCollection # 1. 初始化 mongo = MongoDB(conn_str='localhost:27017') province_con = MonCollection(mongo, database='regiondata', collection_name='provinces').collection provinces = province_con.find(projection={ '_id': False, 'name': True, 'id': True }, sort=[('id', ASCENDING)]) province_dict = {province['name']: province['id'] for province in provinces} perGDP_filepath = r'E:\cyberspace\worklot\college\province_perGDP.xlsx' perGDP = pd.read_excel(perGDP_filepath) # 2.匹配省份名称 for ind in perGDP.index: region = perGDP.loc[ind, 'region'] region = re.sub('\s+', '', region) for province in province_dict: if re.search(region, province) is not None: perGDP.loc[ind, 'province'] = province perGDP.loc[ind, 'acode'] = province_dict[province]
class CollegeInfo(): def __init__(self): mongo = MongoDB(conn_str='localhost:27017') self._college_info = MonCollection( mongo, database='webdata', collection_name='college_info').collection self._college_intro = MonCollection( mongo, database='webdata', collection_name='college_introduction').collection self._headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } def init_first_stage(self): web_fmt = "http://college.gaokao.com/schlist/a{}/p{}" for i in range(1, 32): url = web_fmt.format(str(i), '1') raw_result = requests.get(url, headers=self._headers).text bs_obj = BeautifulSoup(raw_result, "lxml") for string in bs_obj.select('#qx')[0].strings: total_pages = re.split('页', re.split('/', string)[1])[0] break for j in range(1, int(total_pages) + 1): surf_url = web_fmt.format(str(i), str(j)) print(surf_url) surf_result = requests.get(surf_url, headers=self._headers).text surf_obj = BeautifulSoup(surf_result, "lxml") surf_content = surf_obj.select('.scores_List')[0] colleges = [ item.attrs['title'] for item in surf_content.select('.blue') ] college_info = [] for ul_item in surf_content.select('ul'): one_college_info = dict() for n in range(len(ul_item.select('li'))): if n == 1: college_type = (ul_item.select('li')[n]).contents if len(college_type) == 1: one_college_info['985'] = False one_college_info['211'] = False elif len(college_type) == 2: if college_type[1].string == '211': one_college_info['985'] = False one_college_info['211'] = True elif college_type[1].string == '985': one_college_info['985'] = True one_college_info['211'] = False else: raise Exception else: one_college_info['985'] = True one_college_info['211'] = True else: key, value = re.split( ':', (ul_item.select('li')[n]).string) if value == '——' or value == '------': value = None one_college_info[key] = value college_info.append(one_college_info) for m in range(len(colleges)): college_info[m]['学校'] = colleges[m] for college in college_info: found = self._college_info.find_one(college) if found is None: print('Insert..', college) self._college_info.insert_one(college)
# coding = UTF-8 import pandas as pd from lib.base.database.class_mongodb import MongoDB, MonCollection mongo = MongoDB(conn_str='localhost:27017') college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection found = college_info_con.find({"高校性质" : "本科"}, projection={'_id':False, '高校所在地':True, '学校':True}) college_pd = pd.DataFrame(list(found)) college_pd.to_excel(r'E:\cyberspace\worklot\college\colleges.xlsx')
def __init__(self, database='proxy', collection_name='proxys'): # 设置数据库 self._conn = MonCollection(mongodb=MongoDB(), database=database, collection_name=collection_name)
# coding = UTF-8 import re from pymongo import ASCENDING from lib.base.database.class_mongodb import MongoDB, MonCollection mongo = MongoDB(conn_str='localhost:27017') province_con = MonCollection(mongo, database='regiondata', collection_name='provinces').collection college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection entrance_score_con = MonCollection( mongo, database='webdata', collection_name='gaokao_entrancescore').collection provinces = province_con.find(projection={ '_id': False, 'name': True, 'id': True }, sort=[('id', ASCENDING)]) province_dict = {province['name']: province['id'] for province in provinces} COLLEGE_INFO = False ENTRANCE_EXAM = True if COLLEGE_INFO: for item in college_info_con.find(projection={ '_id': True,
def __init__(self, journals_webs=None): self._journal_websites = journals_webs self._pre_conn = MonCollection(mongodb=MongoDB(), database='papers', collection_name='econpaperwebsites') self._paper_conn = MonCollection(mongodb=MongoDB(), database='papers', collection_name='econpapers')
# coding = UTF-8 import os import pickle import numpy as np import pandas as pd from lib.base.database.class_mongodb import MongoDB, MonDatabase, MonCollection mongo = MongoDB( conn_str= 'mongodb://*****:*****@dds-bp162bb74b8184e41658-pub.mongodb.rds.aliyuncs.com:3717' ) mdb = MonDatabase(mongodb=mongo, database_name='enterprise') mcon = MonCollection(mongo, mdb, 'cross_holding_data') PROJECT_DATA_PATH = r'E:\datahouse\projectdata\shareholder' file_path = os.path.join(PROJECT_DATA_PATH, 'cross_holding_main_table.xls') cross_holding_data_table = pd.read_excel(file_path) vars = list(cross_holding_data_table.columns) var_dtype = dict(zip(vars, [str] * len(vars))) print(var_dtype) cross_holding_data_table = pd.read_excel(file_path, dtype=var_dtype) #cross_holding_data_table = cross_holding_data_table.replace('nan',None) records = cross_holding_data_table.to_dict('records')
type=1) abstract = item.get('abstract') if abstract is not None: abstract = escape_latex(abstract) doc.document.append(abstract) #doc.document.generate_tex(r'E:\github\latexdoc\latexdoc\template\academicjournal\wlscirep\plutopaper.tex') doc.document.generate_pdf( r'D:\github\pluto\lib\base\pylatex\template\output\{}'.format( file_name)) if __name__ == '__main__': report = LiteratureReport() conn = MonCollection(mongodb=MongoDB(), database='papers', collection_name='econpapers') journals = conn.collection.find().distinct('journal') for journal in journals: print(journal) if journal == 'Econometrica': report.load_record_from_db(query={ 'journal': journal, 'year': { '$gte': 2012 } }, sort=[('journal', ASCENDING), ('year', DESCENDING)]) report.to_report(file_name=journal)