class ProxyManager: """ ProxyManager类用来管理、检验和更新代理服务器列表 :param str proxy_web: proxy的地址,默认为http://www.youdaili.net/Daili/guonei/ :return: 无返回值 """ def __init__(self, database='proxy', collection_name='proxys'): # 设置数据库 self._conn = MonCollection(mongodb=MongoDB(), database=database, collection_name=collection_name) def find(self, type=0, limit=None): if type == 0: found = self._conn.find(filter={'protocol': { '$in': [0, 2] }}, projection={ '_id': False, 'ip': True, 'port': True, 'type': True }, sort=[('score', DESCENDING), ('speed', ASCENDING)], limit=limit) else: found = self._conn.find(filter={'protocol': { '$in': [1, 2] }}, projection={ '_id': False, 'ip': True, 'port': True, 'type': True }, sort=[('score', DESCENDING), ('speed', ASCENDING)], limit=limit) return { Proxy(ip=item['ip'], port=item['port'], type=type).address for item in found } @property def random_proxy(self): """ 随机返回一个代理服务器,选择的权重是它的count :return: 随机返回一个代理服务器 """ return random.choice(list(self.find(limit=100))) @property def top_150_proxies(self): """ 随机返回一个代理服务器,选择的权重是它的count :return: 随机返回一个代理服务器 """ return list(self.find(limit=150))
class CityStatisticsDatabase: def __init__(self): """ 初始化中国城市统计数据库接口 """ mongo = MongoDB(conn_str='localhost:27017') self.conn = MonCollection(mongo, database='regiondata', collection_name='citystatistics').collection def find(self, *args, **kwargs): """ 调用查询接口 :param args: :param kwargs: :return: """ found = list(self.conn.find(*args, **kwargs)) if len(found) > 0: found_data = pd.DataFrame(found) found_data['var'] = found_data['variable'] + found_data[ 'unit'].apply(lambda x: ''.join(['(', x, ')'])) pdata = pd.pivot_table(found_data, values='value', index=['year', 'acode', 'region'], columns=['var']) pdata = pdata.swaplevel(0, 1, axis=0) return pdata @property def variables(self): found = self.conn.find().distinct('variable') return pd.DataFrame(sorted(found)) @property def regions(self): return None
class AdminDivisionDatabase(): """ 类AdminDivisionDatabase连接admindivision集合 """ def __init__(self): # 连接admindivision集合 mongo = MongoDB() mdb = MonDatabase(mongodb=mongo, database_name='region') self.collection = MonCollection(database=mdb, collection_name='admindivision') # 查询 def find(self, **conds): # 设置projection projection = conds.get('projection') if projection is None: projection = { 'region': 1, 'year': 1, 'adminlevel': 1, 'acode': 1, '_id': 1, 'parent': 1, 'uid': 1 } else: conds.pop('projection') # 设置sorts sorts = conds.get('sorts') if sorts is None: sorts = [('year', ASCENDING), ('acode', ASCENDING)] else: conds.pop('sorts') # 设置查询条件 condition = dict() for key in conds: if isinstance(conds[key], list): condition[key] = {'$in': conds[key]} else: condition[key] = conds[key] # 返回查询结果 return self.collection.find(condition, projection).sort(sorts) # 年份 @property def period(self): return sorted(self.find().distinct('year'))
TEMP2 = False IS_MERGE_CITY_STAT = False # h. 合并大学创立的年份 IS_MERGE_START_YEAR = False # i. 添加本地和附近高校的虚拟变量 IS_ADD_LOCAL_VAR = False IS_ADD_NEARBY_VAR = False # j. 添加本地的人均实际GDP信息 IS_ADD_LOCAL_PERGDP = True if IS_EXPORT_RAW_EXAM_SCORE: for year in range(2010, 2018): found = entrance_score_con.find( { '年份': year, 'type': '文科', "录取批次": "第一批" }, sort=[('regioncode', ASCENDING), ('university', ASCENDING)]) raw_dataframe = pd.DataFrame(list(found)) raw_dataframe.to_excel( r'E:\cyberspace\worklot\college\dataset\raw\{}年高考文科第一批录取分数横截面数据.xlsx' .format(str(year))) found = entrance_score_con.find( { '年份': year, 'type': '文科', "录取批次": "第二批" }, sort=[('regioncode', ASCENDING), ('university', ASCENDING)])
# coding = UTF-8 import pandas as pd from lib.base.database.class_mongodb import MongoDB, MonCollection mongo = MongoDB(conn_str='localhost:27017') college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection found = college_info_con.find({"高校性质" : "本科"}, projection={'_id':False, '高校所在地':True, '学校':True}) college_pd = pd.DataFrame(list(found)) college_pd.to_excel(r'E:\cyberspace\worklot\college\colleges.xlsx')
# coding = UTF-8 import re import pandas as pd from pymongo import ASCENDING from lib.base.database.class_mongodb import MongoDB, MonCollection # 1. 初始化 mongo = MongoDB(conn_str='localhost:27017') province_con = MonCollection(mongo, database='regiondata', collection_name='provinces').collection provinces = province_con.find(projection={ '_id': False, 'name': True, 'id': True }, sort=[('id', ASCENDING)]) province_dict = {province['name']: province['id'] for province in provinces} perGDP_filepath = r'E:\cyberspace\worklot\college\province_perGDP.xlsx' perGDP = pd.read_excel(perGDP_filepath) # 2.匹配省份名称 for ind in perGDP.index: region = perGDP.loc[ind, 'region'] region = re.sub('\s+', '', region) for province in province_dict: if re.search(region, province) is not None: perGDP.loc[ind, 'province'] = province perGDP.loc[ind, 'acode'] = province_dict[province]
} def college_replace(college_name): if college_name in college_tranform: return college_tranform[college_name] else: return college_name # 1. 数据库连接 mongo = MongoDB(conn_str='localhost:27017') college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection entrance_score_con = MonCollection(mongo, database='webdata', collection_name='gaokao_entrancescore').collection # 2. 数据库大学集合 entrance_colleges = entrance_score_con.find().distinct('university') # 3. 导入校友会大学 college_rate_2011_filepath = r'E:\cyberspace\worklot\college\2011年校友会大学排名.xlsx' college_2011 = pd.read_excel(college_rate_2011_filepath) college_2011['学校名称'] = college_2011['学校名称'].apply(college_replace) for item in college_2011['学校名称']: if item not in entrance_colleges: print(item) college_rate_2012_filepath = r'E:\cyberspace\worklot\college\2012年校友会大学排名.xlsx' college_2012 = pd.read_excel(college_rate_2012_filepath) college_2012['学校名称'] = college_2012['学校名称'].apply(college_replace) for item in college_2012['学校名称']: if item not in entrance_colleges: print(item)
class CgssDatabase: def __init__(self, data_collection=None, label_collection=None): """ 初始化数据库连接 :param data_collection: :param label_collection: """ if data_collection is None: self._data_collection = MonCollection( database=MonDatabase( mongodb=MongoDB(conn_str='localhost:27017'), database_name='surveydata'), collection_name='cgssdata').collection else: self._data_collection = data_collection if label_collection is None: self._label_collection = MonCollection( database=MonDatabase( mongodb=MongoDB(conn_str='localhost:27017'), database_name='surveydata'), collection_name='cgsslabel').collection else: self._label_collection = label_collection def query(self, year, variables=None): if variables is not None: projection = {"_id": False} for var in variables: projection[var] = True found = self._data_collection.find({"year": year}, projection=projection) else: found = self._data_collection.find({"year": year}, projection={ "_id": False, "year": False }) pdataframe = iterator2dataframes(found, 2000) #found = list(found) #pdataframe = pd.DataFrame(found) pdataframe = pd.DataFrame(pdataframe, columns=variables) pdataframe.index = range(1, pdataframe.shape[0] + 1) return { "dataframe": pdataframe, "variable_labels": self.get_variable_label_df(year=year, variables=variables), "value_labels": self.get_variable_value_label_df(year=year, variables=variables) } def get_variable_label_df(self, year, variables=None): var_label_dict = self.get_variable_label(year=year) if variables is not None: result = pd.DataFrame([(var, var_label_dict[var]) for var in variables], columns=['variable', 'lable']) else: result = pd.DataFrame([(var, var_label_dict[var]) for var in var_label_dict], columns=['variable', 'label']) result.index = range(1, result.shape[0] + 1) return result def get_variable_value_label_df(self, year, variables=None): var_value_link = self.get_variable_value_link(year=year) value_labels = self.get_value_label(year=year) value_label_dataframe = None if variables is None: variables = [var for var in self.get_variable_label(year=year)] for var in variables: value_label = var_value_link[var] if len(value_label) > 0: #print(var_value_link[var], " ---> ", value_labels[value_label]) if value_label_dataframe is None: value_label_dataframe = pd.DataFrame( [(key, value_labels[value_label][key]) for key in value_labels[value_label]], columns=["value", "label"]) value_label_dataframe['variable'] = var value_label_dataframe = pd.DataFrame( value_label_dataframe, columns=["variable", "value", "label"]) else: tmp_dataframe = pd.DataFrame( [(key, value_labels[value_label][key]) for key in value_labels[value_label]], columns=["value", "label"]) tmp_dataframe['variable'] = var tmp_dataframe = pd.DataFrame( tmp_dataframe, columns=["variable", "value", "label"]) value_label_dataframe = pd.concat( [value_label_dataframe, tmp_dataframe]) return value_label_dataframe def get_variable_value_link(self, year): """ 返回变量和值标签关联信息 :param year: :return: """ return self._label_collection.find( { "type": "variable value lables", "year": year }, projection={ "_id": False, "type": False, "year": False })[0] def get_variable_label(self, year): """ 返回某年份的cgss变量 :param year: :return: """ return self._label_collection.find( { "type": "variable labels", "year": year }, projection={ "_id": False, "type": False, "year": False })[0] def get_value_label(self, year): """ 返回某年份的cgss值标签 :param year: :return: """ return self._label_collection.find( { "type": "value labels", "year": year }, projection={ "_id": False, "type": False, "year": False })[0] @property def year(self): """ 返回数据库中cgss的时间跨度 :return: """ return self._data_collection.find().distinct('year')
class GaoKaoWebScraper(): def __init__(self): mongo = MongoDB(conn_str='localhost:27017') self._web_conn = MonCollection(mongo, database='cache', collection_name='gaokaoweb').collection self._data_web_conn = MonCollection( mongo, database='cache', collection_name='gaokaodataweb').collection self._university_web_conn = MonCollection( mongo, database='cache', collection_name='gaokaouniversityweb').collection self._data_conn = MonCollection( mongo, database='webdata', collection_name='gaokao_entrancescore').collection self._copy_data_web_conn = MonCollection( mongo, database='webdata', collection_name='gaokaouniversityweb').collection def init_first_stage(self): web_fmt = "http://college.gaokao.com/schpoint/{}/{}/{}/" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } for i in range(1, 32): for j in range(1, 32): url = web_fmt.format(''.join(['a', str(i)]), ''.join(['b', str(j)]), 'p1') raw_result = requests.get(url, headers=headers).text bs_obj = BeautifulSoup(raw_result, "lxml") for string in bs_obj.select('#qx')[0].strings: total_pages = re.split('页', re.split('/', string)[1])[0] break if len(total_pages) > 0: for m in range(1, int(total_pages) + 1): web = web_fmt.format(''.join(['a', str(i)]), ''.join(['b', str(j)]), ''.join(['p', str(m)])) record = {'type': 'search', 'url': web} print(record) self._web_conn.insert_one(record) def init_second_stage(self): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } item = self._web_conn.find({'type': 'search'}) for aitem in item: raw_result = requests.get(aitem['url'], headers=headers).text bs_obj = BeautifulSoup(raw_result, "lxml") for obj in bs_obj.select('.blue'): found = obj.find_all(href=re.compile("result")) if len(found) > 0: url = found[0]['href'] record = {'type': 'data', 'url': url} print(record) self._data_web_conn.insert_one(record) def init_three_stage(self): university_urls = self._data_web_conn.find().distinct('url') for url in university_urls: self._university_web_conn.insert_one({'url': url}) def scrape(self, using_proxy=False): vars = ['年份', '最低', '最高', '平均', '录取人数', '录取批次'] nums = self._copy_data_web_conn.count() while nums > 0: urls = [ item['url'] for item in self._copy_data_web_conn.find(limit=5) ] print(urls) start = time.time() scraper = StaticWebScraper(urls=urls, using_proxy=using_proxy) scraper.start() for html in scraper.result: url = html[1] bs_obj = BeautifulSoup(html[0], "lxml") record = dict( zip(['university', 'region', 'type'], [ item.contents[0] for item in bs_obj.select('.btnFsxBox > font') ])) htmlparser = HtmlParser(html_content=bs_obj) table = htmlparser.table('#pointbyarea > table') if len(table) > 0: for item in table: copy_record = copy.copy(record) if len(item) == 0: continue if len(item) == 6: for i in range(len(item)): if i in [0, 1, 2, 3, 4]: if item[i] == '------': copy_record[vars[i]] = None else: copy_record[vars[i]] = int( float(item[i])) else: if item[i] == '------': copy_record[vars[i]] = None else: copy_record[vars[i]] = item[i] else: raise Exception found = self._data_conn.find_one(copy_record) if found is None: print('Insert..', copy_record) self._data_conn.insert_one(copy_record) self._copy_data_web_conn.delete_one({'url': url}) print('Total: {}'.format(time.time() - start)) nums = self._copy_data_web_conn.count()
from lib.base.database.class_mongodb import MongoDB, MonCollection mongo = MongoDB(conn_str='localhost:27017') province_con = MonCollection(mongo, database='regiondata', collection_name='provinces').collection college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection entrance_score_con = MonCollection( mongo, database='webdata', collection_name='gaokao_entrancescore').collection provinces = province_con.find(projection={ '_id': False, 'name': True, 'id': True }, sort=[('id', ASCENDING)]) province_dict = {province['name']: province['id'] for province in provinces} COLLEGE_INFO = False ENTRANCE_EXAM = True if COLLEGE_INFO: for item in college_info_con.find(projection={ '_id': True, '高校所在地': True, '学校': True }): location = item['高校所在地'] for province in province_dict: