class CollegeInfo(): def __init__(self): mongo = MongoDB(conn_str='localhost:27017') self._college_info = MonCollection( mongo, database='webdata', collection_name='college_info').collection self._college_intro = MonCollection( mongo, database='webdata', collection_name='college_introduction').collection self._headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } def init_first_stage(self): web_fmt = "http://college.gaokao.com/schlist/a{}/p{}" for i in range(1, 32): url = web_fmt.format(str(i), '1') raw_result = requests.get(url, headers=self._headers).text bs_obj = BeautifulSoup(raw_result, "lxml") for string in bs_obj.select('#qx')[0].strings: total_pages = re.split('页', re.split('/', string)[1])[0] break for j in range(1, int(total_pages) + 1): surf_url = web_fmt.format(str(i), str(j)) print(surf_url) surf_result = requests.get(surf_url, headers=self._headers).text surf_obj = BeautifulSoup(surf_result, "lxml") surf_content = surf_obj.select('.scores_List')[0] colleges = [ item.attrs['title'] for item in surf_content.select('.blue') ] college_info = [] for ul_item in surf_content.select('ul'): one_college_info = dict() for n in range(len(ul_item.select('li'))): if n == 1: college_type = (ul_item.select('li')[n]).contents if len(college_type) == 1: one_college_info['985'] = False one_college_info['211'] = False elif len(college_type) == 2: if college_type[1].string == '211': one_college_info['985'] = False one_college_info['211'] = True elif college_type[1].string == '985': one_college_info['985'] = True one_college_info['211'] = False else: raise Exception else: one_college_info['985'] = True one_college_info['211'] = True else: key, value = re.split( ':', (ul_item.select('li')[n]).string) if value == '——' or value == '------': value = None one_college_info[key] = value college_info.append(one_college_info) for m in range(len(colleges)): college_info[m]['学校'] = colleges[m] for college in college_info: found = self._college_info.find_one(college) if found is None: print('Insert..', college) self._college_info.insert_one(college)
class GaoKaoWebScraper(): def __init__(self): mongo = MongoDB(conn_str='localhost:27017') self._web_conn = MonCollection(mongo, database='cache', collection_name='gaokaoweb').collection self._data_web_conn = MonCollection( mongo, database='cache', collection_name='gaokaodataweb').collection self._university_web_conn = MonCollection( mongo, database='cache', collection_name='gaokaouniversityweb').collection self._data_conn = MonCollection( mongo, database='webdata', collection_name='gaokao_entrancescore').collection self._copy_data_web_conn = MonCollection( mongo, database='webdata', collection_name='gaokaouniversityweb').collection def init_first_stage(self): web_fmt = "http://college.gaokao.com/schpoint/{}/{}/{}/" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } for i in range(1, 32): for j in range(1, 32): url = web_fmt.format(''.join(['a', str(i)]), ''.join(['b', str(j)]), 'p1') raw_result = requests.get(url, headers=headers).text bs_obj = BeautifulSoup(raw_result, "lxml") for string in bs_obj.select('#qx')[0].strings: total_pages = re.split('页', re.split('/', string)[1])[0] break if len(total_pages) > 0: for m in range(1, int(total_pages) + 1): web = web_fmt.format(''.join(['a', str(i)]), ''.join(['b', str(j)]), ''.join(['p', str(m)])) record = {'type': 'search', 'url': web} print(record) self._web_conn.insert_one(record) def init_second_stage(self): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } item = self._web_conn.find({'type': 'search'}) for aitem in item: raw_result = requests.get(aitem['url'], headers=headers).text bs_obj = BeautifulSoup(raw_result, "lxml") for obj in bs_obj.select('.blue'): found = obj.find_all(href=re.compile("result")) if len(found) > 0: url = found[0]['href'] record = {'type': 'data', 'url': url} print(record) self._data_web_conn.insert_one(record) def init_three_stage(self): university_urls = self._data_web_conn.find().distinct('url') for url in university_urls: self._university_web_conn.insert_one({'url': url}) def scrape(self, using_proxy=False): vars = ['年份', '最低', '最高', '平均', '录取人数', '录取批次'] nums = self._copy_data_web_conn.count() while nums > 0: urls = [ item['url'] for item in self._copy_data_web_conn.find(limit=5) ] print(urls) start = time.time() scraper = StaticWebScraper(urls=urls, using_proxy=using_proxy) scraper.start() for html in scraper.result: url = html[1] bs_obj = BeautifulSoup(html[0], "lxml") record = dict( zip(['university', 'region', 'type'], [ item.contents[0] for item in bs_obj.select('.btnFsxBox > font') ])) htmlparser = HtmlParser(html_content=bs_obj) table = htmlparser.table('#pointbyarea > table') if len(table) > 0: for item in table: copy_record = copy.copy(record) if len(item) == 0: continue if len(item) == 6: for i in range(len(item)): if i in [0, 1, 2, 3, 4]: if item[i] == '------': copy_record[vars[i]] = None else: copy_record[vars[i]] = int( float(item[i])) else: if item[i] == '------': copy_record[vars[i]] = None else: copy_record[vars[i]] = item[i] else: raise Exception found = self._data_conn.find_one(copy_record) if found is None: print('Insert..', copy_record) self._data_conn.insert_one(copy_record) self._copy_data_web_conn.delete_one({'url': url}) print('Total: {}'.format(time.time() - start)) nums = self._copy_data_web_conn.count()