class CEESpiderSina: """ CEESpiderSina类(College Entrance Examination)用来抓取高考数据,数据来源于新浪高考 """ def __init__(self,proxy=None): self.region = '' self.result = [] self.browser = AutoBrowser(proxy=proxy,timeout=20) self.browser.surf('http://kaoshi.edu.sina.com.cn/college/collegeAvgScoreRank?syear=2013&provid=1',ready_check=(By.CLASS_NAME,'pageNumWrap')) def select_region(self,region): """ 选择省份 :param str region: 省份 :return: 无返回值 """ self.region = region self.browser.interact_one_time('#provSel',select_text=region) def select_subject(self,subject='文科'): """ 选择文理科 :param str subject: 科目,文科或者理科 :return: 无返回值 """ self.browser.interact_one_time('#typeSel',select_text=subject) def select_year(self,year='2014'): """ 选择年份 :param str year: 年份 :return: 无返回值 """ self.browser.interact_one_time('#sYear',select_text=year) def select_batch(self,batch='本科一批'): """ 选择批次 :param str order: 批次 :return: 无返回值 """ self.browser.interact_one_time('#sBatch',select_text=batch) def do_search(self): """ 开始搜索 :return: 无返回值 """ self.browser.interact_one_time('#searchBtn',click=True) if self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')): self.current_url = self.browser.browser.current_url else: raise TimeoutError time.sleep(5) def clear(self): """ 清空结果 :return: """ self.result = [] def get_result_and_more(self): """ 添加所有页结果到self.result :return: """ is_next = True self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False)) while(is_next): try: self.browser.browser.find_element_by_css_selector('.pageNumWrap > [node-type="next"]') self.browser.interact_one_time('.pageNumWrap > [node-type="next"]',click=True) time.sleep(2) if not self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')): raise TimeoutError self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False)) except NoSuchElementException: break @property def colleges(self): """ 返回爬虫的结果 :return: 结果列表 :rtype: list """ vars = ['university','type','university_region','average_score','subject','year','batch','student_region'] colleges = [] for cstr in self.result: for item in re.split('\n',cstr): new_item = re.split('\s+',item)[1:8] new_item.append(self.region) colleges.append(dict(zip(vars,new_item))) colleges = [item for item in colleges if len(item) > 7] ''' for item in colleges: if re.match('^--$',item['average_score']) is not None: item['average_score'] = None else: item['average_score'] = int(float(item['average_score'])) if re.match('^--$',item['province_control_score']) is not None: item['province_control_score'] = None else: item['province_control_score'] = int(float(item['province_control_score']))''' return colleges def close(self): """ 关闭浏览器 :return: 无返回值 """ self.browser.quit()
class Cnki: """ Cnki类用来连接cnki数据库 """ def __init__(self,proxy=None): self.soups = list() self.more = True self.browser = AutoBrowser(proxy=proxy) self.browser.surf('http://epub.cnki.net/kns/brief/result.aspx?dbprefix=CJFQ', ready_check=(By.CSS_SELECTOR,'#bottom')) time.sleep(2) def submit(self): """ 提交查询,进行搜索 :return: """ self.browser.interact_one_time(self.browser.locate(id="btnSearch"),click=True) time.sleep(5) def sort(self,by='被引'): """ 根据by参数进行排序 :param str by: 变量 :return: 无返回值 """ self.browser.switch(iframe='iframeResult') self.browser.interact_one_time(location=self.browser.locate(link_text=by),click=True) time.sleep(6) def select_all_literature(self): self.browser.interact_one_time(location=self.browser.locate(link_text='清除'),click=True) time.sleep(2) self.browser.interact_one_time(location=self.browser.locate(id='selectCheckbox'),click=True) time.sleep(2) self.browser.interact_one_time(location='.SavePoint > a:nth-child(3)',click=True) def get_more(self,limit=4): """ 查询下一页 :param limit: :return: """ i = 1 while self.more: if i >= limit: self.more = False try: self.browser.switch(iframe='iframeResult') self.browser.interact_one_time(location=self.browser.locate(id='Page_next'),click=True) time.sleep(3) except NoSuchElementException: self.more = False else: time.sleep(5) self.select_all_literature() self.child_operation() i += 1 time.sleep(2) def child_operation(self): """ 操作子页面,并添加文献信息到self.soups :return: 无返回值 """ self.browser.interact_one_time(location='.GTContentTitle > td:nth-child(1) > input:nth-child(1)',click=True) self.browser.interact_one_time(location='#file_export > input:nth-child(1)',click=True) time.sleep(5) self.browser.interact_one_time(location=self.browser.locate(link_text='NoteExpress'),click=True) time.sleep(5) self.soups.append(BeautifulSoup(self.browser.browser.find_element_by_css_selector('.mainTable').text,"lxml")) time.sleep(5) self.browser.switch_to_parent(close=True) self.browser.switch_to_parent(close=True) time.sleep(5) def set_query(self,query_str=None): """ 设置专业查询字符串 :param str query_str: 查询字符串 :return: 无返回值 """ self.browser.interact_one_time(self.browser.locate(id='1_4'),click=True) time.sleep(2) self.browser.interact_one_time('#expertvalue',send_text=query_str) time.sleep(1) def set_period(self,start_period=None,end_period=None): """ 设置起始和终止时期 :param str start_period: 起始时期 :param str end_period: 终止时期 :return: 无返回值 """ if start_period is not None: self.browser.interact_one_time(location=self.browser.locate(id='year_from'),select_text=start_period) if end_period is not None: self.browser.interact_one_time(location=self.browser.locate(id='year_to'),select_text=end_period) time.sleep(1) def set_subject(self,subjects=None): """ 选择学科领域 :param list subjects: 学科字符串 :return: 无返回值 """ self.browser.interact_one_time(location='input.btn:nth-child(1)',click=True) for subject in subjects: self.browser.interact_one_time(location=self.browser.locate(xpath=''.join(["//input[@name='",subject,"']"])),click=True) time.sleep(1) def export_to_pickle(self,file=r'E:\gitrobot\files\literature\literature_list.pkl'): """ 到处有效的代理服务器列表到文件 :param str file: 文件名 :return: 无返回值 """ F = open(file, 'wb') pickle.dump(self.soups, F) F.close() def export_to_dict(self): literature = OrderedDict() for llist in self.soups: content = str(llist.find_all('p')) content = re.split('</p>\]',re.split('\[<p>',content)[1])[0] items = re.split('\n',content) one_literature = dict() for item in items: if '{Title}' in item: title = re.sub('\s+','',re.split('}: ',item)[1]) if '{Author}' in item: one_literature['author'] = [re.sub('\s+','',author) for author in re.split('\{Author\}\: ',item) if len(author) > 0] if '{Author Address}' in item: one_literature['address'] = [re.sub('\s+','',address) for address in re.split(';',re.split('\}\: ',item)[1]) if len(address) > 0] if '{Journal}' in item: one_literature['journal'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{Year}' in item: one_literature['year'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{Issue}' in item: one_literature['issure'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{Pages}' in item: one_literature['pages'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{Keywords}' in item: one_literature['keyword'] = [re.sub('\s+','',keyword) for keyword in re.split(';',re.split('\}\: ',item)[1]) if len(keyword) > 0] if '{Abstract}' in item: one_literature['abstract'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{ISBN/ISSN}' in item: one_literature['ISBN/ISSN'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{Database Provider}' in item: literature[title] = one_literature one_literature = dict() return literature def export_to_json(self,file=r'E:\gitrobot\files\literature\literature_list.txt'): json.dump(self.export_to_dict(), fp=open(file,'w')) def close(self): """ 关闭浏览器 :return: 无返回值 """ self.browser.quit()
proxy_checked_list = ['58.20.234.243:8000','58.20.242.85:8000', '110.52.232.56:8000','110.52.232.56:80', '58.20.232.239:8000','58.246.242.154:8080', '58.20.232.239:8000','110.52.232.75:8000', '60.13.74.184:81','110.52.232.60:8000', '58.247.30.222:8080','58.22.86.44:8000'] browser = AutoBrowser(proxy=proxy_list[random.randint(0,len(proxy_list)-1)]) browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html') browser.interact_one_time('.gaoxiaoshengyuandi_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True) browser.interact_one_time('div.tabs_10:nth-child(3)',click=True) browser.interact_one_time(location=browser.locate(link_text='西藏'),click=True) browser.interact_one_time('.getFstypegaoxiaogesheng_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True) browser.interact_one_time(location=browser.locate(link_text='文科'),click=True) browser.interact_one_time('#provinceScoreKEY',send_text='复旦大学') browser.interact_one_time('#dxlqx > form:nth-child(1) > div:nth-child(2) > input:nth-child(1)',click=True) time.sleep(5) print(browser.browser.find_element_by_css_selector('#queryschoolad').text) u1 = browser.browser.current_url browser.interact_one_time(location=browser.locate(link_text='下一页'),click=True) u2 = browser.browser.current_url time.sleep(10) print(u1,u2) print(u1==u2) browser.interact_one_time(location=browser.locate(link_text='下一页'),click=True) time.sleep(10) browser.quit()
class CEESpider: """ CEESpider类(College Entrance Examination)用来抓取高考数据 """ def __init__(self,proxy=None): self.first_region_set = ['安徽','北京','重庆','福建','广东','广西','甘肃','贵州','河北','河南','湖南','湖北','海南','黑龙江'] self.second_region_set = ['吉林','江苏','江西','辽宁','内蒙古','宁夏','青海'] self.third_region_set = ['上海','四川','山西','山东','陕西','天津','新疆','西藏','云南','浙江'] self.college = '' self.region = '' self.last_url = '' self.current_url = '' self.result = [] self.no_result = False self.browser = AutoBrowser(proxy=proxy,timeout=20) self.browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html',ready_check=(By.LINK_TEXT,'末页')) def select_region(self,region): """ 选择省份 :param str region: 省份 :return: 无返回值 """ self.region = region self.browser.interact_one_time('.gaoxiaoshengyuandi_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True) if region in self.first_region_set: self.browser.interact_one_time('div.tabs_10:nth-child(1)',click=True) if region in self.second_region_set: self.browser.interact_one_time('div.tabs_10:nth-child(2)',click=True) if region in self.third_region_set: self.browser.interact_one_time('div.tabs_10:nth-child(3)',click=True) self.browser.interact_one_time(location=self.browser.locate(link_text=region),click=True) def select_subject(self,subject='文科'): """ 选择文理科 :param str subject: 科目,文科或者理科 :return: 无返回值 """ self.browser.interact_one_time('.getFstypegaoxiaogesheng_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True) self.browser.interact_one_time(location=self.browser.locate(link_text=subject),click=True) def set_college(self,college='复旦大学'): """ 设定学校 :param str college: 学校名称 :return: 无返回值 """ self.college = college self.browser.interact_one_time('#provinceScoreKEY',send_text=college) def do_search(self): """ 开始搜索 :return: 无返回值 """ self.browser.interact_one_time('#dxlqx > form:nth-child(1) > div:nth-child(2) > input:nth-child(1)',click=True) if self.browser.browser.find_element_by_id('noResultMessage').text == '': if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')): self.current_url = self.browser.browser.current_url else: raise TimeoutError else: self.no_result = True time.sleep(5) def clear(self): """ 清空结果 :return: """ self.result = [] def get_result_and_more(self): """ 添加所有页结果到self.result :return: """ if self.no_result: self.no_result = False return None self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False)) self.last_url = self.current_url self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True) if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')): time.sleep(5) self.current_url = self.browser.browser.current_url while self.last_url != self.current_url: self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False)) self.last_url = self.current_url self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True) if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')): self.current_url = self.browser.browser.current_url @property def colleges(self): """ 返回爬虫的结果 :return: 结果列表 :rtype: list """ vars = ['university','student_region','subject','year','batch','average_score','province_control_score'] colleges = [] for cstr in self.result: for item in re.split('\n',cstr)[1:]: colleges.append(dict(zip(vars,re.split('\s+',item)[0:8]))) for item in colleges: if re.match('^--$',item['average_score']) is not None: item['average_score'] = None else: item['average_score'] = int(float(item['average_score'])) if re.match('^--$',item['province_control_score']) is not None: item['province_control_score'] = None else: item['province_control_score'] = int(float(item['province_control_score'])) return colleges def close(self): """ 关闭浏览器 :return: 无返回值 """ self.browser.quit()