Beispiel #1
0
class CEESpiderSina:
    """ CEESpiderSina类(College Entrance Examination)用来抓取高考数据,数据来源于新浪高考

    """
    def __init__(self,proxy=None):
        self.region = ''
        self.result = []

        self.browser = AutoBrowser(proxy=proxy,timeout=20)
        self.browser.surf('http://kaoshi.edu.sina.com.cn/college/collegeAvgScoreRank?syear=2013&provid=1',ready_check=(By.CLASS_NAME,'pageNumWrap'))

    def select_region(self,region):
        """ 选择省份

        :param str region: 省份
        :return: 无返回值
        """
        self.region = region
        self.browser.interact_one_time('#provSel',select_text=region)


    def select_subject(self,subject='文科'):
        """ 选择文理科

        :param str subject: 科目,文科或者理科
        :return: 无返回值
        """
        self.browser.interact_one_time('#typeSel',select_text=subject)

    def select_year(self,year='2014'):
        """ 选择年份

        :param str year: 年份
        :return: 无返回值
        """
        self.browser.interact_one_time('#sYear',select_text=year)

    def select_batch(self,batch='本科一批'):
        """ 选择批次

        :param str order: 批次
        :return: 无返回值
        """
        self.browser.interact_one_time('#sBatch',select_text=batch)

    def do_search(self):
        """ 开始搜索

        :return: 无返回值
        """
        self.browser.interact_one_time('#searchBtn',click=True)

        if self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')):
            self.current_url = self.browser.browser.current_url
        else:
            raise TimeoutError
        time.sleep(5)

    def clear(self):
        """ 清空结果

        :return:
        """
        self.result = []

    def get_result_and_more(self):
        """ 添加所有页结果到self.result

        :return:
        """
        is_next = True
        self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False))

        while(is_next):
            try:
                self.browser.browser.find_element_by_css_selector('.pageNumWrap > [node-type="next"]')
                self.browser.interact_one_time('.pageNumWrap > [node-type="next"]',click=True)
                time.sleep(2)
                if not self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')):
                    raise TimeoutError
                self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False))
            except NoSuchElementException:
                break

    @property
    def colleges(self):
        """ 返回爬虫的结果

        :return: 结果列表
        :rtype: list
        """
        vars = ['university','type','university_region','average_score','subject','year','batch','student_region']
        colleges = []
        for cstr in self.result:
            for item in re.split('\n',cstr):
                new_item = re.split('\s+',item)[1:8]
                new_item.append(self.region)
                colleges.append(dict(zip(vars,new_item)))

        colleges = [item for item in colleges if len(item) > 7]

        '''
        for item in colleges:
            if re.match('^--$',item['average_score']) is not None:
                item['average_score'] = None
            else:
                item['average_score'] = int(float(item['average_score']))
            if re.match('^--$',item['province_control_score']) is not None:
                item['province_control_score'] = None
            else:
                item['province_control_score'] = int(float(item['province_control_score']))'''

        return colleges

    def close(self):
        """ 关闭浏览器

        :return: 无返回值
        """
        self.browser.quit()
Beispiel #2
0
class Cnki:
    """ Cnki类用来连接cnki数据库

    """
    def __init__(self,proxy=None):
        self.soups = list()
        self.more = True
        self.browser = AutoBrowser(proxy=proxy)
        self.browser.surf('http://epub.cnki.net/kns/brief/result.aspx?dbprefix=CJFQ',
                          ready_check=(By.CSS_SELECTOR,'#bottom'))
        time.sleep(2)

    def submit(self):
        """ 提交查询,进行搜索

        :return:
        """
        self.browser.interact_one_time(self.browser.locate(id="btnSearch"),click=True)
        time.sleep(5)

    def sort(self,by='被引'):
        """ 根据by参数进行排序

        :param str by: 变量
        :return: 无返回值
        """
        self.browser.switch(iframe='iframeResult')
        self.browser.interact_one_time(location=self.browser.locate(link_text=by),click=True)
        time.sleep(6)

    def select_all_literature(self):
        self.browser.interact_one_time(location=self.browser.locate(link_text='清除'),click=True)
        time.sleep(2)
        self.browser.interact_one_time(location=self.browser.locate(id='selectCheckbox'),click=True)
        time.sleep(2)
        self.browser.interact_one_time(location='.SavePoint > a:nth-child(3)',click=True)

    def get_more(self,limit=4):
        """ 查询下一页

        :param limit:
        :return:
        """
        i = 1
        while self.more:
            if i >= limit:
                self.more = False
            try:
                self.browser.switch(iframe='iframeResult')
                self.browser.interact_one_time(location=self.browser.locate(id='Page_next'),click=True)
                time.sleep(3)
            except NoSuchElementException:
                self.more = False
            else:
                time.sleep(5)
                self.select_all_literature()
                self.child_operation()
                i += 1
                time.sleep(2)

    def child_operation(self):
        """ 操作子页面,并添加文献信息到self.soups

        :return: 无返回值
        """
        self.browser.interact_one_time(location='.GTContentTitle > td:nth-child(1) > input:nth-child(1)',click=True)
        self.browser.interact_one_time(location='#file_export > input:nth-child(1)',click=True)
        time.sleep(5)
        self.browser.interact_one_time(location=self.browser.locate(link_text='NoteExpress'),click=True)
        time.sleep(5)
        self.soups.append(BeautifulSoup(self.browser.browser.find_element_by_css_selector('.mainTable').text,"lxml"))
        time.sleep(5)
        self.browser.switch_to_parent(close=True)
        self.browser.switch_to_parent(close=True)
        time.sleep(5)

    def set_query(self,query_str=None):
        """ 设置专业查询字符串

        :param str query_str: 查询字符串
        :return: 无返回值
        """
        self.browser.interact_one_time(self.browser.locate(id='1_4'),click=True)
        time.sleep(2)
        self.browser.interact_one_time('#expertvalue',send_text=query_str)
        time.sleep(1)

    def set_period(self,start_period=None,end_period=None):
        """ 设置起始和终止时期

        :param str start_period: 起始时期
        :param str end_period: 终止时期
        :return: 无返回值
        """
        if start_period is not None:
            self.browser.interact_one_time(location=self.browser.locate(id='year_from'),select_text=start_period)
        if end_period is not None:
            self.browser.interact_one_time(location=self.browser.locate(id='year_to'),select_text=end_period)

        time.sleep(1)

    def set_subject(self,subjects=None):
        """ 选择学科领域

        :param list subjects: 学科字符串
        :return: 无返回值
        """
        self.browser.interact_one_time(location='input.btn:nth-child(1)',click=True)
        for subject in subjects:
            self.browser.interact_one_time(location=self.browser.locate(xpath=''.join(["//input[@name='",subject,"']"])),click=True)
        time.sleep(1)

    def export_to_pickle(self,file=r'E:\gitrobot\files\literature\literature_list.pkl'):
        """ 到处有效的代理服务器列表到文件

        :param str file: 文件名
        :return: 无返回值
        """
        F = open(file, 'wb')
        pickle.dump(self.soups, F)
        F.close()

    def export_to_dict(self):
        literature = OrderedDict()

        for llist in self.soups:
            content = str(llist.find_all('p'))
            content = re.split('</p>\]',re.split('\[<p>',content)[1])[0]
            items = re.split('\n',content)

            one_literature = dict()
            for item in items:
                if '{Title}' in item:
                    title = re.sub('\s+','',re.split('}: ',item)[1])
                if '{Author}' in item:
                    one_literature['author'] = [re.sub('\s+','',author) for author in re.split('\{Author\}\: ',item)
                                                if len(author) > 0]
                if '{Author Address}' in item:
                    one_literature['address'] = [re.sub('\s+','',address) for address in re.split(';',re.split('\}\: ',item)[1])
                                                if len(address) > 0]
                if '{Journal}' in item:
                    one_literature['journal'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{Year}' in item:
                    one_literature['year'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{Issue}' in item:
                    one_literature['issure'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{Pages}' in item:
                    one_literature['pages'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{Keywords}' in item:
                    one_literature['keyword'] = [re.sub('\s+','',keyword) for keyword in re.split(';',re.split('\}\: ',item)[1])
                                                 if len(keyword) > 0]
                if '{Abstract}' in item:
                    one_literature['abstract'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{ISBN/ISSN}' in item:
                    one_literature['ISBN/ISSN'] = re.sub('\s+','',re.split('\}\: ',item)[1])
                if '{Database Provider}' in item:
                    literature[title] = one_literature
                    one_literature = dict()

        return literature

    def export_to_json(self,file=r'E:\gitrobot\files\literature\literature_list.txt'):
        json.dump(self.export_to_dict(), fp=open(file,'w'))

    def close(self):
        """ 关闭浏览器

        :return: 无返回值
        """
        self.browser.quit()
Beispiel #3
0
proxy_checked_list = ['58.20.234.243:8000','58.20.242.85:8000',
                      '110.52.232.56:8000','110.52.232.56:80',
                      '58.20.232.239:8000','58.246.242.154:8080',
                      '58.20.232.239:8000','110.52.232.75:8000',
                      '60.13.74.184:81','110.52.232.60:8000',
                      '58.247.30.222:8080','58.22.86.44:8000']

browser = AutoBrowser(proxy=proxy_list[random.randint(0,len(proxy_list)-1)])
browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html')
browser.interact_one_time('.gaoxiaoshengyuandi_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True)
browser.interact_one_time('div.tabs_10:nth-child(3)',click=True)
browser.interact_one_time(location=browser.locate(link_text='西藏'),click=True)

browser.interact_one_time('.getFstypegaoxiaogesheng_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True)
browser.interact_one_time(location=browser.locate(link_text='文科'),click=True)

browser.interact_one_time('#provinceScoreKEY',send_text='复旦大学')
browser.interact_one_time('#dxlqx > form:nth-child(1) > div:nth-child(2) > input:nth-child(1)',click=True)
time.sleep(5)

print(browser.browser.find_element_by_css_selector('#queryschoolad').text)
u1 = browser.browser.current_url
browser.interact_one_time(location=browser.locate(link_text='下一页'),click=True)
u2 = browser.browser.current_url
time.sleep(10)
print(u1,u2)
print(u1==u2)
browser.interact_one_time(location=browser.locate(link_text='下一页'),click=True)
time.sleep(10)
browser.quit()
Beispiel #4
0
class CEESpider:
    """ CEESpider类(College Entrance Examination)用来抓取高考数据

    """
    def __init__(self,proxy=None):
        self.first_region_set = ['安徽','北京','重庆','福建','广东','广西','甘肃','贵州','河北','河南','湖南','湖北','海南','黑龙江']
        self.second_region_set = ['吉林','江苏','江西','辽宁','内蒙古','宁夏','青海']
        self.third_region_set = ['上海','四川','山西','山东','陕西','天津','新疆','西藏','云南','浙江']
        self.college = ''
        self.region = ''
        self.last_url = ''
        self.current_url = ''
        self.result = []
        self.no_result = False

        self.browser = AutoBrowser(proxy=proxy,timeout=20)
        self.browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html',ready_check=(By.LINK_TEXT,'末页'))

    def select_region(self,region):
        """ 选择省份

        :param str region: 省份
        :return: 无返回值
        """
        self.region = region
        self.browser.interact_one_time('.gaoxiaoshengyuandi_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True)

        if region in self.first_region_set:
            self.browser.interact_one_time('div.tabs_10:nth-child(1)',click=True)
        if region in self.second_region_set:
            self.browser.interact_one_time('div.tabs_10:nth-child(2)',click=True)
        if region in self.third_region_set:
            self.browser.interact_one_time('div.tabs_10:nth-child(3)',click=True)

        self.browser.interact_one_time(location=self.browser.locate(link_text=region),click=True)

    def select_subject(self,subject='文科'):
        """ 选择文理科

        :param str subject: 科目,文科或者理科
        :return: 无返回值
        """
        self.browser.interact_one_time('.getFstypegaoxiaogesheng_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True)
        self.browser.interact_one_time(location=self.browser.locate(link_text=subject),click=True)

    def set_college(self,college='复旦大学'):
        """ 设定学校

        :param str college: 学校名称
        :return: 无返回值
        """
        self.college = college
        self.browser.interact_one_time('#provinceScoreKEY',send_text=college)

    def do_search(self):
        """ 开始搜索

        :return: 无返回值
        """
        self.browser.interact_one_time('#dxlqx > form:nth-child(1) > div:nth-child(2) > input:nth-child(1)',click=True)
        if self.browser.browser.find_element_by_id('noResultMessage').text == '':
            if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')):
                self.current_url = self.browser.browser.current_url
            else:
                raise TimeoutError
        else:
            self.no_result = True

        time.sleep(5)

    def clear(self):
        """ 清空结果

        :return:
        """
        self.result = []

    def get_result_and_more(self):
        """ 添加所有页结果到self.result

        :return:
        """
        if self.no_result:
            self.no_result = False
            return None

        self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False))

        self.last_url = self.current_url
        self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True)
        if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')):
            time.sleep(5)
            self.current_url = self.browser.browser.current_url

            while self.last_url != self.current_url:
                self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False))
                self.last_url = self.current_url
                self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True)
                if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')):
                   self.current_url = self.browser.browser.current_url

    @property
    def colleges(self):
        """ 返回爬虫的结果

        :return: 结果列表
        :rtype: list
        """
        vars = ['university','student_region','subject','year','batch','average_score','province_control_score']
        colleges = []
        for cstr in self.result:
            for item in re.split('\n',cstr)[1:]:
                colleges.append(dict(zip(vars,re.split('\s+',item)[0:8])))

        for item in colleges:
            if re.match('^--$',item['average_score']) is not None:
                item['average_score'] = None
            else:
                item['average_score'] = int(float(item['average_score']))
            if re.match('^--$',item['province_control_score']) is not None:
                item['province_control_score'] = None
            else:
                item['province_control_score'] = int(float(item['province_control_score']))

        return colleges

    def close(self):
        """ 关闭浏览器

        :return: 无返回值
        """
        self.browser.quit()