Ejemplo n.º 1
0
class CEESpiderSina:
    """ CEESpiderSina类(College Entrance Examination)用来抓取高考数据,数据来源于新浪高考

    """
    def __init__(self,proxy=None):
        self.region = ''
        self.result = []

        self.browser = AutoBrowser(proxy=proxy,timeout=20)
        self.browser.surf('http://kaoshi.edu.sina.com.cn/college/collegeAvgScoreRank?syear=2013&provid=1',ready_check=(By.CLASS_NAME,'pageNumWrap'))

    def select_region(self,region):
        """ 选择省份

        :param str region: 省份
        :return: 无返回值
        """
        self.region = region
        self.browser.interact_one_time('#provSel',select_text=region)


    def select_subject(self,subject='文科'):
        """ 选择文理科

        :param str subject: 科目,文科或者理科
        :return: 无返回值
        """
        self.browser.interact_one_time('#typeSel',select_text=subject)

    def select_year(self,year='2014'):
        """ 选择年份

        :param str year: 年份
        :return: 无返回值
        """
        self.browser.interact_one_time('#sYear',select_text=year)

    def select_batch(self,batch='本科一批'):
        """ 选择批次

        :param str order: 批次
        :return: 无返回值
        """
        self.browser.interact_one_time('#sBatch',select_text=batch)

    def do_search(self):
        """ 开始搜索

        :return: 无返回值
        """
        self.browser.interact_one_time('#searchBtn',click=True)

        if self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')):
            self.current_url = self.browser.browser.current_url
        else:
            raise TimeoutError
        time.sleep(5)

    def clear(self):
        """ 清空结果

        :return:
        """
        self.result = []

    def get_result_and_more(self):
        """ 添加所有页结果到self.result

        :return:
        """
        is_next = True
        self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False))

        while(is_next):
            try:
                self.browser.browser.find_element_by_css_selector('.pageNumWrap > [node-type="next"]')
                self.browser.interact_one_time('.pageNumWrap > [node-type="next"]',click=True)
                time.sleep(2)
                if not self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')):
                    raise TimeoutError
                self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False))
            except NoSuchElementException:
                break

    @property
    def colleges(self):
        """ 返回爬虫的结果

        :return: 结果列表
        :rtype: list
        """
        vars = ['university','type','university_region','average_score','subject','year','batch','student_region']
        colleges = []
        for cstr in self.result:
            for item in re.split('\n',cstr):
                new_item = re.split('\s+',item)[1:8]
                new_item.append(self.region)
                colleges.append(dict(zip(vars,new_item)))

        colleges = [item for item in colleges if len(item) > 7]

        '''
        for item in colleges:
            if re.match('^--$',item['average_score']) is not None:
                item['average_score'] = None
            else:
                item['average_score'] = int(float(item['average_score']))
            if re.match('^--$',item['province_control_score']) is not None:
                item['province_control_score'] = None
            else:
                item['province_control_score'] = int(float(item['province_control_score']))'''

        return colleges

    def close(self):
        """ 关闭浏览器

        :return: 无返回值
        """
        self.browser.quit()
Ejemplo n.º 2
0
class CEESpider:
    """ CEESpider类(College Entrance Examination)用来抓取高考数据

    """
    def __init__(self,proxy=None):
        self.first_region_set = ['安徽','北京','重庆','福建','广东','广西','甘肃','贵州','河北','河南','湖南','湖北','海南','黑龙江']
        self.second_region_set = ['吉林','江苏','江西','辽宁','内蒙古','宁夏','青海']
        self.third_region_set = ['上海','四川','山西','山东','陕西','天津','新疆','西藏','云南','浙江']
        self.college = ''
        self.region = ''
        self.last_url = ''
        self.current_url = ''
        self.result = []
        self.no_result = False

        self.browser = AutoBrowser(proxy=proxy,timeout=20)
        self.browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html',ready_check=(By.LINK_TEXT,'末页'))

    def select_region(self,region):
        """ 选择省份

        :param str region: 省份
        :return: 无返回值
        """
        self.region = region
        self.browser.interact_one_time('.gaoxiaoshengyuandi_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True)

        if region in self.first_region_set:
            self.browser.interact_one_time('div.tabs_10:nth-child(1)',click=True)
        if region in self.second_region_set:
            self.browser.interact_one_time('div.tabs_10:nth-child(2)',click=True)
        if region in self.third_region_set:
            self.browser.interact_one_time('div.tabs_10:nth-child(3)',click=True)

        self.browser.interact_one_time(location=self.browser.locate(link_text=region),click=True)

    def select_subject(self,subject='文科'):
        """ 选择文理科

        :param str subject: 科目,文科或者理科
        :return: 无返回值
        """
        self.browser.interact_one_time('.getFstypegaoxiaogesheng_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True)
        self.browser.interact_one_time(location=self.browser.locate(link_text=subject),click=True)

    def set_college(self,college='复旦大学'):
        """ 设定学校

        :param str college: 学校名称
        :return: 无返回值
        """
        self.college = college
        self.browser.interact_one_time('#provinceScoreKEY',send_text=college)

    def do_search(self):
        """ 开始搜索

        :return: 无返回值
        """
        self.browser.interact_one_time('#dxlqx > form:nth-child(1) > div:nth-child(2) > input:nth-child(1)',click=True)
        if self.browser.browser.find_element_by_id('noResultMessage').text == '':
            if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')):
                self.current_url = self.browser.browser.current_url
            else:
                raise TimeoutError
        else:
            self.no_result = True

        time.sleep(5)

    def clear(self):
        """ 清空结果

        :return:
        """
        self.result = []

    def get_result_and_more(self):
        """ 添加所有页结果到self.result

        :return:
        """
        if self.no_result:
            self.no_result = False
            return None

        self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False))

        self.last_url = self.current_url
        self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True)
        if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')):
            time.sleep(5)
            self.current_url = self.browser.browser.current_url

            while self.last_url != self.current_url:
                self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False))
                self.last_url = self.current_url
                self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True)
                if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')):
                   self.current_url = self.browser.browser.current_url

    @property
    def colleges(self):
        """ 返回爬虫的结果

        :return: 结果列表
        :rtype: list
        """
        vars = ['university','student_region','subject','year','batch','average_score','province_control_score']
        colleges = []
        for cstr in self.result:
            for item in re.split('\n',cstr)[1:]:
                colleges.append(dict(zip(vars,re.split('\s+',item)[0:8])))

        for item in colleges:
            if re.match('^--$',item['average_score']) is not None:
                item['average_score'] = None
            else:
                item['average_score'] = int(float(item['average_score']))
            if re.match('^--$',item['province_control_score']) is not None:
                item['province_control_score'] = None
            else:
                item['province_control_score'] = int(float(item['province_control_score']))

        return colleges

    def close(self):
        """ 关闭浏览器

        :return: 无返回值
        """
        self.browser.quit()