class ZhipinCrawler(IterateCrawler):
    '''继承浏览器迭代操作类,完成boss直聘数据采集'''
    def __init__(self, page_count):
        template = {
            'np_selector':'div.page>a.next',
        }
        super().__init__(template, page_count)

    def extract(self):
        '''对采集到的网页进行数据提取'''
        self.db = MySQL('job')
        jobs = self.findall('div.job-list>ul>li')
        for job in jobs:
            title = self.find('.job-title',job).text
            salary = self.find('.red',job).text

            job_link = self.find('.info-primary>h3.name>a',job).get_attribute('href')
            job_id = re.search(r'/job_detail/(.*).html',job_link).group(1)
            job_info = self.find('.info-primary>p',job).get_attribute('innerHTML')

            company_name = self.find(".info-company>div>h3>a",job).text
            company_link = self.find(".info-company>div>h3>a",job).get_attribute('href')
            company_id = re.search(r'/gongsi/(.*).html',company_link).group(1)

            html_str = self.find('.company-text>p',job).get_attribute('innerHTML')
            company_info = html_str.split('''<em class="vline"></em>''')
            company_industry = company_info[0]
            company_size = company_info.pop()
            publisher = self.find('.info-publis>h3',job).get_attribute('innerHTML').split('<em class="vline"></em>').pop()
            date_str = self.find('.info-publis p',job).text
            date_str = date_str.split('发布于')[1]
            try:
                pub_date = datetime.datetime.strptime(date_str,'%m月%d日')
                pub_date = pub_date.replace(year = 2019)
            except ValueError:
                pub_date = datetime.datetime.strptime('11月18日','%m月%d日')
                pub_date = pub_date.replace(year = 2019)
            data = dict(title=title,salary=salary,job_info=job_info.replace('\"','\''),job_id=job_id,company_name=company_name\
                ,company_id=company_id,company_industry=company_industry,company_size=company_size\
                ,publisher=publisher,pub_date=str(pub_date))
            self.write(data)

    def write(self, data):
        '''将提取到的数据写入到数据库'''
        self.db.run(sql.append('zhipin_guangzhou',data))

    def next_page(self):
        '''重写父类方法,实现翻页操作'''
        url = self.instance.current_url
        rst = re.search(r'page=([0-9]*)&',url)
        match = rst.group()
        num = int(rst.group(1))
        self.to(url.replace(match,'page=%d&'%(num+1)))

    def after(self):
        '''实现父类方法,采集完成后处理函数'''
        self.db.exit()
        t.say('job done')