Esempio n. 1
0
class GkChsiSpecialSpider(BaseGkChsiFsxSpider):
    """
    学信网阳光高考省市分数线单用户单线程爬虫,抓取高校录取层次对,高校,高校代码,层次

    seeds title:省市,科类,科类代码,层次,层次代码,年份,院校代码,院校名称
    """
    def __init__(self,
                 threadcnt,
                 account,
                 prefix,
                 proxy=None,
                 sleep=0,
                 captcha_limit=50000000,
                 kldms=None,
                 seeds='spec_seeds',
                 recover=False,
                 sleep_max=5,
                 ua='firefox'):
        super(GkChsiSpecialSpider,
              self).__init__(threadcnt, account, prefix, proxy, sleep,
                             captcha_limit, sleep_max, ua)
        self.special_saver = GkChsiSpecialPaperStore('yggk_spec_' + prefix)
        self.detail_saver = GkChsiDetailPaperStore('yggk_detail_' + prefix)
        self.prefix = prefix
        self.seeds = seeds
        if proxy:
            self.set_proxy(proxy)

        self.recover = recover
        self.kldms = kldms
        self.parser = HTMLParser.HTMLParser()
        self.curl_share = None
        self.login()
        self.info_saver = LinkSaver(prefix + '_spec_data')
        self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s'

    def dispatch(self):
        with open(self.seeds, 'r') as f:
            for l in f:
                param = l.strip().split(',')
                if len(param) != 8:
                    logging.warn('invalid seeds %s', l)
                    continue
                self.add_main_job({
                    'wclx': 1,
                    'yxdm': param[6],
                    'kldm': param[2],
                    'bkcc': param[4],
                    'start': 0,
                    'years': param[5],
                    'yxmc': param[7].decode('utf-8')
                })
        time.sleep(2)
        self.wait_q()
        self.add_job(None)

    def handle_job(self, jobid):
        url = 'http://gk.chsi.com.cn/recruit/listSpecBySchool.do'
        # con = self.request_url(url, data={'yxdm': jobid['yxdm']})
        # 发送请求不接受数据
        url1 = 'http://gk.chsi.com.cn/recruit/listSchByYxmc.do'
        con = self.request_url(url1,
                               data={
                                   'wclx': 1,
                                   'yxmc': jobid['yxmc'],
                                   'kldm': jobid['kldm'],
                                   'bkcc': jobid['bkcc'],
                                   'start': jobid['start'],
                                   'years': jobid['years']
                               })
        if not con or not con.text:
            self.on_work_failed(None, jobid, url1)
            return
            # con = self.request_url(url, data=jobid)
        # 服务器需要之前的请求传递参数,因为这个页面只接收两个参数
        con = self.request_url(url,
                               data={
                                   'yxdm': jobid['yxdm'],
                                   'start': jobid['start']
                               })
        if not con or not con.text:
            self.on_work_failed(None, jobid, url)
            return
        jtitle = '%s/%s/%s/%s/%s/%s' % (jobid['yxdm'], jobid['years'],
                                        jobid['kldm'], jobid['bkcc'],
                                        jobid['wclx'], jobid['start'])

        self.special_saver.save(int(time.time()), jtitle, url, con.text)
        if 0 == jobid['start']:
            m = re.search(ur'共 (\d+) 页', con.text)
            if m:
                pages = int(m.group(1))
                logging.info('found %d pages for %s', pages, str(jobid))
                for page in range(1, pages):
                    data = copy.deepcopy(jobid)
                    data['start'] = page * 20
                    self.add_job(data)
            else:
                logging.warn('failed to parse pages %s', str(jobid))
        specials = self.extract_special(con.text)
        logging.info('found %d specials from', len(specials))
        for zydm, zymc in specials:
            content = self.extract_detail(zydm, zymc, jobid, jtitle, 0)
            if content is None:
                continue
            m = re.search(ur'共 (\d+) 页', content)
            if not m:
                continue
            page_cnt = int(m.group(1))
            if page_cnt <= 1:
                continue
            for p in range(1, page_cnt):
                self.extract_detail(zydm, zymc, jobid, jtitle, p)

    def extract_detail(self, zydm, zymc, jobid, jtitle, page):
        logging.info('parsing special %s,%s', zymc, zydm)
        detail_url = self.detail_url_format % (jobid['years'], jobid['yxdm'],
                                               zydm, page * 10)
        detail_content = self.request_url(detail_url)
        if not detail_content or not detail_content.text:
            logging.error('fail to fetch %s', detail_url)
            self.info_saver.append('detail failed:%s,%s' %
                                   (str(jobid), detail_url))
            return
        self.detail_saver.save(int(time.time()),
                               '%s/%s/%s' % (jtitle, zydm, page), detail_url,
                               detail_content.text)
        return detail_content.text

    def extract_special(self, content):
        return re.findall(
            r'<input type="radio" name="zydm" value="([\d\w]*)" class="radio">([^<]*)</td>',
            content)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += "saved: %d\n" % self.special_saver.saved_count
            spider.util.sendmail(['*****@*****.**'],
                                 '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Esempio n. 2
0
class GkChsiSchoolSpider(BaseGkChsiFsxSpider):
    """
    学信网阳光高考省市分数线单用户单线程爬虫,抓取高校录取层次对,高校,高校代码,层次
    """

    def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0.0, highscore=750,
                 captcha_limit=50000,
                 kldms=None, seeds=None,
                 recover=False, sleep_max=5, ua='firefox'):
        super(GkChsiSchoolSpider, self).__init__(threadcnt, account, prefix, proxy, sleep, captcha_limit, sleep_max, ua)
        if kldms is None:
            kldms = [1, 5]
        self.pagestore = GkChsiSchoolPaperStore('yggk_sch_' + prefix)
        self.prefix = prefix
        if proxy:
            self.set_proxy(proxy)
        self.highscore = highscore
        self.minscore = {}
        self.recover = recover
        self.kldms = kldms
        self.parser = HTMLParser.HTMLParser()
        self.curl_share = None
        self.login()
        self.info_saver = LinkSaver(prefix + '_data')
        self.seeds = seeds

    def __del__(self):
        self.logout()

    def dispatch(self):
        kldms = self.fetch_kldms()
        self.info_saver.append('kldm:' + str(kldms) + '\n')
        if len(kldms) == 2:
            self.kldms[0] = str(kldms[0][0])
            self.kldms[1] = str(kldms[1][0])
        if self.seeds:
            seeds = []
            with open(self.seeds, 'r') as job_saver:
                lines = job_saver.readlines()
                job_saver.close()
                for l in lines:
                    if not self.recover or not self.pagestore.find_any(self.pagestore.channel + '://' + str(l.strip())):
                        seeds.append(eval(l))
            for seed in seeds:
                self.add_main_job(seed)
            print 'recover %d jobs' % len(seeds)
        else:
            s = self.highscore
            while s >= 50:
                for kldm in self.kldms:
                    for bkcc in [1, 2]:
                        self.add_main_job(
                            {'highscore': s, 'lowscore': s - 50, 'bkcc': bkcc, 'kldm': kldm,
                             'years': 15, 'start': 0})
                s -= 50
            if s > 0:
                for kldm in self.kldms:
                    for bkcc in [1, 2]:
                        self.add_main_job(
                            {'highscore': s, 'lowscore': 0, 'bkcc': bkcc, 'kldm': kldm,
                             'years': 15, 'start': 0})
        time.sleep(2)
        self.wait_q()
        self.add_job(None)

    def handle_job(self, jobid):
        url = 'http://gk.chsi.com.cn/recruit/listRecruitSchool.do'
        con = self.request_url(url, data=jobid)
        if not con or not con.text:
            self.on_work_failed(None, jobid, url)
            return
        if re.search(u'您输入的数据不符合要求,请按照下面的提示信息进行修改', con.text):
            logging.info('re add job,%s', str(jobid))
            self.re_add_job(jobid)
            print '查询错误', str(jobid)
            return
        self.pagestore.save(int(time.time()), str(jobid), url, con.text)
        if 0 == jobid['start']:
            m = re.search(ur'共 (\d+) 页', con.text)
            if m:
                pages = int(m.group(1))
                logging.info('found %d pages for %s', pages, str(jobid))
                for page in range(1, pages):
                    data = copy.deepcopy(jobid)
                    data['start'] = page * 20
                    self.add_job(data)
            else:
                logging.warn('failed to parse pages %s', str(jobid))

    def fetch_kldms(self):
        if self.login_time < -1:
            raise Exception('failed to login')
        con = self.request_url('http://gk.chsi.com.cn/recruit/queryByScore.do')
        if con and con.text:
            m = re.search(r'<select name="kldm">.*?</select>', con.text, re.S)
            if m:
                return re.findall(r'<option value=["\'](\d+)["\'][^>]*>(.*?)<\/option>', m.group())
        return []

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += "saved: %d\n" % self.pagestore.saved_count
            spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Esempio n. 3
0
class GkChsiDetailSpider(BaseGkChsiFsxSpider):
    """
        学信网阳光高考省市分数线单用户单线程爬虫,抓取高校录取层次对,高校,高校代码,层次

        seeds title:省市,科类,科类代码,层次,层次代码,年份,院校代码,院校名称
        """
    def __init__(self,
                 threadcnt,
                 account,
                 prefix,
                 proxy=None,
                 sleep=0.0,
                 captcha_limit=50000000,
                 seeds='detail_seeds',
                 recover=False,
                 sleep_max=5,
                 ua='firefox',
                 year='15',
                 bkccs=None,
                 kldms=None):
        super(GkChsiDetailSpider,
              self).__init__(threadcnt, account, prefix, proxy, sleep,
                             captcha_limit, sleep_max, ua)
        if kldms is None:
            kldms = ['5', '1']
        if bkccs is None:
            bkccs = ['1', '2']
        self.pagestore = GkChsiDetailPaperStore('yggk_detail_' + prefix)
        self.prefix = prefix
        self.seeds = seeds
        if proxy:
            self.set_proxy(proxy)
        self.kldms = kldms
        self.bkccs = bkccs
        self.recover = recover
        self.parser = HTMLParser.HTMLParser()
        self.info_saver = LinkSaver(prefix + '_detail_data')
        self.year = year
        self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s'

    def dispatch(self):
        for kldm in self.kldms:
            for bkcc in self.bkccs:
                self.post_kldm_bkcc_for_session(kldm, bkcc)
                seeds = []
                with open(self.seeds, 'r') as f:
                    for l in f:
                        if l[0] == '{':
                            data = eval(l.strip())
                        else:
                            param = l.strip().split(',')
                            if len(param) != 8:
                                logging.warn('invalid seeds %s', l)
                                continue
                            data = {
                                'wclx': 1,
                                'yxdm': param[6],
                                'kldm': param[2],
                                'bkcc': param[4],
                                'start': 0,
                                'years': param[5],
                                'zydm': param[7],
                                'zymc': param[8].encode('utf-8')
                            }
                        if data['kldm'] == kldm and bkcc == data[
                                'bkcc'] and self.year == data['years']:
                            if self.recover and self.pagestore.find_any(
                                    self.pagestore.channel + '://' +
                                    self.get_jobid(data)):
                                continue
                            seeds.append(data)
                for seed in seeds:
                    self.add_main_job(seed)
                print 'add', len(seeds), 'jobs'
                time.sleep(2)
                self.wait_q()
        self.add_job(None)

    def handle_job(self, jobid):
        content = self.extract_detail(jobid)
        if 0 == jobid['start']:
            if content is None:
                return
            m = re.search(ur'共 (\d+) 页', content)
            if not m:
                return
            page_cnt = int(m.group(1))
            if page_cnt <= 1:
                return
            for p in range(1, page_cnt):
                job = copy.deepcopy(jobid)
                job['start'] = p * 10
                self.add_job(job)

    def get_jobid(self, jobid):
        return '%s/%s/%s/%s/%s/%s/%s/%s' % (
            jobid['yxdm'], jobid['years'], jobid['kldm'],
            jobid['bkcc'], jobid['wclx'], jobid['start'], jobid['zydm'],
            int(jobid['start']) / 10)

    def extract_detail(self, jobid):
        logging.info('parsing special %s,%s', jobid['zymc'], jobid['zydm'])
        detail_url = self.detail_url_format % (jobid['years'], jobid['yxdm'],
                                               jobid['zydm'], jobid['start'])
        detail_content = self.request_url(detail_url)
        if not detail_content or not detail_content.text:
            logging.error('fail to fetch %s', detail_url)
            self.info_saver.append('detail failed:%s,%s' %
                                   (str(jobid), detail_url))
            return
        jtitle = '%s/%s/%s/%s/%s/%s' % (jobid['yxdm'], jobid['years'],
                                        jobid['kldm'], jobid['bkcc'],
                                        jobid['wclx'], jobid['start'])
        self.pagestore.save(
            int(time.time()),
            '%s/%s/%s' % (jtitle, jobid['zydm'], int(jobid['start']) / 10),
            detail_url, detail_content.text)
        return detail_content.text

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += "saved: %d\n" % self.pagestore.saved_count
            spider.util.sendmail(['*****@*****.**'],
                                 '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass