Example #1
0
 def pre_save(self, saver):
     saver.add(GkChsiParser.title)
     s2 = LinkSaver('res_score_%s' % self.name, 'w')
     s2.add('省市,科类,层次,位次,分数')
     for r in self.score_rank:
         s2.add(r)
     s2.flush()
Example #2
0
class ShanghaiStoreFilter(CWPParser):
    def __init__(self):
        CWPParser.__init__(self, 'shanghai_court', 'shanghai_court')
        self.pagestore = ShanghaiCourtStore('sh_court_2')
        self.link_saver = LinkSaver('wrong.id.txt')

    def process_child_item(self, item):
        self.pagestore.save(int(item['crawlerUpdateTime'] / 1000),
                            item['indexUrl'][17:], item['realUrl'],
                            item['content'][1])

    def parse_item(self, page):
        if page['indexUrl'][17] != '/':
            return [page]
        self.link_saver.add(page['indexUrl'][17:])
        return []

    def on_finish(self):
        self.link_saver.flush()

    def on_save(self, items):
        for item in items:
            self.pagestore.save(int(item['crawlerUpdateTime'] / 1000),
                                item['indexUrl'][17:], item['realUrl'],
                                item['content'][1])
Example #3
0
class FileAbstractParser(CAPParser):
    def __init__(self,
                 channel,
                 name,
                 saver_name=None,
                 db='admin',
                 url='mongodb://*****:*****@localhost/'):
        CAPParser.__init__(self, channel, name, db, url)
        if saver_name is None:
            self._save_name = 'out.csv'
        else:
            self._save_name = saver_name
        self.saver = None

    def init(self):
        self.saver = LinkSaver(self._save_name, 'w')
        self.pre_save(self.saver)
        return CAPParser.init(self)

    def parse(self, page):
        pass

    def pre_save(self, saver):
        pass

    def save(self, saver, page):
        pass

    def on_save(self, items):
        item_list = spider.util.unique_list(items)
        for item in item_list:
            self.save(self.saver, item)
        self.saver.flush()
Example #4
0
class SeedParser(WenshuSpider):
    date_format = '%Y%m%d'

    def __init__(self, thcnt=4, page=15):
        WenshuSpider.__init__(self, thcnt)
        self.source = WenshuSeedDb('ws_seed')
        self.link_saver = LinkSaver('seeds.dat', buffer_size=400)
        self.page = page

    def dispatch(self):
        seeds = self.source.export_seeds()

        print 'load %d seeds' % len(seeds)
        for seed in seeds:
            date = seed['indexUrl'].split('://')[1]
            eval_str = seed['content'][1:-1].replace('\\"', '"')
            res = eval(eval_str)
            try:

                if (isinstance(res, tuple)
                        or isinstance(res, list)) and len(res) > 0:
                    self.add_main_job({
                        'type': 'main',
                        'date': date.encode('utf-8'),
                        'count': int(res[0]['Count'])
                    })
                else:
                    print 'invalid seed', seed
            except KeyError as e:
                Log.error('KeyError %s' % e.message)
                traceback.print_exc()
                print seed
                print eval_str
        time.sleep(2)
        self.wait_q()
        self.add_job(None)

    def run_job(self, jobid):
        pagecnt = (jobid['count'] + self.page / 2) / self.page
        for index in range(1, pagecnt + 1):
            self.link_saver.add(
                str({
                    'date': jobid['date'],
                    'count': jobid['count'],
                    'index': index,
                    'page': self.page
                }))

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            self.link_saver.flush()
Example #5
0
 def on_finish(self):
     FileAbstractParser.on_finish(self)
     unfetch_saver = LinkSaver('unfetched_seeds_detail_' + self.channel)
     self.unfetch_list = spider.util.unique_list(self.unfetch_list)
     self.fetched_list = spider.util.unique_list(self.fetched_list)
     unfetched = []
     for link in self.unfetch_list:
         if link not in self.fetched_list:
             unfetched.append(link)
     self.unfetch_list = unfetched
     for link in self.unfetch_list:
         unfetch_saver.add(link)
     unfetch_saver.flush()
     fetchsaver = LinkSaver('fetched_seeds_detail_' + self.channel)
     for l in self.fetched_list:
         fetchsaver.add(str(l))
     fetchsaver.flush()
     print 'fetched jobs', len(self.fetched_list)
     print 'unfetched jobs', len(self.unfetch_list)
Example #6
0
class ShanghaiExtractor(CWPParser):
    """解析文书案号"""
    def __init__(self):
        CWPParser.__init__(self, 'shanghai_court', 'court')
        self.an_saver = LinkSaver('ah.%s.txt' % self.name)

    def process_child_item(self, item):
        line = '%s|%s' % (item[0], item[1])
        print line
        self.an_saver.add(line)

    def init(self):
        print 'job start at', datetime.datetime.now()
        return CWPParser.init(self)

    def parse_item(self, page):
        m = re.search('((\d{4}).*\d+号)', page['content'][1])
        if m:
            return [[m.group(1), page['indexUrl'][17:].encode()]]
        return []

    def on_finish(self):
        self.an_saver.flush()
Example #7
0
class ChsiSpider(BaseGkChsiFsxSpider):
    def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, sleep_max=5,
                 ua='firefox', seeds='detail_seeds', recover=False, year='15', bkccs=None, kldms=None,
                 job_tag='', spider_type='detail', post_kldms=True):
        super(ChsiSpider, self).__init__(threadcnt, account, '%s%s' % (tag, job_tag), proxy, sleep, captcha_limit,
                                         sleep_max,
                                         ua)
        if kldms is None:
            kldms = ['5', '1']
        if bkccs is None:
            bkccs = ['1', '2']
        self.pagestore = self.new_page_store(spider_type, tag)
        self.full_tag = tag
        self.seeds = seeds
        if proxy:
            self.set_proxy(proxy)
        self.kldms = kldms
        self.bkccs = bkccs
        self.recover = recover
        self.info_saver = LinkSaver('info_data_%s_%s%s' % (spider_type, tag, job_tag))
        self.failed_saver = LinkSaver('%s.failed.seeds.%s%s' % (spider_type, tag, job_tag))
        self.invalid_saver = LinkSaver('%s.invalid.seeds.%s%s' % (spider_type, tag, job_tag))
        self.year = year
        self.failed_list = []
        self.invalid_list = []
        self.spider_type = spider_type
        self.post_kldms = post_kldms

    def dispatch(self):
        # read all seeds
        seeds = []
        with open(self.seeds, 'r') as f:
            for l in f:
                data = self.parse_seed(l.strip())
                if not data:
                    continue
                if self.year == str(data['years']):
                    if not self.recover or not self.pagestore.find_any(
                                            self.pagestore.channel + '://' + self.get_job_id(data)):
                        seeds.append(data)
        print 'load ', len(seeds), 'jobs'
        count = 10
        while len(seeds) > 0 and count > 0:
            count -= 1
            logging.info('remain tries %d', count)
            for kldm in self.kldms:
                for bkcc in self.bkccs:
                    seeds = self.request_list(seeds, kldm, bkcc)
                    logging.info('seeds %d,failed %d,kldm=%s,bkcc=%s,tries=%d', len(seeds), len(self.failed_list), kldm,
                                 bkcc, count)
                    time.sleep(2)
                    self.wait_q()
            seeds += self.failed_list
            self.failed_list = []
        self.wait_q()
        self.add_job(None)
        self.failed_list = seeds

    def handle_job(self, jobid):
        pass

    def re_add_failed_job(self, jobid):
        if jobid.has_key('content'):
            jobid.pop('content')
        if jobid.has_key('url'):
            jobid.pop('url')
        cnt = jobid.get('_failed_cnt_', 0) + 1
        jobid['_failed_cnt_'] = cnt
        self.failed_list.append(jobid)

    def save_invalid_job(self, jobid):
        cnt = jobid.get('_invalid_cnt_', 0) + 1
        jobid['_invalid_cnt_'] = cnt
        if cnt < 2:
            self.re_add_failed_job(jobid)
        else:
            if jobid.has_key('content'):
                jobid.pop('content')
            if jobid.has_key('url'):
                jobid.pop('url')
            self.invalid_list.append(jobid)

    def request_list(self, seeds, kldm, bkcc):
        remains = []
        if self.post_kldms:
            self.post_kldm_bkcc_for_session(kldm, bkcc)
            for seed in seeds:
                if seed['kldm'] == kldm and bkcc == seed['bkcc']:
                    self.add_main_job(seed)
                else:
                    remains.append(seed)
        else:
            for seed in seeds:
                self.add_main_job(seed)
        return remains

    def run_job(self, jobid):
        if self.pre_job(jobid):
            return
        if not jobid.has_key('content'):
            self.re_add_failed_job(jobid)
            return
        detail_content = jobid['content']
        if detail_content is None:
            self.re_add_failed_job(jobid)
            return
        try:
            if self._check_result(detail_content.text, jobid, jobid['url']):
                '''exception is found and handled'''
                return
        except InvalidQueryError as e:
            logging.info(e.message)
            self.save_invalid_job(jobid)
            return
        except Exception as e:
            logging.info(e.message)
            self.re_add_failed_job(jobid)
            return
        if not jobid.has_key('url'):
            print jobid
            self.re_add_failed_job(jobid)
            return
        jid = self.get_job_id(jobid)
        print 'saving %s==>%s' % (jid, len(detail_content.text))
        self.pagestore.save(int(time.time()), jid, jobid['url'], detail_content.text)

    def get_job_title(self, jobid):
        raise NotImplementedError('Virtual method called')

    def new_page_store(self, spider, tag):
        raise NotImplementedError('Virtual method called')

    def get_job_id(self, jobid):
        raise NotImplementedError('Virtual method called')

    def parse_page(self, jobid, content):
        raise NotImplementedError('Virtual method called')

    def get_url(self, jobid):
        raise NotImplementedError('Virtual method called')

    def report_job(self, jobid):
        raise NotImplementedError('Virtual method called')

    def add_job(self, jobid, mainjob=False):
        if jobid is None:
            super(ChsiSpider, self).add_job(jobid, mainjob)
            return
        url = self.get_url(jobid)
        count = 3
        content = None
        while count > 0 and not content:
            content = self.request_content(jobid, url)
            count -= 1
        if content is None:
            self.re_add_failed_job(jobid)
            return
        jobid['content'] = content
        jobid['url'] = url
        self.report_job(jobid)
        super(ChsiSpider, self).add_job(jobid, mainjob)
        self.parse_page(jobid, content)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += 'seeds: %s\n' % self.seeds
            msg += "saved: %d\n" % self.pagestore.saved_count
            msg += 'captcha times: %s\n' % self._captcha_times
            msg += 'remain seeds: %d\n' % len(self.failed_list)
            msg += 'invalid seeds: %d\n' % len(self.invalid_list)
            for item in self.except_state:
                msg += '%s: %d\n' % (item.name(), item.count())
            spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg)
            print 'remain seeds', len(self.failed_list)
            print 'invalid seeds', len(self.invalid_list)
            for seed in self.invalid_list:
                self.invalid_saver.add(str(seed))
            self.invalid_saver.flush()
            for seed in self.failed_list:
                self.failed_saver.add(str(seed))
            self.failed_saver.flush()
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass

    def parse_seed(self, param):
        raise NotImplementedError('Virtual method called')

    def request_content(self, jobid, url):
        raise NotImplementedError('Virtual method called')
Example #8
0
class BjListSpider(BJSpider):
    def __init__(self,
                 threadcnt,
                 last_page=None,
                 total_page=22305,
                 save_file='seeds.dat',
                 sleep=0.0,
                 proxy_life=180):
        super(BjListSpider, self).__init__(threadcnt,
                                           'BjListSpider',
                                           proxy_life=proxy_life)

        self.test_mode = False
        self.sleep = sleep
        self.zero_link_count = 0
        self.lock = threading.Lock()
        self._shutdown = False
        self.result_saver = LinkSaver(save_file, 'a')
        self.captcha = FoodMakerExtendLock(threadcnt - 1)
        self.last_page = last_page
        self.total_page = total_page

    def dispatch(self):
        if self.last_page is not None and self.last_page <= self.total_page:
            for page in range(self.last_page, self.total_page + 1):
                self.add_main_job({
                    'type':
                    'list',
                    'url':
                    'http://www.bjcourt.gov.cn/cpws/index.htm?page=%s' % page
                })
        else:
            self.add_main_job({
                'type': 'main',
                'url': 'http://www.bjcourt.gov.cn/cpws/index.htm'
            })
        time.sleep(3)
        self.wait_q()
        self.add_job(None, True)

    def with_sleep_request_url(self, url, **kwargs):
        time.sleep(self.sleep)
        return self.request_url(url, **kwargs)

    def _dec_worker(self):
        self.captcha.decrease()
        super(BjListSpider, self)._dec_worker()

    def run_job(self, jobid):
        if not isinstance(jobid, dict):
            return
        if self._shutdown:
            return
        jt = jobid['type']
        url = jobid['url']
        time.sleep(2)

        con = self.with_sleep_request_url(url, timeout=10)
        if self.check_exception(con, jobid):
            return
        m = re.search('yzmInput', con.text)
        if m:
            print self.get_tid(), url, ' need captcha'
            con = self.resolve_captcha(url)
            if self.check_exception(con, jobid):
                return
            if re.search(r'yzmInput', con.text):
                self._shutdown = True
                self.link_saver.add('%d,%d,%s' % (2, 0, url))
                return

        if 'main' == jt:
            m = re.search(ur'您搜到了\s*<em>([0-9]+)</em>\s*条符合条件的文书', con.text,
                          re.S)
            if not m:
                if re.search(r'yzmInput', con.text):
                    self._shutdown = True
                self.link_saver.add('%d,%d,%s' % (2, 0, url))
                return
            papercnt = int(m.group(1))
            if papercnt <= 0:
                print '哎呀,这里没用文书', url
                with self.lock:
                    self.zero_link_count += 1
                return
            print 'there are %d papers on %s' % (papercnt, url)
            self.link_saver.add('%d,%d,%s' % (1, papercnt, url))
            n_url = url
            if n_url.find('?') < 0:
                n_url += '?'
            elif n_url[-1] != '&':
                n_url += '&'
            for page in range((papercnt + 10) / 20 + 1, 1, -1):
                self.add_job({'type': 'list', 'url': n_url + 'page=%s' % page})

        ids = re.findall(r'\/cpws\/paperView.htm\?id=(\d+)', con.text)
        if not ids or len(ids) == 0:
            print 'cannot find any paper on', url
            return
        print 'add %d papers from %s' % (len(ids), url)
        for id in ids:
            self.result_saver.add(id)

    def split_url(self, url):
        urls = CData.split_param(url)
        for u in urls:
            self.add_job({'type': 'main', 'url': u})

    def event_handler(self, evt, msg, **kwargs):
        super(BjListSpider, self).event_handler(evt, msg, **kwargs)
        if evt == 'DONE':
            self.result_saver.flush()
            msg += 'zero count: %d\n' % self.zero_link_count
            msg += 'captcha times: %d\n' % self.captcha_times
            spider.util.sendmail(['*****@*****.**'],
                                 '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Example #9
0
class BaseChsiSpider(BaseGkChsiFsxSpider):
    def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, seeds='detail_seeds',
                 recover=False, sleep_max=5, ua='firefox', year='15', bkccs=None, kldms=None, job_tag=''):
        super(BaseChsiSpider, self).__init__(threadcnt, account, tag, proxy, sleep, captcha_limit, sleep_max,
                                             ua)
        if kldms is None:
            kldms = ['5', '1']
        if bkccs is None:
            bkccs = ['1', '2']
        self.pagestore = GkChsiDetailPaperStore('yggk_detail_' + tag)
        self.full_tag = tag
        self.seeds = seeds
        if proxy:
            self.set_proxy(proxy)
        self.kldms = kldms
        self.bkccs = bkccs
        self.recover = recover
        self.parser = HTMLParser.HTMLParser()
        self.info_saver = LinkSaver(tag + '_detail_data')
        self.failed_saver = LinkSaver('detail.failed.seeds.' + tag + job_tag)
        self.year = year
        self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s'
        self.failed_list = []
        self.last_request_time = time.time()


    def dispatch(self):
        # read all seeds
        seeds = []
        with open(self.seeds, 'r') as f:
            for l in f:
                if l[0] == '{':
                    data = eval(l.strip())
                else:
                    param = l.strip().split(',')
                    if len(param) != 8:
                        logging.warn('invalid seeds %s', l)
                        continue
                    data = {'wclx': 1, 'yxdm': param[6], 'kldm': param[2], 'bkcc': param[4], 'start': 0,
                            'years': param[5], 'zydm': param[7], 'zymc': param[8].encode('utf-8')}
                if self.year == data['years'] and not self.pagestore.find_any(
                                        self.pagestore.channel + '://' + self.get_jobid(data)):
                    seeds.append(data)
        print 'load ', len(seeds), 'jobs'
        count = 10
        while len(seeds) > 0 and count > 0:
            count += 1
            logging.info('remain tries %d', count)
            for kldm in self.kldms:
                for bkcc in self.bkccs:
                    seeds = self.request_list(seeds, kldm, bkcc)
                    logging.info('seeds %d,failed %d,kldm=%s,bkcc=%s', len(seeds), len(self.failed_list), kldm, bkcc)
                    seeds += self.failed_list
                    self.failed_list = []
        time.sleep(2)
        self.wait_q()
        self.add_job(None)
        print 'remain seeds', len(seeds)
        for seed in seeds:
            self.failed_saver.add(seed)
        self.failed_saver.flush()
        self.failed_list = seeds

    def handle_job(self, jobid):
        pass


    def request_list(self, seeds, kldm, bkcc):
        self.post_kldm_bkcc_for_session(kldm, bkcc)
        remains = []
        for seed in seeds:
            if seed['kldm'] == kldm and bkcc == seed['bkcc']:
                self.add_main_job(seed)
            else:
                remains.append(seed)
        return remains

    def run_job(self, jobid):
        if not jobid.has_key('content'):
            if jobid not in self.failed_list:
                self.failed_list.append(jobid)
            return
        detail_content = jobid['content']
        jtitle = '%s/%s/%s/%s/%s/%s' % (
            jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'],
            jobid['start'])
        self.pagestore.save(int(time.time()), '%s/%s/%s' % (jtitle, jobid['zydm'], int(jobid['start']) / 10),
                            jobid['url'], detail_content.text)

    def add_job(self, jobid, mainjob=False):
        if jobid is None:
            super(BaseChsiSpider, self).add_job(jobid)
            return
        logging.info('fetching special %s,%s', jobid['zymc'], jobid['zydm'])
        detail_url = self.detail_url_format % (jobid['years'], jobid['yxdm'], jobid['zydm'], jobid['start'])
        content = self.fetch_content(jobid, detail_url)
        if content is None:
            # exception is handle
            return
        jobid['content'] = content
        jobid['url'] = detail_url
        super(BaseChsiSpider, self).add_job(jobid, True)
        if 0 == jobid['start']:
            m = re.search(ur'共 (\d+) 页', content.text)
            if not m:
                logging.warn('failed to find page count %s,%s,%s', jobid['kldm'], jobid['bkcc'], detail_url)
                return
            page_cnt = int(m.group(1))
            if page_cnt <= 1:
                return
            for p in range(1, page_cnt):
                job = copy.deepcopy(jobid)
                job['start'] = p * 10
                self.add_main_job(job)

    def get_jobid(self, jobid):
        return '%s/%s/%s/%s/%s/%s/%s/%s' % (
            jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'],
            jobid['start'], jobid['zydm'], int(jobid['start']) / 10)

    def fetch_content(self, jobid, detail_url):
        detail_content = self.request_url(detail_url, allow_redirects=20)
        if detail_content is None:
            self.failed_list.append(jobid)
            return
        try:
            if not self._check_result(detail_content.text, jobid, detail_url):
                self.failed_list.append(jobid)
            else:
                return detail_content
        except Exception as e:
            logging.info(e.message)
            self.failed_list.append(jobid)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += 'seeds: %s\n' % self.seeds
            msg += "saved: %d\n" % self.pagestore.saved_count
            msg += 'captcha times: %s' % self._captcha_times
            msg += 'remain seeds: %d\n' % len(self.failed_list)
            spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Example #10
0
class PatentAbstractSpider(ZhuanliBaseSpider, Main):
    """专利摘要爬虫"""

    def __init__(self, thcnt, mode='id', recover=True, seeds='seed.dat'):
        ZhuanliBaseSpider.__init__(self, thcnt, recover, timeout=90)
        Main.__init__(self)
        self.short_tag = 't:m:s:r:o:h:v:'
        self.tags = ['recover=', 'threads=', 'mode=', 'seeds=', 'output=']
        self.seeds = seeds
        self.page_size = 20  # 3或者10,20
        self.pagestore = PatentAbstractStore('abstract')
        self.failed_saver = FailedJobSaver('failed_job.txt')
        self.seed_saver = LinkSaver('seed.year.txt', 'a+')
        self.job_log = LinkSaver('abstract.%s.log' % mode, 'a+')
        self.mode = mode
        self.__version = '1.0.0'
        self.utils = threading.local()
        self.sp_errors = OrderedDict()
        self.pre_save_count = 0
        self.properties = PropertiesManager()
        self.can_load_seed = True

    def output(self, args):
        print '_patent_spider.py: %s' % args

    def version(self):
        print '_patent_spider.py %s' % self.__version

    def usage(self):
        print '_patent_spider.py usage:'
        print '-h, --help: print help message.'
        print '-v, --version: print script version'
        print '-o, --output: input an output verb'
        print '-t, --threads: thread count '
        print '-m, --mode: mode,if not id then will be abstract mode'
        print '-r, --recover: recover,1 or True for recover mode'
        print '-s, --seeds: seeds file'

    def _set_proxy(self, kwargs, selproxy):
        super(PatentAbstractSpider, self)._set_proxy(kwargs, selproxy)
        setattr(self.utils, 'proxy', selproxy)

    def handle(self, opts):
        for o, a in opts:
            if o in ('-h', '--help'):
                self.usage()
                sys.exit(1)
            elif o in ('-v', '--version'):
                self.version()
                sys.exit(0)
            elif o in ('-o', '--output'):
                self.output(a)
                sys.exit(0)
            elif o in ('-t', '--threads'):
                self.thread_count = int(a)
            elif o in ('-m', '--mode'):
                self.mode = a
            elif o in ('-s', '--seeds'):
                self.seeds = a
            elif o in ('-r', '--recover'):
                self.recover = True if (a == '1' or a == 'True') else False
            else:
                print 'unhandled option'
                sys.exit(3)
        if self.mode != 'id':
            self.mode = 'abs'
        if self.mode != 'id' and not os.path.exists(self.seeds):
            print 'seed file %s not exists' % self.seeds
            sys.exit(1)
        count = 3
        while count > 0:
            self.sp_proxies = OrderedDict()
            if self.mode == 'id':
                # self.set_proxy('183.111.169.203:8080', len(job.sp_proxies))
                self.set_proxy('192.168.1.39:3428:ipin:helloipin', len(job.sp_proxies))
            else:
                proxies = KuaidailiProxyManager.load_proxy(100)
                print 'load %d proxies from kuaidaili' % proxies['data']['count']
                if proxies['data']['count'] > 0:
                    self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0)
            # proxies = KuaidailiProxyManager.load_proxy(50)
            # print 'load %d proxies from kuaidaili' % proxies['data']['count']
            # if proxies['data']['count'] > 0:
            #     self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0)
            self.run()
            count -= 1

    def load_proxy(self, fn, index=-1, auto_change=True):
        super(PatentAbstractSpider, self).load_proxy(fn, index, auto_change)
        with self.locker:
            self.sp_errors.clear()
            for proxy in self.sp_proxies.iterkeys():
                self.sp_proxies[proxy] = 0

    def set_proxy(self, prs, index=-1, auto_change=True):
        with self.locker:
            if isinstance(prs, list):
                for p in prs:
                    self.sp_errors[p] = 0
            elif isinstance(prs, str) or isinstance(prs, unicode):
                self.sp_errors[prs] = 0
        super(PatentAbstractSpider, self).set_proxy(prs, index, auto_change)

    @staticmethod
    def gen_list_seed():
        now = datetime.now()
        this_year = int(now.strftime('%Y'))
        this_month = int(now.strftime('%m'))
        types = ['fmgb', 'fmsq', 'xxsq', 'wgsq']
        seeds = []
        for year in range(1985, this_year):
            for month in range(1, 13):
                for t in types:
                    seeds.append(
                        {'type': t, 'index': 1, 'time': '%s%s' % (year, (month if month > 9 else '0%s' % month))})
        for month in range(1, this_month):
            for t in types:
                seeds.append(
                    {'type': t, 'index': 1, 'time': '%s%s' % (this_year, (month if month > 9 else '0%s' % month))})
        return seeds

    def load_abstract_seeds(self, seed_file, limit=1000000):
        seeds = []
        last_position = self.properties.get('position', 0)
        f = open(seed_file, 'r')
        count = 0
        f.seek(last_position)
        while count < limit:
            l = f.readline()
            if not l:
                # 文件结束,不能再读
                self.can_load_seed = False
                break
            res = l.strip().split(',')
            if len(res) < 3:
                print 'invalid seeds:', l
            else:
                seeds.append({'type': res[1], 'id': res[0], 'code': res[2]})
                count += 1
        last_position = f.tell()
        self.properties.set('position', last_position)
        self.properties.save()
        f.close()
        return seeds

    def get_id_seeds(self):
        raw_seeds = self.gen_list_seed()
        rds = self.job_log.readlines()
        '''get done jobs'''
        done_jobs = {}
        for job in rds:
            if '[' == job[0]:
                continue
            js = job.strip().split('-')
            done_jobs['%s-%s' % (js[0], js[1])] = {}
            done_jobs['%s-%s' % (js[0], js[1])]['pages'] = int(js[2])
            done_jobs['%s-%s' % (js[0], js[1])]['current'] = 1
        '''load done seeds'''
        dss = self.seed_saver.readlines()
        for ds in dss:
            sd = ds.strip().split(',')
            if len(sd) < 4:
                print 'invalid seed', ds
                continue
            js = sd[3].split('-')
            sid = '%s-%s' % (js[0], js[1])
            page = int(js[2])
            if done_jobs.has_key(sid) and done_jobs[sid]['current'] < page:
                done_jobs[sid]['current'] = page
        seeds = []
        for seed in raw_seeds:
            sid = seed['time'] + '-' + seed['type']
            if done_jobs.has_key(sid):
                if done_jobs[sid]['pages'] > done_jobs[sid]['current'] > 1:
                    for page in range(done_jobs[sid]['current'] + 1, done_jobs[sid]['pages'] + 1):
                        s = copy.deepcopy(seed)
                        s['index'] = page
                        seeds.append(s)
            else:
                seeds.append(seed)

        logging.info('load %s list seeds', len(seeds))
        return seeds

    def get_abstract_seeds(self, limit=100000):
        rawseeds = self.load_abstract_seeds(self.seeds, limit)
        seeds = []
        for s in rawseeds:
            if not self.recover or not self.pagestore.find_any(self.pagestore.channel + '://' + s['id']):
                seeds.append(s)
                if len(seeds) >= limit:
                    break
        logging.info('load %d abstract seeds', len(seeds))
        return seeds

    def report(self):
        super(PatentAbstractSpider, self).report()
        self.job_log.flush()
        self.seed_saver.flush()
        count = self.pagestore.saved_count - self.pre_save_count
        self.pre_save_count = self.pagestore.saved_count
        print 'save %d doc in this minute' % count

    def dispatch(self):
        self.failed_saver.tag()
        if self.mode == 'id':
            seeds = self.get_id_seeds()
            for seed in seeds:
                self.add_main_job(seed)
        else:
            count = 10
            ever_loaded = False
            while count > 0 and self.can_load_seed:
                seeds = self.get_abstract_seeds()
                if len(seeds) > 0:
                    ever_loaded = True
                    for seed in seeds:
                        self.add_main_job(seed)
                    time.sleep(2)
                    self.wait_q()
                elif ever_loaded:
                    count -= 1
                    time.sleep(100)

        time.sleep(2)
        self.wait_q()
        self.add_job(None)

    @staticmethod
    def extract_seed_id(pub, app, count):
        return '%s-%s/%s-%s/%s' % (
            pub[0], pub[1], app[0] if (app[0] != '-') else '', app[1] if (app[1] != '-') else '', count)

    @staticmethod
    def parse_seed(seed):
        v = seed.split(',')
        if len(v) != 7:
            print 'invalid seed', seed
            return []
        return [[v[1][1:], v[2][:-1]], [v[3][1:], v[4][:-1]], int(v[6])]

    @staticmethod
    def get_query_word(jobid):
        word = '公开(公告)日=%s' % jobid['time']
        return word

    def _on_shutdown(self, jobid):
        self.failed_saver.save('2,%s' % str(jobid))
        return

    def handle_id_job(self, jobid):
        strword = self.get_query_word(jobid)
        url = self.form_query_url(strword, page=jobid['index'], size=self.page_size, selected=jobid['type'], showtype=0)
        con = self.request_url(url, timeout=self.timeout)
        if self.check_exception(con, jobid):
            print 'exception encounter', jobid
            return
        if re.search(u'<title>错误页面</title>', con.text):
            print '错误页面', jobid
            if not self.re_add_job(jobid):
                self.failed_saver.save(str(jobid))
            return
        patents = re.findall(r'<a href="javascript:zl_xm\(\'([\d\w]+)\',\'(\w+)\',\'([\w\d]+)\'\);">[\d\w]+</a>',
                             con.text)
        print '[%d]%s-%s-%s' % (len(patents), jobid['time'], jobid['type'], jobid['index'])
        if 0 == len(patents):
            self.job_log.add('[%d]%s-%s-%s,%s' % (len(patents), jobid['time'], jobid['type'], jobid['index'], con.code))
            self.re_add_job(jobid)
            return
        for p in patents:
            if len(p) != 3:
                logging.warn('invalid pattern matched:%s,%s', str(p), str(jobid))
                self.failed_saver.save('1,%s' % str(jobid))
            else:
                self.seed_saver.add(
                    '%s,%s,%s,%s-%s-%d' % (p[0], p[1], p[2], jobid['time'], jobid['type'], jobid['index']))
        if 1 == jobid['index']:
            m = re.search(r'javascript:if\(event.keyCode == 13\) zl_tz\((\d+)\)', con.text)
            if m:
                pagecnt = int(m.group(1))
                print '[%d][%d]%s-%s-%d' % (len(patents), pagecnt, jobid['time'], jobid['type'], jobid['index'])
                self.job_log.add('%s-%s-%s' % (jobid['time'], jobid['type'], pagecnt))
                for page in range(2, pagecnt + 1):
                    job = copy.deepcopy(jobid)
                    job['_failcnt_'] = 0
                    job['index'] = page
                    self.add_job(job)
            else:
                print 'failed to find count[%d]%s-%s-[%d]' % (len(patents), jobid['time'], jobid['type'], 0)
                logging.warn('failed to find page count:%s-%s-%s', jobid['time'], jobid['type'], jobid['index'])

    def handle_abstract_seed(self, jobid):
        qword = quote('申请号=\'%s\' and %s=1' % (jobid['id'], jobid['code']))
        url = 'http://epub.sipo.gov.cn/patentdetail.action?strSources=%s&strWhere=%s&strLicenseCode=&pageSize=6&pageNow=1' % (
            jobid['type'], qword)
        con = self.request_url(url, timeout=self.timeout)
        if self.check_exception(con, jobid):
            print 'exception encounter', jobid
            return
        if re.search(u'<title>错误页面</title>', con.text):
            print '错误页面', jobid
            if not self.re_add_job(jobid):
                self.failed_saver.save(str(jobid))
            return
        print 'success:%s-%s-%s' % (jobid['id'], jobid['type'], jobid['code'])
        self.pagestore.save(int(time.time()), jobid['id'], url, con.text)

    def run_job(self, jobid):
        if self.check_shutdown(jobid):
            return
        try:
            if self.mode == 'id':
                self.handle_id_job(jobid)
            else:
                self.handle_abstract_seed(jobid)
        except RuntimeError as e:
            if 'no proxy' in e.message:
                self.re_add_job(jobid)
                self.reload_proxy()
                return
            else:
                raise

    def reload_proxy(self):
        prs = {}
        count = 3
        while count > 0:
            if 'id' == self.mode:
                prs = KuaidailiProxyManager.load_proxy(20)
            else:
                prs = KuaidailiProxyManager.load_proxy(100)
            if prs['data']['count'] > 0:
                break
            count -= 1
        if count <= 0 or not prs.has_key('data') or not prs['data'].has_key('count') or \
                        prs['data'][
                            'count'] <= 0:
            self._shutdown()
            logging.error('cannot load any proxy')
            spider.util.sendmail(['*****@*****.**'], 'Proxy Error',
                                 'Cannot load any proxy:%s,%s' % (self._name, self.mode))
            return
        print 'load %d proxies from kuaidaili' % prs['data']['count']
        self.set_proxy(prs['data']['proxy_list'], 15 if (prs['data']['count'] > 15) else 0)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            self.job_log.flush()
            msg += "saved: %d\n" % self.pagestore.saved_count
            spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg)

    def proxy_error(self):
        proxy = getattr(self.utils, 'proxy')
        if proxy is not None:
            with self.locker:
                try:
                    if self.sp_errors[proxy] < 5:
                        self.sp_errors[proxy] += 1
                    else:
                        self.sp_proxies.pop(proxy)
                        if len(self.sp_proxies) == 0:
                            self.reload_proxy()
                except KeyError:
                    pass

    def on_proxy_error(self, con, jobid):
        self.proxy_error()
        self.re_add_job(jobid)
        return True

    def on_other_400_exception(self, con, jobid):
        if con.code == 403:
            self.proxy_error()
        self.re_add_job(jobid)
        return True

    def on_other_500_exception(self, con, jobid):
        if 504 == con.code and re.search('proxy', con.text, re.I):
            self.proxy_error()
            self.re_add_job(jobid)
            return True
        else:
            return super(PatentAbstractSpider, self).on_other_500_exception(con, jobid)