Exemple #1
0
    def dispatch(self):
        kldms = self.fetch_kldms()
        if len(kldms) == 2:
            self.kldms = kldms
        # data_tmp = {'wclx': 1, 'score': 0, 'bkcc': 1, 'kldm': 1, 'years': 15, 'type': 'score'}
        for kldm in self.kldms:
            self.minscore[str(kldm)] = -1
        if self.recover:
            with open(self.prefix + '_undo_jobs_old', 'r') as job_saver:
                job_saver = LinkSaver(self.prefix + '_undo_jobs_old', 'r')
                lines = job_saver.readlines()
                job_saver.close()
                for l in lines:
                    self.add_main_job(eval(l))
                print 'recover %d jobs' % len(lines)
        else:
            bkccs = [1, 2]  # 本科,专科
            for kldm in self.kldms:
                for bkcc in bkccs:
                    for score in range(self.highscore, -1, -1):
                        data = {
                            'wclx': 1,
                            'score': score,
                            'bkcc': bkcc,
                            'kldm': kldm,
                            'years': 15
                        }
                        self.add_main_job({'data': data, 'type': 'score'})

        time.sleep(2)
        self.wait_q()
        self.add_job(None)
Exemple #2
0
    def dispatch(self):
        kldms = self.fetch_kldms()
        if len(kldms) == 2:
            self.kldms[0] = str(kldms[0])
            self.kldms[1] = str(kldms[1])
        # data_tmp = {'wclx': 1, 'score': 0, 'bkcc': 1, 'kldm': 1, 'years': 15, 'type': 'score'}
        for kldm in self.kldms:
            self.min_score_arr[str(kldm)] = -1
        if self.recover:
            job_saver = LinkSaver(self.prefix + '_undo_jobs_old', 'r')
            lines = job_saver.readlines()
            job_saver.close()
            for l in lines:
                self.add_main_job(eval(l.strip()))
            print 'recover %d jobs' % len(lines)
        else:
            bkccs = [1, 2]  # 本科,专科
            mid_score = self.min_score + (self.highscore -
                                          self.min_score) * 3 / 4
            for score in range(
                    0,
                    max(self.highscore - mid_score,
                        mid_score - self.min_score)):
                up_score = mid_score + score
                down_score = mid_score - score - 1
                for kldm in self.kldms:
                    for bkcc in bkccs:
                        if up_score <= self.highscore:
                            data = {
                                'wclx': 1,
                                'score': up_score,
                                'bkcc': bkcc,
                                'kldm': kldm,
                                'years': 15
                            }
                            self.add_main_job({'data': data, 'type': 'score'})
                        if down_score >= self.min_score and down_score > 0:
                            data = {
                                'wclx': 1,
                                'score': down_score,
                                'bkcc': bkcc,
                                'kldm': kldm,
                                'years': 15
                            }
                            self.add_main_job({'data': data, 'type': 'score'})

        time.sleep(2)
        self.wait_q()
        self.add_job(None)
Exemple #3
0
class PatentAbstractSpider(ZhuanliBaseSpider, Main):
    """专利摘要爬虫"""

    def __init__(self, thcnt, mode='id', recover=True, seeds='seed.dat'):
        ZhuanliBaseSpider.__init__(self, thcnt, recover, timeout=90)
        Main.__init__(self)
        self.short_tag = 't:m:s:r:o:h:v:'
        self.tags = ['recover=', 'threads=', 'mode=', 'seeds=', 'output=']
        self.seeds = seeds
        self.page_size = 20  # 3或者10,20
        self.pagestore = PatentAbstractStore('abstract')
        self.failed_saver = FailedJobSaver('failed_job.txt')
        self.seed_saver = LinkSaver('seed.year.txt', 'a+')
        self.job_log = LinkSaver('abstract.%s.log' % mode, 'a+')
        self.mode = mode
        self.__version = '1.0.0'
        self.utils = threading.local()
        self.sp_errors = OrderedDict()
        self.pre_save_count = 0
        self.properties = PropertiesManager()
        self.can_load_seed = True

    def output(self, args):
        print '_patent_spider.py: %s' % args

    def version(self):
        print '_patent_spider.py %s' % self.__version

    def usage(self):
        print '_patent_spider.py usage:'
        print '-h, --help: print help message.'
        print '-v, --version: print script version'
        print '-o, --output: input an output verb'
        print '-t, --threads: thread count '
        print '-m, --mode: mode,if not id then will be abstract mode'
        print '-r, --recover: recover,1 or True for recover mode'
        print '-s, --seeds: seeds file'

    def _set_proxy(self, kwargs, selproxy):
        super(PatentAbstractSpider, self)._set_proxy(kwargs, selproxy)
        setattr(self.utils, 'proxy', selproxy)

    def handle(self, opts):
        for o, a in opts:
            if o in ('-h', '--help'):
                self.usage()
                sys.exit(1)
            elif o in ('-v', '--version'):
                self.version()
                sys.exit(0)
            elif o in ('-o', '--output'):
                self.output(a)
                sys.exit(0)
            elif o in ('-t', '--threads'):
                self.thread_count = int(a)
            elif o in ('-m', '--mode'):
                self.mode = a
            elif o in ('-s', '--seeds'):
                self.seeds = a
            elif o in ('-r', '--recover'):
                self.recover = True if (a == '1' or a == 'True') else False
            else:
                print 'unhandled option'
                sys.exit(3)
        if self.mode != 'id':
            self.mode = 'abs'
        if self.mode != 'id' and not os.path.exists(self.seeds):
            print 'seed file %s not exists' % self.seeds
            sys.exit(1)
        count = 3
        while count > 0:
            self.sp_proxies = OrderedDict()
            if self.mode == 'id':
                # self.set_proxy('183.111.169.203:8080', len(job.sp_proxies))
                self.set_proxy('192.168.1.39:3428:ipin:helloipin', len(job.sp_proxies))
            else:
                proxies = KuaidailiProxyManager.load_proxy(100)
                print 'load %d proxies from kuaidaili' % proxies['data']['count']
                if proxies['data']['count'] > 0:
                    self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0)
            # proxies = KuaidailiProxyManager.load_proxy(50)
            # print 'load %d proxies from kuaidaili' % proxies['data']['count']
            # if proxies['data']['count'] > 0:
            #     self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0)
            self.run()
            count -= 1

    def load_proxy(self, fn, index=-1, auto_change=True):
        super(PatentAbstractSpider, self).load_proxy(fn, index, auto_change)
        with self.locker:
            self.sp_errors.clear()
            for proxy in self.sp_proxies.iterkeys():
                self.sp_proxies[proxy] = 0

    def set_proxy(self, prs, index=-1, auto_change=True):
        with self.locker:
            if isinstance(prs, list):
                for p in prs:
                    self.sp_errors[p] = 0
            elif isinstance(prs, str) or isinstance(prs, unicode):
                self.sp_errors[prs] = 0
        super(PatentAbstractSpider, self).set_proxy(prs, index, auto_change)

    @staticmethod
    def gen_list_seed():
        now = datetime.now()
        this_year = int(now.strftime('%Y'))
        this_month = int(now.strftime('%m'))
        types = ['fmgb', 'fmsq', 'xxsq', 'wgsq']
        seeds = []
        for year in range(1985, this_year):
            for month in range(1, 13):
                for t in types:
                    seeds.append(
                        {'type': t, 'index': 1, 'time': '%s%s' % (year, (month if month > 9 else '0%s' % month))})
        for month in range(1, this_month):
            for t in types:
                seeds.append(
                    {'type': t, 'index': 1, 'time': '%s%s' % (this_year, (month if month > 9 else '0%s' % month))})
        return seeds

    def load_abstract_seeds(self, seed_file, limit=1000000):
        seeds = []
        last_position = self.properties.get('position', 0)
        f = open(seed_file, 'r')
        count = 0
        f.seek(last_position)
        while count < limit:
            l = f.readline()
            if not l:
                # 文件结束,不能再读
                self.can_load_seed = False
                break
            res = l.strip().split(',')
            if len(res) < 3:
                print 'invalid seeds:', l
            else:
                seeds.append({'type': res[1], 'id': res[0], 'code': res[2]})
                count += 1
        last_position = f.tell()
        self.properties.set('position', last_position)
        self.properties.save()
        f.close()
        return seeds

    def get_id_seeds(self):
        raw_seeds = self.gen_list_seed()
        rds = self.job_log.readlines()
        '''get done jobs'''
        done_jobs = {}
        for job in rds:
            if '[' == job[0]:
                continue
            js = job.strip().split('-')
            done_jobs['%s-%s' % (js[0], js[1])] = {}
            done_jobs['%s-%s' % (js[0], js[1])]['pages'] = int(js[2])
            done_jobs['%s-%s' % (js[0], js[1])]['current'] = 1
        '''load done seeds'''
        dss = self.seed_saver.readlines()
        for ds in dss:
            sd = ds.strip().split(',')
            if len(sd) < 4:
                print 'invalid seed', ds
                continue
            js = sd[3].split('-')
            sid = '%s-%s' % (js[0], js[1])
            page = int(js[2])
            if done_jobs.has_key(sid) and done_jobs[sid]['current'] < page:
                done_jobs[sid]['current'] = page
        seeds = []
        for seed in raw_seeds:
            sid = seed['time'] + '-' + seed['type']
            if done_jobs.has_key(sid):
                if done_jobs[sid]['pages'] > done_jobs[sid]['current'] > 1:
                    for page in range(done_jobs[sid]['current'] + 1, done_jobs[sid]['pages'] + 1):
                        s = copy.deepcopy(seed)
                        s['index'] = page
                        seeds.append(s)
            else:
                seeds.append(seed)

        logging.info('load %s list seeds', len(seeds))
        return seeds

    def get_abstract_seeds(self, limit=100000):
        rawseeds = self.load_abstract_seeds(self.seeds, limit)
        seeds = []
        for s in rawseeds:
            if not self.recover or not self.pagestore.find_any(self.pagestore.channel + '://' + s['id']):
                seeds.append(s)
                if len(seeds) >= limit:
                    break
        logging.info('load %d abstract seeds', len(seeds))
        return seeds

    def report(self):
        super(PatentAbstractSpider, self).report()
        self.job_log.flush()
        self.seed_saver.flush()
        count = self.pagestore.saved_count - self.pre_save_count
        self.pre_save_count = self.pagestore.saved_count
        print 'save %d doc in this minute' % count

    def dispatch(self):
        self.failed_saver.tag()
        if self.mode == 'id':
            seeds = self.get_id_seeds()
            for seed in seeds:
                self.add_main_job(seed)
        else:
            count = 10
            ever_loaded = False
            while count > 0 and self.can_load_seed:
                seeds = self.get_abstract_seeds()
                if len(seeds) > 0:
                    ever_loaded = True
                    for seed in seeds:
                        self.add_main_job(seed)
                    time.sleep(2)
                    self.wait_q()
                elif ever_loaded:
                    count -= 1
                    time.sleep(100)

        time.sleep(2)
        self.wait_q()
        self.add_job(None)

    @staticmethod
    def extract_seed_id(pub, app, count):
        return '%s-%s/%s-%s/%s' % (
            pub[0], pub[1], app[0] if (app[0] != '-') else '', app[1] if (app[1] != '-') else '', count)

    @staticmethod
    def parse_seed(seed):
        v = seed.split(',')
        if len(v) != 7:
            print 'invalid seed', seed
            return []
        return [[v[1][1:], v[2][:-1]], [v[3][1:], v[4][:-1]], int(v[6])]

    @staticmethod
    def get_query_word(jobid):
        word = '公开(公告)日=%s' % jobid['time']
        return word

    def _on_shutdown(self, jobid):
        self.failed_saver.save('2,%s' % str(jobid))
        return

    def handle_id_job(self, jobid):
        strword = self.get_query_word(jobid)
        url = self.form_query_url(strword, page=jobid['index'], size=self.page_size, selected=jobid['type'], showtype=0)
        con = self.request_url(url, timeout=self.timeout)
        if self.check_exception(con, jobid):
            print 'exception encounter', jobid
            return
        if re.search(u'<title>错误页面</title>', con.text):
            print '错误页面', jobid
            if not self.re_add_job(jobid):
                self.failed_saver.save(str(jobid))
            return
        patents = re.findall(r'<a href="javascript:zl_xm\(\'([\d\w]+)\',\'(\w+)\',\'([\w\d]+)\'\);">[\d\w]+</a>',
                             con.text)
        print '[%d]%s-%s-%s' % (len(patents), jobid['time'], jobid['type'], jobid['index'])
        if 0 == len(patents):
            self.job_log.add('[%d]%s-%s-%s,%s' % (len(patents), jobid['time'], jobid['type'], jobid['index'], con.code))
            self.re_add_job(jobid)
            return
        for p in patents:
            if len(p) != 3:
                logging.warn('invalid pattern matched:%s,%s', str(p), str(jobid))
                self.failed_saver.save('1,%s' % str(jobid))
            else:
                self.seed_saver.add(
                    '%s,%s,%s,%s-%s-%d' % (p[0], p[1], p[2], jobid['time'], jobid['type'], jobid['index']))
        if 1 == jobid['index']:
            m = re.search(r'javascript:if\(event.keyCode == 13\) zl_tz\((\d+)\)', con.text)
            if m:
                pagecnt = int(m.group(1))
                print '[%d][%d]%s-%s-%d' % (len(patents), pagecnt, jobid['time'], jobid['type'], jobid['index'])
                self.job_log.add('%s-%s-%s' % (jobid['time'], jobid['type'], pagecnt))
                for page in range(2, pagecnt + 1):
                    job = copy.deepcopy(jobid)
                    job['_failcnt_'] = 0
                    job['index'] = page
                    self.add_job(job)
            else:
                print 'failed to find count[%d]%s-%s-[%d]' % (len(patents), jobid['time'], jobid['type'], 0)
                logging.warn('failed to find page count:%s-%s-%s', jobid['time'], jobid['type'], jobid['index'])

    def handle_abstract_seed(self, jobid):
        qword = quote('申请号=\'%s\' and %s=1' % (jobid['id'], jobid['code']))
        url = 'http://epub.sipo.gov.cn/patentdetail.action?strSources=%s&strWhere=%s&strLicenseCode=&pageSize=6&pageNow=1' % (
            jobid['type'], qword)
        con = self.request_url(url, timeout=self.timeout)
        if self.check_exception(con, jobid):
            print 'exception encounter', jobid
            return
        if re.search(u'<title>错误页面</title>', con.text):
            print '错误页面', jobid
            if not self.re_add_job(jobid):
                self.failed_saver.save(str(jobid))
            return
        print 'success:%s-%s-%s' % (jobid['id'], jobid['type'], jobid['code'])
        self.pagestore.save(int(time.time()), jobid['id'], url, con.text)

    def run_job(self, jobid):
        if self.check_shutdown(jobid):
            return
        try:
            if self.mode == 'id':
                self.handle_id_job(jobid)
            else:
                self.handle_abstract_seed(jobid)
        except RuntimeError as e:
            if 'no proxy' in e.message:
                self.re_add_job(jobid)
                self.reload_proxy()
                return
            else:
                raise

    def reload_proxy(self):
        prs = {}
        count = 3
        while count > 0:
            if 'id' == self.mode:
                prs = KuaidailiProxyManager.load_proxy(20)
            else:
                prs = KuaidailiProxyManager.load_proxy(100)
            if prs['data']['count'] > 0:
                break
            count -= 1
        if count <= 0 or not prs.has_key('data') or not prs['data'].has_key('count') or \
                        prs['data'][
                            'count'] <= 0:
            self._shutdown()
            logging.error('cannot load any proxy')
            spider.util.sendmail(['*****@*****.**'], 'Proxy Error',
                                 'Cannot load any proxy:%s,%s' % (self._name, self.mode))
            return
        print 'load %d proxies from kuaidaili' % prs['data']['count']
        self.set_proxy(prs['data']['proxy_list'], 15 if (prs['data']['count'] > 15) else 0)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            self.job_log.flush()
            msg += "saved: %d\n" % self.pagestore.saved_count
            spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg)

    def proxy_error(self):
        proxy = getattr(self.utils, 'proxy')
        if proxy is not None:
            with self.locker:
                try:
                    if self.sp_errors[proxy] < 5:
                        self.sp_errors[proxy] += 1
                    else:
                        self.sp_proxies.pop(proxy)
                        if len(self.sp_proxies) == 0:
                            self.reload_proxy()
                except KeyError:
                    pass

    def on_proxy_error(self, con, jobid):
        self.proxy_error()
        self.re_add_job(jobid)
        return True

    def on_other_400_exception(self, con, jobid):
        if con.code == 403:
            self.proxy_error()
        self.re_add_job(jobid)
        return True

    def on_other_500_exception(self, con, jobid):
        if 504 == con.code and re.search('proxy', con.text, re.I):
            self.proxy_error()
            self.re_add_job(jobid)
            return True
        else:
            return super(PatentAbstractSpider, self).on_other_500_exception(con, jobid)