class FullTextSeedGen(CWPParser): def __init__(self, channel, save='fulltext.seed.txt', db='zhuanli', dburl='mongodb://localhost/zhuanli'): CWPParser.__init__(self, channel, channel, db, dburl) self.seed_saver = LinkSaver(save) def process_child_item(self, item): print item self.seed_saver.add(item) def parse_item(self, page): apc = page['indexUrl'].split('://')[1] m = re.search(r"d.strWhere.value = \"pnm='([\w\d]+)'\";", page['content'][1]) if m: pnm = m.group(1) else: print 'cannot find patent number:', page['indexUrl'] return [] s = re.search(r'd\.strSources\.value = "(\w+)";', page['content'][1]) if s: pt = s.group(1) else: print 'cannot find patent type:', page['indexUrl'] return [] return ['%s-%s-%s' % (pnm, pt, apc)] def on_finish(self): print '%d link saved' % self.seed_saver.count
def pre_save(self, saver): saver.add(GkChsiParser.title) s2 = LinkSaver('res_score_%s' % self.name, 'w') s2.add('省市,科类,层次,位次,分数') for r in self.score_rank: s2.add(r) s2.flush()
class ShanghaiStoreFilter(CWPParser): def __init__(self): CWPParser.__init__(self, 'shanghai_court', 'shanghai_court') self.pagestore = ShanghaiCourtStore('sh_court_2') self.link_saver = LinkSaver('wrong.id.txt') def process_child_item(self, item): self.pagestore.save(int(item['crawlerUpdateTime'] / 1000), item['indexUrl'][17:], item['realUrl'], item['content'][1]) def parse_item(self, page): if page['indexUrl'][17] != '/': return [page] self.link_saver.add(page['indexUrl'][17:]) return [] def on_finish(self): self.link_saver.flush() def on_save(self, items): for item in items: self.pagestore.save(int(item['crawlerUpdateTime'] / 1000), item['indexUrl'][17:], item['realUrl'], item['content'][1])
class ChannelParser: def __init__(self, name='failed.txt', mode='a'): self.failed_saver = LinkSaver(name, mode) pass @abc.abstractmethod def parse(self, jid, content): raise NotImplementedError('virtual function called') def on_failed(self, message): self.failed_saver.add(message)
class Extractor(CWPParser): def __init__(self): CWPParser.__init__(self, 'fs_court', 'fs') self.saver = LinkSaver('seed.txt') def process_child_item(self, item): self.saver.add(item) print '%s saved' % item def parse_item(self, page): if '页面不存在' in page['content'][1]: return [page['realUrl']] return []
class SeedParser(WenshuSpider): date_format = '%Y%m%d' def __init__(self, thcnt=4, page=15): WenshuSpider.__init__(self, thcnt) self.source = WenshuSeedDb('ws_seed') self.link_saver = LinkSaver('seeds.dat', buffer_size=400) self.page = page def dispatch(self): seeds = self.source.export_seeds() print 'load %d seeds' % len(seeds) for seed in seeds: date = seed['indexUrl'].split('://')[1] eval_str = seed['content'][1:-1].replace('\\"', '"') res = eval(eval_str) try: if (isinstance(res, tuple) or isinstance(res, list)) and len(res) > 0: self.add_main_job({ 'type': 'main', 'date': date.encode('utf-8'), 'count': int(res[0]['Count']) }) else: print 'invalid seed', seed except KeyError as e: Log.error('KeyError %s' % e.message) traceback.print_exc() print seed print eval_str time.sleep(2) self.wait_q() self.add_job(None) def run_job(self, jobid): pagecnt = (jobid['count'] + self.page / 2) / self.page for index in range(1, pagecnt + 1): self.link_saver.add( str({ 'date': jobid['date'], 'count': jobid['count'], 'index': index, 'page': self.page })) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': self.link_saver.flush()
def on_finish(self): FileAbstractParser.on_finish(self) unfetch_saver = LinkSaver('unfetched_seeds_detail_' + self.channel) self.unfetch_list = spider.util.unique_list(self.unfetch_list) self.fetched_list = spider.util.unique_list(self.fetched_list) unfetched = [] for link in self.unfetch_list: if link not in self.fetched_list: unfetched.append(link) self.unfetch_list = unfetched for link in self.unfetch_list: unfetch_saver.add(link) unfetch_saver.flush() fetchsaver = LinkSaver('fetched_seeds_detail_' + self.channel) for l in self.fetched_list: fetchsaver.add(str(l)) fetchsaver.flush() print 'fetched jobs', len(self.fetched_list) print 'unfetched jobs', len(self.unfetch_list)
def on_finish(self): FileAbstractParser.on_finish(self) self.detail_seeds = spider.util.unique_list(self.detail_seeds) seed_saver = LinkSaver(self.detail_seeds_file, 'w') for seed in self.detail_seeds: seed_saver.add(str(seed)) unfetch_saver = LinkSaver('unfetched_seeds_' + self.channel) self.unfetch_list = spider.util.unique_list(self.unfetch_list) for link in self.unfetch_list: if link not in self.fetched_list: unfetch_saver.add(str(link)) fetch_saver = LinkSaver('fetched_seeds_' + self.channel) self.fetched_list = spider.util.unique_list(self.fetched_list) for link in self.fetched_list: fetch_saver.add(str(link)) print 'fetched', len(self.fetched_list) print 'unfetched', len(self.unfetch_list) if self.send_mail: fname = self._save_name.encode('utf-8') os.system("cp '%s' '/tmp/%s'" % (fname, fname)) send_attach(['*****@*****.**'], '%s专业数据' % self.name.encode('utf-8'), '%s高考专业数据' % self.name.encode('utf-8'), '/tmp/%s' % fname, '%s.csv' % self.name.encode('utf-8'))
class ShanghaiExtractor(CWPParser): """解析文书案号""" def __init__(self): CWPParser.__init__(self, 'shanghai_court', 'court') self.an_saver = LinkSaver('ah.%s.txt' % self.name) def process_child_item(self, item): line = '%s|%s' % (item[0], item[1]) print line self.an_saver.add(line) def init(self): print 'job start at', datetime.datetime.now() return CWPParser.init(self) def parse_item(self, page): m = re.search('((\d{4}).*\d+号)', page['content'][1]) if m: return [[m.group(1), page['indexUrl'][17:].encode()]] return [] def on_finish(self): self.an_saver.flush()
class ChsiSpider(BaseGkChsiFsxSpider): def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, sleep_max=5, ua='firefox', seeds='detail_seeds', recover=False, year='15', bkccs=None, kldms=None, job_tag='', spider_type='detail', post_kldms=True): super(ChsiSpider, self).__init__(threadcnt, account, '%s%s' % (tag, job_tag), proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = ['5', '1'] if bkccs is None: bkccs = ['1', '2'] self.pagestore = self.new_page_store(spider_type, tag) self.full_tag = tag self.seeds = seeds if proxy: self.set_proxy(proxy) self.kldms = kldms self.bkccs = bkccs self.recover = recover self.info_saver = LinkSaver('info_data_%s_%s%s' % (spider_type, tag, job_tag)) self.failed_saver = LinkSaver('%s.failed.seeds.%s%s' % (spider_type, tag, job_tag)) self.invalid_saver = LinkSaver('%s.invalid.seeds.%s%s' % (spider_type, tag, job_tag)) self.year = year self.failed_list = [] self.invalid_list = [] self.spider_type = spider_type self.post_kldms = post_kldms def dispatch(self): # read all seeds seeds = [] with open(self.seeds, 'r') as f: for l in f: data = self.parse_seed(l.strip()) if not data: continue if self.year == str(data['years']): if not self.recover or not self.pagestore.find_any( self.pagestore.channel + '://' + self.get_job_id(data)): seeds.append(data) print 'load ', len(seeds), 'jobs' count = 10 while len(seeds) > 0 and count > 0: count -= 1 logging.info('remain tries %d', count) for kldm in self.kldms: for bkcc in self.bkccs: seeds = self.request_list(seeds, kldm, bkcc) logging.info('seeds %d,failed %d,kldm=%s,bkcc=%s,tries=%d', len(seeds), len(self.failed_list), kldm, bkcc, count) time.sleep(2) self.wait_q() seeds += self.failed_list self.failed_list = [] self.wait_q() self.add_job(None) self.failed_list = seeds def handle_job(self, jobid): pass def re_add_failed_job(self, jobid): if jobid.has_key('content'): jobid.pop('content') if jobid.has_key('url'): jobid.pop('url') cnt = jobid.get('_failed_cnt_', 0) + 1 jobid['_failed_cnt_'] = cnt self.failed_list.append(jobid) def save_invalid_job(self, jobid): cnt = jobid.get('_invalid_cnt_', 0) + 1 jobid['_invalid_cnt_'] = cnt if cnt < 2: self.re_add_failed_job(jobid) else: if jobid.has_key('content'): jobid.pop('content') if jobid.has_key('url'): jobid.pop('url') self.invalid_list.append(jobid) def request_list(self, seeds, kldm, bkcc): remains = [] if self.post_kldms: self.post_kldm_bkcc_for_session(kldm, bkcc) for seed in seeds: if seed['kldm'] == kldm and bkcc == seed['bkcc']: self.add_main_job(seed) else: remains.append(seed) else: for seed in seeds: self.add_main_job(seed) return remains def run_job(self, jobid): if self.pre_job(jobid): return if not jobid.has_key('content'): self.re_add_failed_job(jobid) return detail_content = jobid['content'] if detail_content is None: self.re_add_failed_job(jobid) return try: if self._check_result(detail_content.text, jobid, jobid['url']): '''exception is found and handled''' return except InvalidQueryError as e: logging.info(e.message) self.save_invalid_job(jobid) return except Exception as e: logging.info(e.message) self.re_add_failed_job(jobid) return if not jobid.has_key('url'): print jobid self.re_add_failed_job(jobid) return jid = self.get_job_id(jobid) print 'saving %s==>%s' % (jid, len(detail_content.text)) self.pagestore.save(int(time.time()), jid, jobid['url'], detail_content.text) def get_job_title(self, jobid): raise NotImplementedError('Virtual method called') def new_page_store(self, spider, tag): raise NotImplementedError('Virtual method called') def get_job_id(self, jobid): raise NotImplementedError('Virtual method called') def parse_page(self, jobid, content): raise NotImplementedError('Virtual method called') def get_url(self, jobid): raise NotImplementedError('Virtual method called') def report_job(self, jobid): raise NotImplementedError('Virtual method called') def add_job(self, jobid, mainjob=False): if jobid is None: super(ChsiSpider, self).add_job(jobid, mainjob) return url = self.get_url(jobid) count = 3 content = None while count > 0 and not content: content = self.request_content(jobid, url) count -= 1 if content is None: self.re_add_failed_job(jobid) return jobid['content'] = content jobid['url'] = url self.report_job(jobid) super(ChsiSpider, self).add_job(jobid, mainjob) self.parse_page(jobid, content) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += 'seeds: %s\n' % self.seeds msg += "saved: %d\n" % self.pagestore.saved_count msg += 'captcha times: %s\n' % self._captcha_times msg += 'remain seeds: %d\n' % len(self.failed_list) msg += 'invalid seeds: %d\n' % len(self.invalid_list) for item in self.except_state: msg += '%s: %d\n' % (item.name(), item.count()) spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) print 'remain seeds', len(self.failed_list) print 'invalid seeds', len(self.invalid_list) for seed in self.invalid_list: self.invalid_saver.add(str(seed)) self.invalid_saver.flush() for seed in self.failed_list: self.failed_saver.add(str(seed)) self.failed_saver.flush() elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass def parse_seed(self, param): raise NotImplementedError('Virtual method called') def request_content(self, jobid, url): raise NotImplementedError('Virtual method called')
class HZCourtSpider(CourtSpider): """杭州市法院爬虫""" def get_page_store(self): return self.pagestore def __init__(self, threadcnt=10, seed_file=None, mode='links', list_file='links', recover=False, test=False): CourtSpider.__init__(self, threadcnt) self._name = 'HangzhouCourt' self.pagestore = HZCourtStore() self.job_spliter = HZSpliter() self._test_mode = test self.pagestore.testmode = test self.list_data = { 'pageno': '1', 'pagesize': '20', 'ajlb': '', 'cbfy': '1300', 'ah': '', 'jarq1': '19700101', 'jarq2': time.strftime('%Y%m%d', time.localtime()), 'key': '' } self.seed_file = seed_file self.page_size = 50 self.mode = mode self.list_file = list_file self.recover = recover self.today = time.strftime('%Y%m%d', time.localtime()) self.link_saver = LinkSaver(self.list_file) def run_job(self, jobid): jt = jobid['type'] if 'paper' == jt: id = jobid['id'] con = self.request_url( 'http://www.zjsfgkw.cn/document/JudgmentDetail/' + id) if con is None or con.text is None: logging.error('failed to request paper %s', str(id)) raise Exception('Failed to request paper %s' % str(id)) else: context = self.extract_content(con.text) m = None if context is not None: m = re.search(r'src="([^"]*)"', context) context2 = None if m is not None: con = self.request_url("http://www.zjsfgkw.cn" + quote(m.group(1).encode('utf-8'))) if con: context2 = con.text else: logging.error('failed to request source paper %s', str(id)) raise Exception('Failed to request source paper %s' % str(id)) else: logging.warn('failed to find source url %s', str(id)) if context2 is not None: self.pagestore.save( int(time.time()), id, 'http://www.zjsfgkw.cn/document/JudgmentDetail/' + id, context2) print id, '=>', len(context2) logging.info('%s==>%d', str(id), len(context2)) else: logging.info('fail to find content for %s', str(id)) print 'fail to find content for:', id return if 'main' == jt: data = copy.deepcopy(self.list_data) data['cbfy'] = jobid['cbfy'] data['pageno'] = jobid['page'] data['pagesize'] = jobid['pagesize'] con = self.request_url(jobid['url'], data=data) if con is None or con.text is None: logging.error('fail to request %s', jobid['url']) raise Exception('response is None %s' % jobid['url']) elif 'list' == jt: if jobid['pageno'] == 0: self.handle_count_and_split(jobid) return con = self.search(pagesize=self.page_size, pageno=jobid['pageno'], jarq1=jobid['jarq1'], jarq2=jobid['jarq2']) if con is None or con.text is None: logging.error('fail to request %s', str(jobid)) raise Exception('response is None %s' % str(jobid)) else: print 'invalid job', jobid return docs = self.extract_paper_url(con.text) if len(docs) == 0: print 'no papers found on %s' % str(jobid) logging.warn('no papers found on %s', str(jobid)) return docs = spider.util.unique_list(docs) logging.info('add %d links from %s', len(docs), str(jobid)) for doc in docs: self.link_saver.add(doc) self.add_job({'type': 'paper', 'id': doc}) def search(self, **kwargs): pageno = kwargs.get('pageno', 1) pagesize = kwargs.get('pagesize', 10) ajlb = kwargs.get('ajlb', '') cbfy = kwargs.get('cbfy', '') ah = kwargs.get('ah', '') jarq1 = kwargs.get('jarq1', '') jarq2 = kwargs.get('jarq2', self.today) key = kwargs.get('key', '') # url = 'http://www.zjsfgkw.cn/document/JudgmentSearch?ajlb=%s&cbfy=%s&ah=%s&key=%s&jarq1=%s&jarq2=%s&pageno=%s&pagesize=%s' % ( # ajlb, cbfy, ah, key, jarq1, jarq2, pageno, pagesize) # return self.request_url(url) return self.request_url( 'http://www.zjsfgkw.cn/document/JudgmentSearch', data={ 'pageno': pageno, 'pagesize': pagesize, 'ajlb': ajlb, 'cbfy': cbfy, 'ah': ah, 'jarq1': jarq1, 'jarq2': jarq2, 'key': key }) def dispatch(self): if 'links' == self.mode and self.seed_file: with open(self.seed_file, 'r') as f: for l in f: j = eval(l) pagecnt = int(j['count']) / self.page_size + 1 for page in range(1, pagecnt + 1): self.add_main_job({ 'type': 'main', 'url': 'http://www.zjsfgkw.cn/document/JudgmentSearch', 'page': page, 'pagesize': self.page_size, 'cbfy': j['id'] }) elif 'papers' == self.mode: with open(self.seed_file, 'r') as f: ids = [] for l in f: ids.append(l.strip()) if self.recover: tmp = ids ids = [] for i in tmp: if not self.pagestore.find_any(self.pagestore.channel + '://' + i): ids.append(i) for i in ids: self.add_main_job({'type': 'paper', 'id': i}) logging.info('add %d paper links', len(ids)) elif 'update' == self.mode: config = Properties(self.seed_file) config.load() self.add_main_job({ 'type': 'list', 'jarq1': config.get('jarq1'), 'jarq2': config.get('jarq2', self.today), 'pageno': 0, 'level': 0 }) time.sleep(3) self.wait_q() self.add_job(None, True) def need_split(self, context, url): return False def extract_content(self, context): m = re.search(r'<div class="books_detail_header">.*</IFRAME>', context, re.S) if m: return m.group(0) return None def extract_paper_id(self, url): m = re.findall(r'http://www.zjsfgkw.cn/document/JudgmentDetail/(\d+)', url) if m is not None: return m[0] return None def extract_paper_url(self, content): return re.findall(r'DocumentId":(\d+)', content) def add_list_job(self, url, con): pass def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "Court Spider:%s\n" % self._name msg += "saved: %d\n" % self.pagestore.saved_count spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass def get_court_by_id(self, id): data = {'courtId': id} con = self.request_url( 'http://www.zjsfgkw.cn/Judges/GetCountByCountId', data=data) print con.text def get_court_paper_count(self, court_id, start_date, end_date): con = self.search(pageno=1, pagesize=1, cbfy=court_id, jarq1=start_date, jarq2=end_date) if con and con.text: res = re.search(r'"total":(\d+)', con.text) if res: print court_id, res.group(1) return int(res.group(1)) else: return -1 return -1 def handle_count_and_split(self, jobid): cnt = self.get_court_paper_count(jobid.get('court', ''), jobid['jarq1'], jobid['jarq2']) pagecnt = (cnt + self.page_size / 2) / self.page_size if pagecnt > 100: splits = date_split(jobid['jarq1'], jobid['jarq2'], '%Y%m%d') if len(splits) == 1: print 'cannot split any more:', jobid return print '[%s,%s],[%s]->%s,%s' % (jobid['jarq1'], jobid['jarq2'], jobid['level'], str( splits[0]), str(splits[1])) for t in splits: job = copy.deepcopy(jobid) job['jarq1'] = t[0] job['jarq2'] = t[1] job['_failcnt_'] = 0 job['level'] += 1 self.add_job(job) return print '[%s,%s][%d]=>%d,%d' % (jobid['jarq1'], jobid['jarq2'], jobid['level'], cnt, pagecnt) for page in range(1, pagecnt + 1): job = copy.deepcopy(jobid) job['pageno'] = page self.add_job(job)
class SeedGenerator(WenshuSpider): """ Wen shu seed generator generator seed format: { file : court.txt, start: 2015-01-01, end: 2015-06-20 } 因为最大页数是25,超过25的服务器不返回数据,所以需要拆分查询,每次查询文数量不能超过20*25=500 """ date_format = '%Y%m%d' def __init__(self, seed, thcnt=4, recover=False): WenshuSpider.__init__(self, thcnt, recover=recover) self.store = WenshuLinkDb('ws_seed') self.seed = seed self.failed_log = LinkSaver('failed_job.txt') def need_split(self, context, url): pass def get_page_store(self): pass def add_list_job(self, url, con): pass def dispatch(self): if self.recover: seeds = [] raw_seeds = self.gen_seeds() fetched = self.export_fetch() for seed in raw_seeds: fetched_arr = fetched.get(seed['key']) if fetched_arr is None: seeds.append(seed) continue unfetched = self.check_unfetched( [self.date2i(seed['start']), self.date2i(seed['end'])], fetched_arr.origin) for u in unfetched: copy_seed = copy.deepcopy(seed) copy_seed['start'] = u[0] copy_seed['end'] = u[1] seeds.append(copy_seed) else: seeds = self.gen_seeds() print 'load %d seeds' % len(seeds) for seed in seeds: self.add_main_job(seed) time.sleep(2) print 'wait for queue' self.wait_q() self.add_job(None) @staticmethod def to_list_seed_id(seed): return '%s/%s/%s' % (seed['key'], seed['start'], seed['end']) def run_job(self, jobid): param = self.seed2param(jobid) url = 'http://wenshu.court.gov.cn/list/list/?sorttype=0&conditions=searchWord+%s+SLFY++法院名称:%s&conditions=searchWord++CPRQ++裁判日期:%s TO %s' % ( jobid['court'], jobid['court'], jobid['start'], jobid['end'], ) con = self.request_results(param, page=1) if self.check_exception(con, jobid): return try: res = eval(eval(con.text)) except NameError as e: print 'NameError', e.message if not self.re_add_job(jobid): self.failed_log.add('0,' + str(jobid)) return if res and len(res) > 0: count = int(res[0]['Count']) if count > 500: print '[%d] %s [%s,%s]==>%d need split' % ( jobid['level'], jobid['court'], jobid['start'], jobid['end'], count) res = date_split(jobid['start'], jobid['end']) if len(res) == 1: print '[%d] %s [%s,%s]==>%d split failed' % ( jobid['level'], jobid['court'], jobid['start'], jobid['end'], count) self.failed_log.add('1,' + str(jobid)) else: self.add_job({ 'level': jobid['level'] + 1, 'court': jobid['court'], 'start': res[0][0], 'end': res[0][1], 'key': jobid['key'] }) self.add_job({ 'level': jobid['level'] + 1, 'court': jobid['court'], 'start': res[1][0], 'end': res[1][1], 'key': jobid['key'] }) else: print '[%d] %s [%s,%s]==>%d ok' % ( jobid['level'], jobid['court'], jobid['start'], jobid['end'], count) self.store.save(url, self.to_list_seed_id(jobid), '%s,%s' % (jobid['court'], count), int(time.time())) else: print 'fail to get content', jobid self.failed_log.add('2,' + str(jobid)) @staticmethod def to_seed(start, end): return start.strftime(SeedGenerator.date_format) + end.strftime( SeedGenerator.date_format) def gen_seeds(self): seeds = [] with open(self.seed['file'], 'r') as f: for l in f: d = eval(l.strip()) seeds.append({ 'court': d['court'], 'key': d['key'], 'start': self.seed['start'], 'end': self.seed['end'], 'level': 0 }) return seeds @staticmethod def get_date_str(year, month=None, day=None): if day is None: day = 1 if month is None: month = 1 if day < 10: ds = '0' + str(day) else: ds = str(day) if month < 10: ms = '0' + str(month) else: ms = str(month) return '%s%s%s' % (year, ms, ds) @staticmethod def get_end_day(year, month): if month > 31 or month < 1: return 0 if 2 == month: if year % 4 == 0 and year % 400 != 0 or year % 400 == 0: return 29 else: return 28 elif month in [4, 6, 9, 11]: return 30 else: return 31 def export(self, mode='json'): seeds = self.store.export_seeds() sf = open('seed.dat', 'w') sd = {} for s in seeds: sd[s['id'][(len(self.store.channel) + 3):]] = s['content'].split(',') res = [] for k, v in sd.items(): ks = k.split('/') if mode == 'json': res.append( str({ 'court': v[0], 'count': v[1], 'key': ks[0], 'start': ks[1], 'end': ks[2] })) else: l = '%s,%s,%s,%s,%s' % (v[0], ks[0], ks[1], ks[2], v[1]) res.append(l) res = spider.util.unique_list(res) for r in res: print r sf.write(r + '\n') print '%d seeds saved' % len(res) @staticmethod def date_convert(date_str): return datetime.datetime.strptime(date_str, '%Y-%m-%d') RELATIVE_DATE = datetime.datetime.strptime('1970-01-01', '%Y-%m-%d') @staticmethod def date2i(date_str): return (SeedGenerator.date_convert(date_str) - SeedGenerator.RELATIVE_DATE).days def export_fetch(self): fetched = self.store.export_seeds(lambda item: item['indexUrl'][ (len(self.store.channel) + 3):].split('/')) res = {} for num, start, end in fetched: arr = res.get(num) s = self.date2i(start) e = self.date2i(end) if arr: arr.add([s, e]) else: itv = Intervals() itv.add([s, e]) res[num] = itv for r in res.keys(): res[r].check() return res @staticmethod def i2date(num): return (SeedGenerator.RELATIVE_DATE + datetime.timedelta(days=num)).strftime('%Y-%m-%d') @staticmethod def check_unfetched(main, intervals): interval = Intervals() interval.add(main) for itv in intervals: interval.remove(itv) itvs = [] for itv in interval.origin: itvs.append( [SeedGenerator.i2date(itv[0]), SeedGenerator.i2date(itv[1])]) return itvs
class BjListSpider(BJSpider): def __init__(self, threadcnt, last_page=None, total_page=22305, save_file='seeds.dat', sleep=0.0, proxy_life=180): super(BjListSpider, self).__init__(threadcnt, 'BjListSpider', proxy_life=proxy_life) self.test_mode = False self.sleep = sleep self.zero_link_count = 0 self.lock = threading.Lock() self._shutdown = False self.result_saver = LinkSaver(save_file, 'a') self.captcha = FoodMakerExtendLock(threadcnt - 1) self.last_page = last_page self.total_page = total_page def dispatch(self): if self.last_page is not None and self.last_page <= self.total_page: for page in range(self.last_page, self.total_page + 1): self.add_main_job({ 'type': 'list', 'url': 'http://www.bjcourt.gov.cn/cpws/index.htm?page=%s' % page }) else: self.add_main_job({ 'type': 'main', 'url': 'http://www.bjcourt.gov.cn/cpws/index.htm' }) time.sleep(3) self.wait_q() self.add_job(None, True) def with_sleep_request_url(self, url, **kwargs): time.sleep(self.sleep) return self.request_url(url, **kwargs) def _dec_worker(self): self.captcha.decrease() super(BjListSpider, self)._dec_worker() def run_job(self, jobid): if not isinstance(jobid, dict): return if self._shutdown: return jt = jobid['type'] url = jobid['url'] time.sleep(2) con = self.with_sleep_request_url(url, timeout=10) if self.check_exception(con, jobid): return m = re.search('yzmInput', con.text) if m: print self.get_tid(), url, ' need captcha' con = self.resolve_captcha(url) if self.check_exception(con, jobid): return if re.search(r'yzmInput', con.text): self._shutdown = True self.link_saver.add('%d,%d,%s' % (2, 0, url)) return if 'main' == jt: m = re.search(ur'您搜到了\s*<em>([0-9]+)</em>\s*条符合条件的文书', con.text, re.S) if not m: if re.search(r'yzmInput', con.text): self._shutdown = True self.link_saver.add('%d,%d,%s' % (2, 0, url)) return papercnt = int(m.group(1)) if papercnt <= 0: print '哎呀,这里没用文书', url with self.lock: self.zero_link_count += 1 return print 'there are %d papers on %s' % (papercnt, url) self.link_saver.add('%d,%d,%s' % (1, papercnt, url)) n_url = url if n_url.find('?') < 0: n_url += '?' elif n_url[-1] != '&': n_url += '&' for page in range((papercnt + 10) / 20 + 1, 1, -1): self.add_job({'type': 'list', 'url': n_url + 'page=%s' % page}) ids = re.findall(r'\/cpws\/paperView.htm\?id=(\d+)', con.text) if not ids or len(ids) == 0: print 'cannot find any paper on', url return print 'add %d papers from %s' % (len(ids), url) for id in ids: self.result_saver.add(id) def split_url(self, url): urls = CData.split_param(url) for u in urls: self.add_job({'type': 'main', 'url': u}) def event_handler(self, evt, msg, **kwargs): super(BjListSpider, self).event_handler(evt, msg, **kwargs) if evt == 'DONE': self.result_saver.flush() msg += 'zero count: %d\n' % self.zero_link_count msg += 'captcha times: %d\n' % self.captcha_times spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass
class BaseChsiSpider(BaseGkChsiFsxSpider): def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, seeds='detail_seeds', recover=False, sleep_max=5, ua='firefox', year='15', bkccs=None, kldms=None, job_tag=''): super(BaseChsiSpider, self).__init__(threadcnt, account, tag, proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = ['5', '1'] if bkccs is None: bkccs = ['1', '2'] self.pagestore = GkChsiDetailPaperStore('yggk_detail_' + tag) self.full_tag = tag self.seeds = seeds if proxy: self.set_proxy(proxy) self.kldms = kldms self.bkccs = bkccs self.recover = recover self.parser = HTMLParser.HTMLParser() self.info_saver = LinkSaver(tag + '_detail_data') self.failed_saver = LinkSaver('detail.failed.seeds.' + tag + job_tag) self.year = year self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s' self.failed_list = [] self.last_request_time = time.time() def dispatch(self): # read all seeds seeds = [] with open(self.seeds, 'r') as f: for l in f: if l[0] == '{': data = eval(l.strip()) else: param = l.strip().split(',') if len(param) != 8: logging.warn('invalid seeds %s', l) continue data = {'wclx': 1, 'yxdm': param[6], 'kldm': param[2], 'bkcc': param[4], 'start': 0, 'years': param[5], 'zydm': param[7], 'zymc': param[8].encode('utf-8')} if self.year == data['years'] and not self.pagestore.find_any( self.pagestore.channel + '://' + self.get_jobid(data)): seeds.append(data) print 'load ', len(seeds), 'jobs' count = 10 while len(seeds) > 0 and count > 0: count += 1 logging.info('remain tries %d', count) for kldm in self.kldms: for bkcc in self.bkccs: seeds = self.request_list(seeds, kldm, bkcc) logging.info('seeds %d,failed %d,kldm=%s,bkcc=%s', len(seeds), len(self.failed_list), kldm, bkcc) seeds += self.failed_list self.failed_list = [] time.sleep(2) self.wait_q() self.add_job(None) print 'remain seeds', len(seeds) for seed in seeds: self.failed_saver.add(seed) self.failed_saver.flush() self.failed_list = seeds def handle_job(self, jobid): pass def request_list(self, seeds, kldm, bkcc): self.post_kldm_bkcc_for_session(kldm, bkcc) remains = [] for seed in seeds: if seed['kldm'] == kldm and bkcc == seed['bkcc']: self.add_main_job(seed) else: remains.append(seed) return remains def run_job(self, jobid): if not jobid.has_key('content'): if jobid not in self.failed_list: self.failed_list.append(jobid) return detail_content = jobid['content'] jtitle = '%s/%s/%s/%s/%s/%s' % ( jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'], jobid['start']) self.pagestore.save(int(time.time()), '%s/%s/%s' % (jtitle, jobid['zydm'], int(jobid['start']) / 10), jobid['url'], detail_content.text) def add_job(self, jobid, mainjob=False): if jobid is None: super(BaseChsiSpider, self).add_job(jobid) return logging.info('fetching special %s,%s', jobid['zymc'], jobid['zydm']) detail_url = self.detail_url_format % (jobid['years'], jobid['yxdm'], jobid['zydm'], jobid['start']) content = self.fetch_content(jobid, detail_url) if content is None: # exception is handle return jobid['content'] = content jobid['url'] = detail_url super(BaseChsiSpider, self).add_job(jobid, True) if 0 == jobid['start']: m = re.search(ur'共 (\d+) 页', content.text) if not m: logging.warn('failed to find page count %s,%s,%s', jobid['kldm'], jobid['bkcc'], detail_url) return page_cnt = int(m.group(1)) if page_cnt <= 1: return for p in range(1, page_cnt): job = copy.deepcopy(jobid) job['start'] = p * 10 self.add_main_job(job) def get_jobid(self, jobid): return '%s/%s/%s/%s/%s/%s/%s/%s' % ( jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'], jobid['start'], jobid['zydm'], int(jobid['start']) / 10) def fetch_content(self, jobid, detail_url): detail_content = self.request_url(detail_url, allow_redirects=20) if detail_content is None: self.failed_list.append(jobid) return try: if not self._check_result(detail_content.text, jobid, detail_url): self.failed_list.append(jobid) else: return detail_content except Exception as e: logging.info(e.message) self.failed_list.append(jobid) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += 'seeds: %s\n' % self.seeds msg += "saved: %d\n" % self.pagestore.saved_count msg += 'captcha times: %s' % self._captcha_times msg += 'remain seeds: %d\n' % len(self.failed_list) spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass
class PatentFullTextSpider(ZhuanliBaseSpider): """专利全文爬虫""" def __init__(self, thcnt, recover=False, seeds='seed.dat'): ZhuanliBaseSpider.__init__(self, thcnt, recover) self.seeds = seeds self.pagestore = PatentFullTextStore() self.failed_saver = LinkSaver('failed.fulltext.txt') def dispatch(self): seeds = [] with open(self.seeds, 'r') as f: for s in f: v = s.rstrip().split('-') if len(v) < 3: print 'invalid seed:', s if not self.recover or not self.pagestore.find_any( self.pagestore.channel + '://%s-%s' % (v[0], v[2])): seeds.append({'type': v[1], 'pnm': v[0], 'apply': v[2]}) # seeds = spider.util.unique_list(seeds) print 'load %s seeds' % len(seeds) for seed in seeds: self.add_main_job(seed) time.sleep(2) self.wait_q() self.add_job(None) @staticmethod def extract_seed_id(pnm, apply_code): return '%s-%s' % (pnm, apply_code) def run_job(self, jobid): url = self.form_download_url(jobid['pnm'], jobid['type']) con = self.request_url(url, timeout=self.timeout) if self.check_exception(con, jobid): return if u'<input type="text" name="vct" />' in con.text: # 输入验证码下载 m = re.search(r'\?path=([^&\s]*)', con.headers) if m: path = m.group(1) else: l_p = re.search('Location:http://egaz.sipo.gov.cn/FileWeb/.*', con.headers) if l_p: location = l_p.group() else: l_p = re.search('Location:.*', con.headers) location = 'None' if not l_p else l_p.group() print 'wrong redirect page:', url, 'location:', location if not self.re_add_job(jobid): self.failed_saver.add( '1,%s-%s-%s' % (jobid['pnm'], jobid['type'], jobid['apply'])) return img = self.request_url('http://egaz.sipo.gov.cn/FileWeb/vci.jpg') fn = jobid['pnm'] + '.jpg' save_file(img.content, fn) vci = Captcha.resolve(fn, jobid['pnm']) con = self.request_url( 'http://egaz.sipo.gov.cn/FileWeb/pfs?path=%s&vct=%s' % (path, vci)) remove_file(fn) if self.check_exception(con, jobid): return if u'您要下载的文件不存在' in con.text: self.failed_saver.add( '2,%s-%s-%s' % (jobid['pnm'], jobid['type'], jobid['apply'])) return if u'<input type="text" name="vct" />' in con.text: if not self.re_add_job(jobid): self.failed_saver.add( '3,%s-%s-%s' % (jobid['pnm'], jobid['type'], jobid['apply'])) return self.pagestore.save(int(time.time()), self.extract_seed_id(jobid['pnm'], jobid['apply']), url, con.text)
class ShenzhenCourtSpider(ETOSSessionCourtSpider): "深圳法院诉讼服务平台爬虫" def __init__(self, thread_count=1, list_only=False, save_link=False, from_link=False, recover=False, seeds='seeds'): super(ShenzhenCourtSpider, self).__init__(thread_count) self._name = 'ShenzhenCourt' self.pagestore = ShenzhenCourtStore() self.job_spliter = ShenzhenSpliter() self._captcha_times = range(0, thread_count) self.test_mode = False self.pagesize = 50 self.list_only = list_only self.save_link = save_link self.link_saver = None self.seeds = seeds if self.save_link: self.link_saver = LinkSaver('saved.links', 'a+b') self.from_link = from_link self.recover = recover def dispatch(self): if self.from_link: links = [] with open(self.seeds, 'r') as f: for l in f: if len(l) > 0: if l[:4] == 'http': links.append(l.strip()) else: links.append(l.strip().split(',')[-1]) if self.recover: tmp = links links = [] for l in tmp: if not self.pagestore.find_any(self.pagestore.channel + '://' + self.extract_paper_id(l)): links.append(l) for l in links: self.add_job({'type': 'paper', 'url': l}) print 'add %d paper links' % len(links) logging.info('add %d paper links', len(links)) else: self.add_main_job({ 'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440300&page=1&pageLimit=%d&caseNo=' % self.pagesize }) self.add_main_job({ 'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440301&page=1&pageLimit=%d&caseNo=' % self.pagesize }) self.add_main_job({ 'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440302&page=1&pageLimit=%d&caseNo=' % self.pagesize }) self.add_main_job({ 'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440303&page=1&pageLimit=%d&caseNo=' % self.pagesize }) self.add_main_job({ 'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440304&page=1&pageLimit=%d&caseNo=' % self.pagesize }) self.add_main_job({ 'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440305&page=1&pageLimit=%d&caseNo=' % self.pagesize }) self.add_main_job({ 'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440306&page=1&pageLimit=%d&caseNo=' % self.pagesize }) self.add_main_job({ 'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440307&page=1&pageLimit=%d&caseNo=' % self.pagesize }) time.sleep(3) self.wait_q() self.add_job(None, True) def thread_init(self, tid): self._captcha_times[tid] = 0 def check_captcha(self, con, url): m = re.search(u'.*需要验证,请输入验证码.*', con.text) if m: print m.group() tid = getattr(self._tls, 'tid', -1) if tid < 0: sys.stderr.write('invalid thread id in run_job') raise RuntimeError('Invalid tid') c = 0 while c < 10: img = self.get_captcha(tid) self._captcha_times[tid] += 1 c += 1 if not img: continue code = self.resolve_captcha(img) if not code: continue success = self.post_captcha(code, None) logging.info('captcha times:%d', self._captcha_times[tid]) if self.test_mode: print "captcha times: ", self._captcha_times[tid] if success == 'true': if re.split(r'\/anjiangongkai\/JudgeDocument', url): u = url + '?code=' + code else: u = url + '&code=' + code con = self.request_url(u) return con else: if self.test_mode: print 'do not need resolve captcha', url logging.warn('do not need resolve captcha %s', url) return con def request_url(self, url, **kwargs): con = super(ShenzhenCourtSpider, self).request_url(url, **kwargs) if con and con.text: return self.check_captcha(con, url) return con def run_job(self, jobid): jt = jobid['type'] url = jobid['url'] if 'paper' == jt: if self.list_only: return con = self.request_url(url) '''check exception''' if self.check_exceptions(con, jobid): return # con = self.check_captcha(con.text, url) u = re.search(r'src=\'(\/temp\/ws\-[\d\w]+\-[\d\w]+.html)\'', con.text) if u is None: logging.warn('cannot find source page url in %s', url) return con = self.request_url('http://ssfw.szcourt.gov.cn/' + u.group(1)) if con is None or con.text is None: logging.warn('source page is None %s', u.group(1)) return context = self.extract_content(con.text) if context is not None: jid = self.extract_paper_id(url) if jid is not None: self.pagestore.save(int(time.time()), jid, url, context) else: logging.warn('failed to find paper id,page nodt save,%s', url) print 'failed to find paper id, paper not save', url print url, '=>', len(context) logging.info('%s==>%d', url, len(context)) else: print 'fail to find content for:', url logging.info('cannot find content %s', url) return con = self.request_url(url) if con is None: logging.error('failed to fetch list page %s', url) return if 'main' == jt: if self.need_split(con.text, url): self.split_url(url) logging.info('job is split %s', url) return self.add_list_job(url, con.text) urls = self.extract_paper_url(con.text) urls = spider.util.unique_list(urls) logging.info('%s add %d papers', url, len(urls)) print 'add ', len(urls), 'paper urls', url if not self.list_only: for u in urls: self.add_job({'type': 'paper', 'url': u}) if self.save_link: for u in urls: self.link_saver.add(u) def need_split(self, context, url): return False def extract_content(self, context): return context def extract_paper_id(self, url): m = re.findall( r'anjiangongkai\/JudgeDocument\/(\d+)\/information\/([\d\w]+)\/([\d\w]+)\/', url) if len(m) > 0: return '-'.join(m[0]) return None def extract_paper_url(self, content): m = re.findall( r'<a href="(\/frontend\/anjiangongkai\/JudgeDocument\/\d+\/information\/[^"]*)">', content) if m is not None: urls = [] for u in m: urls.append('http://ssfw.szcourt.gov.cn' + u) return urls return None def add_list_job(self, url, con): divs = re.findall(ur'\(\d+条记录,每页\d+条记录,共(\d+)页\)', con) if divs: pagecnt = int(divs[0]) print 'add ', pagecnt, 'list url,', url logging.info('add %d list url from %s', pagecnt, url) for page in range(2, pagecnt + 1): self.add_job({ 'type': 'list', 'url': re.sub(r'page=\d+?', 'page=%d' % page, url) }) else: print url, 'has no more page' logging.info('no list page for %s', url) def post_captcha(self, code, session): # url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode?code=%s' % (code) if session is None: url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode?code=%s' % code else: url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode;jsessionid=%s?code=%s' % ( session, code) con = self.request_url(url, data={}) if con: if self.test_mode: print "post captcha cookies:", con.cookies # print "post captcha headers:", con.headers print 'captcha resolve result', con.text res = json.loads(con.text) return res['success'] else: print 'None response' return None def resolve_captcha(self, img): server = LianzhongCaptcha() points = server.point_check() if points <= 0: print 'there are no more points' return print 'There are %d points remaining' % points captcha = server.resolve(img) if self.test_mode: print 'resolved captcha', captcha return captcha def get_captcha(self, tid): con = self.request_url('http://ssfw.szcourt.gov.cn/yzm.jsp') if con is None: print "get none captcha response" return context = copy.deepcopy(con.content) print '====get_captcha====' # print 'headers:', con.headers print 'cookies:', con.cookies return context def check_exceptions(self, con, jobid): if con is None or con.text is None: logging.error('failed to fetch paper page %s', jobid['url']) print 'failed to fetch page %s' % jobid['url'] self.re_add_job(jobid) return True m404 = re.search('\/temp\/judgedocument404\.jsp', con.text) if m404: logging.info('page %s is missing from the server', jobid['url']) print 'page %s is missing from the server' % jobid['url'] return True return False @staticmethod def get_session_id(con): if isinstance(con, str) or isinstance(con, unicode): m = re.search(r'JSESSIONID=([\w\d]+)', con) if m is None: m = re.search(r'jsessionid=([\w\d]+)', con) else: m = re.search(r'jsessionid=([\w\d]+)', con.text) if m: return m.group(1) else: return None def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "Court Spider:%s\n" % self._name msg += "saved: %d\n" % self.pagestore.saved_count spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass
class FsLinkSpider(CourtSpider): def __init__(self, threadcnt): CourtSpider.__init__(self, threadcnt) self._name = 'FoshanLinkSpider' self._test_mode = True self.page_size = 20 self.link_saver = LinkSaver("links") def run_job(self, jobid): if not isinstance( jobid, dict) or not jobid.has_key('type') or not jobid.has_key('url'): raise ValueError('invalid jobid') jt = jobid['type'] url = jobid['url'] if 'main' != jt or not jobid.has_key('page'): raise ValueError('Invalid main job id') page = jobid['page'] urls = self.post_page(page, url) if len(urls) == 0: print 'no page url found at', page return elif self._test_mode: print 'add job', len(urls) urls = spider.util.unique_list(urls) for u in urls: self.link_saver.add(u) def dispatch(self): count = self.fetch_paper_count() for page in range(1, count / self.page_size + 1): self.add_main_job({ 'type': 'main', 'url': 'http://www.fszjfy.gov.cn/CourtProject/index/index-cpws!search.action#', 'page': page }) time.sleep(3) self.wait_q() self.add_job(None, True) def fetch_paper_count(self): con = self.request_url( 'http://www.fszjfy.gov.cn/CourtProject/index/index-cpws!search.action#' ) count = 0 if con: size = re.search(r'<input value="(\d+)" id="pageSize"', con.text) pages = re.search(r'<input value="(\d+)" id="pageTotal"', con.text) if pages: pages = int(pages.group(1)) else: pages = 1 if size: count = int(size.group(1)) * pages return count def need_split(self, context, url): return False def extract_content(self, context): return context def extract_paper_id(self, url): m = re.findall(r'id=(\d+)', url) if m is not None: return m[0] return None def extract_paper_url(self, content): li = re.search(r'<div id="gl3_content_main">.*?<\/div>', content, re.S) m = [] if li: rs = li.group().strip() li_content = re.sub( r'<.*?>|\r|\n|\ |\t', '', re.sub( '</li>', '|', re.sub( r'</dt>', ',', re.sub(r'<a href="', '', re.sub(r'" target[^>]*>', '', rs))))) if li_content: if isinstance(li_content, unicode): li_content = li_content.encode('utf-8') m = li_content.strip().split('|') urls = [] for u in m: urls.append(u.strip()) return urls def add_list_job(self, url, con): pass def post_page(self, page, url): data = { 'pageNo': page, 'pageSize': self.page_size, 'search': '', 'ah': '', 'startTime': '', 'endTime': '', 'ajyear': '', 'ahtxt': '', 'ajfymc': '', 'ajlb': '', 'fymc': '0' } con = self.request_url(url, data=data) time.sleep(1) if con is None: return None return self.extract_paper_url(con.text) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "Court Spider:%s\n" % self._name spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass
class PatentAbstractSpider(ZhuanliBaseSpider, Main): """专利摘要爬虫""" def __init__(self, thcnt, mode='id', recover=True, seeds='seed.dat'): ZhuanliBaseSpider.__init__(self, thcnt, recover, timeout=90) Main.__init__(self) self.short_tag = 't:m:s:r:o:h:v:' self.tags = ['recover=', 'threads=', 'mode=', 'seeds=', 'output='] self.seeds = seeds self.page_size = 20 # 3或者10,20 self.pagestore = PatentAbstractStore('abstract') self.failed_saver = FailedJobSaver('failed_job.txt') self.seed_saver = LinkSaver('seed.year.txt', 'a+') self.job_log = LinkSaver('abstract.%s.log' % mode, 'a+') self.mode = mode self.__version = '1.0.0' self.utils = threading.local() self.sp_errors = OrderedDict() self.pre_save_count = 0 self.properties = PropertiesManager() self.can_load_seed = True def output(self, args): print '_patent_spider.py: %s' % args def version(self): print '_patent_spider.py %s' % self.__version def usage(self): print '_patent_spider.py usage:' print '-h, --help: print help message.' print '-v, --version: print script version' print '-o, --output: input an output verb' print '-t, --threads: thread count ' print '-m, --mode: mode,if not id then will be abstract mode' print '-r, --recover: recover,1 or True for recover mode' print '-s, --seeds: seeds file' def _set_proxy(self, kwargs, selproxy): super(PatentAbstractSpider, self)._set_proxy(kwargs, selproxy) setattr(self.utils, 'proxy', selproxy) def handle(self, opts): for o, a in opts: if o in ('-h', '--help'): self.usage() sys.exit(1) elif o in ('-v', '--version'): self.version() sys.exit(0) elif o in ('-o', '--output'): self.output(a) sys.exit(0) elif o in ('-t', '--threads'): self.thread_count = int(a) elif o in ('-m', '--mode'): self.mode = a elif o in ('-s', '--seeds'): self.seeds = a elif o in ('-r', '--recover'): self.recover = True if (a == '1' or a == 'True') else False else: print 'unhandled option' sys.exit(3) if self.mode != 'id': self.mode = 'abs' if self.mode != 'id' and not os.path.exists(self.seeds): print 'seed file %s not exists' % self.seeds sys.exit(1) count = 3 while count > 0: self.sp_proxies = OrderedDict() if self.mode == 'id': # self.set_proxy('183.111.169.203:8080', len(job.sp_proxies)) self.set_proxy('192.168.1.39:3428:ipin:helloipin', len(job.sp_proxies)) else: proxies = KuaidailiProxyManager.load_proxy(100) print 'load %d proxies from kuaidaili' % proxies['data']['count'] if proxies['data']['count'] > 0: self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0) # proxies = KuaidailiProxyManager.load_proxy(50) # print 'load %d proxies from kuaidaili' % proxies['data']['count'] # if proxies['data']['count'] > 0: # self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0) self.run() count -= 1 def load_proxy(self, fn, index=-1, auto_change=True): super(PatentAbstractSpider, self).load_proxy(fn, index, auto_change) with self.locker: self.sp_errors.clear() for proxy in self.sp_proxies.iterkeys(): self.sp_proxies[proxy] = 0 def set_proxy(self, prs, index=-1, auto_change=True): with self.locker: if isinstance(prs, list): for p in prs: self.sp_errors[p] = 0 elif isinstance(prs, str) or isinstance(prs, unicode): self.sp_errors[prs] = 0 super(PatentAbstractSpider, self).set_proxy(prs, index, auto_change) @staticmethod def gen_list_seed(): now = datetime.now() this_year = int(now.strftime('%Y')) this_month = int(now.strftime('%m')) types = ['fmgb', 'fmsq', 'xxsq', 'wgsq'] seeds = [] for year in range(1985, this_year): for month in range(1, 13): for t in types: seeds.append( {'type': t, 'index': 1, 'time': '%s%s' % (year, (month if month > 9 else '0%s' % month))}) for month in range(1, this_month): for t in types: seeds.append( {'type': t, 'index': 1, 'time': '%s%s' % (this_year, (month if month > 9 else '0%s' % month))}) return seeds def load_abstract_seeds(self, seed_file, limit=1000000): seeds = [] last_position = self.properties.get('position', 0) f = open(seed_file, 'r') count = 0 f.seek(last_position) while count < limit: l = f.readline() if not l: # 文件结束,不能再读 self.can_load_seed = False break res = l.strip().split(',') if len(res) < 3: print 'invalid seeds:', l else: seeds.append({'type': res[1], 'id': res[0], 'code': res[2]}) count += 1 last_position = f.tell() self.properties.set('position', last_position) self.properties.save() f.close() return seeds def get_id_seeds(self): raw_seeds = self.gen_list_seed() rds = self.job_log.readlines() '''get done jobs''' done_jobs = {} for job in rds: if '[' == job[0]: continue js = job.strip().split('-') done_jobs['%s-%s' % (js[0], js[1])] = {} done_jobs['%s-%s' % (js[0], js[1])]['pages'] = int(js[2]) done_jobs['%s-%s' % (js[0], js[1])]['current'] = 1 '''load done seeds''' dss = self.seed_saver.readlines() for ds in dss: sd = ds.strip().split(',') if len(sd) < 4: print 'invalid seed', ds continue js = sd[3].split('-') sid = '%s-%s' % (js[0], js[1]) page = int(js[2]) if done_jobs.has_key(sid) and done_jobs[sid]['current'] < page: done_jobs[sid]['current'] = page seeds = [] for seed in raw_seeds: sid = seed['time'] + '-' + seed['type'] if done_jobs.has_key(sid): if done_jobs[sid]['pages'] > done_jobs[sid]['current'] > 1: for page in range(done_jobs[sid]['current'] + 1, done_jobs[sid]['pages'] + 1): s = copy.deepcopy(seed) s['index'] = page seeds.append(s) else: seeds.append(seed) logging.info('load %s list seeds', len(seeds)) return seeds def get_abstract_seeds(self, limit=100000): rawseeds = self.load_abstract_seeds(self.seeds, limit) seeds = [] for s in rawseeds: if not self.recover or not self.pagestore.find_any(self.pagestore.channel + '://' + s['id']): seeds.append(s) if len(seeds) >= limit: break logging.info('load %d abstract seeds', len(seeds)) return seeds def report(self): super(PatentAbstractSpider, self).report() self.job_log.flush() self.seed_saver.flush() count = self.pagestore.saved_count - self.pre_save_count self.pre_save_count = self.pagestore.saved_count print 'save %d doc in this minute' % count def dispatch(self): self.failed_saver.tag() if self.mode == 'id': seeds = self.get_id_seeds() for seed in seeds: self.add_main_job(seed) else: count = 10 ever_loaded = False while count > 0 and self.can_load_seed: seeds = self.get_abstract_seeds() if len(seeds) > 0: ever_loaded = True for seed in seeds: self.add_main_job(seed) time.sleep(2) self.wait_q() elif ever_loaded: count -= 1 time.sleep(100) time.sleep(2) self.wait_q() self.add_job(None) @staticmethod def extract_seed_id(pub, app, count): return '%s-%s/%s-%s/%s' % ( pub[0], pub[1], app[0] if (app[0] != '-') else '', app[1] if (app[1] != '-') else '', count) @staticmethod def parse_seed(seed): v = seed.split(',') if len(v) != 7: print 'invalid seed', seed return [] return [[v[1][1:], v[2][:-1]], [v[3][1:], v[4][:-1]], int(v[6])] @staticmethod def get_query_word(jobid): word = '公开(公告)日=%s' % jobid['time'] return word def _on_shutdown(self, jobid): self.failed_saver.save('2,%s' % str(jobid)) return def handle_id_job(self, jobid): strword = self.get_query_word(jobid) url = self.form_query_url(strword, page=jobid['index'], size=self.page_size, selected=jobid['type'], showtype=0) con = self.request_url(url, timeout=self.timeout) if self.check_exception(con, jobid): print 'exception encounter', jobid return if re.search(u'<title>错误页面</title>', con.text): print '错误页面', jobid if not self.re_add_job(jobid): self.failed_saver.save(str(jobid)) return patents = re.findall(r'<a href="javascript:zl_xm\(\'([\d\w]+)\',\'(\w+)\',\'([\w\d]+)\'\);">[\d\w]+</a>', con.text) print '[%d]%s-%s-%s' % (len(patents), jobid['time'], jobid['type'], jobid['index']) if 0 == len(patents): self.job_log.add('[%d]%s-%s-%s,%s' % (len(patents), jobid['time'], jobid['type'], jobid['index'], con.code)) self.re_add_job(jobid) return for p in patents: if len(p) != 3: logging.warn('invalid pattern matched:%s,%s', str(p), str(jobid)) self.failed_saver.save('1,%s' % str(jobid)) else: self.seed_saver.add( '%s,%s,%s,%s-%s-%d' % (p[0], p[1], p[2], jobid['time'], jobid['type'], jobid['index'])) if 1 == jobid['index']: m = re.search(r'javascript:if\(event.keyCode == 13\) zl_tz\((\d+)\)', con.text) if m: pagecnt = int(m.group(1)) print '[%d][%d]%s-%s-%d' % (len(patents), pagecnt, jobid['time'], jobid['type'], jobid['index']) self.job_log.add('%s-%s-%s' % (jobid['time'], jobid['type'], pagecnt)) for page in range(2, pagecnt + 1): job = copy.deepcopy(jobid) job['_failcnt_'] = 0 job['index'] = page self.add_job(job) else: print 'failed to find count[%d]%s-%s-[%d]' % (len(patents), jobid['time'], jobid['type'], 0) logging.warn('failed to find page count:%s-%s-%s', jobid['time'], jobid['type'], jobid['index']) def handle_abstract_seed(self, jobid): qword = quote('申请号=\'%s\' and %s=1' % (jobid['id'], jobid['code'])) url = 'http://epub.sipo.gov.cn/patentdetail.action?strSources=%s&strWhere=%s&strLicenseCode=&pageSize=6&pageNow=1' % ( jobid['type'], qword) con = self.request_url(url, timeout=self.timeout) if self.check_exception(con, jobid): print 'exception encounter', jobid return if re.search(u'<title>错误页面</title>', con.text): print '错误页面', jobid if not self.re_add_job(jobid): self.failed_saver.save(str(jobid)) return print 'success:%s-%s-%s' % (jobid['id'], jobid['type'], jobid['code']) self.pagestore.save(int(time.time()), jobid['id'], url, con.text) def run_job(self, jobid): if self.check_shutdown(jobid): return try: if self.mode == 'id': self.handle_id_job(jobid) else: self.handle_abstract_seed(jobid) except RuntimeError as e: if 'no proxy' in e.message: self.re_add_job(jobid) self.reload_proxy() return else: raise def reload_proxy(self): prs = {} count = 3 while count > 0: if 'id' == self.mode: prs = KuaidailiProxyManager.load_proxy(20) else: prs = KuaidailiProxyManager.load_proxy(100) if prs['data']['count'] > 0: break count -= 1 if count <= 0 or not prs.has_key('data') or not prs['data'].has_key('count') or \ prs['data'][ 'count'] <= 0: self._shutdown() logging.error('cannot load any proxy') spider.util.sendmail(['*****@*****.**'], 'Proxy Error', 'Cannot load any proxy:%s,%s' % (self._name, self.mode)) return print 'load %d proxies from kuaidaili' % prs['data']['count'] self.set_proxy(prs['data']['proxy_list'], 15 if (prs['data']['count'] > 15) else 0) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': self.job_log.flush() msg += "saved: %d\n" % self.pagestore.saved_count spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) def proxy_error(self): proxy = getattr(self.utils, 'proxy') if proxy is not None: with self.locker: try: if self.sp_errors[proxy] < 5: self.sp_errors[proxy] += 1 else: self.sp_proxies.pop(proxy) if len(self.sp_proxies) == 0: self.reload_proxy() except KeyError: pass def on_proxy_error(self, con, jobid): self.proxy_error() self.re_add_job(jobid) return True def on_other_400_exception(self, con, jobid): if con.code == 403: self.proxy_error() self.re_add_job(jobid) return True def on_other_500_exception(self, con, jobid): if 504 == con.code and re.search('proxy', con.text, re.I): self.proxy_error() self.re_add_job(jobid) return True else: return super(PatentAbstractSpider, self).on_other_500_exception(con, jobid)
class ShenzhenCourtListSpider(ETOSSessionCourtSpider): "深圳法院诉讼服务平台爬虫" def __init__(self, thread_count=1, full_mode=False, seeds='seeds'): super(ShenzhenCourtListSpider, self).__init__(thread_count, 'list.spider.log') self._name = 'ShenzhenListSpider' self.job_spliter = ShenzhenSpliter() self._captcha_times = range(0, thread_count) self.test_mode = False self.pagesize = 50 self.full_mode = full_mode self.link_saver = LinkSaver(seeds, 'a') def dispatch(self): self.add_main_job({'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440300&page=1&pageLimit=%d&caseNo=' % self.pagesize}) self.add_main_job({'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440301&page=1&pageLimit=%d&caseNo=' % self.pagesize}) self.add_main_job({'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440302&page=1&pageLimit=%d&caseNo=' % self.pagesize}) self.add_main_job({'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440303&page=1&pageLimit=%d&caseNo=' % self.pagesize}) self.add_main_job({'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440304&page=1&pageLimit=%d&caseNo=' % self.pagesize}) self.add_main_job({'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440305&page=1&pageLimit=%d&caseNo=' % self.pagesize}) self.add_main_job({'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440306&page=1&pageLimit=%d&caseNo=' % self.pagesize}) self.add_main_job({'type': 'main', 'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440307&page=1&pageLimit=%d&caseNo=' % self.pagesize}) time.sleep(3) self.wait_q() self.add_job(None, True) def thread_init(self, tid): self._captcha_times[tid] = 0 def check_captcha(self, con, url): m = re.search(u'.*需要验证,请输入验证码.*', con.text) if m: print m.group() tid = getattr(self._tls, 'tid', -1) if tid < 0: sys.stderr.write('invalid thread id in run_job') raise RuntimeError('Invalid tid') c = 0 while c < 10: img = self.get_captcha(tid) self._captcha_times[tid] += 1 c += 1 if not img: continue code = self.resolve_captcha(img) if not code: continue success = self.post_captcha(code, None) logging.info('captcha times:%d', self._captcha_times[tid]) if self.test_mode: print "captcha times: ", self._captcha_times[tid] if success == 'true': if re.split(r'\/anjiangongkai\/JudgeDocument', url): u = url + '?code=' + code else: u = url + '&code=' + code con = self.request_url(u) return con else: if self.test_mode: print 'do not need resolve captcha', url logging.warn('do not need resolve captcha %s', url) return con def request_url(self, url, **kwargs): con = super(ShenzhenCourtListSpider, self).request_url(url, **kwargs) if con and con.text: return self.check_captcha(con, url) return con def check_exception(self, con, jobid): if con is None: print '回应是None,你说怎么办吧', jobid['url'] self.re_add_job(jobid) return True if con.text is None: print 'response text是None', jobid['url'] self.re_add_job(jobid) return True m = re.search( r'<!DOCTYPE html><html><head><meta charset=utf-8><\/head><\/head><body><script>window.location=\'([^\']*)\'<\/script><\/body><\/html>', con.text) if m: url = 'http://ssfw.szcourt.gov.cn' + m.group(1) self.add_job({'type': jobid['type'], 'url': url}) print 'js 页面跳转,目的地是', url return True def run_job(self, jobid): jt = jobid['type'] url = jobid['url'] con = self.request_url(url) if self.check_exception(con, jobid): return if self.need_split(con.text, url): self.split_url(url) logging.info('job is split %s', url) return if jt == 'main': self.add_list_job(url, con.text) urls = self.extract_paper_url(con.text) urls = spider.util.unique_list(urls) logging.info('%s add %d papers', url, len(urls)) print 'add', len(urls), 'paper urls', url if len(urls) == 0: pass if self.full_mode: m = re.search( r'http:\/\/ssfw.szcourt.gov.cn\/frontend\/anjiangongkai\/JudgeDocument\/(\d+)\?ajlb=(\d+)&fydm=(\d+)&page=(\d+)&pageLimit=(\d+)&caseNo=' , url) if m: tp = m.group(1) ajlb = m.group(2) fydm = m.group(3) page = m.group(4) size = m.group(5) for u in urls: self.link_saver.add('%s,%s,%s,%s,%s,%s' % (tp, ajlb, fydm, page, size, u)) else: for u in urls: self.link_saver.add(u) def need_split(self, context, url): return False def extract_content(self, context): return context def extract_paper_id(self, url): m = re.findall(r'anjiangongkai\/JudgeDocument\/(\d+)\/information\/([\d\w]+)\/([\d\w]+)\/', url) if len(m) > 0: return '-'.join(m[0]) return None def extract_paper_url(self, content): m = re.findall(r'<a href="(\/frontend\/anjiangongkai\/JudgeDocument\/\d+\/information\/[^"]*)">', content) if m is not None: urls = [] for u in m: urls.append('http://ssfw.szcourt.gov.cn' + u) return urls return None def add_list_job(self, url, con): divs = re.findall(ur'\(\d+条记录,每页\d+条记录,共(\d+)页\)', con) if divs: pagecnt = int(divs[0]) print 'add ', pagecnt, 'list url,', url logging.info('add %d list url from %s', pagecnt, url) for page in range(2, pagecnt + 1): self.add_job({'type': 'list', 'url': re.sub(r'page=\d+?', 'page=%d' % page, url)}) else: print url, 'has no more page' logging.info('no list page for %s', url) def post_captcha(self, code, session): # url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode?code=%s' % (code) if session is None: url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode?code=%s' % code else: url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode;jsessionid=%s?code=%s' % (session, code) con = self.request_url(url, data={}) if con: if self.test_mode: print "post captcha cookies:", con.cookies # print "post captcha headers:", con.headers print 'captcha resolve result', con.text res = json.loads(con.text) return res['success'] else: print 'None response' return None def resolve_captcha(self, img): server = LianzhongCaptcha() points = server.point_check() if points <= 0: print 'there are no more points' return print 'There are %d points remaining' % points captcha = server.resolve(img) if self.test_mode: print 'resolved captcha', captcha return captcha def get_captcha(self, tid): con = self.request_url('http://ssfw.szcourt.gov.cn/yzm.jsp') if con is None: print "get none captcha response" return context = copy.deepcopy(con.content) print '====get_captcha====' # print 'headers:', con.headers print 'cookies:', con.cookies return context @staticmethod def get_session_id(con): if isinstance(con, str) or isinstance(con, unicode): m = re.search(r'JSESSIONID=([\w\d]+)', con) if m is None: m = re.search(r'jsessionid=([\w\d]+)', con) else: m = re.search(r'jsessionid=([\w\d]+)', con.text) if m: return m.group(1) else: return None def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "Court Spider:%s\n" % self._name spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass
class ShanghaiCourtSpider(ProxySwapSpider): "上海高级人民法院文书检索系统爬虫" def __init__(self, thread_count=5, seeds=None, start=1, name='ShanghaiCourtSpider', list_only=False, paper_seeds=None, recover=False): ProxySwapSpider.__init__(self, thread_count, proxy_life=3600) if seeds is None: seeds = [] self._name = name self.seeds = seeds self.pagestore = ShanghaiCourtStore() self.page_size = 20 self.list_only = list_only self.search_url_format = 'http://www.hshfy.sh.cn:8081/flws/content.jsp?wz=&pa=%s&more=1&toPage=%d&totalPage=%d&perPaperLink=%d&perPaperNum=%d' if self.list_only: self.link_saver = LinkSaver('links', 'a') self.paper_seeds = paper_seeds self.lock = threading.Lock() self.pager_failed_count = 0 self.recover = recover self.start = start def dispatch(self): # for seed in self.seeds: # self.add_main_job(seed) # logging.info('add %d list links' % len(self.seeds)) # if self.paper_seeds: # links = [] # with open(self.paper_seeds, 'r') as f: # for l in f: # links.append(l.strip()) # if self.recover: # tmp = links # links = [] # for l in tmp: # if not self.pagestore.find_any(self.pagestore.channel + '://' + self.extract_paper_id(l)): # links.append(l) # logging.info('add %d paper links' % len(links)) # for l in links: # self.add_main_job({'type': 'paper', 'url': l}) seed_id = 'adHlwZT1BbGwmd3o9z' total = 1060385 pagecnt = (total + self.page_size / 2) / self.page_size + 1 for page in range(self.start, pagecnt): self.add_main_job({ 'type': 'list', 'url': self.search_url_format % (seed_id, page, total, self.page_size, self.page_size) }) time.sleep(3) self.wait_q() self.add_job(None, True) def check_exception(self, con, jobid): '''check if there are exception in response,true if exception are found and cannot be continue, false if no exception is found or exception is handled and is ok to continue''' if con is None: print 'null response' self.re_add_job(jobid) return True if con.text is None: print 'None content type' print con.headers self.re_add_job(jobid) return True if con.code >= 400: print con.headers if 502 == con.code: print 'Proxy Error 502', jobid['url'] logging.error('proxy error 502 %s', jobid['url']) self.change_proxy() self.re_add_job(jobid) return True if 404 == con.code: print '啊呵,404,服务器上居然找不到这个页面', jobid['url'] logging.info('page not found on the server %s', jobid['url']) return True if 410 == con.code: print 'resource gone', jobid['url'] return True if 500 > con.code >= 400: print 'request error', jobid['url'] self.re_add_job(jobid) return True if 600 > con.code >= 500: print 'server error', con.code, jobid['url'] cnt = jobid.get('_failcnt_', 0) if cnt < 47: jobid['_failcnt'] = 47 self.re_add_job(jobid) return True print '600 以上的code,涨见识了!哈哈哈!', jobid['url'] logging.info('failed with response code %d,%s', con.code, jobid['url']) self.re_add_job(jobid) return True if re.search(u'出错了', con.text): print '出错了,他们服务器太弱,慢点抓吧' logging.error('server error,%s', jobid['url']) self.re_add_job(jobid) return True if re.search(u'访问本页面,您的浏览器需要支持JavaScript', con.text): m = re.search(r"<script>(.*?)</script>", con.text) sc = "document = {set cookie(a){console.log(a);}}, window = {innerWidth: 1366, innerHeight: 768, screenX: 200, screenY: 100, screen: {width: 1366, height: 768}}\n" sc += m.group(1) rv = spider.util.runjs(sc) logging.info('nodejs result:%s', rv) print rv return False def run_job(self, jobid): jt = jobid['type'] url = jobid['url'] if 'main' == jt: res = self.post_for_count(url) if self.check_exception(res[1], jobid): return if res[0] <= 0: print 'get 0 result from', url logging.info('get no paper from %s' % url) return seed_id = re.search(r'pa=([\w\d\+]+)', url) if seed_id: seed_id = seed_id.group(1) count = int(res[0]) logging.info('there are %d paper in %s' % (count, seed_id)) page_count = int((count + self.page_size / 2) / self.page_size) for page in range(1, page_count + 1): self.add_job({ 'type': 'list', 'url': self.search_url_format % (seed_id, page, count, self.page_size, self.page_size) }) else: logging.warn('failed to parse seed id from %s', url) elif 'list' == jt: con = self.post_for_data(jobid['url'], {}) if self.check_exception(con, jobid): return urls = self.extract_paper_url(con.text) if self.list_only: for u in urls: self.link_saver.add(u) else: for u in urls: self.add_job({'type': 'paper', 'url': u}) logging.info('add %d from list job %s' % (len(urls), url)) if len(urls) == 0: pass print('add %d from list job %s' % (len(urls), url)) else: con = self.request_url(url, timeout=45) if self.check_exception(con, jobid): return content = self.extract_content(con.text) jid = self.extract_paper_id(url) logging.info('saving %s,%s', jid, url) if content and jid: self.pagestore.save(int(time.time()), jid, url, content) else: with self.lock: self.pager_failed_count += 1 logging.info('failed count %d,None content or jid for %s,%s' % (self.pager_failed_count, jid, url)) def extract_paper_id(self, url): m = re.search(r'pa=([\w\d\/]+)', url) if m: return m.group(1) return None def post_for_data(self, url, data=None): if data is None: data = {} con = self.request_url(url, data=data, timeout=60) if con: con.text = con.content.decode('gbk') con.encoding = 'gbk' return con def post_for_count(self, url): con = self.post_for_data(url) if con is None: return [0, None] count = re.search(r'var totalPage = "(\d+)";', con.text) if count: return [count.group(1), con.text] count = re.search(u'共([\d\s]+)条', con.text) if count: return [count.group(1), con] return [0, con] def extract_paper_url(self, content): m = re.findall(r'onclick="showone\(\'([^\']+)\'', content) urls = [] for u in m: urls.append('http://www.hshfy.sh.cn:8081/flws/text.jsp?pa=' + u) return urls def extract_content(self, content): return content def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "Court Spider:%s\n" % self._name msg += "Mode-list_only:%s\n" % self.list_only msg += "paper id failed: %d\n" % self.pager_failed_count msg += "saved: %d\n" % self.pagestore.saved_count spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) logging.info('Job done,failed count %d,saved %d' % (self.pager_failed_count, self.pagestore.saved_count)) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass
class PatentAbstractExtractor(CWPParser): def __init__(self): CWPParser.__init__(self, 'abs_list', 'abs_list', 'zhuanli') self.store = PatentStore('abstract') self.failed_link = LinkSaver('abstract.parser.failed.txt') self.url_format = 'http://epub.sipo.gov.cn/dxbdl.action?strSources=fmmost&strWhere=%s&recordCursor=0&strLicenseCode=&action=dxbdln' self.save_count = 0 def init(self): print 'job start at', datetime.now() def on_finish(self): print '%d patents saved' % self.save_count def process_child_item(self, item): self.save_count += 1 if self.test_mode: print item['apply_code'] print item['pub_code'] print item['type'] print item['code'] # print item['content'] else: jid = item['apply_code'] + '/' + item['pub_code'] if not self.store.find_any(self.store.channel + '://' + jid): self.store.save( int(time.time()), jid, Patent.form_download_url(item['pub_code'], item['type'], item['code']), item['content']) def parse_item(self, page): patent_contents = re.findall( r'<div class="cp_box">.*?<img src="qrcode/\w{2}\d+\w?.png" width="74" height="74" /></a>', page['content'][1], re.S) patents = [] for pc in patent_contents: m = re.search(r'申请号:(\d+\w?)</li>', pc) if m: apply_code = m.group(1) else: self.failed_link.add('1,%s' % page['indexUrl']) continue u = re.search(r"javascript:dxb3\('(\w+)','([\w\d]+)','(\d)'\);", pc) if not u or len(u.groups()) < 3: self.failed_link.add('2,%s' % page['indexUrl']) continue patents.append({ 'apply_code': apply_code, 'pub_code': u.group(2), 'content': pc, 'type': u.group(1), 'code': u.group(3) }) return patents @staticmethod def parse_content(pc): con = re.sub( ' | ', '', re.sub('<[^>]*>', '', pc.replace('</li>', '\n').replace('<ul>', '\n'))) con = re.sub(r'\n+', '\n', con.replace(' ', '').replace('\t', '')) res = [] for c in con.split('\n'): cr = c.strip() if cr != '': res.append(cr) c9 = res[9] res[8] = res[8].replace('全部', '') + c9 res.remove(c9) return '\n'.join(res).lstrip() def save(self, saver, page): pass
class ListSeedGenQueries(ZhuanliBaseSpider): def __init__(self, thcnt=4, limit=5000, recover=False): super(ListSeedGenQueries, self).__init__(thcnt) self.bs2 = FileSaver("failed_urls.2.txt") self.limit = limit self.test_mode = False self.sf = LinkSaver('seed.2.dat', 'a') self.failed_jobs = LinkSaver('seed.2.failed.dat', 'w') self.count = 0 self.failed = 0 self.sleep = 0 self.recover = recover self.timeout = 60 self.today = datetime.datetime.now().strftime('%Y') random.seed = int(time.time()) self.select_user_agent(ua[2]) def dispatch(self): if self.recover: with open('old.2.failed.dat', 'r') as f: for l in f: d = l.strip().split(',', 1) data = eval(d[1]) data['_failcnt_'] = 0 self.add_main_job(data) else: self.add_main_job({ 'type': 'main', 'year': ['1985', self.today], 'app': ['-', '-'], 'level': -1 }) time.sleep(2) self.wait_q() self.add_job(None) @staticmethod def get_query_word(jobid): word = '公开(公告)日=BETWEEN[\'' + jobid['pub'][0] + '\',\'' + jobid['pub'][ 1] + '\']' if 'pub' != jobid['type']: word += ' AND 申请日=BETWEEN[\'' + jobid['app'][0] + '\',\'' + jobid[ 'app'][1] + '\']' return word def run_job(self, jobid): url = self.form_query_url(self.get_query_word(jobid), size=1) datestr = self.get_date_str(jobid) try: res = self.need_split(datestr, jobid['level'], url) except RuntimeError as e: if 'no proxy' in e.message: count = 3 self.re_add_job(jobid) proxies = {} while count > 0: proxies = KuaidailiProxyManager.load_proxy(30) if proxies['data']['count'] > 0: break count -= 1 if count <= 0 or not proxies.has_key( 'data') or not proxies['data'].has_key( 'count') or proxies['data']['count'] <= 0: self._shutdown() return print 'load %d proxies from kuaidaili' % proxies['data'][ 'count'] self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0) return else: raise if res[0] == 0: self.re_add_job(jobid) return elif res[0] == 1: with self.locker: self.failed += 1 self.failed_jobs.add('1,' + str(jobid)) return elif res[0] == 3: with self.locker: self.count += 1 self.sf.add('1,%s,%d,%d' % (datestr, jobid['level'], res[1])) return dates = date_split(jobid[jobid['type']][0], jobid[jobid['type']][1]) if len(dates) <= 0: with self.locker: self.failed += 1 self.failed_jobs.add('0,' + str(jobid)) return if len(dates) == 1: if 'pub' == jobid['type']: self.add_job({ 'type': 'app', 'pub': jobid['pub'], 'level': jobid['level'] + 1, 'app': ['1985.01.01', '2009.12.31'] }) self.add_job({ 'type': 'app', 'pub': jobid['pub'], 'level': jobid['level'] + 1, 'app': ['2010.01.01', self.today] }) else: with self.locker: self.count += 1 self.sf.add('2,%s,%d,%d' % (datestr, jobid['level'], res[1])) print '(%d)%s ==> %s cannot split any more' % ( jobid['level'], datestr, res[1]) return level = jobid['level'] + 1 for d in dates: job = copy.deepcopy(jobid) job['_failcnt_'] = 0 job['level'] = level job[job['type']] = d self.add_job(job) @staticmethod def get_date_str(jobid): return '[%s,%s],[%s,%s]' % (jobid['pub'][0], jobid['pub'][1], jobid['app'][0], jobid['app'][1]) def need_split(self, datestr, level, url): # self.select_user_agent(ua[random.randint(0, len(ua) - 1)]) con = self.request_url(url) time.sleep(self.sleep) if con is None: print 'none response %s' % datestr return [0, 0] if re.search(u'<title>错误页面</title>', con.text): print 'no results %s' % datestr return [1, 0] counts = re.findall('num\w{4}\.value = "(\d+)";', con.text) if len(counts) <= 0: print 'invalid pages', datestr return [1, 0] if self.test_mode: print 'counts:', counts self.check_state() paper_count = 0 for c in counts: paper_count += int(c) with self.locker: print "[%d][%d]-%s ==> %s %s" % ( level, paper_count, datestr, len(counts), 'failed' if (paper_count > self.limit) else 'ok') if paper_count > self.limit: return [2, paper_count] return [3, paper_count] def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += 'saved: %d\n' % self.count msg += 'failed: %d\n' % self.failed spider.util.sendmail(['*****@*****.**'], '%s finished' % self._name, msg)