def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, sleep_max=5, ua='firefox', seeds='detail_seeds', recover=False, year='15', bkccs=None, kldms=None, job_tag='', spider_type='detail', post_kldms=True): super(ChsiSpider, self).__init__(threadcnt, account, '%s%s' % (tag, job_tag), proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = ['5', '1'] if bkccs is None: bkccs = ['1', '2'] self.pagestore = self.new_page_store(spider_type, tag) self.full_tag = tag self.seeds = seeds if proxy: self.set_proxy(proxy) self.kldms = kldms self.bkccs = bkccs self.recover = recover self.info_saver = LinkSaver('info_data_%s_%s%s' % (spider_type, tag, job_tag)) self.failed_saver = LinkSaver('%s.failed.seeds.%s%s' % (spider_type, tag, job_tag)) self.invalid_saver = LinkSaver('%s.invalid.seeds.%s%s' % (spider_type, tag, job_tag)) self.year = year self.failed_list = [] self.invalid_list = [] self.spider_type = spider_type self.post_kldms = post_kldms
def __init__(self, threadcnt=10, seed_file=None, mode='links', list_file='links', recover=False, test=False): CourtSpider.__init__(self, threadcnt) self._name = 'HangzhouCourt' self.pagestore = HZCourtStore() self.job_spliter = HZSpliter() self._test_mode = test self.pagestore.testmode = test self.list_data = { 'pageno': '1', 'pagesize': '20', 'ajlb': '', 'cbfy': '1300', 'ah': '', 'jarq1': '19700101', 'jarq2': time.strftime('%Y%m%d', time.localtime()), 'key': '' } self.seed_file = seed_file self.page_size = 50 self.mode = mode self.list_file = list_file self.recover = recover self.today = time.strftime('%Y%m%d', time.localtime()) self.link_saver = LinkSaver(self.list_file)
def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0, captcha_limit=50000000, kldms=None, seeds='spec_seeds', recover=False, sleep_max=5, ua='firefox'): super(GkChsiSpecialSpider, self).__init__(threadcnt, account, prefix, proxy, sleep, captcha_limit, sleep_max, ua) self.special_saver = GkChsiSpecialPaperStore('yggk_spec_' + prefix) self.detail_saver = GkChsiDetailPaperStore('yggk_detail_' + prefix) self.prefix = prefix self.seeds = seeds if proxy: self.set_proxy(proxy) self.recover = recover self.kldms = kldms self.parser = HTMLParser.HTMLParser() self.curl_share = None self.login() self.info_saver = LinkSaver(prefix + '_spec_data') self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s'
def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0.0, captcha_limit=5000000, sleep_max=5, ua='firefox'): super(BaseGkChsiFsxSpider, self).__init__(threadcnt, sleep, failed_limit=2) self.select_user_agent(ua) self.account = account self.full_tag = prefix self.proxy = proxy self.max_sleep = sleep_max if proxy: self.set_proxy(proxy) self.success_count = 0 self.lock = threading.Lock() self.remain_time = 0 self.login_time = -1 self._shutdown = False self.job_saver = LinkSaver('undo.jobs.%s' % self.full_tag) self.failed_saver = LinkSaver('failed.jobs.%s' % self.full_tag) self._captcha_times = 0 self._captcha_resolved_limits = captcha_limit self.success_sleep_count = 0 self.login() self.parser = HTMLParser.HTMLParser() self.c = self.full_tag + str(random.randint(1, 100)) self.except_state = [StatisticsItem('request error'), StatisticsItem('speed error'), StatisticsItem('captcha error'), StatisticsItem('login error'), StatisticsItem('server error'), StatisticsItem('remain time error'), StatisticsItem('query error')]
def __init__(self, channel, save='fulltext.seed.txt', db='zhuanli', dburl='mongodb://localhost/zhuanli'): CWPParser.__init__(self, channel, channel, db, dburl) self.seed_saver = LinkSaver(save)
class FileAbstractParser(CAPParser): def __init__(self, channel, name, saver_name=None, db='admin', url='mongodb://*****:*****@localhost/'): CAPParser.__init__(self, channel, name, db, url) if saver_name is None: self._save_name = 'out.csv' else: self._save_name = saver_name self.saver = None def init(self): self.saver = LinkSaver(self._save_name, 'w') self.pre_save(self.saver) return CAPParser.init(self) def parse(self, page): pass def pre_save(self, saver): pass def save(self, saver, page): pass def on_save(self, items): item_list = spider.util.unique_list(items) for item in item_list: self.save(self.saver, item) self.saver.flush()
class FullTextSeedGen(CWPParser): def __init__(self, channel, save='fulltext.seed.txt', db='zhuanli', dburl='mongodb://localhost/zhuanli'): CWPParser.__init__(self, channel, channel, db, dburl) self.seed_saver = LinkSaver(save) def process_child_item(self, item): print item self.seed_saver.add(item) def parse_item(self, page): apc = page['indexUrl'].split('://')[1] m = re.search(r"d.strWhere.value = \"pnm='([\w\d]+)'\";", page['content'][1]) if m: pnm = m.group(1) else: print 'cannot find patent number:', page['indexUrl'] return [] s = re.search(r'd\.strSources\.value = "(\w+)";', page['content'][1]) if s: pt = s.group(1) else: print 'cannot find patent type:', page['indexUrl'] return [] return ['%s-%s-%s' % (pnm, pt, apc)] def on_finish(self): print '%d link saved' % self.seed_saver.count
def dispatch(self): kldms = self.fetch_kldms() if len(kldms) == 2: self.kldms = kldms # data_tmp = {'wclx': 1, 'score': 0, 'bkcc': 1, 'kldm': 1, 'years': 15, 'type': 'score'} for kldm in self.kldms: self.minscore[str(kldm)] = -1 if self.recover: with open(self.prefix + '_undo_jobs_old', 'r') as job_saver: job_saver = LinkSaver(self.prefix + '_undo_jobs_old', 'r') lines = job_saver.readlines() job_saver.close() for l in lines: self.add_main_job(eval(l)) print 'recover %d jobs' % len(lines) else: bkccs = [1, 2] # 本科,专科 for kldm in self.kldms: for bkcc in bkccs: for score in range(self.highscore, -1, -1): data = { 'wclx': 1, 'score': score, 'bkcc': bkcc, 'kldm': kldm, 'years': 15 } self.add_main_job({'data': data, 'type': 'score'}) time.sleep(2) self.wait_q() self.add_job(None)
class ShanghaiStoreFilter(CWPParser): def __init__(self): CWPParser.__init__(self, 'shanghai_court', 'shanghai_court') self.pagestore = ShanghaiCourtStore('sh_court_2') self.link_saver = LinkSaver('wrong.id.txt') def process_child_item(self, item): self.pagestore.save(int(item['crawlerUpdateTime'] / 1000), item['indexUrl'][17:], item['realUrl'], item['content'][1]) def parse_item(self, page): if page['indexUrl'][17] != '/': return [page] self.link_saver.add(page['indexUrl'][17:]) return [] def on_finish(self): self.link_saver.flush() def on_save(self, items): for item in items: self.pagestore.save(int(item['crawlerUpdateTime'] / 1000), item['indexUrl'][17:], item['realUrl'], item['content'][1])
def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0.0, captcha_limit=50000000, seeds='detail_seeds', recover=False, sleep_max=5, ua='firefox', year='15', bkccs=None, kldms=None): super(GkChsiDetailSpider, self).__init__(threadcnt, account, prefix, proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = ['5', '1'] if bkccs is None: bkccs = ['1', '2'] self.pagestore = GkChsiDetailPaperStore('yggk_detail_' + prefix) self.prefix = prefix self.seeds = seeds if proxy: self.set_proxy(proxy) self.kldms = kldms self.bkccs = bkccs self.recover = recover self.parser = HTMLParser.HTMLParser() self.info_saver = LinkSaver(prefix + '_detail_data') self.year = year self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s'
def __init__(self, thread_count=1, full_mode=False, seeds='seeds'): super(ShenzhenCourtListSpider, self).__init__(thread_count, 'list.spider.log') self._name = 'ShenzhenListSpider' self.job_spliter = ShenzhenSpliter() self._captcha_times = range(0, thread_count) self.test_mode = False self.pagesize = 50 self.full_mode = full_mode self.link_saver = LinkSaver(seeds, 'a')
class ChannelParser: def __init__(self, name='failed.txt', mode='a'): self.failed_saver = LinkSaver(name, mode) pass @abc.abstractmethod def parse(self, jid, content): raise NotImplementedError('virtual function called') def on_failed(self, message): self.failed_saver.add(message)
class Extractor(CWPParser): def __init__(self): CWPParser.__init__(self, 'fs_court', 'fs') self.saver = LinkSaver('seed.txt') def process_child_item(self, item): self.saver.add(item) print '%s saved' % item def parse_item(self, page): if '页面不存在' in page['content'][1]: return [page['realUrl']] return []
class SeedParser(WenshuSpider): date_format = '%Y%m%d' def __init__(self, thcnt=4, page=15): WenshuSpider.__init__(self, thcnt) self.source = WenshuSeedDb('ws_seed') self.link_saver = LinkSaver('seeds.dat', buffer_size=400) self.page = page def dispatch(self): seeds = self.source.export_seeds() print 'load %d seeds' % len(seeds) for seed in seeds: date = seed['indexUrl'].split('://')[1] eval_str = seed['content'][1:-1].replace('\\"', '"') res = eval(eval_str) try: if (isinstance(res, tuple) or isinstance(res, list)) and len(res) > 0: self.add_main_job({ 'type': 'main', 'date': date.encode('utf-8'), 'count': int(res[0]['Count']) }) else: print 'invalid seed', seed except KeyError as e: Log.error('KeyError %s' % e.message) traceback.print_exc() print seed print eval_str time.sleep(2) self.wait_q() self.add_job(None) def run_job(self, jobid): pagecnt = (jobid['count'] + self.page / 2) / self.page for index in range(1, pagecnt + 1): self.link_saver.add( str({ 'date': jobid['date'], 'count': jobid['count'], 'index': index, 'page': self.page })) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': self.link_saver.flush()
def pre_save(self, saver): saver.add(GkChsiParser.title) s2 = LinkSaver('res_score_%s' % self.name, 'w') s2.add('省市,科类,层次,位次,分数') for r in self.score_rank: s2.add(r) s2.flush()
def dispatch(self): kldms = self.fetch_kldms() if len(kldms) == 2: self.kldms[0] = str(kldms[0]) self.kldms[1] = str(kldms[1]) # data_tmp = {'wclx': 1, 'score': 0, 'bkcc': 1, 'kldm': 1, 'years': 15, 'type': 'score'} for kldm in self.kldms: self.min_score_arr[str(kldm)] = -1 if self.recover: job_saver = LinkSaver(self.prefix + '_undo_jobs_old', 'r') lines = job_saver.readlines() job_saver.close() for l in lines: self.add_main_job(eval(l.strip())) print 'recover %d jobs' % len(lines) else: bkccs = [1, 2] # 本科,专科 mid_score = self.min_score + (self.highscore - self.min_score) * 3 / 4 for score in range( 0, max(self.highscore - mid_score, mid_score - self.min_score)): up_score = mid_score + score down_score = mid_score - score - 1 for kldm in self.kldms: for bkcc in bkccs: if up_score <= self.highscore: data = { 'wclx': 1, 'score': up_score, 'bkcc': bkcc, 'kldm': kldm, 'years': 15 } self.add_main_job({'data': data, 'type': 'score'}) if down_score >= self.min_score and down_score > 0: data = { 'wclx': 1, 'score': down_score, 'bkcc': bkcc, 'kldm': kldm, 'years': 15 } self.add_main_job({'data': data, 'type': 'score'}) time.sleep(2) self.wait_q() self.add_job(None)
def __init__(self, thcnt=4, limit=5000, recover=False): super(ListSeedGenQueries, self).__init__(thcnt) self.bs2 = FileSaver("failed_urls.2.txt") self.limit = limit self.test_mode = False self.sf = LinkSaver('seed.2.dat', 'a') self.failed_jobs = LinkSaver('seed.2.failed.dat', 'w') self.count = 0 self.failed = 0 self.sleep = 0 self.recover = recover self.timeout = 60 self.today = datetime.datetime.now().strftime('%Y') random.seed = int(time.time()) self.select_user_agent(ua[2])
def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0, captcha_limit=50000000, kldms=None, seeds='spec_seeds', year='15', bkccs=None, recover=False, recover_seeds=None, sleep_max=5, ua='firefox'): super(GkChsiSpecialSpider2, self).__init__(threadcnt, account, prefix, proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = ['5', '1'] if bkccs is None: bkccs = ['1', '2'] self.special_saver = GkChsiSpecialPaperStore('yggk_spec_' + prefix) self.prefix = prefix self.seeds = seeds if proxy: self.set_proxy(proxy) self.recover = recover self.kldms = kldms self.bkccs = bkccs self.parser = HTMLParser.HTMLParser() self.year = year self.info_saver = LinkSaver(prefix + '_spec_data') self.recover_seeds = recover_seeds
def __init__(self, thcnt, mode='id', recover=True, seeds='seed.dat'): ZhuanliBaseSpider.__init__(self, thcnt, recover, timeout=90) Main.__init__(self) self.short_tag = 't:m:s:r:o:h:v:' self.tags = ['recover=', 'threads=', 'mode=', 'seeds=', 'output='] self.seeds = seeds self.page_size = 20 # 3或者10,20 self.pagestore = PatentAbstractStore('abstract') self.failed_saver = FailedJobSaver('failed_job.txt') self.seed_saver = LinkSaver('seed.year.txt', 'a+') self.job_log = LinkSaver('abstract.%s.log' % mode, 'a+') self.mode = mode self.__version = '1.0.0' self.utils = threading.local() self.sp_errors = OrderedDict() self.pre_save_count = 0 self.properties = PropertiesManager() self.can_load_seed = True
def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0.0, highscore=750, captcha_limit=50000, kldms=None, seeds=None, recover=False, sleep_max=5, ua='firefox'): super(GkChsiSchoolSpider, self).__init__(threadcnt, account, prefix, proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = [1, 5] self.pagestore = GkChsiSchoolPaperStore('yggk_sch_' + prefix) self.prefix = prefix if proxy: self.set_proxy(proxy) self.highscore = highscore self.minscore = {} self.recover = recover self.kldms = kldms self.parser = HTMLParser.HTMLParser() self.curl_share = None self.login() self.info_saver = LinkSaver(prefix + '_data') self.seeds = seeds
def __init__(self, threadcnt, last_page=None, total_page=22305, save_file='seeds.dat', sleep=0.0, proxy_life=180): super(BjListSpider, self).__init__(threadcnt, 'BjListSpider', proxy_life=proxy_life) self.test_mode = False self.sleep = sleep self.zero_link_count = 0 self.lock = threading.Lock() self._shutdown = False self.result_saver = LinkSaver(save_file, 'a') self.captcha = FoodMakerExtendLock(threadcnt - 1) self.last_page = last_page self.total_page = total_page
def __init__(self, thread_count=5, name='ShanghaiCourtListSpider', log='list.spider.log', out='links', recover=False): CourtSpider.__init__(self, thread_count, log) self._name = name self.pagestore = ShanghaiSeedStore() self.linkdb = ShanghaiLinkDb('sh_link') self.seedb = ShanghaiLinkDb('sh_seed') self.link_saver = LinkSaver(out) self.lock = threading.Lock() self.pager_failed_count = 0 self.recover = recover
def __init__(self, thread_count=1, list_only=False, save_link=False, from_link=False, recover=False, seeds='seeds'): super(ShenzhenCourtSpider, self).__init__(thread_count) self._name = 'ShenzhenCourt' self.pagestore = ShenzhenCourtStore() self.job_spliter = ShenzhenSpliter() self._captcha_times = range(0, thread_count) self.test_mode = False self.pagesize = 50 self.list_only = list_only self.save_link = save_link self.link_saver = None self.seeds = seeds if self.save_link: self.link_saver = LinkSaver('saved.links', 'a+b') self.from_link = from_link self.recover = recover
class ShanghaiExtractor(CWPParser): """解析文书案号""" def __init__(self): CWPParser.__init__(self, 'shanghai_court', 'court') self.an_saver = LinkSaver('ah.%s.txt' % self.name) def process_child_item(self, item): line = '%s|%s' % (item[0], item[1]) print line self.an_saver.add(line) def init(self): print 'job start at', datetime.datetime.now() return CWPParser.init(self) def parse_item(self, page): m = re.search('((\d{4}).*\d+号)', page['content'][1]) if m: return [[m.group(1), page['indexUrl'][17:].encode()]] return [] def on_finish(self): self.an_saver.flush()
def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, seeds='detail_seeds', recover=False, sleep_max=5, ua='firefox', year='15', bkccs=None, kldms=None, job_tag=''): super(BaseChsiSpider, self).__init__(threadcnt, account, tag, proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = ['5', '1'] if bkccs is None: bkccs = ['1', '2'] self.pagestore = GkChsiDetailPaperStore('yggk_detail_' + tag) self.full_tag = tag self.seeds = seeds if proxy: self.set_proxy(proxy) self.kldms = kldms self.bkccs = bkccs self.recover = recover self.parser = HTMLParser.HTMLParser() self.info_saver = LinkSaver(tag + '_detail_data') self.failed_saver = LinkSaver('detail.failed.seeds.' + tag + job_tag) self.year = year self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s' self.failed_list = [] self.last_request_time = time.time()
def __init__(self, thread_count=5, seeds=None, start=1, name='ShanghaiCourtSpider', list_only=False, paper_seeds=None, recover=False): ProxySwapSpider.__init__(self, thread_count, proxy_life=3600) if seeds is None: seeds = [] self._name = name self.seeds = seeds self.pagestore = ShanghaiCourtStore() self.page_size = 20 self.list_only = list_only self.search_url_format = 'http://www.hshfy.sh.cn:8081/flws/content.jsp?wz=&pa=%s&more=1&toPage=%d&totalPage=%d&perPaperLink=%d&perPaperNum=%d' if self.list_only: self.link_saver = LinkSaver('links', 'a') self.paper_seeds = paper_seeds self.lock = threading.Lock() self.pager_failed_count = 0 self.recover = recover self.start = start
def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0, highscore=750, captcha_limit=50000, kldms=None, recover=False, sleep_max=5, min_score=0, ua='firefox'): super(GkChsiFsxSpider, self).__init__(threadcnt) if kldms is None: kldms = [1, 5] self.select_user_agent(ua) self.pagestore = GkChsiFsxPaperStore('gkchsi_' + prefix) self.score_saver = GkChsiFsxScoreStore('gkchsi_score_' + prefix) self.account = account self.prefix = prefix self.proxy = proxy self.sleep = sleep self.cur_sleep = sleep self.max_sleep = sleep_max if proxy: self.set_proxy(proxy) self.highscore = highscore self.min_score_arr = {} self.success_count = 0 self.lock = threading.Lock() self.remain_time = 0 self.login_time = -1 self.__shutdown = False self.job_saver = LinkSaver(self.prefix + '_undo_jobs') self.__captcha_times = 0 self.__captcha_resolved_limits = captcha_limit self.recover = recover self.success_sleep_count = 0 self.kldms = kldms self.parser = HTMLParser.HTMLParser() self.curl_share = None self.login() self.min_score = min_score
def __init__(self, thread_count=5, start=2000, split_limit=3000, name='ShanghaiSeedGenerator', recover=False): super(ShanghaiListGenerator, self).__init__(thread_count) self.select_user_agent( '=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36' ) self._name = name self.seeds = seeds self.linkdb = ShanghaiLinkDb('sh_link') self.page_size = 15 self.link_saver = LinkSaver('links', 'a') self.lock = threading.Lock() self.pager_failed_count = 0 self.recover = recover self.start = start self.split_limit = split_limit
def __init__(self, thcnt, name='BeijingCourtSpider', link_saver='links', saver_mode='a', sleep=0.0, proxy_life=180, captcha_limit=100): super(BJSpider, self).__init__(thcnt, proxy_life) self._name = name self.link_saver = LinkSaver(link_saver, saver_mode) self.total_content_failed = 0 self.current_failed = StatisticsItem() self.linkstore = SpiderLinkStore('bj_court') # test parameters self.test_mode = False self._shutdown_in_test = False self.sleep = sleep self.captcha = FoodMakerExtendLock(thcnt - 1) self._shutdown = False self.captcha_times = 0 self.captcha_limit = captcha_limit self.captcha_lock = threading.Lock()
def __init__(self): CWPParser.__init__(self, 'abs_list', 'abs_list', 'zhuanli') self.store = PatentStore('abstract') self.failed_link = LinkSaver('abstract.parser.failed.txt') self.url_format = 'http://epub.sipo.gov.cn/dxbdl.action?strSources=fmmost&strWhere=%s&recordCursor=0&strLicenseCode=&action=dxbdln' self.save_count = 0