Exemple #1
0
 def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, sleep_max=5,
              ua='firefox', seeds='detail_seeds', recover=False, year='15', bkccs=None, kldms=None,
              job_tag='', spider_type='detail', post_kldms=True):
     super(ChsiSpider, self).__init__(threadcnt, account, '%s%s' % (tag, job_tag), proxy, sleep, captcha_limit,
                                      sleep_max,
                                      ua)
     if kldms is None:
         kldms = ['5', '1']
     if bkccs is None:
         bkccs = ['1', '2']
     self.pagestore = self.new_page_store(spider_type, tag)
     self.full_tag = tag
     self.seeds = seeds
     if proxy:
         self.set_proxy(proxy)
     self.kldms = kldms
     self.bkccs = bkccs
     self.recover = recover
     self.info_saver = LinkSaver('info_data_%s_%s%s' % (spider_type, tag, job_tag))
     self.failed_saver = LinkSaver('%s.failed.seeds.%s%s' % (spider_type, tag, job_tag))
     self.invalid_saver = LinkSaver('%s.invalid.seeds.%s%s' % (spider_type, tag, job_tag))
     self.year = year
     self.failed_list = []
     self.invalid_list = []
     self.spider_type = spider_type
     self.post_kldms = post_kldms
Exemple #2
0
 def __init__(self,
              threadcnt=10,
              seed_file=None,
              mode='links',
              list_file='links',
              recover=False,
              test=False):
     CourtSpider.__init__(self, threadcnt)
     self._name = 'HangzhouCourt'
     self.pagestore = HZCourtStore()
     self.job_spliter = HZSpliter()
     self._test_mode = test
     self.pagestore.testmode = test
     self.list_data = {
         'pageno': '1',
         'pagesize': '20',
         'ajlb': '',
         'cbfy': '1300',
         'ah': '',
         'jarq1': '19700101',
         'jarq2': time.strftime('%Y%m%d', time.localtime()),
         'key': ''
     }
     self.seed_file = seed_file
     self.page_size = 50
     self.mode = mode
     self.list_file = list_file
     self.recover = recover
     self.today = time.strftime('%Y%m%d', time.localtime())
     self.link_saver = LinkSaver(self.list_file)
Exemple #3
0
    def __init__(self,
                 threadcnt,
                 account,
                 prefix,
                 proxy=None,
                 sleep=0,
                 captcha_limit=50000000,
                 kldms=None,
                 seeds='spec_seeds',
                 recover=False,
                 sleep_max=5,
                 ua='firefox'):
        super(GkChsiSpecialSpider,
              self).__init__(threadcnt, account, prefix, proxy, sleep,
                             captcha_limit, sleep_max, ua)
        self.special_saver = GkChsiSpecialPaperStore('yggk_spec_' + prefix)
        self.detail_saver = GkChsiDetailPaperStore('yggk_detail_' + prefix)
        self.prefix = prefix
        self.seeds = seeds
        if proxy:
            self.set_proxy(proxy)

        self.recover = recover
        self.kldms = kldms
        self.parser = HTMLParser.HTMLParser()
        self.curl_share = None
        self.login()
        self.info_saver = LinkSaver(prefix + '_spec_data')
        self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s'
Exemple #4
0
 def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0.0, captcha_limit=5000000, sleep_max=5,
              ua='firefox'):
     super(BaseGkChsiFsxSpider, self).__init__(threadcnt, sleep, failed_limit=2)
     self.select_user_agent(ua)
     self.account = account
     self.full_tag = prefix
     self.proxy = proxy
     self.max_sleep = sleep_max
     if proxy:
         self.set_proxy(proxy)
     self.success_count = 0
     self.lock = threading.Lock()
     self.remain_time = 0
     self.login_time = -1
     self._shutdown = False
     self.job_saver = LinkSaver('undo.jobs.%s' % self.full_tag)
     self.failed_saver = LinkSaver('failed.jobs.%s' % self.full_tag)
     self._captcha_times = 0
     self._captcha_resolved_limits = captcha_limit
     self.success_sleep_count = 0
     self.login()
     self.parser = HTMLParser.HTMLParser()
     self.c = self.full_tag + str(random.randint(1, 100))
     self.except_state = [StatisticsItem('request error'), StatisticsItem('speed error'),
                          StatisticsItem('captcha error'),
                          StatisticsItem('login error'), StatisticsItem('server error'),
                          StatisticsItem('remain time error'), StatisticsItem('query error')]
Exemple #5
0
 def __init__(self,
              channel,
              save='fulltext.seed.txt',
              db='zhuanli',
              dburl='mongodb://localhost/zhuanli'):
     CWPParser.__init__(self, channel, channel, db, dburl)
     self.seed_saver = LinkSaver(save)
Exemple #6
0
class FileAbstractParser(CAPParser):
    def __init__(self,
                 channel,
                 name,
                 saver_name=None,
                 db='admin',
                 url='mongodb://*****:*****@localhost/'):
        CAPParser.__init__(self, channel, name, db, url)
        if saver_name is None:
            self._save_name = 'out.csv'
        else:
            self._save_name = saver_name
        self.saver = None

    def init(self):
        self.saver = LinkSaver(self._save_name, 'w')
        self.pre_save(self.saver)
        return CAPParser.init(self)

    def parse(self, page):
        pass

    def pre_save(self, saver):
        pass

    def save(self, saver, page):
        pass

    def on_save(self, items):
        item_list = spider.util.unique_list(items)
        for item in item_list:
            self.save(self.saver, item)
        self.saver.flush()
Exemple #7
0
class FullTextSeedGen(CWPParser):
    def __init__(self,
                 channel,
                 save='fulltext.seed.txt',
                 db='zhuanli',
                 dburl='mongodb://localhost/zhuanli'):
        CWPParser.__init__(self, channel, channel, db, dburl)
        self.seed_saver = LinkSaver(save)

    def process_child_item(self, item):
        print item
        self.seed_saver.add(item)

    def parse_item(self, page):
        apc = page['indexUrl'].split('://')[1]
        m = re.search(r"d.strWhere.value = \"pnm='([\w\d]+)'\";",
                      page['content'][1])
        if m:
            pnm = m.group(1)
        else:
            print 'cannot find patent number:', page['indexUrl']
            return []
        s = re.search(r'd\.strSources\.value = "(\w+)";', page['content'][1])
        if s:
            pt = s.group(1)
        else:
            print 'cannot find patent type:', page['indexUrl']
            return []
        return ['%s-%s-%s' % (pnm, pt, apc)]

    def on_finish(self):
        print '%d link saved' % self.seed_saver.count
Exemple #8
0
    def dispatch(self):
        kldms = self.fetch_kldms()
        if len(kldms) == 2:
            self.kldms = kldms
        # data_tmp = {'wclx': 1, 'score': 0, 'bkcc': 1, 'kldm': 1, 'years': 15, 'type': 'score'}
        for kldm in self.kldms:
            self.minscore[str(kldm)] = -1
        if self.recover:
            with open(self.prefix + '_undo_jobs_old', 'r') as job_saver:
                job_saver = LinkSaver(self.prefix + '_undo_jobs_old', 'r')
                lines = job_saver.readlines()
                job_saver.close()
                for l in lines:
                    self.add_main_job(eval(l))
                print 'recover %d jobs' % len(lines)
        else:
            bkccs = [1, 2]  # 本科,专科
            for kldm in self.kldms:
                for bkcc in bkccs:
                    for score in range(self.highscore, -1, -1):
                        data = {
                            'wclx': 1,
                            'score': score,
                            'bkcc': bkcc,
                            'kldm': kldm,
                            'years': 15
                        }
                        self.add_main_job({'data': data, 'type': 'score'})

        time.sleep(2)
        self.wait_q()
        self.add_job(None)
class ShanghaiStoreFilter(CWPParser):
    def __init__(self):
        CWPParser.__init__(self, 'shanghai_court', 'shanghai_court')
        self.pagestore = ShanghaiCourtStore('sh_court_2')
        self.link_saver = LinkSaver('wrong.id.txt')

    def process_child_item(self, item):
        self.pagestore.save(int(item['crawlerUpdateTime'] / 1000),
                            item['indexUrl'][17:], item['realUrl'],
                            item['content'][1])

    def parse_item(self, page):
        if page['indexUrl'][17] != '/':
            return [page]
        self.link_saver.add(page['indexUrl'][17:])
        return []

    def on_finish(self):
        self.link_saver.flush()

    def on_save(self, items):
        for item in items:
            self.pagestore.save(int(item['crawlerUpdateTime'] / 1000),
                                item['indexUrl'][17:], item['realUrl'],
                                item['content'][1])
Exemple #10
0
 def __init__(self,
              threadcnt,
              account,
              prefix,
              proxy=None,
              sleep=0.0,
              captcha_limit=50000000,
              seeds='detail_seeds',
              recover=False,
              sleep_max=5,
              ua='firefox',
              year='15',
              bkccs=None,
              kldms=None):
     super(GkChsiDetailSpider,
           self).__init__(threadcnt, account, prefix, proxy, sleep,
                          captcha_limit, sleep_max, ua)
     if kldms is None:
         kldms = ['5', '1']
     if bkccs is None:
         bkccs = ['1', '2']
     self.pagestore = GkChsiDetailPaperStore('yggk_detail_' + prefix)
     self.prefix = prefix
     self.seeds = seeds
     if proxy:
         self.set_proxy(proxy)
     self.kldms = kldms
     self.bkccs = bkccs
     self.recover = recover
     self.parser = HTMLParser.HTMLParser()
     self.info_saver = LinkSaver(prefix + '_detail_data')
     self.year = year
     self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s'
Exemple #11
0
 def __init__(self, thread_count=1, full_mode=False, seeds='seeds'):
     super(ShenzhenCourtListSpider, self).__init__(thread_count, 'list.spider.log')
     self._name = 'ShenzhenListSpider'
     self.job_spliter = ShenzhenSpliter()
     self._captcha_times = range(0, thread_count)
     self.test_mode = False
     self.pagesize = 50
     self.full_mode = full_mode
     self.link_saver = LinkSaver(seeds, 'a')
Exemple #12
0
class ChannelParser:
    def __init__(self, name='failed.txt', mode='a'):
        self.failed_saver = LinkSaver(name, mode)
        pass

    @abc.abstractmethod
    def parse(self, jid, content):
        raise NotImplementedError('virtual function called')

    def on_failed(self, message):
        self.failed_saver.add(message)
class Extractor(CWPParser):
    def __init__(self):
        CWPParser.__init__(self, 'fs_court', 'fs')
        self.saver = LinkSaver('seed.txt')

    def process_child_item(self, item):
        self.saver.add(item)
        print '%s saved' % item

    def parse_item(self, page):
        if '页面不存在' in page['content'][1]:
            return [page['realUrl']]
        return []
Exemple #14
0
class SeedParser(WenshuSpider):
    date_format = '%Y%m%d'

    def __init__(self, thcnt=4, page=15):
        WenshuSpider.__init__(self, thcnt)
        self.source = WenshuSeedDb('ws_seed')
        self.link_saver = LinkSaver('seeds.dat', buffer_size=400)
        self.page = page

    def dispatch(self):
        seeds = self.source.export_seeds()

        print 'load %d seeds' % len(seeds)
        for seed in seeds:
            date = seed['indexUrl'].split('://')[1]
            eval_str = seed['content'][1:-1].replace('\\"', '"')
            res = eval(eval_str)
            try:

                if (isinstance(res, tuple)
                        or isinstance(res, list)) and len(res) > 0:
                    self.add_main_job({
                        'type': 'main',
                        'date': date.encode('utf-8'),
                        'count': int(res[0]['Count'])
                    })
                else:
                    print 'invalid seed', seed
            except KeyError as e:
                Log.error('KeyError %s' % e.message)
                traceback.print_exc()
                print seed
                print eval_str
        time.sleep(2)
        self.wait_q()
        self.add_job(None)

    def run_job(self, jobid):
        pagecnt = (jobid['count'] + self.page / 2) / self.page
        for index in range(1, pagecnt + 1):
            self.link_saver.add(
                str({
                    'date': jobid['date'],
                    'count': jobid['count'],
                    'index': index,
                    'page': self.page
                }))

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            self.link_saver.flush()
Exemple #15
0
 def pre_save(self, saver):
     saver.add(GkChsiParser.title)
     s2 = LinkSaver('res_score_%s' % self.name, 'w')
     s2.add('省市,科类,层次,位次,分数')
     for r in self.score_rank:
         s2.add(r)
     s2.flush()
Exemple #16
0
    def dispatch(self):
        kldms = self.fetch_kldms()
        if len(kldms) == 2:
            self.kldms[0] = str(kldms[0])
            self.kldms[1] = str(kldms[1])
        # data_tmp = {'wclx': 1, 'score': 0, 'bkcc': 1, 'kldm': 1, 'years': 15, 'type': 'score'}
        for kldm in self.kldms:
            self.min_score_arr[str(kldm)] = -1
        if self.recover:
            job_saver = LinkSaver(self.prefix + '_undo_jobs_old', 'r')
            lines = job_saver.readlines()
            job_saver.close()
            for l in lines:
                self.add_main_job(eval(l.strip()))
            print 'recover %d jobs' % len(lines)
        else:
            bkccs = [1, 2]  # 本科,专科
            mid_score = self.min_score + (self.highscore -
                                          self.min_score) * 3 / 4
            for score in range(
                    0,
                    max(self.highscore - mid_score,
                        mid_score - self.min_score)):
                up_score = mid_score + score
                down_score = mid_score - score - 1
                for kldm in self.kldms:
                    for bkcc in bkccs:
                        if up_score <= self.highscore:
                            data = {
                                'wclx': 1,
                                'score': up_score,
                                'bkcc': bkcc,
                                'kldm': kldm,
                                'years': 15
                            }
                            self.add_main_job({'data': data, 'type': 'score'})
                        if down_score >= self.min_score and down_score > 0:
                            data = {
                                'wclx': 1,
                                'score': down_score,
                                'bkcc': bkcc,
                                'kldm': kldm,
                                'years': 15
                            }
                            self.add_main_job({'data': data, 'type': 'score'})

        time.sleep(2)
        self.wait_q()
        self.add_job(None)
Exemple #17
0
 def __init__(self, thcnt=4, limit=5000, recover=False):
     super(ListSeedGenQueries, self).__init__(thcnt)
     self.bs2 = FileSaver("failed_urls.2.txt")
     self.limit = limit
     self.test_mode = False
     self.sf = LinkSaver('seed.2.dat', 'a')
     self.failed_jobs = LinkSaver('seed.2.failed.dat', 'w')
     self.count = 0
     self.failed = 0
     self.sleep = 0
     self.recover = recover
     self.timeout = 60
     self.today = datetime.datetime.now().strftime('%Y')
     random.seed = int(time.time())
     self.select_user_agent(ua[2])
Exemple #18
0
    def __init__(self,
                 threadcnt,
                 account,
                 prefix,
                 proxy=None,
                 sleep=0,
                 captcha_limit=50000000,
                 kldms=None,
                 seeds='spec_seeds',
                 year='15',
                 bkccs=None,
                 recover=False,
                 recover_seeds=None,
                 sleep_max=5,
                 ua='firefox'):
        super(GkChsiSpecialSpider2,
              self).__init__(threadcnt, account, prefix, proxy, sleep,
                             captcha_limit, sleep_max, ua)
        if kldms is None:
            kldms = ['5', '1']
        if bkccs is None:
            bkccs = ['1', '2']
        self.special_saver = GkChsiSpecialPaperStore('yggk_spec_' + prefix)
        self.prefix = prefix
        self.seeds = seeds
        if proxy:
            self.set_proxy(proxy)

        self.recover = recover
        self.kldms = kldms
        self.bkccs = bkccs
        self.parser = HTMLParser.HTMLParser()
        self.year = year
        self.info_saver = LinkSaver(prefix + '_spec_data')
        self.recover_seeds = recover_seeds
Exemple #19
0
 def __init__(self, thcnt, mode='id', recover=True, seeds='seed.dat'):
     ZhuanliBaseSpider.__init__(self, thcnt, recover, timeout=90)
     Main.__init__(self)
     self.short_tag = 't:m:s:r:o:h:v:'
     self.tags = ['recover=', 'threads=', 'mode=', 'seeds=', 'output=']
     self.seeds = seeds
     self.page_size = 20  # 3或者10,20
     self.pagestore = PatentAbstractStore('abstract')
     self.failed_saver = FailedJobSaver('failed_job.txt')
     self.seed_saver = LinkSaver('seed.year.txt', 'a+')
     self.job_log = LinkSaver('abstract.%s.log' % mode, 'a+')
     self.mode = mode
     self.__version = '1.0.0'
     self.utils = threading.local()
     self.sp_errors = OrderedDict()
     self.pre_save_count = 0
     self.properties = PropertiesManager()
     self.can_load_seed = True
Exemple #20
0
 def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0.0, highscore=750,
              captcha_limit=50000,
              kldms=None, seeds=None,
              recover=False, sleep_max=5, ua='firefox'):
     super(GkChsiSchoolSpider, self).__init__(threadcnt, account, prefix, proxy, sleep, captcha_limit, sleep_max, ua)
     if kldms is None:
         kldms = [1, 5]
     self.pagestore = GkChsiSchoolPaperStore('yggk_sch_' + prefix)
     self.prefix = prefix
     if proxy:
         self.set_proxy(proxy)
     self.highscore = highscore
     self.minscore = {}
     self.recover = recover
     self.kldms = kldms
     self.parser = HTMLParser.HTMLParser()
     self.curl_share = None
     self.login()
     self.info_saver = LinkSaver(prefix + '_data')
     self.seeds = seeds
Exemple #21
0
    def __init__(self,
                 threadcnt,
                 last_page=None,
                 total_page=22305,
                 save_file='seeds.dat',
                 sleep=0.0,
                 proxy_life=180):
        super(BjListSpider, self).__init__(threadcnt,
                                           'BjListSpider',
                                           proxy_life=proxy_life)

        self.test_mode = False
        self.sleep = sleep
        self.zero_link_count = 0
        self.lock = threading.Lock()
        self._shutdown = False
        self.result_saver = LinkSaver(save_file, 'a')
        self.captcha = FoodMakerExtendLock(threadcnt - 1)
        self.last_page = last_page
        self.total_page = total_page
Exemple #22
0
    def __init__(self, thread_count=5, name='ShanghaiCourtListSpider', log='list.spider.log',
                 out='links',
                 recover=False):
        CourtSpider.__init__(self, thread_count, log)

        self._name = name
        self.pagestore = ShanghaiSeedStore()
        self.linkdb = ShanghaiLinkDb('sh_link')
        self.seedb = ShanghaiLinkDb('sh_seed')
        self.link_saver = LinkSaver(out)
        self.lock = threading.Lock()
        self.pager_failed_count = 0
        self.recover = recover
Exemple #23
0
 def __init__(self,
              thread_count=1,
              list_only=False,
              save_link=False,
              from_link=False,
              recover=False,
              seeds='seeds'):
     super(ShenzhenCourtSpider, self).__init__(thread_count)
     self._name = 'ShenzhenCourt'
     self.pagestore = ShenzhenCourtStore()
     self.job_spliter = ShenzhenSpliter()
     self._captcha_times = range(0, thread_count)
     self.test_mode = False
     self.pagesize = 50
     self.list_only = list_only
     self.save_link = save_link
     self.link_saver = None
     self.seeds = seeds
     if self.save_link:
         self.link_saver = LinkSaver('saved.links', 'a+b')
     self.from_link = from_link
     self.recover = recover
Exemple #24
0
class ShanghaiExtractor(CWPParser):
    """解析文书案号"""
    def __init__(self):
        CWPParser.__init__(self, 'shanghai_court', 'court')
        self.an_saver = LinkSaver('ah.%s.txt' % self.name)

    def process_child_item(self, item):
        line = '%s|%s' % (item[0], item[1])
        print line
        self.an_saver.add(line)

    def init(self):
        print 'job start at', datetime.datetime.now()
        return CWPParser.init(self)

    def parse_item(self, page):
        m = re.search('((\d{4}).*\d+号)', page['content'][1])
        if m:
            return [[m.group(1), page['indexUrl'][17:].encode()]]
        return []

    def on_finish(self):
        self.an_saver.flush()
Exemple #25
0
 def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, seeds='detail_seeds',
              recover=False, sleep_max=5, ua='firefox', year='15', bkccs=None, kldms=None, job_tag=''):
     super(BaseChsiSpider, self).__init__(threadcnt, account, tag, proxy, sleep, captcha_limit, sleep_max,
                                          ua)
     if kldms is None:
         kldms = ['5', '1']
     if bkccs is None:
         bkccs = ['1', '2']
     self.pagestore = GkChsiDetailPaperStore('yggk_detail_' + tag)
     self.full_tag = tag
     self.seeds = seeds
     if proxy:
         self.set_proxy(proxy)
     self.kldms = kldms
     self.bkccs = bkccs
     self.recover = recover
     self.parser = HTMLParser.HTMLParser()
     self.info_saver = LinkSaver(tag + '_detail_data')
     self.failed_saver = LinkSaver('detail.failed.seeds.' + tag + job_tag)
     self.year = year
     self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s'
     self.failed_list = []
     self.last_request_time = time.time()
Exemple #26
0
 def __init__(self,
              thread_count=5,
              seeds=None,
              start=1,
              name='ShanghaiCourtSpider',
              list_only=False,
              paper_seeds=None,
              recover=False):
     ProxySwapSpider.__init__(self, thread_count, proxy_life=3600)
     if seeds is None:
         seeds = []
     self._name = name
     self.seeds = seeds
     self.pagestore = ShanghaiCourtStore()
     self.page_size = 20
     self.list_only = list_only
     self.search_url_format = 'http://www.hshfy.sh.cn:8081/flws/content.jsp?wz=&pa=%s&more=1&toPage=%d&totalPage=%d&perPaperLink=%d&perPaperNum=%d'
     if self.list_only:
         self.link_saver = LinkSaver('links', 'a')
     self.paper_seeds = paper_seeds
     self.lock = threading.Lock()
     self.pager_failed_count = 0
     self.recover = recover
     self.start = start
Exemple #27
0
 def __init__(self,
              threadcnt,
              account,
              prefix,
              proxy=None,
              sleep=0,
              highscore=750,
              captcha_limit=50000,
              kldms=None,
              recover=False,
              sleep_max=5,
              min_score=0,
              ua='firefox'):
     super(GkChsiFsxSpider, self).__init__(threadcnt)
     if kldms is None:
         kldms = [1, 5]
     self.select_user_agent(ua)
     self.pagestore = GkChsiFsxPaperStore('gkchsi_' + prefix)
     self.score_saver = GkChsiFsxScoreStore('gkchsi_score_' + prefix)
     self.account = account
     self.prefix = prefix
     self.proxy = proxy
     self.sleep = sleep
     self.cur_sleep = sleep
     self.max_sleep = sleep_max
     if proxy:
         self.set_proxy(proxy)
     self.highscore = highscore
     self.min_score_arr = {}
     self.success_count = 0
     self.lock = threading.Lock()
     self.remain_time = 0
     self.login_time = -1
     self.__shutdown = False
     self.job_saver = LinkSaver(self.prefix + '_undo_jobs')
     self.__captcha_times = 0
     self.__captcha_resolved_limits = captcha_limit
     self.recover = recover
     self.success_sleep_count = 0
     self.kldms = kldms
     self.parser = HTMLParser.HTMLParser()
     self.curl_share = None
     self.login()
     self.min_score = min_score
Exemple #28
0
 def __init__(self,
              thread_count=5,
              start=2000,
              split_limit=3000,
              name='ShanghaiSeedGenerator',
              recover=False):
     super(ShanghaiListGenerator, self).__init__(thread_count)
     self.select_user_agent(
         '=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36'
     )
     self._name = name
     self.seeds = seeds
     self.linkdb = ShanghaiLinkDb('sh_link')
     self.page_size = 15
     self.link_saver = LinkSaver('links', 'a')
     self.lock = threading.Lock()
     self.pager_failed_count = 0
     self.recover = recover
     self.start = start
     self.split_limit = split_limit
Exemple #29
0
 def __init__(self,
              thcnt,
              name='BeijingCourtSpider',
              link_saver='links',
              saver_mode='a',
              sleep=0.0,
              proxy_life=180,
              captcha_limit=100):
     super(BJSpider, self).__init__(thcnt, proxy_life)
     self._name = name
     self.link_saver = LinkSaver(link_saver, saver_mode)
     self.total_content_failed = 0
     self.current_failed = StatisticsItem()
     self.linkstore = SpiderLinkStore('bj_court')
     # test parameters
     self.test_mode = False
     self._shutdown_in_test = False
     self.sleep = sleep
     self.captcha = FoodMakerExtendLock(thcnt - 1)
     self._shutdown = False
     self.captcha_times = 0
     self.captcha_limit = captcha_limit
     self.captcha_lock = threading.Lock()
 def __init__(self):
     CWPParser.__init__(self, 'abs_list', 'abs_list', 'zhuanli')
     self.store = PatentStore('abstract')
     self.failed_link = LinkSaver('abstract.parser.failed.txt')
     self.url_format = 'http://epub.sipo.gov.cn/dxbdl.action?strSources=fmmost&strWhere=%s&recordCursor=0&strLicenseCode=&action=dxbdln'
     self.save_count = 0