Example #1
0
class FullTextSeedGen(CWPParser):
    def __init__(self,
                 channel,
                 save='fulltext.seed.txt',
                 db='zhuanli',
                 dburl='mongodb://localhost/zhuanli'):
        CWPParser.__init__(self, channel, channel, db, dburl)
        self.seed_saver = LinkSaver(save)

    def process_child_item(self, item):
        print item
        self.seed_saver.add(item)

    def parse_item(self, page):
        apc = page['indexUrl'].split('://')[1]
        m = re.search(r"d.strWhere.value = \"pnm='([\w\d]+)'\";",
                      page['content'][1])
        if m:
            pnm = m.group(1)
        else:
            print 'cannot find patent number:', page['indexUrl']
            return []
        s = re.search(r'd\.strSources\.value = "(\w+)";', page['content'][1])
        if s:
            pt = s.group(1)
        else:
            print 'cannot find patent type:', page['indexUrl']
            return []
        return ['%s-%s-%s' % (pnm, pt, apc)]

    def on_finish(self):
        print '%d link saved' % self.seed_saver.count
Example #2
0
 def pre_save(self, saver):
     saver.add(GkChsiParser.title)
     s2 = LinkSaver('res_score_%s' % self.name, 'w')
     s2.add('省市,科类,层次,位次,分数')
     for r in self.score_rank:
         s2.add(r)
     s2.flush()
Example #3
0
class ShanghaiStoreFilter(CWPParser):
    def __init__(self):
        CWPParser.__init__(self, 'shanghai_court', 'shanghai_court')
        self.pagestore = ShanghaiCourtStore('sh_court_2')
        self.link_saver = LinkSaver('wrong.id.txt')

    def process_child_item(self, item):
        self.pagestore.save(int(item['crawlerUpdateTime'] / 1000),
                            item['indexUrl'][17:], item['realUrl'],
                            item['content'][1])

    def parse_item(self, page):
        if page['indexUrl'][17] != '/':
            return [page]
        self.link_saver.add(page['indexUrl'][17:])
        return []

    def on_finish(self):
        self.link_saver.flush()

    def on_save(self, items):
        for item in items:
            self.pagestore.save(int(item['crawlerUpdateTime'] / 1000),
                                item['indexUrl'][17:], item['realUrl'],
                                item['content'][1])
Example #4
0
class ChannelParser:
    def __init__(self, name='failed.txt', mode='a'):
        self.failed_saver = LinkSaver(name, mode)
        pass

    @abc.abstractmethod
    def parse(self, jid, content):
        raise NotImplementedError('virtual function called')

    def on_failed(self, message):
        self.failed_saver.add(message)
Example #5
0
class Extractor(CWPParser):
    def __init__(self):
        CWPParser.__init__(self, 'fs_court', 'fs')
        self.saver = LinkSaver('seed.txt')

    def process_child_item(self, item):
        self.saver.add(item)
        print '%s saved' % item

    def parse_item(self, page):
        if '页面不存在' in page['content'][1]:
            return [page['realUrl']]
        return []
Example #6
0
class SeedParser(WenshuSpider):
    date_format = '%Y%m%d'

    def __init__(self, thcnt=4, page=15):
        WenshuSpider.__init__(self, thcnt)
        self.source = WenshuSeedDb('ws_seed')
        self.link_saver = LinkSaver('seeds.dat', buffer_size=400)
        self.page = page

    def dispatch(self):
        seeds = self.source.export_seeds()

        print 'load %d seeds' % len(seeds)
        for seed in seeds:
            date = seed['indexUrl'].split('://')[1]
            eval_str = seed['content'][1:-1].replace('\\"', '"')
            res = eval(eval_str)
            try:

                if (isinstance(res, tuple)
                        or isinstance(res, list)) and len(res) > 0:
                    self.add_main_job({
                        'type': 'main',
                        'date': date.encode('utf-8'),
                        'count': int(res[0]['Count'])
                    })
                else:
                    print 'invalid seed', seed
            except KeyError as e:
                Log.error('KeyError %s' % e.message)
                traceback.print_exc()
                print seed
                print eval_str
        time.sleep(2)
        self.wait_q()
        self.add_job(None)

    def run_job(self, jobid):
        pagecnt = (jobid['count'] + self.page / 2) / self.page
        for index in range(1, pagecnt + 1):
            self.link_saver.add(
                str({
                    'date': jobid['date'],
                    'count': jobid['count'],
                    'index': index,
                    'page': self.page
                }))

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            self.link_saver.flush()
Example #7
0
 def on_finish(self):
     FileAbstractParser.on_finish(self)
     unfetch_saver = LinkSaver('unfetched_seeds_detail_' + self.channel)
     self.unfetch_list = spider.util.unique_list(self.unfetch_list)
     self.fetched_list = spider.util.unique_list(self.fetched_list)
     unfetched = []
     for link in self.unfetch_list:
         if link not in self.fetched_list:
             unfetched.append(link)
     self.unfetch_list = unfetched
     for link in self.unfetch_list:
         unfetch_saver.add(link)
     unfetch_saver.flush()
     fetchsaver = LinkSaver('fetched_seeds_detail_' + self.channel)
     for l in self.fetched_list:
         fetchsaver.add(str(l))
     fetchsaver.flush()
     print 'fetched jobs', len(self.fetched_list)
     print 'unfetched jobs', len(self.unfetch_list)
Example #8
0
 def on_finish(self):
     FileAbstractParser.on_finish(self)
     self.detail_seeds = spider.util.unique_list(self.detail_seeds)
     seed_saver = LinkSaver(self.detail_seeds_file, 'w')
     for seed in self.detail_seeds:
         seed_saver.add(str(seed))
     unfetch_saver = LinkSaver('unfetched_seeds_' + self.channel)
     self.unfetch_list = spider.util.unique_list(self.unfetch_list)
     for link in self.unfetch_list:
         if link not in self.fetched_list:
             unfetch_saver.add(str(link))
     fetch_saver = LinkSaver('fetched_seeds_' + self.channel)
     self.fetched_list = spider.util.unique_list(self.fetched_list)
     for link in self.fetched_list:
         fetch_saver.add(str(link))
     print 'fetched', len(self.fetched_list)
     print 'unfetched', len(self.unfetch_list)
     if self.send_mail:
         fname = self._save_name.encode('utf-8')
         os.system("cp '%s' '/tmp/%s'" % (fname, fname))
         send_attach(['*****@*****.**'],
                     '%s专业数据' % self.name.encode('utf-8'),
                     '%s高考专业数据' % self.name.encode('utf-8'),
                     '/tmp/%s' % fname,
                     '%s.csv' % self.name.encode('utf-8'))
Example #9
0
class ShanghaiExtractor(CWPParser):
    """解析文书案号"""
    def __init__(self):
        CWPParser.__init__(self, 'shanghai_court', 'court')
        self.an_saver = LinkSaver('ah.%s.txt' % self.name)

    def process_child_item(self, item):
        line = '%s|%s' % (item[0], item[1])
        print line
        self.an_saver.add(line)

    def init(self):
        print 'job start at', datetime.datetime.now()
        return CWPParser.init(self)

    def parse_item(self, page):
        m = re.search('((\d{4}).*\d+号)', page['content'][1])
        if m:
            return [[m.group(1), page['indexUrl'][17:].encode()]]
        return []

    def on_finish(self):
        self.an_saver.flush()
Example #10
0
class ChsiSpider(BaseGkChsiFsxSpider):
    def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, sleep_max=5,
                 ua='firefox', seeds='detail_seeds', recover=False, year='15', bkccs=None, kldms=None,
                 job_tag='', spider_type='detail', post_kldms=True):
        super(ChsiSpider, self).__init__(threadcnt, account, '%s%s' % (tag, job_tag), proxy, sleep, captcha_limit,
                                         sleep_max,
                                         ua)
        if kldms is None:
            kldms = ['5', '1']
        if bkccs is None:
            bkccs = ['1', '2']
        self.pagestore = self.new_page_store(spider_type, tag)
        self.full_tag = tag
        self.seeds = seeds
        if proxy:
            self.set_proxy(proxy)
        self.kldms = kldms
        self.bkccs = bkccs
        self.recover = recover
        self.info_saver = LinkSaver('info_data_%s_%s%s' % (spider_type, tag, job_tag))
        self.failed_saver = LinkSaver('%s.failed.seeds.%s%s' % (spider_type, tag, job_tag))
        self.invalid_saver = LinkSaver('%s.invalid.seeds.%s%s' % (spider_type, tag, job_tag))
        self.year = year
        self.failed_list = []
        self.invalid_list = []
        self.spider_type = spider_type
        self.post_kldms = post_kldms

    def dispatch(self):
        # read all seeds
        seeds = []
        with open(self.seeds, 'r') as f:
            for l in f:
                data = self.parse_seed(l.strip())
                if not data:
                    continue
                if self.year == str(data['years']):
                    if not self.recover or not self.pagestore.find_any(
                                            self.pagestore.channel + '://' + self.get_job_id(data)):
                        seeds.append(data)
        print 'load ', len(seeds), 'jobs'
        count = 10
        while len(seeds) > 0 and count > 0:
            count -= 1
            logging.info('remain tries %d', count)
            for kldm in self.kldms:
                for bkcc in self.bkccs:
                    seeds = self.request_list(seeds, kldm, bkcc)
                    logging.info('seeds %d,failed %d,kldm=%s,bkcc=%s,tries=%d', len(seeds), len(self.failed_list), kldm,
                                 bkcc, count)
                    time.sleep(2)
                    self.wait_q()
            seeds += self.failed_list
            self.failed_list = []
        self.wait_q()
        self.add_job(None)
        self.failed_list = seeds

    def handle_job(self, jobid):
        pass

    def re_add_failed_job(self, jobid):
        if jobid.has_key('content'):
            jobid.pop('content')
        if jobid.has_key('url'):
            jobid.pop('url')
        cnt = jobid.get('_failed_cnt_', 0) + 1
        jobid['_failed_cnt_'] = cnt
        self.failed_list.append(jobid)

    def save_invalid_job(self, jobid):
        cnt = jobid.get('_invalid_cnt_', 0) + 1
        jobid['_invalid_cnt_'] = cnt
        if cnt < 2:
            self.re_add_failed_job(jobid)
        else:
            if jobid.has_key('content'):
                jobid.pop('content')
            if jobid.has_key('url'):
                jobid.pop('url')
            self.invalid_list.append(jobid)

    def request_list(self, seeds, kldm, bkcc):
        remains = []
        if self.post_kldms:
            self.post_kldm_bkcc_for_session(kldm, bkcc)
            for seed in seeds:
                if seed['kldm'] == kldm and bkcc == seed['bkcc']:
                    self.add_main_job(seed)
                else:
                    remains.append(seed)
        else:
            for seed in seeds:
                self.add_main_job(seed)
        return remains

    def run_job(self, jobid):
        if self.pre_job(jobid):
            return
        if not jobid.has_key('content'):
            self.re_add_failed_job(jobid)
            return
        detail_content = jobid['content']
        if detail_content is None:
            self.re_add_failed_job(jobid)
            return
        try:
            if self._check_result(detail_content.text, jobid, jobid['url']):
                '''exception is found and handled'''
                return
        except InvalidQueryError as e:
            logging.info(e.message)
            self.save_invalid_job(jobid)
            return
        except Exception as e:
            logging.info(e.message)
            self.re_add_failed_job(jobid)
            return
        if not jobid.has_key('url'):
            print jobid
            self.re_add_failed_job(jobid)
            return
        jid = self.get_job_id(jobid)
        print 'saving %s==>%s' % (jid, len(detail_content.text))
        self.pagestore.save(int(time.time()), jid, jobid['url'], detail_content.text)

    def get_job_title(self, jobid):
        raise NotImplementedError('Virtual method called')

    def new_page_store(self, spider, tag):
        raise NotImplementedError('Virtual method called')

    def get_job_id(self, jobid):
        raise NotImplementedError('Virtual method called')

    def parse_page(self, jobid, content):
        raise NotImplementedError('Virtual method called')

    def get_url(self, jobid):
        raise NotImplementedError('Virtual method called')

    def report_job(self, jobid):
        raise NotImplementedError('Virtual method called')

    def add_job(self, jobid, mainjob=False):
        if jobid is None:
            super(ChsiSpider, self).add_job(jobid, mainjob)
            return
        url = self.get_url(jobid)
        count = 3
        content = None
        while count > 0 and not content:
            content = self.request_content(jobid, url)
            count -= 1
        if content is None:
            self.re_add_failed_job(jobid)
            return
        jobid['content'] = content
        jobid['url'] = url
        self.report_job(jobid)
        super(ChsiSpider, self).add_job(jobid, mainjob)
        self.parse_page(jobid, content)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += 'seeds: %s\n' % self.seeds
            msg += "saved: %d\n" % self.pagestore.saved_count
            msg += 'captcha times: %s\n' % self._captcha_times
            msg += 'remain seeds: %d\n' % len(self.failed_list)
            msg += 'invalid seeds: %d\n' % len(self.invalid_list)
            for item in self.except_state:
                msg += '%s: %d\n' % (item.name(), item.count())
            spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg)
            print 'remain seeds', len(self.failed_list)
            print 'invalid seeds', len(self.invalid_list)
            for seed in self.invalid_list:
                self.invalid_saver.add(str(seed))
            self.invalid_saver.flush()
            for seed in self.failed_list:
                self.failed_saver.add(str(seed))
            self.failed_saver.flush()
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass

    def parse_seed(self, param):
        raise NotImplementedError('Virtual method called')

    def request_content(self, jobid, url):
        raise NotImplementedError('Virtual method called')
Example #11
0
class HZCourtSpider(CourtSpider):
    """杭州市法院爬虫"""
    def get_page_store(self):
        return self.pagestore

    def __init__(self,
                 threadcnt=10,
                 seed_file=None,
                 mode='links',
                 list_file='links',
                 recover=False,
                 test=False):
        CourtSpider.__init__(self, threadcnt)
        self._name = 'HangzhouCourt'
        self.pagestore = HZCourtStore()
        self.job_spliter = HZSpliter()
        self._test_mode = test
        self.pagestore.testmode = test
        self.list_data = {
            'pageno': '1',
            'pagesize': '20',
            'ajlb': '',
            'cbfy': '1300',
            'ah': '',
            'jarq1': '19700101',
            'jarq2': time.strftime('%Y%m%d', time.localtime()),
            'key': ''
        }
        self.seed_file = seed_file
        self.page_size = 50
        self.mode = mode
        self.list_file = list_file
        self.recover = recover
        self.today = time.strftime('%Y%m%d', time.localtime())
        self.link_saver = LinkSaver(self.list_file)

    def run_job(self, jobid):
        jt = jobid['type']
        if 'paper' == jt:
            id = jobid['id']
            con = self.request_url(
                'http://www.zjsfgkw.cn/document/JudgmentDetail/' + id)
            if con is None or con.text is None:
                logging.error('failed to request paper %s', str(id))
                raise Exception('Failed to request paper %s' % str(id))
            else:
                context = self.extract_content(con.text)
            m = None
            if context is not None:
                m = re.search(r'src="([^"]*)"', context)
            context2 = None
            if m is not None:
                con = self.request_url("http://www.zjsfgkw.cn" +
                                       quote(m.group(1).encode('utf-8')))
                if con:
                    context2 = con.text
                else:
                    logging.error('failed to request source paper %s', str(id))
                    raise Exception('Failed to request source paper %s' %
                                    str(id))
            else:
                logging.warn('failed to find source url %s', str(id))
            if context2 is not None:
                self.pagestore.save(
                    int(time.time()), id,
                    'http://www.zjsfgkw.cn/document/JudgmentDetail/' + id,
                    context2)
                print id, '=>', len(context2)
                logging.info('%s==>%d', str(id), len(context2))
            else:
                logging.info('fail to find content for %s', str(id))
                print 'fail to find content for:', id
            return

        if 'main' == jt:
            data = copy.deepcopy(self.list_data)
            data['cbfy'] = jobid['cbfy']
            data['pageno'] = jobid['page']
            data['pagesize'] = jobid['pagesize']
            con = self.request_url(jobid['url'], data=data)
            if con is None or con.text is None:
                logging.error('fail to request %s', jobid['url'])
                raise Exception('response is None %s' % jobid['url'])
        elif 'list' == jt:
            if jobid['pageno'] == 0:
                self.handle_count_and_split(jobid)
                return
            con = self.search(pagesize=self.page_size,
                              pageno=jobid['pageno'],
                              jarq1=jobid['jarq1'],
                              jarq2=jobid['jarq2'])
            if con is None or con.text is None:
                logging.error('fail to request %s', str(jobid))
                raise Exception('response is None %s' % str(jobid))
        else:
            print 'invalid job', jobid
            return
        docs = self.extract_paper_url(con.text)
        if len(docs) == 0:
            print 'no papers found on %s' % str(jobid)
            logging.warn('no papers found on %s', str(jobid))
            return
        docs = spider.util.unique_list(docs)
        logging.info('add %d links from %s', len(docs), str(jobid))
        for doc in docs:
            self.link_saver.add(doc)
            self.add_job({'type': 'paper', 'id': doc})

    def search(self, **kwargs):
        pageno = kwargs.get('pageno', 1)
        pagesize = kwargs.get('pagesize', 10)
        ajlb = kwargs.get('ajlb', '')
        cbfy = kwargs.get('cbfy', '')
        ah = kwargs.get('ah', '')
        jarq1 = kwargs.get('jarq1', '')
        jarq2 = kwargs.get('jarq2', self.today)
        key = kwargs.get('key', '')
        # url = 'http://www.zjsfgkw.cn/document/JudgmentSearch?ajlb=%s&cbfy=%s&ah=%s&key=%s&jarq1=%s&jarq2=%s&pageno=%s&pagesize=%s' % (
        #     ajlb, cbfy, ah, key, jarq1, jarq2, pageno, pagesize)
        # return self.request_url(url)

        return self.request_url(
            'http://www.zjsfgkw.cn/document/JudgmentSearch',
            data={
                'pageno': pageno,
                'pagesize': pagesize,
                'ajlb': ajlb,
                'cbfy': cbfy,
                'ah': ah,
                'jarq1': jarq1,
                'jarq2': jarq2,
                'key': key
            })

    def dispatch(self):
        if 'links' == self.mode and self.seed_file:
            with open(self.seed_file, 'r') as f:
                for l in f:
                    j = eval(l)
                    pagecnt = int(j['count']) / self.page_size + 1
                    for page in range(1, pagecnt + 1):
                        self.add_main_job({
                            'type': 'main',
                            'url':
                            'http://www.zjsfgkw.cn/document/JudgmentSearch',
                            'page': page,
                            'pagesize': self.page_size,
                            'cbfy': j['id']
                        })
        elif 'papers' == self.mode:
            with open(self.seed_file, 'r') as f:
                ids = []
                for l in f:
                    ids.append(l.strip())
                if self.recover:
                    tmp = ids
                    ids = []
                    for i in tmp:
                        if not self.pagestore.find_any(self.pagestore.channel +
                                                       '://' + i):
                            ids.append(i)
                for i in ids:
                    self.add_main_job({'type': 'paper', 'id': i})
                logging.info('add %d paper links', len(ids))
        elif 'update' == self.mode:
            config = Properties(self.seed_file)
            config.load()
            self.add_main_job({
                'type': 'list',
                'jarq1': config.get('jarq1'),
                'jarq2': config.get('jarq2', self.today),
                'pageno': 0,
                'level': 0
            })

        time.sleep(3)
        self.wait_q()
        self.add_job(None, True)

    def need_split(self, context, url):
        return False

    def extract_content(self, context):
        m = re.search(r'<div class="books_detail_header">.*</IFRAME>', context,
                      re.S)
        if m:
            return m.group(0)
        return None

    def extract_paper_id(self, url):
        m = re.findall(r'http://www.zjsfgkw.cn/document/JudgmentDetail/(\d+)',
                       url)
        if m is not None:
            return m[0]
        return None

    def extract_paper_url(self, content):
        return re.findall(r'DocumentId":(\d+)', content)

    def add_list_job(self, url, con):
        pass

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += "Court Spider:%s\n" % self._name
            msg += "saved: %d\n" % self.pagestore.saved_count
            spider.util.sendmail(['*****@*****.**'],
                                 '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass

    def get_court_by_id(self, id):
        data = {'courtId': id}

        con = self.request_url(
            'http://www.zjsfgkw.cn/Judges/GetCountByCountId', data=data)
        print con.text

    def get_court_paper_count(self, court_id, start_date, end_date):
        con = self.search(pageno=1,
                          pagesize=1,
                          cbfy=court_id,
                          jarq1=start_date,
                          jarq2=end_date)
        if con and con.text:
            res = re.search(r'"total":(\d+)', con.text)
            if res:
                print court_id, res.group(1)
                return int(res.group(1))
            else:
                return -1
        return -1

    def handle_count_and_split(self, jobid):
        cnt = self.get_court_paper_count(jobid.get('court', ''),
                                         jobid['jarq1'], jobid['jarq2'])
        pagecnt = (cnt + self.page_size / 2) / self.page_size
        if pagecnt > 100:
            splits = date_split(jobid['jarq1'], jobid['jarq2'], '%Y%m%d')
            if len(splits) == 1:
                print 'cannot split any more:', jobid
                return
            print '[%s,%s],[%s]->%s,%s' % (jobid['jarq1'], jobid['jarq2'],
                                           jobid['level'], str(
                                               splits[0]), str(splits[1]))
            for t in splits:
                job = copy.deepcopy(jobid)
                job['jarq1'] = t[0]
                job['jarq2'] = t[1]
                job['_failcnt_'] = 0
                job['level'] += 1
                self.add_job(job)
            return
        print '[%s,%s][%d]=>%d,%d' % (jobid['jarq1'], jobid['jarq2'],
                                      jobid['level'], cnt, pagecnt)
        for page in range(1, pagecnt + 1):
            job = copy.deepcopy(jobid)
            job['pageno'] = page
            self.add_job(job)
Example #12
0
class SeedGenerator(WenshuSpider):
    """
    Wen shu seed generator
    generator seed format:
    {
     file : court.txt,
     start: 2015-01-01,
     end: 2015-06-20
     }
    因为最大页数是25,超过25的服务器不返回数据,所以需要拆分查询,每次查询文数量不能超过20*25=500
    """
    date_format = '%Y%m%d'

    def __init__(self, seed, thcnt=4, recover=False):
        WenshuSpider.__init__(self, thcnt, recover=recover)
        self.store = WenshuLinkDb('ws_seed')
        self.seed = seed
        self.failed_log = LinkSaver('failed_job.txt')

    def need_split(self, context, url):
        pass

    def get_page_store(self):
        pass

    def add_list_job(self, url, con):
        pass

    def dispatch(self):
        if self.recover:
            seeds = []
            raw_seeds = self.gen_seeds()
            fetched = self.export_fetch()
            for seed in raw_seeds:
                fetched_arr = fetched.get(seed['key'])
                if fetched_arr is None:
                    seeds.append(seed)
                    continue
                unfetched = self.check_unfetched(
                    [self.date2i(seed['start']),
                     self.date2i(seed['end'])], fetched_arr.origin)
                for u in unfetched:
                    copy_seed = copy.deepcopy(seed)
                    copy_seed['start'] = u[0]
                    copy_seed['end'] = u[1]
                    seeds.append(copy_seed)
        else:
            seeds = self.gen_seeds()
        print 'load %d seeds' % len(seeds)
        for seed in seeds:
            self.add_main_job(seed)
        time.sleep(2)
        print 'wait for queue'
        self.wait_q()
        self.add_job(None)

    @staticmethod
    def to_list_seed_id(seed):
        return '%s/%s/%s' % (seed['key'], seed['start'], seed['end'])

    def run_job(self, jobid):
        param = self.seed2param(jobid)
        url = 'http://wenshu.court.gov.cn/list/list/?sorttype=0&conditions=searchWord+%s+SLFY++法院名称:%s&conditions=searchWord++CPRQ++裁判日期:%s TO %s' % (
            jobid['court'],
            jobid['court'],
            jobid['start'],
            jobid['end'],
        )
        con = self.request_results(param, page=1)
        if self.check_exception(con, jobid):
            return
        try:
            res = eval(eval(con.text))
        except NameError as e:
            print 'NameError', e.message
            if not self.re_add_job(jobid):
                self.failed_log.add('0,' + str(jobid))
            return
        if res and len(res) > 0:
            count = int(res[0]['Count'])
            if count > 500:
                print '[%d] %s [%s,%s]==>%d need split' % (
                    jobid['level'], jobid['court'], jobid['start'],
                    jobid['end'], count)
                res = date_split(jobid['start'], jobid['end'])
                if len(res) == 1:
                    print '[%d] %s [%s,%s]==>%d split failed' % (
                        jobid['level'], jobid['court'], jobid['start'],
                        jobid['end'], count)
                    self.failed_log.add('1,' + str(jobid))
                else:
                    self.add_job({
                        'level': jobid['level'] + 1,
                        'court': jobid['court'],
                        'start': res[0][0],
                        'end': res[0][1],
                        'key': jobid['key']
                    })
                    self.add_job({
                        'level': jobid['level'] + 1,
                        'court': jobid['court'],
                        'start': res[1][0],
                        'end': res[1][1],
                        'key': jobid['key']
                    })
            else:
                print '[%d] %s [%s,%s]==>%d ok' % (
                    jobid['level'], jobid['court'], jobid['start'],
                    jobid['end'], count)
                self.store.save(url, self.to_list_seed_id(jobid),
                                '%s,%s' % (jobid['court'], count),
                                int(time.time()))
        else:
            print 'fail to get content', jobid
            self.failed_log.add('2,' + str(jobid))

    @staticmethod
    def to_seed(start, end):
        return start.strftime(SeedGenerator.date_format) + end.strftime(
            SeedGenerator.date_format)

    def gen_seeds(self):
        seeds = []
        with open(self.seed['file'], 'r') as f:
            for l in f:
                d = eval(l.strip())
                seeds.append({
                    'court': d['court'],
                    'key': d['key'],
                    'start': self.seed['start'],
                    'end': self.seed['end'],
                    'level': 0
                })
        return seeds

    @staticmethod
    def get_date_str(year, month=None, day=None):
        if day is None:
            day = 1
        if month is None:
            month = 1
        if day < 10:
            ds = '0' + str(day)
        else:
            ds = str(day)
        if month < 10:
            ms = '0' + str(month)
        else:
            ms = str(month)
        return '%s%s%s' % (year, ms, ds)

    @staticmethod
    def get_end_day(year, month):
        if month > 31 or month < 1:
            return 0
        if 2 == month:
            if year % 4 == 0 and year % 400 != 0 or year % 400 == 0:
                return 29
            else:
                return 28
        elif month in [4, 6, 9, 11]:
            return 30
        else:
            return 31

    def export(self, mode='json'):
        seeds = self.store.export_seeds()
        sf = open('seed.dat', 'w')
        sd = {}
        for s in seeds:
            sd[s['id'][(len(self.store.channel) +
                        3):]] = s['content'].split(',')

        res = []
        for k, v in sd.items():
            ks = k.split('/')
            if mode == 'json':
                res.append(
                    str({
                        'court': v[0],
                        'count': v[1],
                        'key': ks[0],
                        'start': ks[1],
                        'end': ks[2]
                    }))
            else:
                l = '%s,%s,%s,%s,%s' % (v[0], ks[0], ks[1], ks[2], v[1])
                res.append(l)

        res = spider.util.unique_list(res)
        for r in res:
            print r
            sf.write(r + '\n')
        print '%d seeds saved' % len(res)

    @staticmethod
    def date_convert(date_str):
        return datetime.datetime.strptime(date_str, '%Y-%m-%d')

    RELATIVE_DATE = datetime.datetime.strptime('1970-01-01', '%Y-%m-%d')

    @staticmethod
    def date2i(date_str):
        return (SeedGenerator.date_convert(date_str) -
                SeedGenerator.RELATIVE_DATE).days

    def export_fetch(self):
        fetched = self.store.export_seeds(lambda item: item['indexUrl'][
            (len(self.store.channel) + 3):].split('/'))
        res = {}
        for num, start, end in fetched:
            arr = res.get(num)
            s = self.date2i(start)
            e = self.date2i(end)
            if arr:
                arr.add([s, e])
            else:
                itv = Intervals()
                itv.add([s, e])
                res[num] = itv
        for r in res.keys():
            res[r].check()
        return res

    @staticmethod
    def i2date(num):
        return (SeedGenerator.RELATIVE_DATE +
                datetime.timedelta(days=num)).strftime('%Y-%m-%d')

    @staticmethod
    def check_unfetched(main, intervals):
        interval = Intervals()
        interval.add(main)
        for itv in intervals:
            interval.remove(itv)
        itvs = []
        for itv in interval.origin:
            itvs.append(
                [SeedGenerator.i2date(itv[0]),
                 SeedGenerator.i2date(itv[1])])
        return itvs
Example #13
0
class BjListSpider(BJSpider):
    def __init__(self,
                 threadcnt,
                 last_page=None,
                 total_page=22305,
                 save_file='seeds.dat',
                 sleep=0.0,
                 proxy_life=180):
        super(BjListSpider, self).__init__(threadcnt,
                                           'BjListSpider',
                                           proxy_life=proxy_life)

        self.test_mode = False
        self.sleep = sleep
        self.zero_link_count = 0
        self.lock = threading.Lock()
        self._shutdown = False
        self.result_saver = LinkSaver(save_file, 'a')
        self.captcha = FoodMakerExtendLock(threadcnt - 1)
        self.last_page = last_page
        self.total_page = total_page

    def dispatch(self):
        if self.last_page is not None and self.last_page <= self.total_page:
            for page in range(self.last_page, self.total_page + 1):
                self.add_main_job({
                    'type':
                    'list',
                    'url':
                    'http://www.bjcourt.gov.cn/cpws/index.htm?page=%s' % page
                })
        else:
            self.add_main_job({
                'type': 'main',
                'url': 'http://www.bjcourt.gov.cn/cpws/index.htm'
            })
        time.sleep(3)
        self.wait_q()
        self.add_job(None, True)

    def with_sleep_request_url(self, url, **kwargs):
        time.sleep(self.sleep)
        return self.request_url(url, **kwargs)

    def _dec_worker(self):
        self.captcha.decrease()
        super(BjListSpider, self)._dec_worker()

    def run_job(self, jobid):
        if not isinstance(jobid, dict):
            return
        if self._shutdown:
            return
        jt = jobid['type']
        url = jobid['url']
        time.sleep(2)

        con = self.with_sleep_request_url(url, timeout=10)
        if self.check_exception(con, jobid):
            return
        m = re.search('yzmInput', con.text)
        if m:
            print self.get_tid(), url, ' need captcha'
            con = self.resolve_captcha(url)
            if self.check_exception(con, jobid):
                return
            if re.search(r'yzmInput', con.text):
                self._shutdown = True
                self.link_saver.add('%d,%d,%s' % (2, 0, url))
                return

        if 'main' == jt:
            m = re.search(ur'您搜到了\s*<em>([0-9]+)</em>\s*条符合条件的文书', con.text,
                          re.S)
            if not m:
                if re.search(r'yzmInput', con.text):
                    self._shutdown = True
                self.link_saver.add('%d,%d,%s' % (2, 0, url))
                return
            papercnt = int(m.group(1))
            if papercnt <= 0:
                print '哎呀,这里没用文书', url
                with self.lock:
                    self.zero_link_count += 1
                return
            print 'there are %d papers on %s' % (papercnt, url)
            self.link_saver.add('%d,%d,%s' % (1, papercnt, url))
            n_url = url
            if n_url.find('?') < 0:
                n_url += '?'
            elif n_url[-1] != '&':
                n_url += '&'
            for page in range((papercnt + 10) / 20 + 1, 1, -1):
                self.add_job({'type': 'list', 'url': n_url + 'page=%s' % page})

        ids = re.findall(r'\/cpws\/paperView.htm\?id=(\d+)', con.text)
        if not ids or len(ids) == 0:
            print 'cannot find any paper on', url
            return
        print 'add %d papers from %s' % (len(ids), url)
        for id in ids:
            self.result_saver.add(id)

    def split_url(self, url):
        urls = CData.split_param(url)
        for u in urls:
            self.add_job({'type': 'main', 'url': u})

    def event_handler(self, evt, msg, **kwargs):
        super(BjListSpider, self).event_handler(evt, msg, **kwargs)
        if evt == 'DONE':
            self.result_saver.flush()
            msg += 'zero count: %d\n' % self.zero_link_count
            msg += 'captcha times: %d\n' % self.captcha_times
            spider.util.sendmail(['*****@*****.**'],
                                 '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Example #14
0
class BaseChsiSpider(BaseGkChsiFsxSpider):
    def __init__(self, threadcnt, account, tag, proxy=None, sleep=0.0, captcha_limit=50000000, seeds='detail_seeds',
                 recover=False, sleep_max=5, ua='firefox', year='15', bkccs=None, kldms=None, job_tag=''):
        super(BaseChsiSpider, self).__init__(threadcnt, account, tag, proxy, sleep, captcha_limit, sleep_max,
                                             ua)
        if kldms is None:
            kldms = ['5', '1']
        if bkccs is None:
            bkccs = ['1', '2']
        self.pagestore = GkChsiDetailPaperStore('yggk_detail_' + tag)
        self.full_tag = tag
        self.seeds = seeds
        if proxy:
            self.set_proxy(proxy)
        self.kldms = kldms
        self.bkccs = bkccs
        self.recover = recover
        self.parser = HTMLParser.HTMLParser()
        self.info_saver = LinkSaver(tag + '_detail_data')
        self.failed_saver = LinkSaver('detail.failed.seeds.' + tag + job_tag)
        self.year = year
        self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s'
        self.failed_list = []
        self.last_request_time = time.time()


    def dispatch(self):
        # read all seeds
        seeds = []
        with open(self.seeds, 'r') as f:
            for l in f:
                if l[0] == '{':
                    data = eval(l.strip())
                else:
                    param = l.strip().split(',')
                    if len(param) != 8:
                        logging.warn('invalid seeds %s', l)
                        continue
                    data = {'wclx': 1, 'yxdm': param[6], 'kldm': param[2], 'bkcc': param[4], 'start': 0,
                            'years': param[5], 'zydm': param[7], 'zymc': param[8].encode('utf-8')}
                if self.year == data['years'] and not self.pagestore.find_any(
                                        self.pagestore.channel + '://' + self.get_jobid(data)):
                    seeds.append(data)
        print 'load ', len(seeds), 'jobs'
        count = 10
        while len(seeds) > 0 and count > 0:
            count += 1
            logging.info('remain tries %d', count)
            for kldm in self.kldms:
                for bkcc in self.bkccs:
                    seeds = self.request_list(seeds, kldm, bkcc)
                    logging.info('seeds %d,failed %d,kldm=%s,bkcc=%s', len(seeds), len(self.failed_list), kldm, bkcc)
                    seeds += self.failed_list
                    self.failed_list = []
        time.sleep(2)
        self.wait_q()
        self.add_job(None)
        print 'remain seeds', len(seeds)
        for seed in seeds:
            self.failed_saver.add(seed)
        self.failed_saver.flush()
        self.failed_list = seeds

    def handle_job(self, jobid):
        pass


    def request_list(self, seeds, kldm, bkcc):
        self.post_kldm_bkcc_for_session(kldm, bkcc)
        remains = []
        for seed in seeds:
            if seed['kldm'] == kldm and bkcc == seed['bkcc']:
                self.add_main_job(seed)
            else:
                remains.append(seed)
        return remains

    def run_job(self, jobid):
        if not jobid.has_key('content'):
            if jobid not in self.failed_list:
                self.failed_list.append(jobid)
            return
        detail_content = jobid['content']
        jtitle = '%s/%s/%s/%s/%s/%s' % (
            jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'],
            jobid['start'])
        self.pagestore.save(int(time.time()), '%s/%s/%s' % (jtitle, jobid['zydm'], int(jobid['start']) / 10),
                            jobid['url'], detail_content.text)

    def add_job(self, jobid, mainjob=False):
        if jobid is None:
            super(BaseChsiSpider, self).add_job(jobid)
            return
        logging.info('fetching special %s,%s', jobid['zymc'], jobid['zydm'])
        detail_url = self.detail_url_format % (jobid['years'], jobid['yxdm'], jobid['zydm'], jobid['start'])
        content = self.fetch_content(jobid, detail_url)
        if content is None:
            # exception is handle
            return
        jobid['content'] = content
        jobid['url'] = detail_url
        super(BaseChsiSpider, self).add_job(jobid, True)
        if 0 == jobid['start']:
            m = re.search(ur'共 (\d+) 页', content.text)
            if not m:
                logging.warn('failed to find page count %s,%s,%s', jobid['kldm'], jobid['bkcc'], detail_url)
                return
            page_cnt = int(m.group(1))
            if page_cnt <= 1:
                return
            for p in range(1, page_cnt):
                job = copy.deepcopy(jobid)
                job['start'] = p * 10
                self.add_main_job(job)

    def get_jobid(self, jobid):
        return '%s/%s/%s/%s/%s/%s/%s/%s' % (
            jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'],
            jobid['start'], jobid['zydm'], int(jobid['start']) / 10)

    def fetch_content(self, jobid, detail_url):
        detail_content = self.request_url(detail_url, allow_redirects=20)
        if detail_content is None:
            self.failed_list.append(jobid)
            return
        try:
            if not self._check_result(detail_content.text, jobid, detail_url):
                self.failed_list.append(jobid)
            else:
                return detail_content
        except Exception as e:
            logging.info(e.message)
            self.failed_list.append(jobid)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += 'seeds: %s\n' % self.seeds
            msg += "saved: %d\n" % self.pagestore.saved_count
            msg += 'captcha times: %s' % self._captcha_times
            msg += 'remain seeds: %d\n' % len(self.failed_list)
            spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Example #15
0
class PatentFullTextSpider(ZhuanliBaseSpider):
    """专利全文爬虫"""
    def __init__(self, thcnt, recover=False, seeds='seed.dat'):
        ZhuanliBaseSpider.__init__(self, thcnt, recover)
        self.seeds = seeds
        self.pagestore = PatentFullTextStore()
        self.failed_saver = LinkSaver('failed.fulltext.txt')

    def dispatch(self):
        seeds = []
        with open(self.seeds, 'r') as f:
            for s in f:
                v = s.rstrip().split('-')
                if len(v) < 3:
                    print 'invalid seed:', s
                if not self.recover or not self.pagestore.find_any(
                        self.pagestore.channel + '://%s-%s' % (v[0], v[2])):
                    seeds.append({'type': v[1], 'pnm': v[0], 'apply': v[2]})
        # seeds = spider.util.unique_list(seeds)
        print 'load %s seeds' % len(seeds)
        for seed in seeds:
            self.add_main_job(seed)
        time.sleep(2)
        self.wait_q()
        self.add_job(None)

    @staticmethod
    def extract_seed_id(pnm, apply_code):
        return '%s-%s' % (pnm, apply_code)

    def run_job(self, jobid):
        url = self.form_download_url(jobid['pnm'], jobid['type'])
        con = self.request_url(url, timeout=self.timeout)
        if self.check_exception(con, jobid):
            return
        if u'<input type="text" name="vct" />' in con.text:
            # 输入验证码下载
            m = re.search(r'\?path=([^&\s]*)', con.headers)
            if m:
                path = m.group(1)
            else:
                l_p = re.search('Location:http://egaz.sipo.gov.cn/FileWeb/.*',
                                con.headers)
                if l_p:
                    location = l_p.group()
                else:
                    l_p = re.search('Location:.*', con.headers)
                    location = 'None' if not l_p else l_p.group()
                print 'wrong redirect page:', url, 'location:', location
                if not self.re_add_job(jobid):
                    self.failed_saver.add(
                        '1,%s-%s-%s' %
                        (jobid['pnm'], jobid['type'], jobid['apply']))
                return
            img = self.request_url('http://egaz.sipo.gov.cn/FileWeb/vci.jpg')
            fn = jobid['pnm'] + '.jpg'
            save_file(img.content, fn)
            vci = Captcha.resolve(fn, jobid['pnm'])
            con = self.request_url(
                'http://egaz.sipo.gov.cn/FileWeb/pfs?path=%s&vct=%s' %
                (path, vci))
            remove_file(fn)
            if self.check_exception(con, jobid):
                return
            if u'您要下载的文件不存在' in con.text:
                self.failed_saver.add(
                    '2,%s-%s-%s' %
                    (jobid['pnm'], jobid['type'], jobid['apply']))
                return
            if u'<input type="text" name="vct" />' in con.text:
                if not self.re_add_job(jobid):
                    self.failed_saver.add(
                        '3,%s-%s-%s' %
                        (jobid['pnm'], jobid['type'], jobid['apply']))
                return
        self.pagestore.save(int(time.time()),
                            self.extract_seed_id(jobid['pnm'], jobid['apply']),
                            url, con.text)
Example #16
0
class ShenzhenCourtSpider(ETOSSessionCourtSpider):
    "深圳法院诉讼服务平台爬虫"

    def __init__(self,
                 thread_count=1,
                 list_only=False,
                 save_link=False,
                 from_link=False,
                 recover=False,
                 seeds='seeds'):
        super(ShenzhenCourtSpider, self).__init__(thread_count)
        self._name = 'ShenzhenCourt'
        self.pagestore = ShenzhenCourtStore()
        self.job_spliter = ShenzhenSpliter()
        self._captcha_times = range(0, thread_count)
        self.test_mode = False
        self.pagesize = 50
        self.list_only = list_only
        self.save_link = save_link
        self.link_saver = None
        self.seeds = seeds
        if self.save_link:
            self.link_saver = LinkSaver('saved.links', 'a+b')
        self.from_link = from_link
        self.recover = recover

    def dispatch(self):
        if self.from_link:
            links = []
            with open(self.seeds, 'r') as f:
                for l in f:
                    if len(l) > 0:
                        if l[:4] == 'http':
                            links.append(l.strip())
                        else:
                            links.append(l.strip().split(',')[-1])
            if self.recover:
                tmp = links
                links = []
                for l in tmp:
                    if not self.pagestore.find_any(self.pagestore.channel +
                                                   '://' +
                                                   self.extract_paper_id(l)):
                        links.append(l)
            for l in links:
                self.add_job({'type': 'paper', 'url': l})
            print 'add %d paper links' % len(links)
            logging.info('add %d paper links', len(links))
        else:
            self.add_main_job({
                'type':
                'main',
                'url':
                'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440300&page=1&pageLimit=%d&caseNo='
                % self.pagesize
            })
            self.add_main_job({
                'type':
                'main',
                'url':
                'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440301&page=1&pageLimit=%d&caseNo='
                % self.pagesize
            })
            self.add_main_job({
                'type':
                'main',
                'url':
                'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440302&page=1&pageLimit=%d&caseNo='
                % self.pagesize
            })
            self.add_main_job({
                'type':
                'main',
                'url':
                'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440303&page=1&pageLimit=%d&caseNo='
                % self.pagesize
            })
            self.add_main_job({
                'type':
                'main',
                'url':
                'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440304&page=1&pageLimit=%d&caseNo='
                % self.pagesize
            })
            self.add_main_job({
                'type':
                'main',
                'url':
                'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440305&page=1&pageLimit=%d&caseNo='
                % self.pagesize
            })
            self.add_main_job({
                'type':
                'main',
                'url':
                'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440306&page=1&pageLimit=%d&caseNo='
                % self.pagesize
            })
            self.add_main_job({
                'type':
                'main',
                'url':
                'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440307&page=1&pageLimit=%d&caseNo='
                % self.pagesize
            })

        time.sleep(3)
        self.wait_q()
        self.add_job(None, True)

    def thread_init(self, tid):
        self._captcha_times[tid] = 0

    def check_captcha(self, con, url):
        m = re.search(u'.*需要验证,请输入验证码.*', con.text)
        if m:
            print m.group()
            tid = getattr(self._tls, 'tid', -1)
            if tid < 0:
                sys.stderr.write('invalid thread id in run_job')
                raise RuntimeError('Invalid tid')
            c = 0
            while c < 10:
                img = self.get_captcha(tid)
                self._captcha_times[tid] += 1
                c += 1
                if not img:
                    continue
                code = self.resolve_captcha(img)
                if not code:
                    continue
                success = self.post_captcha(code, None)
                logging.info('captcha times:%d', self._captcha_times[tid])
                if self.test_mode:
                    print "captcha times: ", self._captcha_times[tid]
                if success == 'true':
                    if re.split(r'\/anjiangongkai\/JudgeDocument', url):
                        u = url + '?code=' + code
                    else:
                        u = url + '&code=' + code
                    con = self.request_url(u)
                    return con
        else:
            if self.test_mode:
                print 'do not need resolve captcha', url
                logging.warn('do not need resolve captcha %s', url)
        return con

    def request_url(self, url, **kwargs):
        con = super(ShenzhenCourtSpider, self).request_url(url, **kwargs)
        if con and con.text:
            return self.check_captcha(con, url)
        return con

    def run_job(self, jobid):
        jt = jobid['type']
        url = jobid['url']
        if 'paper' == jt:
            if self.list_only:
                return
            con = self.request_url(url)
            '''check exception'''
            if self.check_exceptions(con, jobid):
                return
                # con = self.check_captcha(con.text, url)
            u = re.search(r'src=\'(\/temp\/ws\-[\d\w]+\-[\d\w]+.html)\'',
                          con.text)
            if u is None:
                logging.warn('cannot find source page url in %s', url)
                return
            con = self.request_url('http://ssfw.szcourt.gov.cn/' + u.group(1))
            if con is None or con.text is None:
                logging.warn('source page is None %s', u.group(1))
                return
            context = self.extract_content(con.text)
            if context is not None:
                jid = self.extract_paper_id(url)
                if jid is not None:
                    self.pagestore.save(int(time.time()), jid, url, context)
                else:
                    logging.warn('failed to find paper id,page nodt save,%s',
                                 url)
                    print 'failed to find paper id, paper not save', url
                print url, '=>', len(context)
                logging.info('%s==>%d', url, len(context))
            else:
                print 'fail to find content for:', url
                logging.info('cannot find content %s', url)
            return

        con = self.request_url(url)
        if con is None:
            logging.error('failed to fetch list page %s', url)
            return

        if 'main' == jt:
            if self.need_split(con.text, url):
                self.split_url(url)
                logging.info('job is split %s', url)
                return
            self.add_list_job(url, con.text)
        urls = self.extract_paper_url(con.text)
        urls = spider.util.unique_list(urls)
        logging.info('%s add %d papers', url, len(urls))
        print 'add ', len(urls), 'paper urls', url
        if not self.list_only:
            for u in urls:
                self.add_job({'type': 'paper', 'url': u})
        if self.save_link:
            for u in urls:
                self.link_saver.add(u)

    def need_split(self, context, url):
        return False

    def extract_content(self, context):
        return context

    def extract_paper_id(self, url):
        m = re.findall(
            r'anjiangongkai\/JudgeDocument\/(\d+)\/information\/([\d\w]+)\/([\d\w]+)\/',
            url)
        if len(m) > 0:
            return '-'.join(m[0])
        return None

    def extract_paper_url(self, content):
        m = re.findall(
            r'<a href="(\/frontend\/anjiangongkai\/JudgeDocument\/\d+\/information\/[^"]*)">',
            content)
        if m is not None:
            urls = []
            for u in m:
                urls.append('http://ssfw.szcourt.gov.cn' + u)
            return urls
        return None

    def add_list_job(self, url, con):
        divs = re.findall(ur'\(\d+条记录,每页\d+条记录,共(\d+)页\)', con)
        if divs:
            pagecnt = int(divs[0])
            print 'add ', pagecnt, 'list url,', url
            logging.info('add %d list url from %s', pagecnt, url)
            for page in range(2, pagecnt + 1):
                self.add_job({
                    'type': 'list',
                    'url': re.sub(r'page=\d+?', 'page=%d' % page, url)
                })
        else:
            print url, 'has no more page'
            logging.info('no list page for %s', url)

    def post_captcha(self, code, session):
        # url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode?code=%s' % (code)
        if session is None:
            url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode?code=%s' % code
        else:
            url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode;jsessionid=%s?code=%s' % (
                session, code)
        con = self.request_url(url, data={})
        if con:
            if self.test_mode:
                print "post captcha cookies:", con.cookies
                # print "post captcha headers:", con.headers
                print 'captcha resolve result', con.text
            res = json.loads(con.text)
            return res['success']
        else:
            print 'None response'
        return None

    def resolve_captcha(self, img):
        server = LianzhongCaptcha()
        points = server.point_check()
        if points <= 0:
            print 'there are no more points'
            return
        print 'There are %d points remaining' % points

        captcha = server.resolve(img)
        if self.test_mode:
            print 'resolved captcha', captcha
        return captcha

    def get_captcha(self, tid):
        con = self.request_url('http://ssfw.szcourt.gov.cn/yzm.jsp')
        if con is None:
            print "get none captcha response"
            return
        context = copy.deepcopy(con.content)
        print '====get_captcha===='
        # print 'headers:', con.headers
        print 'cookies:', con.cookies
        return context

    def check_exceptions(self, con, jobid):
        if con is None or con.text is None:
            logging.error('failed to fetch paper page %s', jobid['url'])
            print 'failed to fetch page %s' % jobid['url']
            self.re_add_job(jobid)
            return True
        m404 = re.search('\/temp\/judgedocument404\.jsp', con.text)
        if m404:
            logging.info('page %s is missing from the server', jobid['url'])
            print 'page %s is missing from the server' % jobid['url']
            return True
        return False

    @staticmethod
    def get_session_id(con):
        if isinstance(con, str) or isinstance(con, unicode):
            m = re.search(r'JSESSIONID=([\w\d]+)', con)
            if m is None:
                m = re.search(r'jsessionid=([\w\d]+)', con)
        else:
            m = re.search(r'jsessionid=([\w\d]+)', con.text)
        if m:
            return m.group(1)
        else:
            return None

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += "Court Spider:%s\n" % self._name
            msg += "saved: %d\n" % self.pagestore.saved_count
            spider.util.sendmail(['*****@*****.**'],
                                 '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Example #17
0
class FsLinkSpider(CourtSpider):
    def __init__(self, threadcnt):
        CourtSpider.__init__(self, threadcnt)
        self._name = 'FoshanLinkSpider'
        self._test_mode = True
        self.page_size = 20
        self.link_saver = LinkSaver("links")

    def run_job(self, jobid):
        if not isinstance(
                jobid,
                dict) or not jobid.has_key('type') or not jobid.has_key('url'):
            raise ValueError('invalid jobid')
        jt = jobid['type']
        url = jobid['url']

        if 'main' != jt or not jobid.has_key('page'):
            raise ValueError('Invalid main job id')

        page = jobid['page']
        urls = self.post_page(page, url)
        if len(urls) == 0:
            print 'no page url found at', page
            return
        elif self._test_mode:
            print 'add job', len(urls)
        urls = spider.util.unique_list(urls)
        for u in urls:
            self.link_saver.add(u)

    def dispatch(self):
        count = self.fetch_paper_count()
        for page in range(1, count / self.page_size + 1):
            self.add_main_job({
                'type': 'main',
                'url':
                'http://www.fszjfy.gov.cn/CourtProject/index/index-cpws!search.action#',
                'page': page
            })
        time.sleep(3)
        self.wait_q()
        self.add_job(None, True)

    def fetch_paper_count(self):
        con = self.request_url(
            'http://www.fszjfy.gov.cn/CourtProject/index/index-cpws!search.action#'
        )
        count = 0
        if con:
            size = re.search(r'<input value="(\d+)" id="pageSize"', con.text)
            pages = re.search(r'<input value="(\d+)" id="pageTotal"', con.text)
            if pages:
                pages = int(pages.group(1))
            else:
                pages = 1
            if size:
                count = int(size.group(1)) * pages
        return count

    def need_split(self, context, url):
        return False

    def extract_content(self, context):
        return context

    def extract_paper_id(self, url):
        m = re.findall(r'id=(\d+)', url)
        if m is not None:
            return m[0]
        return None

    def extract_paper_url(self, content):
        li = re.search(r'<div id="gl3_content_main">.*?<\/div>', content, re.S)
        m = []
        if li:
            rs = li.group().strip()
            li_content = re.sub(
                r'<.*?>|\r|\n|\&nbsp;|\t', '',
                re.sub(
                    '</li>', '|',
                    re.sub(
                        r'</dt>', ',',
                        re.sub(r'<a href="', '',
                               re.sub(r'" target[^>]*>', '', rs)))))

            if li_content:
                if isinstance(li_content, unicode):
                    li_content = li_content.encode('utf-8')
                m = li_content.strip().split('|')
        urls = []
        for u in m:
            urls.append(u.strip())
        return urls

    def add_list_job(self, url, con):
        pass

    def post_page(self, page, url):
        data = {
            'pageNo': page,
            'pageSize': self.page_size,
            'search': '',
            'ah': '',
            'startTime': '',
            'endTime': '',
            'ajyear': '',
            'ahtxt': '',
            'ajfymc': '',
            'ajlb': '',
            'fymc': '0'
        }
        con = self.request_url(url, data=data)
        time.sleep(1)
        if con is None:
            return None
        return self.extract_paper_url(con.text)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += "Court Spider:%s\n" % self._name
            spider.util.sendmail(['*****@*****.**'],
                                 '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Example #18
0
class PatentAbstractSpider(ZhuanliBaseSpider, Main):
    """专利摘要爬虫"""

    def __init__(self, thcnt, mode='id', recover=True, seeds='seed.dat'):
        ZhuanliBaseSpider.__init__(self, thcnt, recover, timeout=90)
        Main.__init__(self)
        self.short_tag = 't:m:s:r:o:h:v:'
        self.tags = ['recover=', 'threads=', 'mode=', 'seeds=', 'output=']
        self.seeds = seeds
        self.page_size = 20  # 3或者10,20
        self.pagestore = PatentAbstractStore('abstract')
        self.failed_saver = FailedJobSaver('failed_job.txt')
        self.seed_saver = LinkSaver('seed.year.txt', 'a+')
        self.job_log = LinkSaver('abstract.%s.log' % mode, 'a+')
        self.mode = mode
        self.__version = '1.0.0'
        self.utils = threading.local()
        self.sp_errors = OrderedDict()
        self.pre_save_count = 0
        self.properties = PropertiesManager()
        self.can_load_seed = True

    def output(self, args):
        print '_patent_spider.py: %s' % args

    def version(self):
        print '_patent_spider.py %s' % self.__version

    def usage(self):
        print '_patent_spider.py usage:'
        print '-h, --help: print help message.'
        print '-v, --version: print script version'
        print '-o, --output: input an output verb'
        print '-t, --threads: thread count '
        print '-m, --mode: mode,if not id then will be abstract mode'
        print '-r, --recover: recover,1 or True for recover mode'
        print '-s, --seeds: seeds file'

    def _set_proxy(self, kwargs, selproxy):
        super(PatentAbstractSpider, self)._set_proxy(kwargs, selproxy)
        setattr(self.utils, 'proxy', selproxy)

    def handle(self, opts):
        for o, a in opts:
            if o in ('-h', '--help'):
                self.usage()
                sys.exit(1)
            elif o in ('-v', '--version'):
                self.version()
                sys.exit(0)
            elif o in ('-o', '--output'):
                self.output(a)
                sys.exit(0)
            elif o in ('-t', '--threads'):
                self.thread_count = int(a)
            elif o in ('-m', '--mode'):
                self.mode = a
            elif o in ('-s', '--seeds'):
                self.seeds = a
            elif o in ('-r', '--recover'):
                self.recover = True if (a == '1' or a == 'True') else False
            else:
                print 'unhandled option'
                sys.exit(3)
        if self.mode != 'id':
            self.mode = 'abs'
        if self.mode != 'id' and not os.path.exists(self.seeds):
            print 'seed file %s not exists' % self.seeds
            sys.exit(1)
        count = 3
        while count > 0:
            self.sp_proxies = OrderedDict()
            if self.mode == 'id':
                # self.set_proxy('183.111.169.203:8080', len(job.sp_proxies))
                self.set_proxy('192.168.1.39:3428:ipin:helloipin', len(job.sp_proxies))
            else:
                proxies = KuaidailiProxyManager.load_proxy(100)
                print 'load %d proxies from kuaidaili' % proxies['data']['count']
                if proxies['data']['count'] > 0:
                    self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0)
            # proxies = KuaidailiProxyManager.load_proxy(50)
            # print 'load %d proxies from kuaidaili' % proxies['data']['count']
            # if proxies['data']['count'] > 0:
            #     self.set_proxy(proxies['data']['proxy_list'], 15 if (proxies['data']['count'] > 15) else 0)
            self.run()
            count -= 1

    def load_proxy(self, fn, index=-1, auto_change=True):
        super(PatentAbstractSpider, self).load_proxy(fn, index, auto_change)
        with self.locker:
            self.sp_errors.clear()
            for proxy in self.sp_proxies.iterkeys():
                self.sp_proxies[proxy] = 0

    def set_proxy(self, prs, index=-1, auto_change=True):
        with self.locker:
            if isinstance(prs, list):
                for p in prs:
                    self.sp_errors[p] = 0
            elif isinstance(prs, str) or isinstance(prs, unicode):
                self.sp_errors[prs] = 0
        super(PatentAbstractSpider, self).set_proxy(prs, index, auto_change)

    @staticmethod
    def gen_list_seed():
        now = datetime.now()
        this_year = int(now.strftime('%Y'))
        this_month = int(now.strftime('%m'))
        types = ['fmgb', 'fmsq', 'xxsq', 'wgsq']
        seeds = []
        for year in range(1985, this_year):
            for month in range(1, 13):
                for t in types:
                    seeds.append(
                        {'type': t, 'index': 1, 'time': '%s%s' % (year, (month if month > 9 else '0%s' % month))})
        for month in range(1, this_month):
            for t in types:
                seeds.append(
                    {'type': t, 'index': 1, 'time': '%s%s' % (this_year, (month if month > 9 else '0%s' % month))})
        return seeds

    def load_abstract_seeds(self, seed_file, limit=1000000):
        seeds = []
        last_position = self.properties.get('position', 0)
        f = open(seed_file, 'r')
        count = 0
        f.seek(last_position)
        while count < limit:
            l = f.readline()
            if not l:
                # 文件结束,不能再读
                self.can_load_seed = False
                break
            res = l.strip().split(',')
            if len(res) < 3:
                print 'invalid seeds:', l
            else:
                seeds.append({'type': res[1], 'id': res[0], 'code': res[2]})
                count += 1
        last_position = f.tell()
        self.properties.set('position', last_position)
        self.properties.save()
        f.close()
        return seeds

    def get_id_seeds(self):
        raw_seeds = self.gen_list_seed()
        rds = self.job_log.readlines()
        '''get done jobs'''
        done_jobs = {}
        for job in rds:
            if '[' == job[0]:
                continue
            js = job.strip().split('-')
            done_jobs['%s-%s' % (js[0], js[1])] = {}
            done_jobs['%s-%s' % (js[0], js[1])]['pages'] = int(js[2])
            done_jobs['%s-%s' % (js[0], js[1])]['current'] = 1
        '''load done seeds'''
        dss = self.seed_saver.readlines()
        for ds in dss:
            sd = ds.strip().split(',')
            if len(sd) < 4:
                print 'invalid seed', ds
                continue
            js = sd[3].split('-')
            sid = '%s-%s' % (js[0], js[1])
            page = int(js[2])
            if done_jobs.has_key(sid) and done_jobs[sid]['current'] < page:
                done_jobs[sid]['current'] = page
        seeds = []
        for seed in raw_seeds:
            sid = seed['time'] + '-' + seed['type']
            if done_jobs.has_key(sid):
                if done_jobs[sid]['pages'] > done_jobs[sid]['current'] > 1:
                    for page in range(done_jobs[sid]['current'] + 1, done_jobs[sid]['pages'] + 1):
                        s = copy.deepcopy(seed)
                        s['index'] = page
                        seeds.append(s)
            else:
                seeds.append(seed)

        logging.info('load %s list seeds', len(seeds))
        return seeds

    def get_abstract_seeds(self, limit=100000):
        rawseeds = self.load_abstract_seeds(self.seeds, limit)
        seeds = []
        for s in rawseeds:
            if not self.recover or not self.pagestore.find_any(self.pagestore.channel + '://' + s['id']):
                seeds.append(s)
                if len(seeds) >= limit:
                    break
        logging.info('load %d abstract seeds', len(seeds))
        return seeds

    def report(self):
        super(PatentAbstractSpider, self).report()
        self.job_log.flush()
        self.seed_saver.flush()
        count = self.pagestore.saved_count - self.pre_save_count
        self.pre_save_count = self.pagestore.saved_count
        print 'save %d doc in this minute' % count

    def dispatch(self):
        self.failed_saver.tag()
        if self.mode == 'id':
            seeds = self.get_id_seeds()
            for seed in seeds:
                self.add_main_job(seed)
        else:
            count = 10
            ever_loaded = False
            while count > 0 and self.can_load_seed:
                seeds = self.get_abstract_seeds()
                if len(seeds) > 0:
                    ever_loaded = True
                    for seed in seeds:
                        self.add_main_job(seed)
                    time.sleep(2)
                    self.wait_q()
                elif ever_loaded:
                    count -= 1
                    time.sleep(100)

        time.sleep(2)
        self.wait_q()
        self.add_job(None)

    @staticmethod
    def extract_seed_id(pub, app, count):
        return '%s-%s/%s-%s/%s' % (
            pub[0], pub[1], app[0] if (app[0] != '-') else '', app[1] if (app[1] != '-') else '', count)

    @staticmethod
    def parse_seed(seed):
        v = seed.split(',')
        if len(v) != 7:
            print 'invalid seed', seed
            return []
        return [[v[1][1:], v[2][:-1]], [v[3][1:], v[4][:-1]], int(v[6])]

    @staticmethod
    def get_query_word(jobid):
        word = '公开(公告)日=%s' % jobid['time']
        return word

    def _on_shutdown(self, jobid):
        self.failed_saver.save('2,%s' % str(jobid))
        return

    def handle_id_job(self, jobid):
        strword = self.get_query_word(jobid)
        url = self.form_query_url(strword, page=jobid['index'], size=self.page_size, selected=jobid['type'], showtype=0)
        con = self.request_url(url, timeout=self.timeout)
        if self.check_exception(con, jobid):
            print 'exception encounter', jobid
            return
        if re.search(u'<title>错误页面</title>', con.text):
            print '错误页面', jobid
            if not self.re_add_job(jobid):
                self.failed_saver.save(str(jobid))
            return
        patents = re.findall(r'<a href="javascript:zl_xm\(\'([\d\w]+)\',\'(\w+)\',\'([\w\d]+)\'\);">[\d\w]+</a>',
                             con.text)
        print '[%d]%s-%s-%s' % (len(patents), jobid['time'], jobid['type'], jobid['index'])
        if 0 == len(patents):
            self.job_log.add('[%d]%s-%s-%s,%s' % (len(patents), jobid['time'], jobid['type'], jobid['index'], con.code))
            self.re_add_job(jobid)
            return
        for p in patents:
            if len(p) != 3:
                logging.warn('invalid pattern matched:%s,%s', str(p), str(jobid))
                self.failed_saver.save('1,%s' % str(jobid))
            else:
                self.seed_saver.add(
                    '%s,%s,%s,%s-%s-%d' % (p[0], p[1], p[2], jobid['time'], jobid['type'], jobid['index']))
        if 1 == jobid['index']:
            m = re.search(r'javascript:if\(event.keyCode == 13\) zl_tz\((\d+)\)', con.text)
            if m:
                pagecnt = int(m.group(1))
                print '[%d][%d]%s-%s-%d' % (len(patents), pagecnt, jobid['time'], jobid['type'], jobid['index'])
                self.job_log.add('%s-%s-%s' % (jobid['time'], jobid['type'], pagecnt))
                for page in range(2, pagecnt + 1):
                    job = copy.deepcopy(jobid)
                    job['_failcnt_'] = 0
                    job['index'] = page
                    self.add_job(job)
            else:
                print 'failed to find count[%d]%s-%s-[%d]' % (len(patents), jobid['time'], jobid['type'], 0)
                logging.warn('failed to find page count:%s-%s-%s', jobid['time'], jobid['type'], jobid['index'])

    def handle_abstract_seed(self, jobid):
        qword = quote('申请号=\'%s\' and %s=1' % (jobid['id'], jobid['code']))
        url = 'http://epub.sipo.gov.cn/patentdetail.action?strSources=%s&strWhere=%s&strLicenseCode=&pageSize=6&pageNow=1' % (
            jobid['type'], qword)
        con = self.request_url(url, timeout=self.timeout)
        if self.check_exception(con, jobid):
            print 'exception encounter', jobid
            return
        if re.search(u'<title>错误页面</title>', con.text):
            print '错误页面', jobid
            if not self.re_add_job(jobid):
                self.failed_saver.save(str(jobid))
            return
        print 'success:%s-%s-%s' % (jobid['id'], jobid['type'], jobid['code'])
        self.pagestore.save(int(time.time()), jobid['id'], url, con.text)

    def run_job(self, jobid):
        if self.check_shutdown(jobid):
            return
        try:
            if self.mode == 'id':
                self.handle_id_job(jobid)
            else:
                self.handle_abstract_seed(jobid)
        except RuntimeError as e:
            if 'no proxy' in e.message:
                self.re_add_job(jobid)
                self.reload_proxy()
                return
            else:
                raise

    def reload_proxy(self):
        prs = {}
        count = 3
        while count > 0:
            if 'id' == self.mode:
                prs = KuaidailiProxyManager.load_proxy(20)
            else:
                prs = KuaidailiProxyManager.load_proxy(100)
            if prs['data']['count'] > 0:
                break
            count -= 1
        if count <= 0 or not prs.has_key('data') or not prs['data'].has_key('count') or \
                        prs['data'][
                            'count'] <= 0:
            self._shutdown()
            logging.error('cannot load any proxy')
            spider.util.sendmail(['*****@*****.**'], 'Proxy Error',
                                 'Cannot load any proxy:%s,%s' % (self._name, self.mode))
            return
        print 'load %d proxies from kuaidaili' % prs['data']['count']
        self.set_proxy(prs['data']['proxy_list'], 15 if (prs['data']['count'] > 15) else 0)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            self.job_log.flush()
            msg += "saved: %d\n" % self.pagestore.saved_count
            spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg)

    def proxy_error(self):
        proxy = getattr(self.utils, 'proxy')
        if proxy is not None:
            with self.locker:
                try:
                    if self.sp_errors[proxy] < 5:
                        self.sp_errors[proxy] += 1
                    else:
                        self.sp_proxies.pop(proxy)
                        if len(self.sp_proxies) == 0:
                            self.reload_proxy()
                except KeyError:
                    pass

    def on_proxy_error(self, con, jobid):
        self.proxy_error()
        self.re_add_job(jobid)
        return True

    def on_other_400_exception(self, con, jobid):
        if con.code == 403:
            self.proxy_error()
        self.re_add_job(jobid)
        return True

    def on_other_500_exception(self, con, jobid):
        if 504 == con.code and re.search('proxy', con.text, re.I):
            self.proxy_error()
            self.re_add_job(jobid)
            return True
        else:
            return super(PatentAbstractSpider, self).on_other_500_exception(con, jobid)
Example #19
0
class ShenzhenCourtListSpider(ETOSSessionCourtSpider):
    "深圳法院诉讼服务平台爬虫"

    def __init__(self, thread_count=1, full_mode=False, seeds='seeds'):
        super(ShenzhenCourtListSpider, self).__init__(thread_count, 'list.spider.log')
        self._name = 'ShenzhenListSpider'
        self.job_spliter = ShenzhenSpliter()
        self._captcha_times = range(0, thread_count)
        self.test_mode = False
        self.pagesize = 50
        self.full_mode = full_mode
        self.link_saver = LinkSaver(seeds, 'a')

    def dispatch(self):

        self.add_main_job({'type': 'main',
                           'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440300&page=1&pageLimit=%d&caseNo=' % self.pagesize})
        self.add_main_job({'type': 'main',
                           'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440301&page=1&pageLimit=%d&caseNo=' % self.pagesize})
        self.add_main_job({'type': 'main',
                           'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440302&page=1&pageLimit=%d&caseNo=' % self.pagesize})
        self.add_main_job({'type': 'main',
                           'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440303&page=1&pageLimit=%d&caseNo=' % self.pagesize})
        self.add_main_job({'type': 'main',
                           'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440304&page=1&pageLimit=%d&caseNo=' % self.pagesize})
        self.add_main_job({'type': 'main',
                           'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440305&page=1&pageLimit=%d&caseNo=' % self.pagesize})
        self.add_main_job({'type': 'main',
                           'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440306&page=1&pageLimit=%d&caseNo=' % self.pagesize})
        self.add_main_job({'type': 'main',
                           'url': 'http://ssfw.szcourt.gov.cn/frontend/anjiangongkai/JudgeDocument/25?ajlb=2&fydm=440307&page=1&pageLimit=%d&caseNo=' % self.pagesize})

        time.sleep(3)
        self.wait_q()
        self.add_job(None, True)

    def thread_init(self, tid):
        self._captcha_times[tid] = 0

    def check_captcha(self, con, url):
        m = re.search(u'.*需要验证,请输入验证码.*', con.text)
        if m:
            print m.group()
            tid = getattr(self._tls, 'tid', -1)
            if tid < 0:
                sys.stderr.write('invalid thread id in run_job')
                raise RuntimeError('Invalid tid')
            c = 0
            while c < 10:
                img = self.get_captcha(tid)
                self._captcha_times[tid] += 1
                c += 1
                if not img:
                    continue
                code = self.resolve_captcha(img)
                if not code:
                    continue
                success = self.post_captcha(code, None)
                logging.info('captcha times:%d', self._captcha_times[tid])
                if self.test_mode:
                    print "captcha times: ", self._captcha_times[tid]
                if success == 'true':
                    if re.split(r'\/anjiangongkai\/JudgeDocument', url):
                        u = url + '?code=' + code
                    else:
                        u = url + '&code=' + code
                    con = self.request_url(u)
                    return con
        else:
            if self.test_mode:
                print 'do not need resolve captcha', url
                logging.warn('do not need resolve captcha %s', url)
        return con

    def request_url(self, url, **kwargs):
        con = super(ShenzhenCourtListSpider, self).request_url(url, **kwargs)
        if con and con.text:
            return self.check_captcha(con, url)
        return con

    def check_exception(self, con, jobid):
        if con is None:
            print '回应是None,你说怎么办吧', jobid['url']
            self.re_add_job(jobid)
            return True
        if con.text is None:
            print 'response text是None', jobid['url']
            self.re_add_job(jobid)
            return True
        m = re.search(
            r'<!DOCTYPE html><html><head><meta charset=utf-8><\/head><\/head><body><script>window.location=\'([^\']*)\'<\/script><\/body><\/html>',
            con.text)
        if m:
            url = 'http://ssfw.szcourt.gov.cn' + m.group(1)
            self.add_job({'type': jobid['type'], 'url': url})
            print 'js 页面跳转,目的地是', url
            return True

    def run_job(self, jobid):
        jt = jobid['type']
        url = jobid['url']

        con = self.request_url(url)
        if self.check_exception(con, jobid):
            return

        if self.need_split(con.text, url):
            self.split_url(url)
            logging.info('job is split %s', url)
            return
        if jt == 'main':
            self.add_list_job(url, con.text)
        urls = self.extract_paper_url(con.text)
        urls = spider.util.unique_list(urls)
        logging.info('%s add %d papers', url, len(urls))
        print 'add', len(urls), 'paper urls', url
        if len(urls) == 0:
            pass
        if self.full_mode:
            m = re.search(
                r'http:\/\/ssfw.szcourt.gov.cn\/frontend\/anjiangongkai\/JudgeDocument\/(\d+)\?ajlb=(\d+)&fydm=(\d+)&page=(\d+)&pageLimit=(\d+)&caseNo='
                , url)
            if m:
                tp = m.group(1)
                ajlb = m.group(2)
                fydm = m.group(3)
                page = m.group(4)
                size = m.group(5)
                for u in urls:
                    self.link_saver.add('%s,%s,%s,%s,%s,%s' % (tp, ajlb, fydm, page, size, u))
        else:
            for u in urls:
                self.link_saver.add(u)

    def need_split(self, context, url):
        return False

    def extract_content(self, context):
        return context

    def extract_paper_id(self, url):
        m = re.findall(r'anjiangongkai\/JudgeDocument\/(\d+)\/information\/([\d\w]+)\/([\d\w]+)\/', url)
        if len(m) > 0:
            return '-'.join(m[0])
        return None

    def extract_paper_url(self, content):
        m = re.findall(r'<a href="(\/frontend\/anjiangongkai\/JudgeDocument\/\d+\/information\/[^"]*)">',
                       content)
        if m is not None:
            urls = []
            for u in m:
                urls.append('http://ssfw.szcourt.gov.cn' + u)
            return urls
        return None

    def add_list_job(self, url, con):
        divs = re.findall(ur'\(\d+条记录,每页\d+条记录,共(\d+)页\)', con)
        if divs:
            pagecnt = int(divs[0])
            print 'add ', pagecnt, 'list url,', url
            logging.info('add %d list url from %s', pagecnt, url)
            for page in range(2, pagecnt + 1):
                self.add_job({'type': 'list', 'url': re.sub(r'page=\d+?', 'page=%d' % page, url)})
        else:
            print url, 'has no more page'
            logging.info('no list page for %s', url)

    def post_captcha(self, code, session):
        # url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode?code=%s' % (code)
        if session is None:
            url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode?code=%s' % code
        else:
            url = 'http://ssfw.szcourt.gov.cn/frontend/validateRandCode;jsessionid=%s?code=%s' % (session, code)
        con = self.request_url(url, data={})
        if con:
            if self.test_mode:
                print "post captcha cookies:", con.cookies
                # print "post captcha headers:", con.headers
                print 'captcha resolve result', con.text
            res = json.loads(con.text)
            return res['success']
        else:
            print 'None response'
        return None

    def resolve_captcha(self, img):
        server = LianzhongCaptcha()
        points = server.point_check()
        if points <= 0:
            print 'there are no more points'
            return
        print 'There are %d points remaining' % points

        captcha = server.resolve(img)
        if self.test_mode:
            print 'resolved captcha', captcha
        return captcha

    def get_captcha(self, tid):
        con = self.request_url('http://ssfw.szcourt.gov.cn/yzm.jsp')
        if con is None:
            print "get none captcha response"
            return
        context = copy.deepcopy(con.content)
        print '====get_captcha===='
        # print 'headers:', con.headers
        print 'cookies:', con.cookies
        return context

    @staticmethod
    def get_session_id(con):
        if isinstance(con, str) or isinstance(con, unicode):
            m = re.search(r'JSESSIONID=([\w\d]+)', con)
            if m is None:
                m = re.search(r'jsessionid=([\w\d]+)', con)
        else:
            m = re.search(r'jsessionid=([\w\d]+)', con.text)
        if m:
            return m.group(1)
        else:
            return None

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += "Court Spider:%s\n" % self._name
            spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg)
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Example #20
0
class ShanghaiCourtSpider(ProxySwapSpider):
    "上海高级人民法院文书检索系统爬虫"

    def __init__(self,
                 thread_count=5,
                 seeds=None,
                 start=1,
                 name='ShanghaiCourtSpider',
                 list_only=False,
                 paper_seeds=None,
                 recover=False):
        ProxySwapSpider.__init__(self, thread_count, proxy_life=3600)
        if seeds is None:
            seeds = []
        self._name = name
        self.seeds = seeds
        self.pagestore = ShanghaiCourtStore()
        self.page_size = 20
        self.list_only = list_only
        self.search_url_format = 'http://www.hshfy.sh.cn:8081/flws/content.jsp?wz=&pa=%s&more=1&toPage=%d&totalPage=%d&perPaperLink=%d&perPaperNum=%d'
        if self.list_only:
            self.link_saver = LinkSaver('links', 'a')
        self.paper_seeds = paper_seeds
        self.lock = threading.Lock()
        self.pager_failed_count = 0
        self.recover = recover
        self.start = start

    def dispatch(self):
        # for seed in self.seeds:
        #     self.add_main_job(seed)
        # logging.info('add %d list links' % len(self.seeds))
        # if self.paper_seeds:
        #     links = []
        #     with open(self.paper_seeds, 'r') as f:
        #         for l in f:
        #             links.append(l.strip())
        #     if self.recover:
        #         tmp = links
        #         links = []
        #         for l in tmp:
        #             if not self.pagestore.find_any(self.pagestore.channel + '://' + self.extract_paper_id(l)):
        #                 links.append(l)
        #     logging.info('add %d paper links' % len(links))
        #     for l in links:
        #         self.add_main_job({'type': 'paper', 'url': l})
        seed_id = 'adHlwZT1BbGwmd3o9z'
        total = 1060385
        pagecnt = (total + self.page_size / 2) / self.page_size + 1
        for page in range(self.start, pagecnt):
            self.add_main_job({
                'type':
                'list',
                'url':
                self.search_url_format %
                (seed_id, page, total, self.page_size, self.page_size)
            })

        time.sleep(3)
        self.wait_q()
        self.add_job(None, True)

    def check_exception(self, con, jobid):
        '''check if there are exception in response,true if exception are found and cannot be continue,
        false if no exception is found or exception is handled and is ok to continue'''
        if con is None:
            print 'null response'
            self.re_add_job(jobid)
            return True
        if con.text is None:
            print 'None content type'
            print con.headers
            self.re_add_job(jobid)
            return True
        if con.code >= 400:
            print con.headers
            if 502 == con.code:
                print 'Proxy Error 502', jobid['url']
                logging.error('proxy error 502 %s', jobid['url'])
                self.change_proxy()
                self.re_add_job(jobid)
                return True
            if 404 == con.code:
                print '啊呵,404,服务器上居然找不到这个页面', jobid['url']
                logging.info('page not found on the server %s', jobid['url'])
                return True
            if 410 == con.code:
                print 'resource gone', jobid['url']
                return True
            if 500 > con.code >= 400:
                print 'request error', jobid['url']
                self.re_add_job(jobid)
                return True
            if 600 > con.code >= 500:
                print 'server error', con.code, jobid['url']
                cnt = jobid.get('_failcnt_', 0)
                if cnt < 47:
                    jobid['_failcnt'] = 47
                self.re_add_job(jobid)
                return True
            print '600 以上的code,涨见识了!哈哈哈!', jobid['url']
            logging.info('failed with response code %d,%s', con.code,
                         jobid['url'])
            self.re_add_job(jobid)
            return True
        if re.search(u'出错了', con.text):
            print '出错了,他们服务器太弱,慢点抓吧'
            logging.error('server error,%s', jobid['url'])
            self.re_add_job(jobid)
            return True
        if re.search(u'访问本页面,您的浏览器需要支持JavaScript', con.text):
            m = re.search(r"<script>(.*?)</script>", con.text)
            sc = "document = {set cookie(a){console.log(a);}}, window = {innerWidth: 1366, innerHeight: 768, screenX: 200, screenY: 100, screen: {width: 1366, height: 768}}\n"
            sc += m.group(1)
            rv = spider.util.runjs(sc)
            logging.info('nodejs result:%s', rv)
            print rv
        return False

    def run_job(self, jobid):
        jt = jobid['type']
        url = jobid['url']

        if 'main' == jt:
            res = self.post_for_count(url)
            if self.check_exception(res[1], jobid):
                return
            if res[0] <= 0:
                print 'get 0 result from', url
                logging.info('get no paper from %s' % url)
                return
            seed_id = re.search(r'pa=([\w\d\+]+)', url)
            if seed_id:
                seed_id = seed_id.group(1)
                count = int(res[0])
                logging.info('there are %d paper in %s' % (count, seed_id))
                page_count = int((count + self.page_size / 2) / self.page_size)
                for page in range(1, page_count + 1):
                    self.add_job({
                        'type':
                        'list',
                        'url':
                        self.search_url_format %
                        (seed_id, page, count, self.page_size, self.page_size)
                    })
            else:
                logging.warn('failed to parse seed id from %s', url)
        elif 'list' == jt:
            con = self.post_for_data(jobid['url'], {})
            if self.check_exception(con, jobid):
                return
            urls = self.extract_paper_url(con.text)
            if self.list_only:
                for u in urls:
                    self.link_saver.add(u)
            else:
                for u in urls:
                    self.add_job({'type': 'paper', 'url': u})
            logging.info('add %d from list job %s' % (len(urls), url))
            if len(urls) == 0:
                pass
            print('add %d from list job %s' % (len(urls), url))

        else:
            con = self.request_url(url, timeout=45)
            if self.check_exception(con, jobid):
                return
            content = self.extract_content(con.text)
            jid = self.extract_paper_id(url)
            logging.info('saving %s,%s', jid, url)
            if content and jid:
                self.pagestore.save(int(time.time()), jid, url, content)
            else:
                with self.lock:
                    self.pager_failed_count += 1
                logging.info('failed count %d,None content or jid for %s,%s' %
                             (self.pager_failed_count, jid, url))

    def extract_paper_id(self, url):
        m = re.search(r'pa=([\w\d\/]+)', url)
        if m:
            return m.group(1)
        return None

    def post_for_data(self, url, data=None):
        if data is None:
            data = {}
        con = self.request_url(url, data=data, timeout=60)
        if con:
            con.text = con.content.decode('gbk')
            con.encoding = 'gbk'
        return con

    def post_for_count(self, url):
        con = self.post_for_data(url)
        if con is None:
            return [0, None]
        count = re.search(r'var totalPage = "(\d+)";', con.text)
        if count:
            return [count.group(1), con.text]
        count = re.search(u'共([\d\s]+)条', con.text)
        if count:
            return [count.group(1), con]
        return [0, con]

    def extract_paper_url(self, content):
        m = re.findall(r'onclick="showone\(\'([^\']+)\'', content)
        urls = []
        for u in m:
            urls.append('http://www.hshfy.sh.cn:8081/flws/text.jsp?pa=' + u)
        return urls

    def extract_content(self, content):
        return content

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += "Court Spider:%s\n" % self._name
            msg += "Mode-list_only:%s\n" % self.list_only
            msg += "paper id failed: %d\n" % self.pager_failed_count
            msg += "saved: %d\n" % self.pagestore.saved_count
            spider.util.sendmail(['*****@*****.**'],
                                 '%s DONE' % self._name, msg)
            logging.info('Job done,failed count %d,saved %d' %
                         (self.pager_failed_count, self.pagestore.saved_count))
        elif evt == 'STARTED':
            # spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Example #21
0
class PatentAbstractExtractor(CWPParser):
    def __init__(self):
        CWPParser.__init__(self, 'abs_list', 'abs_list', 'zhuanli')
        self.store = PatentStore('abstract')
        self.failed_link = LinkSaver('abstract.parser.failed.txt')
        self.url_format = 'http://epub.sipo.gov.cn/dxbdl.action?strSources=fmmost&strWhere=%s&recordCursor=0&strLicenseCode=&action=dxbdln'
        self.save_count = 0

    def init(self):
        print 'job start at', datetime.now()

    def on_finish(self):
        print '%d patents saved' % self.save_count

    def process_child_item(self, item):
        self.save_count += 1
        if self.test_mode:
            print item['apply_code']
            print item['pub_code']
            print item['type']
            print item['code']
            # print item['content']
        else:
            jid = item['apply_code'] + '/' + item['pub_code']
            if not self.store.find_any(self.store.channel + '://' + jid):
                self.store.save(
                    int(time.time()), jid,
                    Patent.form_download_url(item['pub_code'], item['type'],
                                             item['code']), item['content'])

    def parse_item(self, page):
        patent_contents = re.findall(
            r'<div class="cp_box">.*?<img src="qrcode/\w{2}\d+\w?.png" width="74" height="74" /></a>',
            page['content'][1], re.S)
        patents = []
        for pc in patent_contents:
            m = re.search(r'申请号:(\d+\w?)</li>', pc)
            if m:
                apply_code = m.group(1)
            else:
                self.failed_link.add('1,%s' % page['indexUrl'])
                continue
            u = re.search(r"javascript:dxb3\('(\w+)','([\w\d]+)','(\d)'\);",
                          pc)
            if not u or len(u.groups()) < 3:
                self.failed_link.add('2,%s' % page['indexUrl'])
                continue
            patents.append({
                'apply_code': apply_code,
                'pub_code': u.group(2),
                'content': pc,
                'type': u.group(1),
                'code': u.group(3)
            })
        return patents

    @staticmethod
    def parse_content(pc):
        con = re.sub(
            '&nbsp;|&ensp;', '',
            re.sub('<[^>]*>', '',
                   pc.replace('</li>', '\n').replace('<ul>', '\n')))
        con = re.sub(r'\n+', '\n', con.replace(' ', '').replace('\t', ''))
        res = []
        for c in con.split('\n'):
            cr = c.strip()
            if cr != '':
                res.append(cr)
        c9 = res[9]
        res[8] = res[8].replace('全部', '') + c9
        res.remove(c9)
        return '\n'.join(res).lstrip()

    def save(self, saver, page):
        pass
Example #22
0
class ListSeedGenQueries(ZhuanliBaseSpider):
    def __init__(self, thcnt=4, limit=5000, recover=False):
        super(ListSeedGenQueries, self).__init__(thcnt)
        self.bs2 = FileSaver("failed_urls.2.txt")
        self.limit = limit
        self.test_mode = False
        self.sf = LinkSaver('seed.2.dat', 'a')
        self.failed_jobs = LinkSaver('seed.2.failed.dat', 'w')
        self.count = 0
        self.failed = 0
        self.sleep = 0
        self.recover = recover
        self.timeout = 60
        self.today = datetime.datetime.now().strftime('%Y')
        random.seed = int(time.time())
        self.select_user_agent(ua[2])

    def dispatch(self):
        if self.recover:
            with open('old.2.failed.dat', 'r') as f:
                for l in f:
                    d = l.strip().split(',', 1)
                    data = eval(d[1])
                    data['_failcnt_'] = 0
                    self.add_main_job(data)
        else:
            self.add_main_job({
                'type': 'main',
                'year': ['1985', self.today],
                'app': ['-', '-'],
                'level': -1
            })
        time.sleep(2)
        self.wait_q()
        self.add_job(None)

    @staticmethod
    def get_query_word(jobid):
        word = '公开(公告)日=BETWEEN[\'' + jobid['pub'][0] + '\',\'' + jobid['pub'][
            1] + '\']'
        if 'pub' != jobid['type']:
            word += ' AND 申请日=BETWEEN[\'' + jobid['app'][0] + '\',\'' + jobid[
                'app'][1] + '\']'
        return word

    def run_job(self, jobid):
        url = self.form_query_url(self.get_query_word(jobid), size=1)
        datestr = self.get_date_str(jobid)
        try:
            res = self.need_split(datestr, jobid['level'], url)
        except RuntimeError as e:
            if 'no proxy' in e.message:
                count = 3
                self.re_add_job(jobid)
                proxies = {}
                while count > 0:
                    proxies = KuaidailiProxyManager.load_proxy(30)
                    if proxies['data']['count'] > 0:
                        break
                    count -= 1
                if count <= 0 or not proxies.has_key(
                        'data') or not proxies['data'].has_key(
                            'count') or proxies['data']['count'] <= 0:
                    self._shutdown()
                    return
                print 'load %d proxies from kuaidaili' % proxies['data'][
                    'count']
                self.set_proxy(proxies['data']['proxy_list'], 15 if
                               (proxies['data']['count'] > 15) else 0)
                return
            else:
                raise
        if res[0] == 0:
            self.re_add_job(jobid)
            return
        elif res[0] == 1:
            with self.locker:
                self.failed += 1
                self.failed_jobs.add('1,' + str(jobid))
            return
        elif res[0] == 3:
            with self.locker:
                self.count += 1
                self.sf.add('1,%s,%d,%d' % (datestr, jobid['level'], res[1]))
            return
        dates = date_split(jobid[jobid['type']][0], jobid[jobid['type']][1])
        if len(dates) <= 0:
            with self.locker:
                self.failed += 1
                self.failed_jobs.add('0,' + str(jobid))
            return
        if len(dates) == 1:
            if 'pub' == jobid['type']:
                self.add_job({
                    'type': 'app',
                    'pub': jobid['pub'],
                    'level': jobid['level'] + 1,
                    'app': ['1985.01.01', '2009.12.31']
                })
                self.add_job({
                    'type': 'app',
                    'pub': jobid['pub'],
                    'level': jobid['level'] + 1,
                    'app': ['2010.01.01', self.today]
                })
            else:
                with self.locker:
                    self.count += 1
                    self.sf.add('2,%s,%d,%d' %
                                (datestr, jobid['level'], res[1]))
                    print '(%d)%s ==> %s cannot split any more' % (
                        jobid['level'], datestr, res[1])
            return
        level = jobid['level'] + 1
        for d in dates:
            job = copy.deepcopy(jobid)
            job['_failcnt_'] = 0
            job['level'] = level
            job[job['type']] = d
            self.add_job(job)

    @staticmethod
    def get_date_str(jobid):
        return '[%s,%s],[%s,%s]' % (jobid['pub'][0], jobid['pub'][1],
                                    jobid['app'][0], jobid['app'][1])

    def need_split(self, datestr, level, url):
        # self.select_user_agent(ua[random.randint(0, len(ua) - 1)])
        con = self.request_url(url)
        time.sleep(self.sleep)
        if con is None:
            print 'none response %s' % datestr
            return [0, 0]
        if re.search(u'<title>错误页面</title>', con.text):
            print 'no results %s' % datestr
            return [1, 0]
        counts = re.findall('num\w{4}\.value = "(\d+)";', con.text)
        if len(counts) <= 0:
            print 'invalid pages', datestr
            return [1, 0]
        if self.test_mode:
            print 'counts:', counts
        self.check_state()
        paper_count = 0
        for c in counts:
            paper_count += int(c)
        with self.locker:
            print "[%d][%d]-%s ==> %s %s" % (
                level, paper_count, datestr, len(counts), 'failed' if
                (paper_count > self.limit) else 'ok')
        if paper_count > self.limit:
            return [2, paper_count]
        return [3, paper_count]

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += 'saved: %d\n' % self.count
            msg += 'failed: %d\n' % self.failed
            spider.util.sendmail(['*****@*****.**'],
                                 '%s finished' % self._name, msg)