def resolve_captcha(self, url): count = 0 while count < 100: count += 1 rd = random.random() * 10000 time.sleep(1) con = super(GkChsiFsxSpider, self).request_url( 'http://gk.chsi.com.cn/ValidatorIMG.JPG?ID=%s' % str(rd)) if not con or not con.content: logging.info('failed to fetch captcha') return False fname = '/tmp/' + str(uuid.uuid4()) + '.jpg' save_file(con.content, fname) res = Captcha.resolve(fname, self.prefix) remove_file(fname) if not res: logging.error('fail to resolve captcha') continue data = {'CHKNUM': res, 'url': url} if data['url'] is None: logging.error('Invalid host found %s', url) return None time.sleep(1) con = super(GkChsiFsxSpider, self).request_url( 'http://gk.chsi.com.cn/checkcode/CheckAccess.do', data=data) if con and con.text: m = re.search( '<form name="CheckAccessForm" method="post" action="/checkcode/CheckAccess.do">', con.text) if not m: return con print 'try captcha times 100 %s' % url return None
def resolve_captcha(self): us = str(uuid.uuid4()) rd = random.random() * 10000 con = self.request_url('http://gk.chsi.com.cn/ValidatorIMG.JPG?ID=%s' % str(rd)) if not con or not con.content: logging.info('failed to fetch captcha') return '' fname = '/tmp/' + us + '.jpg' save_file(con.content, fname) res = Captcha.resolve(fname, us) remove_file(fname) remove_file(us + '.txt') return res
def run_job(self, jobid): url = self.form_download_url(jobid['pnm'], jobid['type']) con = self.request_url(url, timeout=self.timeout) if self.check_exception(con, jobid): return if u'<input type="text" name="vct" />' in con.text: # 输入验证码下载 m = re.search(r'\?path=([^&\s]*)', con.headers) if m: path = m.group(1) else: l_p = re.search('Location:http://egaz.sipo.gov.cn/FileWeb/.*', con.headers) if l_p: location = l_p.group() else: l_p = re.search('Location:.*', con.headers) location = 'None' if not l_p else l_p.group() print 'wrong redirect page:', url, 'location:', location if not self.re_add_job(jobid): self.failed_saver.add( '1,%s-%s-%s' % (jobid['pnm'], jobid['type'], jobid['apply'])) return img = self.request_url('http://egaz.sipo.gov.cn/FileWeb/vci.jpg') fn = jobid['pnm'] + '.jpg' save_file(img.content, fn) vci = Captcha.resolve(fn, jobid['pnm']) con = self.request_url( 'http://egaz.sipo.gov.cn/FileWeb/pfs?path=%s&vct=%s' % (path, vci)) remove_file(fn) if self.check_exception(con, jobid): return if u'您要下载的文件不存在' in con.text: self.failed_saver.add( '2,%s-%s-%s' % (jobid['pnm'], jobid['type'], jobid['apply'])) return if u'<input type="text" name="vct" />' in con.text: if not self.re_add_job(jobid): self.failed_saver.add( '3,%s-%s-%s' % (jobid['pnm'], jobid['type'], jobid['apply'])) return self.pagestore.save(int(time.time()), self.extract_seed_id(jobid['pnm'], jobid['apply']), url, con.text)
#!/usr/bin/env python # -*- coding:utf8 -*- import time from court.util import save_file from spider.httpreq import BasicRequests if __name__ == '__main__': count = 100 req = BasicRequests() while count > 0: time.sleep(1) con = req.request_url('http://egaz.sipo.gov.cn/FileWeb/vci.jpg') if con: save_file(con.content, './vci/100%s.jpg' % count) count -= 1 print count