def resolve_captcha(self, url): count = 0 while count < 100: count += 1 rd = random.random() * 10000 time.sleep(1) con = super(GkChsiFsxSpider, self).request_url( 'http://gk.chsi.com.cn/ValidatorIMG.JPG?ID=%s' % str(rd)) if not con or not con.content: logging.info('failed to fetch captcha') return False fname = '/tmp/' + str(uuid.uuid4()) + '.jpg' save_file(con.content, fname) res = Captcha.resolve(fname, self.prefix) remove_file(fname) if not res: logging.error('fail to resolve captcha') continue data = {'CHKNUM': res, 'url': url} if data['url'] is None: logging.error('Invalid host found %s', url) return None time.sleep(1) con = super(GkChsiFsxSpider, self).request_url( 'http://gk.chsi.com.cn/checkcode/CheckAccess.do', data=data) if con and con.text: m = re.search( '<form name="CheckAccessForm" method="post" action="/checkcode/CheckAccess.do">', con.text) if not m: return con print 'try captcha times 100 %s' % url return None
def gen_queries(self): remove_file(self.job_file) fs = FileSaver(self.job_file) for ct in self.case_types: pcnt = ct['count'] / self.pagesize + 1 for page in range(1, pcnt + 1): fs.append(ct['key'] + '|' + str(ct['value']) + '|' + str(page) + '|' + str(self.pagesize))
def extract_swf_file(self, url, con): jid = str(uuid.uuid4()) if jid is not None: fn = '/tmp/%s.pdf' % jid f = open(fn, 'wb') f.write(con) f.flush() f.close() text = swf2text(fn) remove_file(fn) return text return None
def resolve_captcha(self): us = str(uuid.uuid4()) rd = random.random() * 10000 con = self.request_url('http://gk.chsi.com.cn/ValidatorIMG.JPG?ID=%s' % str(rd)) if not con or not con.content: logging.info('failed to fetch captcha') return '' fname = '/tmp/' + us + '.jpg' save_file(con.content, fname) res = Captcha.resolve(fname, us) remove_file(fname) remove_file(us + '.txt') return res
def run_job(self, jobid): url = self.form_download_url(jobid['pnm'], jobid['type']) con = self.request_url(url, timeout=self.timeout) if self.check_exception(con, jobid): return if u'<input type="text" name="vct" />' in con.text: # 输入验证码下载 m = re.search(r'\?path=([^&\s]*)', con.headers) if m: path = m.group(1) else: l_p = re.search('Location:http://egaz.sipo.gov.cn/FileWeb/.*', con.headers) if l_p: location = l_p.group() else: l_p = re.search('Location:.*', con.headers) location = 'None' if not l_p else l_p.group() print 'wrong redirect page:', url, 'location:', location if not self.re_add_job(jobid): self.failed_saver.add( '1,%s-%s-%s' % (jobid['pnm'], jobid['type'], jobid['apply'])) return img = self.request_url('http://egaz.sipo.gov.cn/FileWeb/vci.jpg') fn = jobid['pnm'] + '.jpg' save_file(img.content, fn) vci = Captcha.resolve(fn, jobid['pnm']) con = self.request_url( 'http://egaz.sipo.gov.cn/FileWeb/pfs?path=%s&vct=%s' % (path, vci)) remove_file(fn) if self.check_exception(con, jobid): return if u'您要下载的文件不存在' in con.text: self.failed_saver.add( '2,%s-%s-%s' % (jobid['pnm'], jobid['type'], jobid['apply'])) return if u'<input type="text" name="vct" />' in con.text: if not self.re_add_job(jobid): self.failed_saver.add( '3,%s-%s-%s' % (jobid['pnm'], jobid['type'], jobid['apply'])) return self.pagestore.save(int(time.time()), self.extract_seed_id(jobid['pnm'], jobid['apply']), url, con.text)
def resolve(filename, tag): img = cv2.imread(filename, 0) blur = cv2.GaussianBlur(img, (1, 1), 0) ret, thresh = cv2.threshold(blur, 100, 255, cv2.THRESH_BINARY) binary = thresh[3:-3, 2:-2] us = str(uuid.uuid4()) outfile = '/tmp/' + us + '.png' result = '' for i in range(4): single = binary[0:14, 14 * i:14 * i + 14] cv2.imwrite(outfile, single) command = 'tesseract --tessdata-dir /usr/share/tesseract-ocr/tessdata/ %s %s -psm 10 digits 2> /dev/null && cat %s.txt' % ( outfile, tag, tag) output = os.popen(command) result = result + output.read().strip() remove_file(outfile) return result
def do_on_finished(self): remove_file(self._remain_job_file) job_file = FileSaver(self._remain_job_file) while True: try: jobid = self.job_queue.get_nowait() self.job_queue.task_done() job_file.append(json.dumps(jobid, ensure_ascii=False)) except Queue.Empty: break while True: try: jobid = self.job_queue2.get_nowait() self.job_queue.task_done() job_file.append(json.dumps(jobid, ensure_ascii=False)) except Queue.Empty: break while True: try: jobid = self.job_queue3.get_nowait() self.job_queue.task_done() job_file.append(json.dumps(jobid, ensure_ascii=False)) except Queue.Empty: break
def save(self): if os.path.exists(self.saver): remove_file(self.saver) fs = FileSaver(self.saver) for r in self.res: fs.append(str(r))