Esempio n. 1
0
 def resolve_captcha(self, url):
     count = 0
     while count < 100:
         count += 1
         rd = random.random() * 10000
         time.sleep(1)
         con = super(GkChsiFsxSpider, self).request_url(
             'http://gk.chsi.com.cn/ValidatorIMG.JPG?ID=%s' % str(rd))
         if not con or not con.content:
             logging.info('failed to fetch captcha')
             return False
         fname = '/tmp/' + str(uuid.uuid4()) + '.jpg'
         save_file(con.content, fname)
         res = Captcha.resolve(fname, self.prefix)
         remove_file(fname)
         if not res:
             logging.error('fail to resolve captcha')
             continue
         data = {'CHKNUM': res, 'url': url}
         if data['url'] is None:
             logging.error('Invalid host found %s', url)
             return None
         time.sleep(1)
         con = super(GkChsiFsxSpider, self).request_url(
             'http://gk.chsi.com.cn/checkcode/CheckAccess.do', data=data)
         if con and con.text:
             m = re.search(
                 '<form name="CheckAccessForm" method="post" action="/checkcode/CheckAccess.do">',
                 con.text)
             if not m:
                 return con
     print 'try captcha times 100 %s' % url
     return None
Esempio n. 2
0
 def gen_queries(self):
     remove_file(self.job_file)
     fs = FileSaver(self.job_file)
     for ct in self.case_types:
         pcnt = ct['count'] / self.pagesize + 1
         for page in range(1, pcnt + 1):
             fs.append(ct['key'] + '|' + str(ct['value']) + '|' + str(page) + '|' + str(self.pagesize))
Esempio n. 3
0
 def extract_swf_file(self, url, con):
     jid = str(uuid.uuid4())
     if jid is not None:
         fn = '/tmp/%s.pdf' % jid
         f = open(fn, 'wb')
         f.write(con)
         f.flush()
         f.close()
         text = swf2text(fn)
         remove_file(fn)
         return text
     return None
Esempio n. 4
0
 def resolve_captcha(self):
     us = str(uuid.uuid4())
     rd = random.random() * 10000
     con = self.request_url('http://gk.chsi.com.cn/ValidatorIMG.JPG?ID=%s' %
                            str(rd))
     if not con or not con.content:
         logging.info('failed to fetch captcha')
         return ''
     fname = '/tmp/' + us + '.jpg'
     save_file(con.content, fname)
     res = Captcha.resolve(fname, us)
     remove_file(fname)
     remove_file(us + '.txt')
     return res
Esempio n. 5
0
 def run_job(self, jobid):
     url = self.form_download_url(jobid['pnm'], jobid['type'])
     con = self.request_url(url, timeout=self.timeout)
     if self.check_exception(con, jobid):
         return
     if u'<input type="text" name="vct" />' in con.text:
         # 输入验证码下载
         m = re.search(r'\?path=([^&\s]*)', con.headers)
         if m:
             path = m.group(1)
         else:
             l_p = re.search('Location:http://egaz.sipo.gov.cn/FileWeb/.*',
                             con.headers)
             if l_p:
                 location = l_p.group()
             else:
                 l_p = re.search('Location:.*', con.headers)
                 location = 'None' if not l_p else l_p.group()
             print 'wrong redirect page:', url, 'location:', location
             if not self.re_add_job(jobid):
                 self.failed_saver.add(
                     '1,%s-%s-%s' %
                     (jobid['pnm'], jobid['type'], jobid['apply']))
             return
         img = self.request_url('http://egaz.sipo.gov.cn/FileWeb/vci.jpg')
         fn = jobid['pnm'] + '.jpg'
         save_file(img.content, fn)
         vci = Captcha.resolve(fn, jobid['pnm'])
         con = self.request_url(
             'http://egaz.sipo.gov.cn/FileWeb/pfs?path=%s&vct=%s' %
             (path, vci))
         remove_file(fn)
         if self.check_exception(con, jobid):
             return
         if u'您要下载的文件不存在' in con.text:
             self.failed_saver.add(
                 '2,%s-%s-%s' %
                 (jobid['pnm'], jobid['type'], jobid['apply']))
             return
         if u'<input type="text" name="vct" />' in con.text:
             if not self.re_add_job(jobid):
                 self.failed_saver.add(
                     '3,%s-%s-%s' %
                     (jobid['pnm'], jobid['type'], jobid['apply']))
             return
     self.pagestore.save(int(time.time()),
                         self.extract_seed_id(jobid['pnm'], jobid['apply']),
                         url, con.text)
Esempio n. 6
0
 def resolve(filename, tag):
     img = cv2.imread(filename, 0)
     blur = cv2.GaussianBlur(img, (1, 1), 0)
     ret, thresh = cv2.threshold(blur, 100, 255, cv2.THRESH_BINARY)
     binary = thresh[3:-3, 2:-2]
     us = str(uuid.uuid4())
     outfile = '/tmp/' + us + '.png'
     result = ''
     for i in range(4):
         single = binary[0:14, 14 * i:14 * i + 14]
         cv2.imwrite(outfile, single)
         command = 'tesseract --tessdata-dir /usr/share/tesseract-ocr/tessdata/ %s %s -psm 10 digits 2> /dev/null && cat %s.txt' % (
             outfile, tag, tag)
         output = os.popen(command)
         result = result + output.read().strip()
     remove_file(outfile)
     return result
Esempio n. 7
0
 def do_on_finished(self):
     remove_file(self._remain_job_file)
     job_file = FileSaver(self._remain_job_file)
     while True:
         try:
             jobid = self.job_queue.get_nowait()
             self.job_queue.task_done()
             job_file.append(json.dumps(jobid, ensure_ascii=False))
         except Queue.Empty:
             break
     while True:
         try:
             jobid = self.job_queue2.get_nowait()
             self.job_queue.task_done()
             job_file.append(json.dumps(jobid, ensure_ascii=False))
         except Queue.Empty:
             break
     while True:
         try:
             jobid = self.job_queue3.get_nowait()
             self.job_queue.task_done()
             job_file.append(json.dumps(jobid, ensure_ascii=False))
         except Queue.Empty:
             break
Esempio n. 8
0
 def save(self):
     if os.path.exists(self.saver):
         remove_file(self.saver)
     fs = FileSaver(self.saver)
     for r in self.res:
         fs.append(str(r))