Exemple #1
0
 def resolve_captcha(self, url):
     count = 0
     while count < 100:
         count += 1
         rd = random.random() * 10000
         time.sleep(1)
         con = super(GkChsiFsxSpider, self).request_url(
             'http://gk.chsi.com.cn/ValidatorIMG.JPG?ID=%s' % str(rd))
         if not con or not con.content:
             logging.info('failed to fetch captcha')
             return False
         fname = '/tmp/' + str(uuid.uuid4()) + '.jpg'
         save_file(con.content, fname)
         res = Captcha.resolve(fname, self.prefix)
         remove_file(fname)
         if not res:
             logging.error('fail to resolve captcha')
             continue
         data = {'CHKNUM': res, 'url': url}
         if data['url'] is None:
             logging.error('Invalid host found %s', url)
             return None
         time.sleep(1)
         con = super(GkChsiFsxSpider, self).request_url(
             'http://gk.chsi.com.cn/checkcode/CheckAccess.do', data=data)
         if con and con.text:
             m = re.search(
                 '<form name="CheckAccessForm" method="post" action="/checkcode/CheckAccess.do">',
                 con.text)
             if not m:
                 return con
     print 'try captcha times 100 %s' % url
     return None
Exemple #2
0
 def resolve_captcha(self):
     us = str(uuid.uuid4())
     rd = random.random() * 10000
     con = self.request_url('http://gk.chsi.com.cn/ValidatorIMG.JPG?ID=%s' %
                            str(rd))
     if not con or not con.content:
         logging.info('failed to fetch captcha')
         return ''
     fname = '/tmp/' + us + '.jpg'
     save_file(con.content, fname)
     res = Captcha.resolve(fname, us)
     remove_file(fname)
     remove_file(us + '.txt')
     return res
Exemple #3
0
 def run_job(self, jobid):
     url = self.form_download_url(jobid['pnm'], jobid['type'])
     con = self.request_url(url, timeout=self.timeout)
     if self.check_exception(con, jobid):
         return
     if u'<input type="text" name="vct" />' in con.text:
         # 输入验证码下载
         m = re.search(r'\?path=([^&\s]*)', con.headers)
         if m:
             path = m.group(1)
         else:
             l_p = re.search('Location:http://egaz.sipo.gov.cn/FileWeb/.*',
                             con.headers)
             if l_p:
                 location = l_p.group()
             else:
                 l_p = re.search('Location:.*', con.headers)
                 location = 'None' if not l_p else l_p.group()
             print 'wrong redirect page:', url, 'location:', location
             if not self.re_add_job(jobid):
                 self.failed_saver.add(
                     '1,%s-%s-%s' %
                     (jobid['pnm'], jobid['type'], jobid['apply']))
             return
         img = self.request_url('http://egaz.sipo.gov.cn/FileWeb/vci.jpg')
         fn = jobid['pnm'] + '.jpg'
         save_file(img.content, fn)
         vci = Captcha.resolve(fn, jobid['pnm'])
         con = self.request_url(
             'http://egaz.sipo.gov.cn/FileWeb/pfs?path=%s&vct=%s' %
             (path, vci))
         remove_file(fn)
         if self.check_exception(con, jobid):
             return
         if u'您要下载的文件不存在' in con.text:
             self.failed_saver.add(
                 '2,%s-%s-%s' %
                 (jobid['pnm'], jobid['type'], jobid['apply']))
             return
         if u'<input type="text" name="vct" />' in con.text:
             if not self.re_add_job(jobid):
                 self.failed_saver.add(
                     '3,%s-%s-%s' %
                     (jobid['pnm'], jobid['type'], jobid['apply']))
             return
     self.pagestore.save(int(time.time()),
                         self.extract_seed_id(jobid['pnm'], jobid['apply']),
                         url, con.text)
Exemple #4
0
#!/usr/bin/env python
# -*- coding:utf8 -*-
import time

from court.util import save_file
from spider.httpreq import BasicRequests

if __name__ == '__main__':
    count = 100
    req = BasicRequests()
    while count > 0:
        time.sleep(1)
        con = req.request_url('http://egaz.sipo.gov.cn/FileWeb/vci.jpg')
        if con:
            save_file(con.content, './vci/100%s.jpg' % count)
            count -= 1
            print count