コード例 #1
0
ファイル: onlineocr.py プロジェクト: wolfwhoami/xxxxx
 def resolve(self, getimg, fc=None):
     while True:
         imgcon = getimg(dbgdata=fc)
         f = StringIO.StringIO()
         f.write(imgcon)
         f.seek(0)
         h = BasicRequests()
         imgcode = None
         if fc is not None and isinstance(fc, dict) and "type" in fc:
             response = h.request_url(self.server, files={'file': imgcon}, data={"province": self._type})
             result = response.text.strip()
             if '"valid":true' in result:
                 try:
                     result = json.loads(result)
                     imgcode = result["answer"]
                     if imgcode == None or imgcode == "":
                         print "验证码图片识别错误 imgcode==None或''"
                         continue
                 except Exception as err:
                     print "验证码图片识别错误,重新校验...,result:", result, "错误原因:", err
                     time.sleep(1)
                     continue
         else:
             url = "%s?type=%s" % (self.server, self._type)
             response = h.request_url(url, files={'file': f})
             imgcode = response.text.strip()
             if imgcode == '<fail>':
                 print "验证码图片识别错误 imgcode==<fail>"
                 continue
         if isinstance(fc, dict):
             fc['content'] = imgcon
             fc['code'] = imgcode
         return imgcode
コード例 #2
0
def find_ipin_proxy():
    ff = IpinFactory()
    prs = ff.getProxyList()
    s = BasicRequests()
    #print json.dumps(prs, ensure_ascii=0).encode('utf-8')
    res = {}
    for p in prs:
        print "trying", p
        auth, proxies = ff.genRequestsParam(p)
        con = s.request_url("http://ip.cn/",
                            auth=auth,
                            proxies=proxies,
                            timeout=6)
        if con is None:
            continue
        m = re.search("<code>(.*?)</code>", con.text)
        if m:
            sys.stderr.write("%s %s\n" % (p['host'], m.group(1)))
            res[m.group(1)] = "%s:%s:%s" % (p.get('host'), p.get('port'),
                                            p.get('password'))
        p2, proxies = ff.genTinyProxy(p)
        print proxies
        con = s.request_url("http://ip.cn/", proxies=proxies, timeout=5)
        if con is None:
            continue
        m = re.search("<code>(.*?)</code>", con.text)
        if m:
            sys.stderr.write("%s %s\n" % (p['host'], m.group(1)))
            res[m.group(1)] = p2
    print "\n".join(res.values())
コード例 #3
0
ファイル: proxy.py プロジェクト: wolfwhoami/xxxxx
 def test_proxy_speed(url, proxy, t=60):
     req = BasicRequests()
     req.set_proxy(proxy, 0, False)
     s = time.time()
     count = 0
     while time.time() - s <= t:
         try:
             req.request_url(url, timeout=5)
             count += 1
         except Exception:
             pass
     return count
コード例 #4
0
def post_for_proxy():
    req = BasicRequests()
    con = req.request_url(
        'http://dev.kuaidaili.com/api/getproxy/?orderid=925817981728018&num=50&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=2&an_an=1&an_ha=1&sp1=1&quality=1&sort=1&format=json&sep=1'
    )
    if con:
        return eval(con.text)
コード例 #5
0
ファイル: tmp.py プロジェクト: wolfwhoami/xxxxx
def init_data():
    br = BasicRequests()
    br.select_user_agent('firefox')
    s = open('af').read(64 * 1024)
    odata = []
    for m in re.finditer('<a (.*?)>(.*?)</a>', s, re.S):
        name = m.group(2)
        attrs = m.group(1)
        url = None
        prov = None
        for m1 in re.finditer('([a-z][a-z0-9]+)="(.*?)"', attrs, re.S):
            n, v = m1.group(1), m1.group(2)
            if n == 'href':
                url = v
            if n == "prov":
                prov = v
        con = br.request_url(url)
        siteurl = 'unknown'
        if con is not None:
            siteurl = con.request.url
        print name, con.request.url
        odata.append({
            'name': name,
            'url': siteurl,
            'imgurl': '',
            'prov': prov
        })
    print json.dumps(odata, ensure_ascii=0, indent=4)
コード例 #6
0
ファイル: _51job.py プロジェクト: wolfwhoami/xxxxx
def test_ps():
    ps = PageStore51()
    ps.testmode = True
    br = BasicRequests()
    br.select_user_agent('firefox')
    url = "http://jobs.51job.com/beijing-hdq/70320056.html?s=0"
    con = br.request_url(url)
    ps.save(int(time.time()), "jd_51job://", url, con.text)
コード例 #7
0
 def test_login(self):
     req = BasicRequests()
     con = req.request_url(self.main_url + 'login',
                           date={
                               'username': '******',
                               'password': '******'
                           })
     print con.headers
     print con.text
コード例 #8
0
ファイル: _court_hz.py プロジェクト: wolfwhoami/xxxxx
def test_parse_time():
    request = BasicRequests()
    con = request.request_url(
        'http://www.zjsfgkw.cn/attachment/documentbook/2016-04-05/0225-0229/html/671a34a7-b068-4025-af13-d9fe4c28ce6a.html'
    )
    m = re.search(
        ur'[一二三四五六七八九〇零○十]{4}年[一二三四五六七八九〇十○]{1,2}月[一二三四五六七八九〇零○十]{1,3}日',
        con.text)
    if m:
        print date_cs2num(m.group())
コード例 #9
0
def try_proxy(proxy, url='http://gaokao.chsi.com.cn', tag=u'阳光高考'):
    req = BasicRequests()
    req.set_proxy(proxy, 0, False)
    # con = req.request_url('http://gk.chsi.com.cn/recruit/listSpecBySchool.do?yxdm=11055&start=0 ')
    con = req.request_url(url, timeout=5)
    if con:
        m = re.search(r'<title>[^<]*<\/title>', con.text)
        if m:
            print m.group()
        return re.search(tag, con.text)
コード例 #10
0
ファイル: test_set.py プロジェクト: wolfwhoami/xxxxx
def test_search():
    a = BasicRequests()
    while True:
        data = {
            'docids1': "1,2,3,4,5,6,6109234,6110168,11070364",
            "keywords": "武汉"
        }
        con = a.request_url("http://localhost:4096/search?hehe=1", data=data)
        if con is not None:
            print con.code, con.text
        time.sleep(10)
コード例 #11
0
ファイル: _court_hz.py プロジェクト: wolfwhoami/xxxxx
def test_extract_inner_paper_url():
    rq = BasicRequests()
    con = rq.request_url(
        'http://www.zjsfgkw.cn/document/JudgmentDetail/4177773')
    content = re.search(r'<div class="books_detail_header">.*</IFRAME>',
                        con.text, re.S)
    m = re.search(r'src="([^"]+)"', content.group())
    if m:
        print m.group(1)
    else:
        print content
コード例 #12
0
ファイル: proxy.py プロジェクト: wolfwhoami/xxxxx
 def filter_with_speed(proxies, url='http://www.baidu.com', timeout=10):
     results = []
     req = BasicRequests()
     for proxy in proxies:
         req.set_proxy(proxy, len(req.sp_proxies), False)
         try:
             con = req.request_url(url, timeout=timeout)
         except Exception:
             con = None
         if con:
             results.append(proxy)
     return results
コード例 #13
0
 def get_child_court(self, court):
     req = BasicRequests()
     req.set_proxy(self.proxy)
     time.sleep(1)
     print 'fetching child court', court['key']
     con = req.request_url('http://wenshu.court.gov.cn/Index/GetChildAllCourt',
                           data={'keyCodeArrayStr': court['key']})
     if '<' in con.text:
         return
     court = self.parse_results(con)
     for c in court:
         self.child_courts.append(c)
コード例 #14
0
 def get_court(self):
     req = BasicRequests()
     req.set_proxy(self.proxy)
     for p in self.provinces:
         time.sleep(1)
         print 'fetch province', p['name']
         con = req.request_url('http://wenshu.court.gov.cn/Index/GetCourt', data={'province': p['name']})
         if '<' in con.text:
             print 'invalid response'
             continue
         court = self.parse_results(con)
         for c in court:
             self.courts.append(c)
コード例 #15
0
def runjs(url):
    rq = BasicRequests()
    con = rq.request_url(url, data={})
    if con:
        print con.text
        m = re.findall(r'<script[^>]*>(.+?)</script>', con.text, re.S)
        if m:
            for js in m:
                if js == '':
                    continue
                print js
                sc = "document = {set cookie(a){console.log(a);}}, window = {innerWidth: 1024, innerHeight: 768, screenX: 200, screenY: 100, screen: {width: 1024, height: 768}}\n"
                sc += js
                rv = spider.util.runjs(sc)
                print 'my results:'
                print rv
コード例 #16
0
def test_proxy(proxy, url, count=10):
    c = count
    rq = BasicRequests()
    rq.set_proxy(proxy['p'])
    total = 0
    success = 0
    while c > 0:
        try:
            s = time.time()
            con = rq.request_url(url)
            t = time.time() - s
        except:
            con = None
            t = 0
            pass
        c -= 1
        if con:
            success += 1
            total += t
    if success > 0:
        proxy['v'] = total / success
コード例 #17
0
def get_area_code():
    nr = BasicRequests()
    nr.select_user_agent('firefox')
    con = nr.request_url(
        'http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html'
    )
    xx = spider.util.htmlfind(con.text, '<div class="TRS_PreAppend"', 0)
    shtml = xx.get_node()
    stext = re.sub('<.*?>', lambda m: sv(m), shtml)
    stext = re.sub('&nbsp;', ' ', stext)

    outmap = {}
    cclist = {}
    fulloutmap = {}

    for line in re.split("\n", stext):
        cns = re.split(r'\s+', line)
        if len(cns) <= 1:
            continue
        code, name = cns
        name = name.decode('utf-8').strip()

        if u'直辖县级行政区划' in name:
            continue
        if name in [u'市辖区', u'区', u'县', u'矿区', u'郊区', u'城区']:
            continue
        if name not in fulloutmap:
            fulloutmap[name] = []
        fulloutmap[name].append(code[0:4])

        name1 = get_short_name(name)
        if name1 is None or name1 == name:
            continue
        if name1 not in cclist:
            cclist[name1] = {}
        if code[0:4] not in cclist[name1]:
            cclist[name1][code[0:4]] = []
        cclist[name1][code[0:4]].append([code, name])

    for key in cclist.keys():
        if key == u'吉林':
            outmap[key] = "2202"
        elif key == u"海南":
            outmap[key] = "4600"
        elif len(cclist[key]) == 1:
            thekey = cclist[key].keys()[0]
            outmap[key] = thekey
        else:
            preflist = []
            for thekey, v in cclist[key].items():
                for code, name in v:
                    if code[-2:] == '00':
                        preflist.append(code)
            if len(preflist) == 0:
                pass
            elif len(preflist) == 1:
                outmap[key] = preflist[0][0:4]
            else:
                assert not "nani?"

    fout = {}
    for k, v in outmap.items():
        fout[k] = v
    for k, v in fulloutmap.items():
        if len(v) == 1:
            fout[k] = v[0]
    return fout
コード例 #18
0
 def test_find(self):
     req = BasicRequests()
     con = req.request_url(self.main_url + '?key=' + 'mumas')
     print con.text
     con = req.request_url(self.main_url + '?key=' + 'skiloop')
     print con.text
コード例 #19
0
def load_url(url):
    br = BasicRequests()
    con = br.request_url(url)
    print con.text
コード例 #20
0
#!/usr/bin/env python
# -*- coding:utf8 -*-
import time

from court.util import save_file
from spider.httpreq import BasicRequests

if __name__ == '__main__':
    count = 100
    req = BasicRequests()
    while count > 0:
        time.sleep(1)
        con = req.request_url('http://egaz.sipo.gov.cn/FileWeb/vci.jpg')
        if con:
            save_file(con.content, './vci/100%s.jpg' % count)
            count -= 1
            print count
コード例 #21
0
ファイル: _test_network.py プロジェクト: wolfwhoami/xxxxx
#!/usr/bin/env python
# -*- coding:utf8 -*-
from spider.httpreq import BasicRequests

if "__main__" == __name__:
    rq = BasicRequests()
    rq.set_proxy('106.75.134.190:18888:ipin:ipin1234')
    con = rq.request_url('http://www.zjsfgkw.cn/document/JudgmentDetail/4062962')
    if con:
        print con.text
コード例 #22
0
#!/usr/bin/env python
# -*- coding:utf8 -*-
import io
import tesseract_ocr
import pyocr
from PIL import Image

from court.util import Captcha

from spider.httpreq import BasicRequests

if __name__ == '__main__':

    rq = BasicRequests()
    rq.select_user_agent('firefox')
    rq.set_proxy('106.75.134.191:18888:ipin:ipin1234')
    # con = rq.request_url('http://ssfw.szcourt.gov.cn/yzm.jsp')
    con = rq.request_url('http://www.bjcourt.gov.cn/yzm.jpg')
    if not con:
        print 'failed to fetch image'
    else:
        t = tesseract_ocr.Tesseract()
        text = t.text_for_bytes(con.content)

        print text
        with open('a.jpeg', 'wb') as f:
            f.write(con.content)

        print Captcha.resolve('a.jpeg', '1')