Ejemplo n.º 1
0
def init_data():
    br = BasicRequests()
    br.select_user_agent('firefox')
    s = open('af').read(64 * 1024)
    odata = []
    for m in re.finditer('<a (.*?)>(.*?)</a>', s, re.S):
        name = m.group(2)
        attrs = m.group(1)
        url = None
        prov = None
        for m1 in re.finditer('([a-z][a-z0-9]+)="(.*?)"', attrs, re.S):
            n, v = m1.group(1), m1.group(2)
            if n == 'href':
                url = v
            if n == "prov":
                prov = v
        con = br.request_url(url)
        siteurl = 'unknown'
        if con is not None:
            siteurl = con.request.url
        print name, con.request.url
        odata.append({
            'name': name,
            'url': siteurl,
            'imgurl': '',
            'prov': prov
        })
    print json.dumps(odata, ensure_ascii=0, indent=4)
Ejemplo n.º 2
0
def test_ps():
    ps = PageStore51()
    ps.testmode = True
    br = BasicRequests()
    br.select_user_agent('firefox')
    url = "http://jobs.51job.com/beijing-hdq/70320056.html?s=0"
    con = br.request_url(url)
    ps.save(int(time.time()), "jd_51job://", url, con.text)
Ejemplo n.º 3
0
#!/usr/bin/env python
# -*- coding:utf8 -*-
import io
import tesseract_ocr
import pyocr
from PIL import Image

from court.util import Captcha

from spider.httpreq import BasicRequests

if __name__ == '__main__':

    rq = BasicRequests()
    rq.select_user_agent('firefox')
    rq.set_proxy('106.75.134.191:18888:ipin:ipin1234')
    # con = rq.request_url('http://ssfw.szcourt.gov.cn/yzm.jsp')
    con = rq.request_url('http://www.bjcourt.gov.cn/yzm.jpg')
    if not con:
        print 'failed to fetch image'
    else:
        t = tesseract_ocr.Tesseract()
        text = t.text_for_bytes(con.content)

        print text
        with open('a.jpeg', 'wb') as f:
            f.write(con.content)

        print Captcha.resolve('a.jpeg', '1')
Ejemplo n.º 4
0
def get_area_code():
    nr = BasicRequests()
    nr.select_user_agent('firefox')
    con = nr.request_url(
        'http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html'
    )
    xx = spider.util.htmlfind(con.text, '<div class="TRS_PreAppend"', 0)
    shtml = xx.get_node()
    stext = re.sub('<.*?>', lambda m: sv(m), shtml)
    stext = re.sub('&nbsp;', ' ', stext)

    outmap = {}
    cclist = {}
    fulloutmap = {}

    for line in re.split("\n", stext):
        cns = re.split(r'\s+', line)
        if len(cns) <= 1:
            continue
        code, name = cns
        name = name.decode('utf-8').strip()

        if u'直辖县级行政区划' in name:
            continue
        if name in [u'市辖区', u'区', u'县', u'矿区', u'郊区', u'城区']:
            continue
        if name not in fulloutmap:
            fulloutmap[name] = []
        fulloutmap[name].append(code[0:4])

        name1 = get_short_name(name)
        if name1 is None or name1 == name:
            continue
        if name1 not in cclist:
            cclist[name1] = {}
        if code[0:4] not in cclist[name1]:
            cclist[name1][code[0:4]] = []
        cclist[name1][code[0:4]].append([code, name])

    for key in cclist.keys():
        if key == u'吉林':
            outmap[key] = "2202"
        elif key == u"海南":
            outmap[key] = "4600"
        elif len(cclist[key]) == 1:
            thekey = cclist[key].keys()[0]
            outmap[key] = thekey
        else:
            preflist = []
            for thekey, v in cclist[key].items():
                for code, name in v:
                    if code[-2:] == '00':
                        preflist.append(code)
            if len(preflist) == 0:
                pass
            elif len(preflist) == 1:
                outmap[key] = preflist[0][0:4]
            else:
                assert not "nani?"

    fout = {}
    for k, v in outmap.items():
        fout[k] = v
    for k, v in fulloutmap.items():
        if len(v) == 1:
            fout[k] = v[0]
    return fout