Exemple #1
0
def init_data():
    br = BasicRequests()
    br.select_user_agent('firefox')
    s = open('af').read(64 * 1024)
    odata = []
    for m in re.finditer('<a (.*?)>(.*?)</a>', s, re.S):
        name = m.group(2)
        attrs = m.group(1)
        url = None
        prov = None
        for m1 in re.finditer('([a-z][a-z0-9]+)="(.*?)"', attrs, re.S):
            n, v = m1.group(1), m1.group(2)
            if n == 'href':
                url = v
            if n == "prov":
                prov = v
        con = br.request_url(url)
        siteurl = 'unknown'
        if con is not None:
            siteurl = con.request.url
        print name, con.request.url
        odata.append({
            'name': name,
            'url': siteurl,
            'imgurl': '',
            'prov': prov
        })
    print json.dumps(odata, ensure_ascii=0, indent=4)
Exemple #2
0
def post_for_proxy():
    req = BasicRequests()
    con = req.request_url(
        'http://dev.kuaidaili.com/api/getproxy/?orderid=925817981728018&num=50&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=2&an_an=1&an_ha=1&sp1=1&quality=1&sort=1&format=json&sep=1'
    )
    if con:
        return eval(con.text)
Exemple #3
0
 def resolve(self, getimg, fc=None):
     while True:
         imgcon = getimg(dbgdata=fc)
         f = StringIO.StringIO()
         f.write(imgcon)
         f.seek(0)
         h = BasicRequests()
         imgcode = None
         if fc is not None and isinstance(fc, dict) and "type" in fc:
             response = h.request_url(self.server, files={'file': imgcon}, data={"province": self._type})
             result = response.text.strip()
             if '"valid":true' in result:
                 try:
                     result = json.loads(result)
                     imgcode = result["answer"]
                     if imgcode == None or imgcode == "":
                         print "验证码图片识别错误 imgcode==None或''"
                         continue
                 except Exception as err:
                     print "验证码图片识别错误,重新校验...,result:", result, "错误原因:", err
                     time.sleep(1)
                     continue
         else:
             url = "%s?type=%s" % (self.server, self._type)
             response = h.request_url(url, files={'file': f})
             imgcode = response.text.strip()
             if imgcode == '<fail>':
                 print "验证码图片识别错误 imgcode==<fail>"
                 continue
         if isinstance(fc, dict):
             fc['content'] = imgcon
             fc['code'] = imgcode
         return imgcode
Exemple #4
0
def find_ipin_proxy():
    ff = IpinFactory()
    prs = ff.getProxyList()
    s = BasicRequests()
    #print json.dumps(prs, ensure_ascii=0).encode('utf-8')
    res = {}
    for p in prs:
        print "trying", p
        auth, proxies = ff.genRequestsParam(p)
        con = s.request_url("http://ip.cn/",
                            auth=auth,
                            proxies=proxies,
                            timeout=6)
        if con is None:
            continue
        m = re.search("<code>(.*?)</code>", con.text)
        if m:
            sys.stderr.write("%s %s\n" % (p['host'], m.group(1)))
            res[m.group(1)] = "%s:%s:%s" % (p.get('host'), p.get('port'),
                                            p.get('password'))
        p2, proxies = ff.genTinyProxy(p)
        print proxies
        con = s.request_url("http://ip.cn/", proxies=proxies, timeout=5)
        if con is None:
            continue
        m = re.search("<code>(.*?)</code>", con.text)
        if m:
            sys.stderr.write("%s %s\n" % (p['host'], m.group(1)))
            res[m.group(1)] = p2
    print "\n".join(res.values())
Exemple #5
0
 def __init__(self, sf='jobs'):
     BasicRequests.__init__(self)
     self._main_url = 'http://www.zjsfgkw.cn/Document/JudgmentBook'
     self._court_search_url = 'http://www.zjsfgkw.cn/Judges/GetCountByCountId'
     self._book_search_url = 'http://www.zjsfgkw.cn/document/JudgmentSearch'
     self.start_date = '19700101'
     self.end_date = time.strftime('%Y%m%d', time.localtime())
     self.save_file = sf
Exemple #6
0
 def get_search_url(opts):
     # SF_1_1_27=0为中文简历 SF_1_1_27=1为英文简历, 但是从js看并不能搜英文的.
     b = BasicRequests()
     # surl = 'http://rdsearch.zhaopin.com/Home/ResultForCustom?SF_1_1_7=7,9&orderBy=DATE_MODIFIED,1&pageSize=60&SF_1_1_27=0&exclude=1'
     surl = 'http://rdsearch.zhaopin.com/Home/ResultForCustom?SF_1_1_7=8,9&orderBy=DATE_MODIFIED,1&pageSize=60&SF_1_1_27=0&exclude=1'
     for name, value in opts.items():
         surl = b.compose_url(surl, name, value)
     return surl
Exemple #7
0
def test_ps():
    ps = PageStore51()
    ps.testmode = True
    br = BasicRequests()
    br.select_user_agent('firefox')
    url = "http://jobs.51job.com/beijing-hdq/70320056.html?s=0"
    con = br.request_url(url)
    ps.save(int(time.time()), "jd_51job://", url, con.text)
Exemple #8
0
 def test_login(self):
     req = BasicRequests()
     con = req.request_url(self.main_url + 'login',
                           date={
                               'username': '******',
                               'password': '******'
                           })
     print con.headers
     print con.text
Exemple #9
0
def test_parse_time():
    request = BasicRequests()
    con = request.request_url(
        'http://www.zjsfgkw.cn/attachment/documentbook/2016-04-05/0225-0229/html/671a34a7-b068-4025-af13-d9fe4c28ce6a.html'
    )
    m = re.search(
        ur'[一二三四五六七八九〇零○十]{4}年[一二三四五六七八九〇十○]{1,2}月[一二三四五六七八九〇零○十]{1,3}日',
        con.text)
    if m:
        print date_cs2num(m.group())
Exemple #10
0
 def thread_init(self, tid):
     # self.proxyq is threading-safe
     proxy = self.proxyq.get(True)
     basicreq = BasicRequests()
     basicreq.sp_proxies[proxy] = 0
     basicreq._cur_proxy_index = 0
     basicreq._auto_change_proxy = False
     setattr(self._tls, "req", basicreq)
     with self.locker:
         Log.info("Thread%d's request prepared..Proxy:%s" % (tid, proxy))
Exemple #11
0
def try_proxy(proxy, url='http://gaokao.chsi.com.cn', tag=u'阳光高考'):
    req = BasicRequests()
    req.set_proxy(proxy, 0, False)
    # con = req.request_url('http://gk.chsi.com.cn/recruit/listSpecBySchool.do?yxdm=11055&start=0 ')
    con = req.request_url(url, timeout=5)
    if con:
        m = re.search(r'<title>[^<]*<\/title>', con.text)
        if m:
            print m.group()
        return re.search(tag, con.text)
Exemple #12
0
 def sub_pages(url, con):
     count = CVZhilianUtil.get_count(url, con)
     if count > 4000:
         count = 4000
     npages = (count + 60 - 1) / 60
     if npages >= 2:
         b = BasicRequests()
         for p in range(2, npages + 1):
             url1 = b.compose_url(url, 'pageIndex', p)
             yield url1
Exemple #13
0
def test_extract_inner_paper_url():
    rq = BasicRequests()
    con = rq.request_url(
        'http://www.zjsfgkw.cn/document/JudgmentDetail/4177773')
    content = re.search(r'<div class="books_detail_header">.*</IFRAME>',
                        con.text, re.S)
    m = re.search(r'src="([^"]+)"', content.group())
    if m:
        print m.group(1)
    else:
        print content
Exemple #14
0
def test_search():
    a = BasicRequests()
    while True:
        data = {
            'docids1': "1,2,3,4,5,6,6109234,6110168,11070364",
            "keywords": "武汉"
        }
        con = a.request_url("http://localhost:4096/search?hehe=1", data=data)
        if con is not None:
            print con.code, con.text
        time.sleep(10)
Exemple #15
0
 def filter_with_speed(proxies, url='http://www.baidu.com', timeout=10):
     results = []
     req = BasicRequests()
     for proxy in proxies:
         req.set_proxy(proxy, len(req.sp_proxies), False)
         try:
             con = req.request_url(url, timeout=timeout)
         except Exception:
             con = None
         if con:
             results.append(proxy)
     return results
Exemple #16
0
 def get_child_court(self, court):
     req = BasicRequests()
     req.set_proxy(self.proxy)
     time.sleep(1)
     print 'fetching child court', court['key']
     con = req.request_url('http://wenshu.court.gov.cn/Index/GetChildAllCourt',
                           data={'keyCodeArrayStr': court['key']})
     if '<' in con.text:
         return
     court = self.parse_results(con)
     for c in court:
         self.child_courts.append(c)
Exemple #17
0
 def get_court(self):
     req = BasicRequests()
     req.set_proxy(self.proxy)
     for p in self.provinces:
         time.sleep(1)
         print 'fetch province', p['name']
         con = req.request_url('http://wenshu.court.gov.cn/Index/GetCourt', data={'province': p['name']})
         if '<' in con.text:
             print 'invalid response'
             continue
         court = self.parse_results(con)
         for c in court:
             self.courts.append(c)
Exemple #18
0
def runjs(url):
    rq = BasicRequests()
    con = rq.request_url(url, data={})
    if con:
        print con.text
        m = re.findall(r'<script[^>]*>(.+?)</script>', con.text, re.S)
        if m:
            for js in m:
                if js == '':
                    continue
                print js
                sc = "document = {set cookie(a){console.log(a);}}, window = {innerWidth: 1024, innerHeight: 768, screenX: 200, screenY: 100, screen: {width: 1024, height: 768}}\n"
                sc += js
                rv = spider.util.runjs(sc)
                print 'my results:'
                print rv
Exemple #19
0
 def _do_requests(self, url, **kwargs):
     rv = BasicRequests._do_requests(self, url, **kwargs)
     # TODO: replace SimpleCookie with someone better.
     if rv is not None:
         curlckjar = getattr(self._curltls, 'cookies', None)
         if curlckjar is None:
             curlckjar = CurlCookieJar()
         curlckjar.add_list(rv.cookies)
         setattr(self._curltls, 'cookies', curlckjar)
     return rv
Exemple #20
0
def test_proxy(proxy, url, count=10):
    c = count
    rq = BasicRequests()
    rq.set_proxy(proxy['p'])
    total = 0
    success = 0
    while c > 0:
        try:
            s = time.time()
            con = rq.request_url(url)
            t = time.time() - s
        except:
            con = None
            t = 0
            pass
        c -= 1
        if con:
            success += 1
            total += t
    if success > 0:
        proxy['v'] = total / success
Exemple #21
0
 def test_proxy_speed(url, proxy, t=60):
     req = BasicRequests()
     req.set_proxy(proxy, 0, False)
     s = time.time()
     count = 0
     while time.time() - s <= t:
         try:
             req.request_url(url, timeout=5)
             count += 1
         except Exception:
             pass
     return count
Exemple #22
0
#!/usr/bin/env python
# -*- coding:utf8 -*-
from spider.httpreq import BasicRequests

if "__main__" == __name__:
    rq = BasicRequests()
    rq.set_proxy('106.75.134.190:18888:ipin:ipin1234')
    con = rq.request_url('http://www.zjsfgkw.cn/document/JudgmentDetail/4062962')
    if con:
        print con.text
Exemple #23
0
 def __init__(self):
     BasicRequests.__init__(self)
     self.url = 'http://www.shenpan.cn/cpws/writopenlist.aspx?typeString='
     self.params = {}
     self.count = 0
Exemple #24
0
 def __init__(self):
     BasicRequests.__init__(self)
     self._url_format = 'http://gaokao.chsi.com.cn/zyk/pub/myd/schAppraisalTop.action?start=%d'
     self._url_format2 = 'http://gaokao.chsi.com.cn/zsgs/zhangcheng/listVerifedZszc--method-index,lb-1,start-%d.dhtml'
     self._schools = []
Exemple #25
0
 def __init__(self, url):
     BasicRequests.__init__(self)
     self.url = url
Exemple #26
0
        'format':
        'http://gaokao.chsi.com.cn/zsgs/zhangcheng/listVerifedZszc--method-index,lb-1,start-%d.dhtml',
        'url_reg':
        r'<a href="\/zsgs\/zhangcheng\/listZszc\-\-schId\-\d+\.dhtml".*>([^<]*)<\/a>',
        'page_reg': r'<a[^>]*start\-\d+[^>]*>(\d+)<',
        'pagesize': 100
    }]
    url_format = [
        'http://gaokao.chsi.com.cn/zyk/pub/myd/schAppraisalTop.action?start=%d',
        'http://gaokao.chsi.com.cn/zsgs/zhangcheng/listVerifedZszc--method-index,lb-1,start-%d.dhtml'
    ]
    url_pattern = [
        r'<a href="#" onclick="doDialog\(\'\d+\',\'([^\']+)\'\);',
        r'<a href="\/zsgs\/zhangcheng\/listZszc\-\-schId\-\d+\.dhtml".*>([^<]*)<\/a>'
    ]
    page_pattern = [
        r'<a[^>]*start=\d+[^>]*>(\d+)<', r'<a[^>]*start\-\d+[^>]*>(\d+)<'
    ]
    request = BasicRequests()
    schools = []
    for site in sites:
        schools += GetSchoolName.fetch_schools(request, site['format'],
                                               site['pagesize'],
                                               site['url_reg'],
                                               site['page_reg'])
    ss = []
    for c in schools:
        if c not in ss:
            ss.append(c)
    GetSchoolName.save(schools)
Exemple #27
0
 def __init__(self):
     BasicRequests.__init__(self)
     self.url = 'http://www.ftcourt.gov.cn/cpwspt/writopenlist.aspx?cls=0'
     self.params = {}
     self.count = 0
Exemple #28
0
 def __init__(self):
     BasicRequests.__init__(self)
     self.session = requests.Session()
Exemple #29
0
 def test_find(self):
     req = BasicRequests()
     con = req.request_url(self.main_url + '?key=' + 'mumas')
     print con.text
     con = req.request_url(self.main_url + '?key=' + 'skiloop')
     print con.text
Exemple #30
0
def load_url(url):
    br = BasicRequests()
    con = br.request_url(url)
    print con.text