Example #1
0
def testProxy(i):
    global curnum
    #print('正在测试代理:%s:%s %s %s'%(i['ip'],i['port'],i['proxy_type'],i['proxy_area']))
    # sys.stdout.write('正在测试代理:%s:%s ...'%(i['ip'],i['port'])+"\r")
    # sys.stdout.flush()
    progress.settext('正在测试代理:%s:%s'%(i['ip'],i['port']))
    ht=kl_http.kl_http()
    ht.setproxy('','','%s:%s'%(i['ip'],i['port']))
    r=ht.geturl('http://proxy.59vip.cn')
    mylock.acquire() #Get the lock
    if r!=None:
        data=filterhtml(r.read().decode())
        if data.find('#ok#')!=-1:
            jso=json.loads(data)
            db.table('proxy').where({'id':i['id']}).save({
                'status':'1',
                'response_time':ht.responsetime,
                'niming':jso['niming'],
                'proxy_ip':jso['proxy_ip'],
                'zhenshi_ip':jso['ip'],
                'update_time':int(time.time())
                })
            print('代理:%s:%s %s it\'s ok! responsetime: %f  S'%(i['ip'],i['port'],i['proxy_type'],ht.responsetime))
    else:
        #db.table('proxy').where({'id':i['id']}).save({'status':'0','update_time':int(time.time())})
        db.table('proxy').where({'id':i['id']}).delete()
        #print('代理:%s:%s %s %s it is not ok!'%(i['ip'],i['port'],i['proxy_type'],i['proxy_area']))
    curnum-=1
    mylock.release()  #Release the lock.
Example #2
0
    def caijicon(self):
        while 1:
            try:
                dlist = db.table(
                    self.url_table).limit(10).order('id asc').where({
                        'status': 0
                    }).getarr()
                if not dlist:
                    break
                for i in dlist:
                    url = self.formaturl(i['src_url'], i['url'])

                    ht = kl_http.kl_http()
                    ht.autoUserAgent = True
                    r = None
                    content = ''
                    changshi = 1
                    while True:
                        try:
                            print("collection content %s  request nums:%d" %
                                  (url, changshi))
                            if self.isproxy:
                                daili = self.get_proxy()
                                print_green("using proxy:%s" % daili)
                                ht.setproxy('', '', daili)
                            r = ht.geturl(url)
                            if ht.lasterror == None:
                                content = r.read().decode(self.charset)
                                break
                            else:
                                #print(ht.lasterror)
                                changshi = changshi + 1
                        except Exception as e:
                            content = ''
                            print_red(e)
                            changshi = changshi + 1
                    if content:
                        #查找目标url
                        mbcon_list = regex.findall(self.mb_con_reg, content,
                                                   regex.I | regex.S)
                        adddata = {}
                        for m in mbcon_list:
                            for o, p in self.con_field.items():
                                adddata[o] = m[int(p) - 1]

                            resu = db.table(
                                self.content_table).where(adddata).count()
                            if resu < 1:
                                db.table(self.content_table).add(adddata)
                                print_green('added %s' % adddata)
                        db.table(self.url_table).where({
                            'id': i['id']
                        }).save({'status': r.code})
            except Exception as e:
                print_red(e)
Example #3
0
 def shenduurl(self, url, cur_shendu=1):
     global threadnum
     global maxthread
     global isproxy
     threadnum += 1
     print("collection page %s depth:%d" % (url, cur_shendu))
     ht = kl_http.kl_http()
     ht.autoUserAgent = True
     r = None
     while True:
         if isproxy:
             daili = self.get_proxy()
             print("using proxy:%s" % daili)
             http.resetsession()
             http.setproxy('', '', daili)
         r = http.geturl(url)
         if http.lasterror == None:
             break
         else:
             print(http.lasterror)
     if r != None:
         content = r.read().decode(self.charset)
         #查找目标url
         mburl_list = regex.findall(self.mb_url_reg, content,
                                    regex.I | regex.S)
         #去重
         mburl_list = list(set(mburl_list))
         mylock.acquire()
         self.adddata(mburl_list, url)
         mylock.release()
         #深度查找
         if cur_shendu < self.shendu:
             cur_shendu += 1
             xiangsereg = self.linkreg.replace('00_00', self.link_tezheng)
             sdurl_list = regex.findall(xiangsereg, content,
                                        regex.I | regex.S)
             sdurl_list = list(set(sdurl_list))
             for j in sdurl_list:
                 while True:
                     if threadnum < maxthread:
                         threading.Thread(target=self.shenduurl,
                                          args=(
                                              self.formaturl(url, j),
                                              cur_shendu,
                                          )).start()
                         break
                     time.sleep(1)
                     #cur_shendu-=1
     threadnum -= 1
Example #4
0
    def caijicon(self):
        while 1:
            try:
                dlist=db.table(self.url_table).limit(10).order('id asc').where({
                    'status':0
                    }).getarr()
                if not dlist:
                    break
                for i in dlist:
                    url=self.formaturl(i['src_url'],i['url'])

                    ht=kl_http.kl_http()
                    ht.autoUserAgent=True
                    r=None
                    content=''
                    changshi=1
                    while True:
                        try:
                            print("collection content %s  request nums:%d"%(url,changshi))
                            if self.isproxy:
                                daili=self.get_proxy()
                                print_green("using proxy:%s"%daili)
                                ht.setproxy('','',daili)
                            r=ht.geturl(url)
                            if ht.lasterror==None:
                                content=r.read().decode(self.charset)
                                break
                            else:
                                #print(ht.lasterror)
                                changshi=changshi+1
                        except Exception as e:
                            content=''
                            print_red(e)
                            changshi=changshi+1
                    if content:
                        #查找目标url
                        mbcon_list=regex.findall(self.mb_con_reg,content, regex.I|regex.S)
                        adddata={}
                        for m in mbcon_list:
                            for o,p in self.con_field.items():
                                adddata[o]=m[int(p)-1]

                            resu=db.table(self.content_table).where(adddata).count()
                            if resu<1:
                                db.table(self.content_table).add(adddata)
                                print_green('added %s'%adddata)
                        db.table(self.url_table).where({'id':i['id']}).save({'status':r.code})
            except Exception as e:
                print_red(e)
Example #5
0
 def shenduurl(self,url,cur_shendu=1):
     global  threadnum
     global  maxthread
     global isproxy
     threadnum+=1
     print("collection page %s depth:%d"%(url,cur_shendu))
     ht=kl_http.kl_http()
     ht.autoUserAgent=True
     r=None
     while True:
         if isproxy:
             daili=self.get_proxy()
             print("using proxy:%s"%daili)
             http.resetsession()
             http.setproxy('','',daili)
         r=http.geturl(url)
         if http.lasterror==None:
             break
         else:
             print(http.lasterror)
     if r!=None:
         content=r.read().decode(self.charset)
         #查找目标url
         mburl_list=regex.findall(self.mb_url_reg,content, regex.I|regex.S)
         #去重
         mburl_list = list(set(mburl_list))
         mylock.acquire()
         self.adddata(mburl_list,url)
         mylock.release()
         #深度查找
         if cur_shendu<self.shendu:
             cur_shendu+=1
             xiangsereg=self.linkreg.replace('00_00',self.link_tezheng)
             sdurl_list=regex.findall(xiangsereg,content, regex.I|regex.S)
             sdurl_list = list(set(sdurl_list))
             for j in sdurl_list:
                 while True:
                     if threadnum<maxthread:
                         threading.Thread(target=self.shenduurl,args=(self.formaturl(url,j),cur_shendu,)).start()
                         break
                     time.sleep(1)
                     #cur_shendu-=1
     threadnum-=1
Example #6
0
def testProxy(i):
    try:
        global curnum
        #print('正在测试代理:%s:%s %s %s'%(i['ip'],i['port'],i['proxy_type'],i['proxy_area']))
        # sys.stdout.write('正在测试代理:%s:%s ...'%(i['ip'],i['port'])+"\r")
        # sys.stdout.flush()
        progress.settext('正在测试代理:%s:%s'%(i['ip'],i['port']))
        ht=kl_http.kl_http()
        ht.setproxy('','','%s:%s'%(i['ip'],i['port']))
        r=ht.geturl('http://proxy.59vip.cn')
        mylock.acquire() #Get the lock
        if r!=None:
            data=r.read().decode()
            if data.find('#ok#')!=-1:
                jso=json.loads(data)
                proxyfile=open('proxy.txt','a')
                proxyfile.write('%s:%s\n'%(i['ip'],i['port']))
                proxyfile.close()
                print('代理:%s:%s %s it\'s ok! responsetime: %f  S'%(i['ip'],i['port'],jso['niming'],ht.responsetime))
        curnum-=1
        mylock.release()  #Release the lock.
    except Exception as e:
        mylock.release()  #Release the lock.
Example #7
0
def testProxy(i):
    try:
        global curnum
        #print('正在测试代理:%s:%s %s %s'%(i['ip'],i['port'],i['proxy_type'],i['proxy_area']))
        # sys.stdout.write('正在测试代理:%s:%s ...'%(i['ip'],i['port'])+"\r")
        # sys.stdout.flush()
        progress.settext('正在测试代理:%s:%s' % (i['ip'], i['port']))
        ht = kl_http.kl_http()
        ht.setproxy('', '', '%s:%s' % (i['ip'], i['port']))
        r = ht.geturl('http://proxy.59vip.cn')
        mylock.acquire()  #Get the lock
        if r != None:
            data = r.read().decode()
            if data.find('#ok#') != -1:
                jso = json.loads(data)
                proxyfile = open('proxy.txt', 'a')
                proxyfile.write('%s:%s\n' % (i['ip'], i['port']))
                proxyfile.close()
                print('代理:%s:%s %s it\'s ok! responsetime: %f  S' %
                      (i['ip'], i['port'], jso['niming'], ht.responsetime))
        curnum -= 1
        mylock.release()  #Release the lock.
    except Exception as e:
        mylock.release()  #Release the lock.
Example #8
0
import sys, re, random

sys.path.append('../../lib/')
import kl_http, kl_db

http = kl_http.kl_http()
http.setproxy('', '', '127.0.0.1:8087')
db = kl_db.mysql({
    'host': 'localhost',
    'user': '******',
    'passwd': 'adminrootkl',
    'db': 'douban',
    'prefix': 'kl_',
    'charset': 'utf8'
})
http.autoUserAgent = True

try:
    for m in range(1989, 2000, 1):
        for n in range(0, 550, 15):
            reurl = 'http://www.douban.com/tag/%s/movie?start=%s' % (m, n)
            print(reurl)
            r = http.geturl(reurl).read().decode()
            http.resetsession()
            #查找电影列表
            data = re.findall('movie\-list[\s\S]*?paginator', r, re.S | re.I)

            #查找单个电影
            restr = 'dl[ALL]<a.*?href="(.*?)"[ALL]</a>[ALL]<a.*?>([ALL])</a>[ALL]</dl'
            restr = restr.replace('[ALL]', '[\s\S]*?')
            item = re.findall(restr, data[0])
Example #9
0
'''
采集配置型号
'''
import kl_http,kl_db,os,json,kl_log
from postdata import postdata
addnum=0
http=kl_http.kl_http()
log=kl_log.kl_log('brand')
db=kl_db.mysql({
            'host':'localhost',
            'user':'******',
            'passwd':'adminrootkl',
            'db':'qiche',
            'prefix':'kl_',
            'charset':'utf8'
        })
http.autoUserAgent=True
http.setheaders('''\
Host:www.epicc.com.cn
Origin:http://www.epicc.com.cn
Referer:http://www.epicc.com.cn/ecar/proposal/normalProposal
User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
X-Requested-With:XMLHttpRequestContent-Type: application/x-www-form-urlencoded\
''')
try:
    url='http://www.epicc.com.cn/ecar/car/carModel/getCarModelFromJYDB'
    brandlist=db.table('4shoudong').where({'status':0}).order('id asc').getarr()
    for i in brandlist:
        tjdata=postdata['peizhixinghao'].replace('[GROUPID]',i['groupId'])
        tjdata=tjdata.replace('[ENGINEDESC]',i['engineDesc'])
        r=http.posturl(url,tjdata)
Example #10
0
    def shenduurl(self, url, cur_shendu=1):
        #global  mylock
        url = url.strip()

        #self.mylock.acquire()
        result = db.table(self.urled_table).where({'url': url}).count()
        runing = db.table('runtimeing').where({'url': url}).count()
        if (result > 0 or runing > 0) and not db.lasterror:
            return True

        #添加正在采集的地址
        db.table('runtimeing').add({'url': url})
        #self.mylock.release()

        ht = kl_http.kl_http()
        ht.autoUserAgent = True
        r = None
        content = ''
        changshi = 1
        while True:
            try:
                print("collection page %s depth:%d  request nums:%d" %
                      (url, cur_shendu, changshi))
                if self.isproxy:
                    daili = self.get_proxy()
                    print_green("using proxy:%s" % daili)
                    ht.setproxy('', '', daili)
                r = ht.geturl(url)
                if ht.lasterror == None:
                    content = r.read().decode(self.charset)
                    break
                else:
                    print_red(ht.lasterror)
                    changshi = changshi + 1
            except Exception as e:
                content = ''
                print_red(e)
                changshi = changshi + 1
            # finally:
            #     del ht
        if content:
            #查找目标url
            mburl_list = regex.findall(self.mb_url_reg, content,
                                       regex.I | regex.S)
            #去重
            mburl_list = list(set(mburl_list))
            #self.mylock.acquire()
            self.adddata(mburl_list, url)
            #self.mylock.release()
            #深度查找
            if cur_shendu < self.shendu or self.shendu == 0:
                cur_shendu += 1
                #查找特征列表
                for x in self.link_tezheng:
                    xiangsereg = self.linkreg.replace('00_00', x)
                    sdurl_list = regex.findall(xiangsereg, content,
                                               regex.I | regex.S)
                    #self.mylock.acquire()
                    sdurl_list = self.__filterurl(sdurl_list, url)
                    #self.mylock.release()
                    for j in sdurl_list:
                        #if cur_shendu==2:
                        if False:
                            while True:
                                #print('curthread nums:%d'%self.threadnum)
                                self.progress.show()
                                #只有第一次进入这个函数时才可以启动线程
                                if self.threadnum < self.maxthread:
                                    self.threadnum += 1
                                    threading.Thread(target=self.shenduurl,
                                                     args=(
                                                         self.formaturl(
                                                             url, j),
                                                         cur_shendu,
                                                     )).start()
                                    break
                                time.sleep(1)
                        else:
                            self.shenduurl(self.formaturl(url, j), cur_shendu)

        #更新已经采集过的网址为采集完成状态
        db.table(self.urled_table).add({'url': url})
        db.table('runtimeing').where({'url': url}).delete()
        self.threadnum -= 1
Example #11
0
import sys
import urllib
sys.path.append('./lib/')
import kl_log,kl_db,kl_http


if __name__ == '__main__':
    try:
        page=kl_http.kl_http()
        print(page.posturl('http://www.0yuanwang.com'))
        kl_log.write('success')
        input('按任意键继续...')
    except KeyboardInterrupt as e:
        print('程序已经退出')
        print(e)
Example #12
0
    def shenduurl(self,url,cur_shendu=1):
        #global  mylock
        url=url.strip()

        #self.mylock.acquire()
        result=db.table(self.urled_table).where({'url':url}).count()
        runing=db.table('runtimeing').where({'url':url}).count()
        if (result>0 or runing>0 )and not db.lasterror:
            return True

        #添加正在采集的地址
        db.table('runtimeing').add({'url':url})
        #self.mylock.release()

        ht=kl_http.kl_http()
        ht.autoUserAgent=True
        r=None
        content=''
        changshi=1
        while True:
            try:
                print("collection page %s depth:%d  request nums:%d"%(url,cur_shendu,changshi))
                if self.isproxy:
                    daili=self.get_proxy()
                    print_green("using proxy:%s"%daili)
                    ht.setproxy('','',daili)
                r=ht.geturl(url)
                if ht.lasterror==None:
                    content=r.read().decode(self.charset)
                    break
                else:
                    print_red(ht.lasterror)
                    changshi=changshi+1
            except Exception as e:
                content=''
                print_red(e)
                changshi=changshi+1
            # finally:
            #     del ht
        if content:
            #查找目标url
            mburl_list=regex.findall(self.mb_url_reg,content, regex.I|regex.S)
            #去重
            mburl_list = list(set(mburl_list))
            #self.mylock.acquire()
            self.adddata(mburl_list,url)
            #self.mylock.release()
            #深度查找
            if cur_shendu<self.shendu or self.shendu==0:
                cur_shendu+=1
                #查找特征列表
                for x in self.link_tezheng:
                    xiangsereg=self.linkreg.replace('00_00',x)
                    sdurl_list=regex.findall(xiangsereg,content, regex.I|regex.S)
                    #self.mylock.acquire()
                    sdurl_list = self.__filterurl(sdurl_list,url)
                    #self.mylock.release()
                    for j in sdurl_list:
                        #if cur_shendu==2:
                        if False:
                            while True:
                                #print('curthread nums:%d'%self.threadnum)
                                self.progress.show();
                                    #只有第一次进入这个函数时才可以启动线程
                                if self.threadnum<self.maxthread:
                                    self.threadnum+=1
                                    threading.Thread(target=self.shenduurl,args=(self.formaturl(url,j),cur_shendu,)).start()
                                    break
                                time.sleep(1)
                        else:
                            self.shenduurl(self.formaturl(url,j),cur_shendu)

        #更新已经采集过的网址为采集完成状态
        db.table(self.urled_table).add({'url':url})
        db.table('runtimeing').where({'url':url}).delete()
        self.threadnum-=1