def testProxy(i): global curnum #print('正在测试代理:%s:%s %s %s'%(i['ip'],i['port'],i['proxy_type'],i['proxy_area'])) # sys.stdout.write('正在测试代理:%s:%s ...'%(i['ip'],i['port'])+"\r") # sys.stdout.flush() progress.settext('正在测试代理:%s:%s'%(i['ip'],i['port'])) ht=kl_http.kl_http() ht.setproxy('','','%s:%s'%(i['ip'],i['port'])) r=ht.geturl('http://proxy.59vip.cn') mylock.acquire() #Get the lock if r!=None: data=filterhtml(r.read().decode()) if data.find('#ok#')!=-1: jso=json.loads(data) db.table('proxy').where({'id':i['id']}).save({ 'status':'1', 'response_time':ht.responsetime, 'niming':jso['niming'], 'proxy_ip':jso['proxy_ip'], 'zhenshi_ip':jso['ip'], 'update_time':int(time.time()) }) print('代理:%s:%s %s it\'s ok! responsetime: %f S'%(i['ip'],i['port'],i['proxy_type'],ht.responsetime)) else: #db.table('proxy').where({'id':i['id']}).save({'status':'0','update_time':int(time.time())}) db.table('proxy').where({'id':i['id']}).delete() #print('代理:%s:%s %s %s it is not ok!'%(i['ip'],i['port'],i['proxy_type'],i['proxy_area'])) curnum-=1 mylock.release() #Release the lock.
def caijicon(self): while 1: try: dlist = db.table( self.url_table).limit(10).order('id asc').where({ 'status': 0 }).getarr() if not dlist: break for i in dlist: url = self.formaturl(i['src_url'], i['url']) ht = kl_http.kl_http() ht.autoUserAgent = True r = None content = '' changshi = 1 while True: try: print("collection content %s request nums:%d" % (url, changshi)) if self.isproxy: daili = self.get_proxy() print_green("using proxy:%s" % daili) ht.setproxy('', '', daili) r = ht.geturl(url) if ht.lasterror == None: content = r.read().decode(self.charset) break else: #print(ht.lasterror) changshi = changshi + 1 except Exception as e: content = '' print_red(e) changshi = changshi + 1 if content: #查找目标url mbcon_list = regex.findall(self.mb_con_reg, content, regex.I | regex.S) adddata = {} for m in mbcon_list: for o, p in self.con_field.items(): adddata[o] = m[int(p) - 1] resu = db.table( self.content_table).where(adddata).count() if resu < 1: db.table(self.content_table).add(adddata) print_green('added %s' % adddata) db.table(self.url_table).where({ 'id': i['id'] }).save({'status': r.code}) except Exception as e: print_red(e)
def shenduurl(self, url, cur_shendu=1): global threadnum global maxthread global isproxy threadnum += 1 print("collection page %s depth:%d" % (url, cur_shendu)) ht = kl_http.kl_http() ht.autoUserAgent = True r = None while True: if isproxy: daili = self.get_proxy() print("using proxy:%s" % daili) http.resetsession() http.setproxy('', '', daili) r = http.geturl(url) if http.lasterror == None: break else: print(http.lasterror) if r != None: content = r.read().decode(self.charset) #查找目标url mburl_list = regex.findall(self.mb_url_reg, content, regex.I | regex.S) #去重 mburl_list = list(set(mburl_list)) mylock.acquire() self.adddata(mburl_list, url) mylock.release() #深度查找 if cur_shendu < self.shendu: cur_shendu += 1 xiangsereg = self.linkreg.replace('00_00', self.link_tezheng) sdurl_list = regex.findall(xiangsereg, content, regex.I | regex.S) sdurl_list = list(set(sdurl_list)) for j in sdurl_list: while True: if threadnum < maxthread: threading.Thread(target=self.shenduurl, args=( self.formaturl(url, j), cur_shendu, )).start() break time.sleep(1) #cur_shendu-=1 threadnum -= 1
def caijicon(self): while 1: try: dlist=db.table(self.url_table).limit(10).order('id asc').where({ 'status':0 }).getarr() if not dlist: break for i in dlist: url=self.formaturl(i['src_url'],i['url']) ht=kl_http.kl_http() ht.autoUserAgent=True r=None content='' changshi=1 while True: try: print("collection content %s request nums:%d"%(url,changshi)) if self.isproxy: daili=self.get_proxy() print_green("using proxy:%s"%daili) ht.setproxy('','',daili) r=ht.geturl(url) if ht.lasterror==None: content=r.read().decode(self.charset) break else: #print(ht.lasterror) changshi=changshi+1 except Exception as e: content='' print_red(e) changshi=changshi+1 if content: #查找目标url mbcon_list=regex.findall(self.mb_con_reg,content, regex.I|regex.S) adddata={} for m in mbcon_list: for o,p in self.con_field.items(): adddata[o]=m[int(p)-1] resu=db.table(self.content_table).where(adddata).count() if resu<1: db.table(self.content_table).add(adddata) print_green('added %s'%adddata) db.table(self.url_table).where({'id':i['id']}).save({'status':r.code}) except Exception as e: print_red(e)
def shenduurl(self,url,cur_shendu=1): global threadnum global maxthread global isproxy threadnum+=1 print("collection page %s depth:%d"%(url,cur_shendu)) ht=kl_http.kl_http() ht.autoUserAgent=True r=None while True: if isproxy: daili=self.get_proxy() print("using proxy:%s"%daili) http.resetsession() http.setproxy('','',daili) r=http.geturl(url) if http.lasterror==None: break else: print(http.lasterror) if r!=None: content=r.read().decode(self.charset) #查找目标url mburl_list=regex.findall(self.mb_url_reg,content, regex.I|regex.S) #去重 mburl_list = list(set(mburl_list)) mylock.acquire() self.adddata(mburl_list,url) mylock.release() #深度查找 if cur_shendu<self.shendu: cur_shendu+=1 xiangsereg=self.linkreg.replace('00_00',self.link_tezheng) sdurl_list=regex.findall(xiangsereg,content, regex.I|regex.S) sdurl_list = list(set(sdurl_list)) for j in sdurl_list: while True: if threadnum<maxthread: threading.Thread(target=self.shenduurl,args=(self.formaturl(url,j),cur_shendu,)).start() break time.sleep(1) #cur_shendu-=1 threadnum-=1
def testProxy(i): try: global curnum #print('正在测试代理:%s:%s %s %s'%(i['ip'],i['port'],i['proxy_type'],i['proxy_area'])) # sys.stdout.write('正在测试代理:%s:%s ...'%(i['ip'],i['port'])+"\r") # sys.stdout.flush() progress.settext('正在测试代理:%s:%s'%(i['ip'],i['port'])) ht=kl_http.kl_http() ht.setproxy('','','%s:%s'%(i['ip'],i['port'])) r=ht.geturl('http://proxy.59vip.cn') mylock.acquire() #Get the lock if r!=None: data=r.read().decode() if data.find('#ok#')!=-1: jso=json.loads(data) proxyfile=open('proxy.txt','a') proxyfile.write('%s:%s\n'%(i['ip'],i['port'])) proxyfile.close() print('代理:%s:%s %s it\'s ok! responsetime: %f S'%(i['ip'],i['port'],jso['niming'],ht.responsetime)) curnum-=1 mylock.release() #Release the lock. except Exception as e: mylock.release() #Release the lock.
def testProxy(i): try: global curnum #print('正在测试代理:%s:%s %s %s'%(i['ip'],i['port'],i['proxy_type'],i['proxy_area'])) # sys.stdout.write('正在测试代理:%s:%s ...'%(i['ip'],i['port'])+"\r") # sys.stdout.flush() progress.settext('正在测试代理:%s:%s' % (i['ip'], i['port'])) ht = kl_http.kl_http() ht.setproxy('', '', '%s:%s' % (i['ip'], i['port'])) r = ht.geturl('http://proxy.59vip.cn') mylock.acquire() #Get the lock if r != None: data = r.read().decode() if data.find('#ok#') != -1: jso = json.loads(data) proxyfile = open('proxy.txt', 'a') proxyfile.write('%s:%s\n' % (i['ip'], i['port'])) proxyfile.close() print('代理:%s:%s %s it\'s ok! responsetime: %f S' % (i['ip'], i['port'], jso['niming'], ht.responsetime)) curnum -= 1 mylock.release() #Release the lock. except Exception as e: mylock.release() #Release the lock.
import sys, re, random sys.path.append('../../lib/') import kl_http, kl_db http = kl_http.kl_http() http.setproxy('', '', '127.0.0.1:8087') db = kl_db.mysql({ 'host': 'localhost', 'user': '******', 'passwd': 'adminrootkl', 'db': 'douban', 'prefix': 'kl_', 'charset': 'utf8' }) http.autoUserAgent = True try: for m in range(1989, 2000, 1): for n in range(0, 550, 15): reurl = 'http://www.douban.com/tag/%s/movie?start=%s' % (m, n) print(reurl) r = http.geturl(reurl).read().decode() http.resetsession() #查找电影列表 data = re.findall('movie\-list[\s\S]*?paginator', r, re.S | re.I) #查找单个电影 restr = 'dl[ALL]<a.*?href="(.*?)"[ALL]</a>[ALL]<a.*?>([ALL])</a>[ALL]</dl' restr = restr.replace('[ALL]', '[\s\S]*?') item = re.findall(restr, data[0])
''' 采集配置型号 ''' import kl_http,kl_db,os,json,kl_log from postdata import postdata addnum=0 http=kl_http.kl_http() log=kl_log.kl_log('brand') db=kl_db.mysql({ 'host':'localhost', 'user':'******', 'passwd':'adminrootkl', 'db':'qiche', 'prefix':'kl_', 'charset':'utf8' }) http.autoUserAgent=True http.setheaders('''\ Host:www.epicc.com.cn Origin:http://www.epicc.com.cn Referer:http://www.epicc.com.cn/ecar/proposal/normalProposal User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 X-Requested-With:XMLHttpRequestContent-Type: application/x-www-form-urlencoded\ ''') try: url='http://www.epicc.com.cn/ecar/car/carModel/getCarModelFromJYDB' brandlist=db.table('4shoudong').where({'status':0}).order('id asc').getarr() for i in brandlist: tjdata=postdata['peizhixinghao'].replace('[GROUPID]',i['groupId']) tjdata=tjdata.replace('[ENGINEDESC]',i['engineDesc']) r=http.posturl(url,tjdata)
def shenduurl(self, url, cur_shendu=1): #global mylock url = url.strip() #self.mylock.acquire() result = db.table(self.urled_table).where({'url': url}).count() runing = db.table('runtimeing').where({'url': url}).count() if (result > 0 or runing > 0) and not db.lasterror: return True #添加正在采集的地址 db.table('runtimeing').add({'url': url}) #self.mylock.release() ht = kl_http.kl_http() ht.autoUserAgent = True r = None content = '' changshi = 1 while True: try: print("collection page %s depth:%d request nums:%d" % (url, cur_shendu, changshi)) if self.isproxy: daili = self.get_proxy() print_green("using proxy:%s" % daili) ht.setproxy('', '', daili) r = ht.geturl(url) if ht.lasterror == None: content = r.read().decode(self.charset) break else: print_red(ht.lasterror) changshi = changshi + 1 except Exception as e: content = '' print_red(e) changshi = changshi + 1 # finally: # del ht if content: #查找目标url mburl_list = regex.findall(self.mb_url_reg, content, regex.I | regex.S) #去重 mburl_list = list(set(mburl_list)) #self.mylock.acquire() self.adddata(mburl_list, url) #self.mylock.release() #深度查找 if cur_shendu < self.shendu or self.shendu == 0: cur_shendu += 1 #查找特征列表 for x in self.link_tezheng: xiangsereg = self.linkreg.replace('00_00', x) sdurl_list = regex.findall(xiangsereg, content, regex.I | regex.S) #self.mylock.acquire() sdurl_list = self.__filterurl(sdurl_list, url) #self.mylock.release() for j in sdurl_list: #if cur_shendu==2: if False: while True: #print('curthread nums:%d'%self.threadnum) self.progress.show() #只有第一次进入这个函数时才可以启动线程 if self.threadnum < self.maxthread: self.threadnum += 1 threading.Thread(target=self.shenduurl, args=( self.formaturl( url, j), cur_shendu, )).start() break time.sleep(1) else: self.shenduurl(self.formaturl(url, j), cur_shendu) #更新已经采集过的网址为采集完成状态 db.table(self.urled_table).add({'url': url}) db.table('runtimeing').where({'url': url}).delete() self.threadnum -= 1
import sys import urllib sys.path.append('./lib/') import kl_log,kl_db,kl_http if __name__ == '__main__': try: page=kl_http.kl_http() print(page.posturl('http://www.0yuanwang.com')) kl_log.write('success') input('按任意键继续...') except KeyboardInterrupt as e: print('程序已经退出') print(e)
def shenduurl(self,url,cur_shendu=1): #global mylock url=url.strip() #self.mylock.acquire() result=db.table(self.urled_table).where({'url':url}).count() runing=db.table('runtimeing').where({'url':url}).count() if (result>0 or runing>0 )and not db.lasterror: return True #添加正在采集的地址 db.table('runtimeing').add({'url':url}) #self.mylock.release() ht=kl_http.kl_http() ht.autoUserAgent=True r=None content='' changshi=1 while True: try: print("collection page %s depth:%d request nums:%d"%(url,cur_shendu,changshi)) if self.isproxy: daili=self.get_proxy() print_green("using proxy:%s"%daili) ht.setproxy('','',daili) r=ht.geturl(url) if ht.lasterror==None: content=r.read().decode(self.charset) break else: print_red(ht.lasterror) changshi=changshi+1 except Exception as e: content='' print_red(e) changshi=changshi+1 # finally: # del ht if content: #查找目标url mburl_list=regex.findall(self.mb_url_reg,content, regex.I|regex.S) #去重 mburl_list = list(set(mburl_list)) #self.mylock.acquire() self.adddata(mburl_list,url) #self.mylock.release() #深度查找 if cur_shendu<self.shendu or self.shendu==0: cur_shendu+=1 #查找特征列表 for x in self.link_tezheng: xiangsereg=self.linkreg.replace('00_00',x) sdurl_list=regex.findall(xiangsereg,content, regex.I|regex.S) #self.mylock.acquire() sdurl_list = self.__filterurl(sdurl_list,url) #self.mylock.release() for j in sdurl_list: #if cur_shendu==2: if False: while True: #print('curthread nums:%d'%self.threadnum) self.progress.show(); #只有第一次进入这个函数时才可以启动线程 if self.threadnum<self.maxthread: self.threadnum+=1 threading.Thread(target=self.shenduurl,args=(self.formaturl(url,j),cur_shendu,)).start() break time.sleep(1) else: self.shenduurl(self.formaturl(url,j),cur_shendu) #更新已经采集过的网址为采集完成状态 db.table(self.urled_table).add({'url':url}) db.table('runtimeing').where({'url':url}).delete() self.threadnum-=1