def test(): runque = RedisQueueConnection('scan').conn #########runque.flushdb() size = runque.qsize() t = 0 cnt = 0 port = 0 data = [] if size == 0: return tmp = runque.get() runque.put(tmp) port, iph, ipl = tmp.split(' ') print port, size raw_input('confirm:') while runque.qsize() > 0: tmp = runque.get() try: t += len(tmp.split(' ')[-1].split(',')) data.append(tmp) except: pass cnt += 1 f = "china%s_%s_%s.txt" % (port, size, t) fp = open(f, 'w') for item in data: fp.write(item + '\n') fp.close()
def test(): runque = RedisQueueConnection('scan').conn size = runque.qsize() print size sleep(1) cnt = 0 if size: while cnt < size: i = runque.get() print i runque.put(i) cnt += 1 runque.flushdb() exit(0) f = open('seeds995k.txt') urls = f.read().strip().split('\n') if size == 0: i = 0 st = time() for url in urls: runque.put(url)
def extract(): icmd = "insert into orignal (dbname, dbid, url, cms, headers, head) values (%s, %s, %s, %s, %s, %s)" runque = RedisQueueConnection('cms').conn size = runque.qsize() print "total:" , size i = 0 while i < size: item = runque.get() #runque.put(item) data = pickle.loads(item) ndata = [] for item in data: if isinstance(item, unicode): item = item.encode('utf8') ndata.append(item) #print ndata if ndata[3]: print ndata[3], ndata[2] cur.execute(icmd , ndata) i += 1 conn.commit() print "done" print runque.qsize()
def test(): runque = RedisQueueConnection('robots').conn #########runque.flushdb() size = runque.qsize() item = runque.get() runque.put(item) print pickle.loads(item) print size return raw_input('cofrim') s = flist('urlstogetrobots.txt') for url in s: runque.put(url) print runque.qsize()
def show(name): runque = RedisQueueConnection(name).conn cnt = 0 while cnt < runque.qsize(): data = runque.get() runque.put(data) data = pickle.loads(data) seed = data['seed'] data = data['content'].replace('\r', '\n').replace('\n\n','\n').strip() if not data: continue if data.find('<') >= 0: #html page print seed continue robots = data.split('\n') print seed print print "\n".join(robots) print cnt += 1
con = req.content #print url, len(con) req.close() except: pass data = (url, con) cb(data) from time import time def cb(data): seed, con = data #print "\t", seed, len(con) cnt = 0 sst = time() while True: url = runque.get() runque.put(url) st = time() pool.spawn(httpget, url) et = time() cnt += 1 if cnt % 10 == 0: print cnt / (et-sst) ,runque.qsize(), robotsque.qsize()
class Crawler: def __init__(self, done_que): self.showpercounts = 100 self.timeout = 5 self.starttime = time() self.quit = False self.run_que = RedisQueueConnection('running').conn self.done_que = done_que self.tasks = [] self.done = 1 self.errdone = set() self.err = Error() self.https_enable = 0 self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 100 self.down_pool = Pool(size=self.poolsize) self.totalnettime = 0 self.totaldownsize = 0 self.ip = getip() #callback function when greenlet of httpget run done def cb_httpget(self, data = None): if not data: return seed, err, headers, content = data if err: self.handle_error(err,seed) return data={'seed':seed,'headers':headers,'content':content} dat = cPickle.dumps(data) #self.done_que.put_nowait(dat) #print "done", seed if self.done % self.showpercounts == 0: self.out(seed) def out(self, seed): spendtime = time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else "" now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 ) print "\n%s\t%s D:%-4d R:%-7d [QPS: %.2f %.2f] %s" % (self.ip, now, (self.done), self.run_que.qsize(), \ self.done/spendtime, self.done/self.totalnettime , str(self.err) ) def run(self): while self.quit == False: try: if self.run_que.qsize() == 0: print "run que empty" sleep(10) continue url = self.run_que.get() self.run_que.put(url) #self.down_pool.apply_cb(self.httpget, (url,), callback=self.cb_httpget) # spawn is more fast? #url = 'http://www.sdust.edu.cn' self.down_pool.spawn(self.httpget, url) self.done += 1 except KeyboardInterrupt: print "Crawler recv quit singal" self.quit = True self.down_pool.join() print "Crawler over, quit" def handle_error(self,e,url): self.err.lasterrurl = url # do not record the err url, but record the least err url to show if e.find('DNSError') > 0 : self.err.dns += 1 #self.err.rdns.append(url) elif e.find('reset') > 0 :#Connection reset self.err.reset += 1 #self.err.rreset.append(url) elif e.find('Max retries') > 0 or e.find('Connection aborted'): # self.err.conntimeout += 1 #self.err.rconntimeout.append(url) elif e.find('refused') > 0: #Connection refused self.err.refuse += 1 #self.err.rrefuse.append(url) else: self.err.others +=1 #self.err.rothers.append(url) print "Error", url, e # requests is better than curl in tests def httpget_requests(self, url): #return data data = None st = time() con = "" e = "" res_headers = "" headers = { 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding':'gzip,deflate', 'Connection':'close', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } res = None done = False try: with gevent.Timeout(3, False) as timeout: #req.max_redirects = 2 res = requests.get(url, headers = headers ) con = res.content res.close() done = True except KeyboardInterrupt: raise except Exception as e: e = str(e) if res: res.close() #as for spawn, no callback , we should call by ourself data = (url, e, None, None) #return url,e,None,None et = time() self.totalnettime += (et-st) #spawn if done: data = (url, e, res.headers, con) self.cb_httpget(data)
con = req.content #print url, len(con) req.close() except: pass data = (url, con) cb(data) from time import time def cb(data): seed, con = data #print "\t", seed, len(con) cnt = 0 sst = time() while True: url = runque.get() runque.put(url) st = time() pool.spawn(httpget, url) et = time() cnt += 1 if cnt % 10 == 0: print cnt / (et - sst), runque.qsize(), robotsque.qsize()
class Crawler: def __init__(self): self.showpercounts = 10 self.timeout = 20 self.poolsize = 100 self.down_pool = Pool(size=self.poolsize) self.run_que = RedisQueueConnection('running').conn self.doneque = RedisQueueConnection('robots').conn self.tempque = Queue() self.done = 1 self.sent = 0 self.quit = False self.err = Error() self.https_enable = 0 self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.totalnettime = 0 self.totaldownsize = 0 self.starttime = time() self.ip = getip() self.headers = { 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding': 'gzip,deflate', 'Connection': 'close', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } #callback function when greenlet of httpget run done def cb_httpget(self, data=None): if not data: return seed, err, headers, content = data if err: self.handle_error(err, seed) return if len(content) <= 0: return data = {'seed': seed, 'headers': headers, 'content': content} #content is robots.txt, normally it's pure text dat = cPickle.dumps(data) self.tempque.put(dat) self.done += 1 if self.done % self.showpercounts == 0: self.out(seed) def out(self, seed): spendtime = time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime print "\n%s D:%-4d DT: %4d R:%-7d [QPS: %.2f %.2f] %s" % (self.ip, self.done,self.doneque.qsize(), self.run_que.qsize(), \ self.done/spendtime, self.done/self.totalnettime , str(self.err) ) def run(self): while self.quit == False: try: if self.run_que.qsize() == 0: print "run que empty" sleep(60) url = self.run_que.get() self.down_pool.spawn(self.httpget, url) self.sent += 1 except KeyboardInterrupt: print "Crawler recv quit singal" self.quit = True self.down_pool.join() print "Crawler over, quit" def handle_error(self, e, url): self.err.lasterrurl = url # do not record the err url, but record the least err url to show if e.find('DNSError') > 0: self.err.dns += 1 #self.err.rdns.append(url) elif e.find('reset') > 0: #Connection reset self.err.reset += 1 #self.err.rreset.append(url) elif e.find('Max retries') > 0 or e.find('Connection aborted'): # self.err.conntimeout += 1 #self.err.rconntimeout.append(url) elif e.find('refused') > 0: #Connection refused self.err.refuse += 1 #self.err.rrefuse.append(url) else: self.err.others += 1 #self.err.rothers.append(url) print "Error", url, e # requests is better than curl in tests def httpget_requests(self, url): #return data data = None st = time() con = "" e = "" res_headers = "" res = None done = False try: with gevent.Timeout(self.timeout, False) as timeout: url = url + '/robots.txt' res = requests.get(url, headers=self.headers) if res.status_code == 200: con = res.content done = True res.close() except KeyboardInterrupt: raise except Exception as e: e = str(e) if res: res.close() data = (url, e, None, None) et = time() self.totalnettime += (et - st) #spawn if done: data = (url, e, res.headers, con) #self.cb_httpget(data) if not data: return seed, err, headers, content = data if err: self.handle_error(err, seed) return if len(content) <= 0: return data = {'seed': seed, 'headers': headers, 'content': content} #content is robots.txt, normally it's pure text dat = cPickle.dumps(data) self.tempque.put(dat) self.done += 1 if self.done % self.showpercounts == 0: #self.out(seed) spendtime = time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime print "\n%s D:%-4dDT:%4d R:%-7d [QPS: %.2f %.2f] %s" % (self.ip, self.done,self.doneque.qsize(), self.run_que.qsize(), \ self.done/spendtime, self.sent/spendtime , str(self.err) )
def getsize(name): runque = RedisQueueConnection(name).conn print runque.qsize() i = runque.get() runque.put(i) print i