class QueueManage(object): def __init__(self, name): self.q_obj = RedisQueue(name) def get_queue_data(self): q_re = self.q_obj.get_all() return q_re def queue_size(self): return self.q_obj.qsize()
def main(run_type, store_num): q = RedisQueue(store_num) if run_type == 'gen': for i in range(len(sectorPolygons1954.sectors)): s = generateSectors(sectorPolygons1954.sectors[i], i, q, store_num) print('starting thread ' + str(i)) s.start() elif run_type == 'run': batch_size = 30 uri = 'localhost:27017' client = pymongo.MongoClient(uri) db = client['streets'] streets = db[store_num] streets.create_index([("latitude", pymongo.DESCENDING), ("longitude", pymongo.DESCENDING)]) for i in range(int(batch_size)): rg = request_getter(q, store_num) print('starting request thread ' + str(i)) rg.start() while q.qsize(): sleep(10) print(q.qsize())
def main(): done_que = RedisQueue('seed') run_que = RedisQueue('run') run_que.flushdb() conn = sqlite3.connect('site_data.db') conn.execute( "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)" ) spend = 0 cnt = 0 size = 0 while True: data = cPickle.loads(done_que.get()) st = time.time() urls = geturls(data['url'], data['content']) if len(urls) == 0: continue for url in urls: if url not in bfdone: run_que.put(url) gziphtml = sqlite3.Binary(gzip.zlib.compress(data['content'])) size += len(gziphtml) conn.execute( "insert into mainpages (url,headers,content) values (?,?,?)", (data['url'], str(data['headers']), gziphtml)) et = time.time() spend += (et - st) cnt += 1 if cnt % 10 == 0: print "cost:", spend / cnt, cnt, done_que.qsize( ), size / 1024 / 1024 conn.commit()
html = page return html def next_page(): base_url = 'http://jandan.net/ooxx/page-1006#comments' for i in range(3): html = user_agent(base_url).read() soup = BeautifulSoup(html) next_url = soup.find('a', { 'class': 'next-comment-page', 'title': 'Newer Comments' }).get('href') yield base_url base_url = next_url for page in next_page(): queue.put(page) print 'There are %d pages' % queue.qsize() while not queue.empty(): page_url = queue.get() html = user_agent(page_url).read() soup = BeautifulSoup(html) img_urls = soup.find_all(['img']) for myimg in img_urls: Jpgurl = myimg.get('src') redis.put(Jpgurl) print 'There are %d pictures' % redis.qsize()
q = RedisQueue('account_login', **redis_conn) http = urllib3.PoolManager(num_pools=50) def worker(value): params = {} params['account_login'] = base64.encodestring(value) r = http.request('POST', author_login, params) #服务器失败,重新压回队列 if r.status != 200: q.put(value) #IP白名单验证失败,重新压回队列 if r.data['status'] == 10002: q.put(value) print r.data while 1: # time.sleep(1); if q.empty(): print 'empty queue' break s = q.qsize() for i in range(0,s): value = q.get() t = threading.Thread(target=worker, args=(value,)) t.start() if threading.active_count() >= 500: time.sleep(1)
# # # store the website # dbname with ip? # # sitedata_2nd_ # # # # db = dbd['runque'] # db = dbd['extracturls'] # dbd = dict() dbd['runque'] = 1 dbd['extracturls'] = 2 host = "127.0.0.1" password = '******' # first insert into the done_site.bin rq = RedisQueue(name = 'extracturls', host=host, password=password, db=dbd['extracturls']) rr = RedisQueue(name ='runque', host=host, password=password, db=dbd['runque']) print rq.qsize() print rr.qsize() #exit(0)
# then master pop them to check whether already in done_site.bin # if not, send the host (without http://) to the dns server # from dns server we got the website host is accessble or not # when recived the reply from dns, then insert the parsed url (which is accessble) into the runqueu in a specific redis queue # # # store the website # # # db = dbd['runque'] # db = dbd['extracturls'] # dbd = dict() dbd['runque'] = 1 dbd['extracturls'] = 2 host = "127.0.0.1" password = '******' # first insert into the done_site.bin rq = RedisQueue(name='extracturls', host=host, password=password, db=dbd['extracturls']) rr = RedisQueue(name='runque', host=host, password=password, db=dbd['runque']) print rq.qsize() print rr.qsize() #exit(0)
req_timeout = 20 req = urllib2.Request(url, None, req_header) page = urllib2.urlopen(req, None, req_timeout) html = page return html def next_page(): base_url = "http://jandan.net/ooxx/page-1006#comments" for i in range(3): html = user_agent(base_url).read() soup = BeautifulSoup(html) next_url = soup.find("a", {"class": "next-comment-page", "title": "Newer Comments"}).get("href") yield base_url base_url = next_url for page in next_page(): queue.put(page) print "There are %d pages" % queue.qsize() while not queue.empty(): page_url = queue.get() html = user_agent(page_url).read() soup = BeautifulSoup(html) img_urls = soup.find_all(["img"]) for myimg in img_urls: Jpgurl = myimg.get("src") redis.put(Jpgurl) print "There are %d pictures" % redis.qsize()