class Chunker(object): def __init__(self, redis_host): self.work_queue = RedisQueue(redis_host, "inqueue") def run(self): chunk_id = 0 a_range = xrange(1,10) + xrange(10,256) for a in shuffle(a_range): for b in shuffle(xrange(1, 255)): if a == 172 and b in xrange(16,32): continue if a == 192 and b == 168: continue for c in shuffle(xrange(1, 255)): ip_range = "{0}.{1}.{2}.0/24".format(a, b, c) print "Sending chunk {0} range: {1}".format(chunk_id, ip_range) task = { "range": ip_range, "id": chunk_id } self.work_queue.put(task) chunk_id += 1 sleep(10) def run_test(self): self.work_queue.put({"range": "129.21.50.0/24", "id":0}) self.work_queue.put({"range": "129.21.49.0/24", "id":1})
class Crawler(object): def __init__(self, redis_host, depth=10): self.links_queue = RedisQueue(redis_host, "linksqueue") self.pages_queue = RedisQueue(redis_host, "pagesqueue") def run(self): while True: link = self.links_queue.get().data try: page = WebPage(requests.get(link).text, link, 80) except: print("Exception GETing {0}".format(link)) continue self.pages_queue.put(page.to_dict())
def process_request_origin(self, request, spider): redis = RedisQueue('proxy_ip') if not redis.empty(): proxy_ip = redis.get() else: proxy_ip = get_ip() proxy_para = {'ip_port': proxy_ip, 'user_pass': ''} request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers[ 'Proxy-Authorization'] = 'Basic ' + encoded_user_pass print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para[ 'ip_port'] redis.put(proxy_ip)
def process_request_origin(self, request, spider): redis = RedisQueue('proxy_ip') if not redis.empty(): proxy_ip = redis.get() else: proxy_ip = get_ip() proxy_para = { 'ip_port': proxy_ip, 'user_pass': '' } request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para['ip_port'] redis.put(proxy_ip)
class Indexer(object): def __init__(self, redis_host, es_urls): self.pages_queue = RedisQueue(redis_host, "pagesqueue") # take pages out of this queue self.links_queue = RedisQueue(redis_host, "linksqueue") # put links into this queue self.connection = pyelasticsearch.ElasticSearch(es_urls) try: self.connection.create_index("webpages") except: pass def run(self): while True: result = self.pages_queue.get().data result['tags'] = genTags(result['html']) self.connection.index('webpages', 'webpage', result, id=result['ip']) print('Indexed {0}'.format(result['ip'])) for link in result['links']: self.links_queue.put(link)
class StudyscrapyPipeline(object): def __init__(self): self.q = RedisQueue(name='CSDN', host='localhost', port=6379, db=3) if redis_db.hlen(redis_data_dict) == 0: pass def process_item(self, item, spider): # fp = open(r'F:\Spider\Spider\studyscrapy\out.txt', 'a+') if redis_db.hexists(redis_data_dict, item['title']): print('数据已存入队列 <--') pass else: # fp.write(item['title']+', '+item['time']+'\n') self.q.put(item['title'] + ':' + item['time']) redis_db.hset(redis_data_dict, item['title'], item['time']) print('title: {0},time: {1} 存入队列成功'.format(item['title'], item['time'])) return item
def dispose_ip(self, proxy_ip, redis_label): redis_list = [] for i in range(REDIS_NUM): redis_list.append(RedisQueue('proxy_ip_%d' % i)) redis_invalid_ip = RedisQueue('invalid_ip') if redis_label == REDIS_NUM - 1: redis_invalid_ip.put(proxy_ip) redis_list[0].put(get_ip()) else: redis_list[redis_label].remove(proxy_ip) redis_list[redis_label + 1].put(proxy_ip) if redis_list[0].empty(): redis_list[0].put(get_ip()) new_redis_label = random.choice(range(REDIS_NUM)) while redis_list[new_redis_label].empty(): new_redis_label = random.choice(range(REDIS_NUM)) new_proxy_ip = redis_list[new_redis_label].get() redis_list[new_redis_label].put(new_proxy_ip) return new_proxy_ip, new_redis_label
def dispose_ip(self, proxy_ip, redis_label): redis_list = [] for i in range(REDIS_NUM): redis_list.append(RedisQueue('proxy_ip_%d' %i)) redis_invalid_ip = RedisQueue('invalid_ip') if redis_label == REDIS_NUM - 1: redis_invalid_ip.put(proxy_ip) redis_list[0].put(get_ip()) else: redis_list[redis_label].remove(proxy_ip) redis_list[redis_label+1].put(proxy_ip) if redis_list[0].empty(): redis_list[0].put(get_ip()) new_redis_label = random.choice(range(REDIS_NUM)) while redis_list[new_redis_label].empty(): new_redis_label = random.choice(range(REDIS_NUM)) new_proxy_ip = redis_list[new_redis_label].get() redis_list[new_redis_label].put(new_proxy_ip) return new_proxy_ip,new_redis_label
def process_exception(self, request, exception, spider): request_ip = request.meta['proxy'] invalid_ip = request_ip.split('//')[1] redis = RedisQueue('proxy_ip') redis_invalid_ip = RedisQueue('invalid_ip') if not redis.empty(): redis.remove(invalid_ip) redis_invalid_ip.put(invalid_ip) print '+++++++++++++++++++++++%s' % exception print '-----------------------removing ip from redis: %s' % invalid_ip new_ip = get_ip() proxy_para = {'ip_port': new_ip, 'user_pass': ''} request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers[ 'Proxy-Authorization'] = 'Basic ' + encoded_user_pass print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % ( invalid_ip, proxy_para['ip_port']) redis.put(new_ip)
def main(): done_que = RedisQueue('seed') run_que = RedisQueue('run') run_que.flushdb() conn = sqlite3.connect('site_data.db') conn.execute( "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)" ) spend = 0 cnt = 0 size = 0 while True: data = cPickle.loads(done_que.get()) st = time.time() urls = geturls(data['url'], data['content']) if len(urls) == 0: continue for url in urls: if url not in bfdone: run_que.put(url) gziphtml = sqlite3.Binary(gzip.zlib.compress(data['content'])) size += len(gziphtml) conn.execute( "insert into mainpages (url,headers,content) values (?,?,?)", (data['url'], str(data['headers']), gziphtml)) et = time.time() spend += (et - st) cnt += 1 if cnt % 10 == 0: print "cost:", spend / cnt, cnt, done_que.qsize( ), size / 1024 / 1024 conn.commit()
def process_exception(self, request, exception, spider): request_ip = request.meta['proxy'] invalid_ip = request_ip.split('//')[1] redis = RedisQueue('proxy_ip') redis_invalid_ip = RedisQueue('invalid_ip') if not redis.empty(): redis.remove(invalid_ip) redis_invalid_ip.put(invalid_ip) print '+++++++++++++++++++++++%s' %exception print '-----------------------removing ip from redis: %s' %invalid_ip new_ip = get_ip() proxy_para = { 'ip_port': new_ip, 'user_pass': '' } request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % (invalid_ip,proxy_para['ip_port']) redis.put(new_ip)
class ParserWorker(): def __init__(self, in_queue_namespace, out_queue_namespace): self.in_queue_namespace = in_queue_namespace self.out_queue_namespace = out_queue_namespace self.in_queue = RedisQueue(in_queue_namespace) self.out_queue = RedisQueue(out_queue_namespace) print "Parser worker loaded" def run(self): while 1: xml_text = self.in_queue.get() print "Received XML" if xml_text == "None": self.out_queue.put("None") break json_doc = DataParser.parse_get_state_stats_resp(xml_text) print "Made JSON" self.out_queue.put(json_doc)
class FetcherWorker: def __init__(self, in_queue_namespace, out_queue_namespace, apikey): self.in_queue_namespace = in_queue_namespace self.out_queue_namespace = out_queue_namespace self.apikey = apikey self.in_queue = RedisQueue(in_queue_namespace) self.out_queue = RedisQueue(out_queue_namespace) print "Fetcher loaded with apikey", self.apikey def run(self): while 1: base_url = self.in_queue.get() if base_url == "None": # add end-of-queue markers for parsers self.out_queue.put("None") # ends program break url = base_url + self.apikey t1 = time.time() print "fetching try 1", url resp = urllib2.urlopen(url) if resp.code == 200: text = resp.read() self.out_queue.put(text) else: print 'failed once', url time.sleep(10) print "fetching try 2", url resp = urllib2.urlopen(url) if resp.code == 200: text = resp.read() self.out_queue.put(text) print "done fetching" # make sure we don't use the same API key within 2 seconds t2 = time.time() if t2 - t1 < 2.0: time.sleep(2.0 - (t2 - t1))
#coding=utf-8 from RedisQueue import RedisQueue redis = RedisQueue('0','testno1080') with open("testcn1080.txt") as file: for i in file.readlines(): i = i.replace("\n","") + ":1080" print(i) redis.put(i)
page = urllib2.urlopen(req, None, req_timeout) html = page return html def next_page(): base_url = 'http://jandan.net/ooxx/page-1006#comments' for i in range(3): html = user_agent(base_url).read() soup = BeautifulSoup(html) next_url = soup.find('a', { 'class': 'next-comment-page', 'title': 'Newer Comments' }).get('href') yield base_url base_url = next_url for page in next_page(): queue.put(page) print 'There are %d pages' % queue.qsize() while not queue.empty(): page_url = queue.get() html = user_agent(page_url).read() soup = BeautifulSoup(html) img_urls = soup.find_all(['img']) for myimg in img_urls: Jpgurl = myimg.get('src') redis.put(Jpgurl) print 'There are %d pictures' % redis.qsize()
if (len(sys.argv) > 2): time_duration = float(sys.argv[2]) start = time.time() print("Starting sensor readings at {}".format(start)) previous = start try: while True: current = time.time() acc = mpu.get_accel_data() q.put("{:.4f}, {:.3f}, {:.3f}, {:.3f}\n".format( current - start, (acc['x'] - X_OFFSET), (acc['y'] - Y_OFFSET), (acc['z'] - Z_OFFSET))) if ((current - start) > monitor_time): q.put('finished') break current_sensed = time.time() #print("Sensed and written in {}".format(current_sensed-current)) while (current_sensed - previous) < time_duration: current_sensed = time.time() continue previous = current_sensed except KeyboardInterrupt:
#!/usr/bin/python from RedisQueue import RedisQueue import subprocess import json import base64 import sys q = RedisQueue(sys.argv[1], namespace='ansible', host='internal-redis.ovmdvp.0001.use2.cache.amazonaws.com', port=6379, db=1) q.put(json.dumps({'type': sys.argv[2], 'payload': sys.argv[3]}))
from RedisQueue import RedisQueue import time q = RedisQueue('test') for i in xrange(20): q.put(i) print i, "put into queue" time.sleep(0.5) q.put("None")
combinations.append(feature) shuffle(combinations) print('starting', len(combinations)) input() # clear the queue while not q.empty(): q.get() print("empty") for i in range(6): x = machine(df) print('starting...') x.start() for feature in combinations: q.put(feature) print('all put') while not q.empty(): try: sleep(1) except: break del q
#!/usr/bin/env python # UniPi Python Control Panel # stop_server.py # uses Python 3.5 up # Author: Johannes Untiedt # Version 10.0 vom 26.03.2018 from RedisQueue import RedisQueue import threading if __name__ == '__main__': print("stop_server.py started") lock = threading.Lock() q = RedisQueue('ws_2') payload = "close" with lock: q.put(payload) print("Stop_server.py send ", payload)
from RedisQueue import RedisQueue q=RedisQueue('test') q.put("你好") print(q.get().decode('utf-8'))
from RedisQueue import RedisQueue import sys import random from pymongo import MongoClient if __name__ == '__main__': db = MongoClient() exists = db.zhihu.zhihu_answers exist_owners = [] for e in exists.find(): exist_owners.append(e['owner']) print(len(exist_owners)) all_ids = [line.strip().split('\t')[0] for line in open('./user_followees.data')] candidates = list(set(all_ids) - set(exist_owners)) queue = RedisQueue('answer_queue') queue.clear() print('Count: %d' % len(candidates)) for c in candidates[0:]: queue.put(c)
from flask import Flask import celeryTask from RedisQueue import RedisQueue flask_app = Flask(__name__) # Example URL to stop the current celery task @flask_app.route("/terminate", methods=['GET']) def rfGetVersions(): result.revoke(terminate=True) return ("Celery task Terminated") # Create a redis queue, which sits on redis server. q = RedisQueue('test') q.put('Task 1') q.put('Task 2') q.put('Task 3') q.put('Task 4') print("Celery background tasks starting.....") # This is actually calling a celery thread and asssiging it a task 'basic_celery_task' # By using .delay() method we are actually saying it execute in background, so that the flask can serve web URIs. result = celeryTask.basic_celery_task.delay() print("Started!") flask_app.run(host="127.0.0.1", port=5001, threaded=True)
req_timeout = 20 req = urllib2.Request(url, None, req_header) page = urllib2.urlopen(req, None, req_timeout) html = page return html def next_page(): base_url = "http://jandan.net/ooxx/page-1006#comments" for i in range(3): html = user_agent(base_url).read() soup = BeautifulSoup(html) next_url = soup.find("a", {"class": "next-comment-page", "title": "Newer Comments"}).get("href") yield base_url base_url = next_url for page in next_page(): queue.put(page) print "There are %d pages" % queue.qsize() while not queue.empty(): page_url = queue.get() html = user_agent(page_url).read() soup = BeautifulSoup(html) img_urls = soup.find_all(["img"]) for myimg in img_urls: Jpgurl = myimg.get("src") redis.put(Jpgurl) print "There are %d pictures" % redis.qsize()
def get_all_url(url, biz): """ 根据URL解析JSON,得到文章信息保存到Redis队列中 :param url: :param biz: :return: """ if biz is None: print("空") return q = RedisQueue(biz.strip()) json_str = get_page_detail(url) json_re = parse_page_index(json_str) general_msg_list = parse_page_index(json_re['general_msg_list']) for list_re in general_msg_list['list']: print("当前biz为", biz) datetime = list_re['comm_msg_info']['datetime'] try: title = list_re['app_msg_ext_info']['title'] digest = list_re['app_msg_ext_info']['digest'] content_url = list_re['app_msg_ext_info']['content_url'] author = list_re['app_msg_ext_info']['author'] print(url) # content = get_content(content_url) data1 = { 'title': title, 'digest': digest, 'datetime': datetime, 'content_url': content_url, 'author': author # 'content': content } data1 = json.dumps(data1) q.put(data1) print(data1) for multi_app_msg_item_list in list_re['app_msg_ext_info']['multi_app_msg_item_list']: title = multi_app_msg_item_list['title'] digest = multi_app_msg_item_list['digest'] content_url = multi_app_msg_item_list['content_url'] print(content_url) author = multi_app_msg_item_list['author'] data2 = { 'title': title, 'digest': digest, 'datetime': datetime, 'content_url': content_url, 'author': author # 'content': content } data2 = json.dumps(data2) q.put(data2) print(data2) except KeyError as e: print("error", e) # 获取 next_offset if len(general_msg_list['list']) < 10: return None return json_re['next_offset']
company_url = url_queue.get() download_data(company_url, 5) time.sleep(2) url_queue.task_done() def url_producer(name_queue, url_queue): while True: company_name = name_queue.get() download_url(company_name, 5) #name_queue = queue.Queue() i = 0 name_queue = RedisQueue("name" + str(i + 1)) csv_reader = csv.reader(open('./company.csv', encoding="utf8")) for row in csv_reader: name_queue.put(row[0]) url_queue = RedisQueue("url" + str(i + 1)) for n in range(4): producer_thread = threading.Thread(target=url_producer, args=( name_queue, url_queue, )) producer_thread.start() for n in range(5): consumer_thread = threading.Thread(target=url_consumer, args=(url_queue, )) consumer_thread.start() #url_queue.join()