def jp2_to_jpeg( _threads, _app, _source, _destination, _broken, _jpegs, _verbose ): testApp(_app) t = ThreadPool(_threads) for (root, dirs, files) in os.walk(_destination): subpath = root.replace(_destination, '').lstrip('/') if _broken not in subpath: if any(".jp2" in s for s in files): print >>emaillog, 'Converting contents of ' + subpath + ' from JP2 to JPEG' for (output_file, size) in _jpegs: for file in files: if file.endswith('.jp2'): jp2 = os.path.join(root, file) newfile = os.path.join(root, os.path.splitext(file)[0]) + '_' \ + output_file command = _app + ' -size ' + size + " " + jp2 \ + ' -resize ' + size + ' ' + newfile if _verbose == True: print 'Creating ' + newfile t.add_task(executeConversion,command,None,jp2,_source,_broken,file,newfile) t.await_completion()
def testcase_ThreadPool_init_thread_pool_success(self): """测试用例2:初始化函数中初始化线程池""" jobs = [str(i) for i in xrange(2)] pool = ThreadPool(3, test_function, jobs, 0) thread_count = len(pool.threads) self.assertEqual(3, thread_count) pool.wait_allcomplete()
def tif_to_jp2( _threads, _app, _source, _destination, _broken, _options, _verbose ): testApp(_app) t = ThreadPool(_threads) for (root, dirs, files) in os.walk(_source): subpath = root.replace(_source, '').lstrip('/') if _broken not in subpath: jp2Path = os.path.join(_destination,subpath) makeDir(jp2Path) if any(".tif" in s for s in files): print >>emaillog, 'Converting contents of ' + subpath + ' from TIF to JP2' for file in files: if file.endswith('.tif'): tiff = os.path.join(root, file) jp2 = os.path.join(_destination, subpath, os.path.splitext(file)[0] + '.jp2') tiffcopy = os.path.join(_destination,subpath,file) command = _app + ' -i ' + tiff + ' -o ' + jp2 + ' ' \ + _options command_post = 'shutil.move(\'' + tiff + '\',\'' + tiffcopy + '\')' if _verbose == True: print 'Creating ' + jp2 t.add_task(executeConversion,command,command_post,tiff,_destination,_broken,file,jp2) t.await_completion()
def test(): print 'start testing' wm = ThreadPool(10) for i in range(1): wm.add_job(test_job, i, i*0.001) wm.wait_for_complete() print 'end testing'
def iterate(_source,_ignore,_patron,_patron_zip,_threads): print 'Descend into ' + _source t = ThreadPool(_threads) for (root, dirs, files) in os.walk(_source): t.add_task(patron_bundle,_patron,_patron_zip,root) t.await_completion()
def articles(): error = None if request.method == 'POST': query = request.form['query'].lower() query = query.replace(" ", "%20") if query in app.cache: return Response(json.dumps(app.cache[query]), mimetype='application/json') url = GOOGLE_NEWS_RSS+query response = requests.get(url).text # Get article links from the RSS links = [] utils.find_links(links, response) if len(links) > 2: links = links[2:] # Get article titles from the RSS titles = [] utils.find_titles(titles, response) if len(titles) > 2: titles = titles[2:] num_links = len(links) num_titles = len(titles) if num_titles < num_links: links = links[:num_titles] if num_links < num_titles: titles = titles[:num_links] articles = {} pool = ThreadPool(num_links) for i in range(num_links): pool.add_task( utils.get_article_sentiment, links[i], titles[i], articles) pool.wait_completion() sentiments = [] result = {} result["articles"] = [] index = 0 for key in articles: info = {} info["id"] = index index += 1 info["title"] = articles[key]["title"] info["link"] = key sentiments.append(articles[key]["sentiment"]) info["sentiment"] = articles[key]["sentiment"] info["snippet"] = articles[key]["snippet"] result["articles"].append(info) average_sentiment = utils.average_sentiment( sentiments, len(sentiments)) result["sentiment"] = average_sentiment app.cache[query] = deepcopy(result) return Response(json.dumps(result), mimetype='application/json')
def __init__(self, config_name="config.json"): self.__config = ServerConfig(config_name) logging.basicConfig(filename=self.__config.log_file, level=logging.DEBUG, format='%(asctime)s %(message)s') self.thread_pool = ThreadPool() cache_dir = Path.cwd() / self.__config.cache_dir self.cache = CacheStorage(cache_dir) self.request_handler = RequestHandler(self.cache)
def sentiment(): error = None if request.method == 'POST': query = request.form['query'] query = query.replace(" ", "%20") if query in app.sentiment_cache: return Response( json.dumps(app.sentiment_cache[query]), mimetype='application/json') url = GOOGLE_NEWS_RSS+query response = requests.get(url).text # Get article links from the RSS links = [] utils.find_links(links, response) if len(links) > 2: links = links[2:] # Get article titles from the RSS titles = [] utils.find_titles(titles, response) if len(titles) > 2: titles = titles[2:] num_links = len(links) num_titles = len(titles) if num_titles > 3: num_titles = 3 if num_titles < num_links: links = links[:num_titles] if num_links > 3: num_links = 3 if num_links < num_titles: titles = titles[:num_links] articles = {} pool = ThreadPool(num_links) for i in range(num_links): pool.add_task( utils.get_article_sentiment, links[i], titles[i], articles) pool.wait_completion() sentiments = [] result = {} for key in articles: print query, ':', articles[key]["sentiment"] sentiments.append(articles[key]["sentiment"]) average_sentiment = utils.average_sentiment( sentiments, len(sentiments)) print average_sentiment result["sentiment"] = average_sentiment result["query"] = request.form['query'] app.sentiment_cache[query] = deepcopy(result) return Response(json.dumps(result), mimetype='application/json')
def test_results(self): def my_add(a,b): return a+b tp = ThreadPool(5) for i in range(5): tp.add_task(my_add, i, i) d = tp.wait_completion() vals = d.values() vals.sort() assert vals == [0, 2, 4, 6, 8]
def start_tasks(): stores = load_stores() thread_pool = ThreadPool(size=20) total = len(stores) pos = 0 for store in stores: pos += 1 task = UnderLoadSlotZeroTask(store=store, total=total, pos=pos) thread_pool.push_task(task) thread_pool.init_pool() thread_pool.start() print('Waiting for task exit!') thread_pool.join()
def main(): store_list = load_stores() thread_pool = ThreadPool(size=20) pos = 0 total = len(store_list) for store in store_list: pos += 1 task = SlotStateFetchTask(store, pos=pos, total=total) thread_pool.push_task(task) thread_pool.init_pool() thread_pool.start() print('Waiting for tasks exit!!!') thread_pool.join()
def testcase_ThreadPool_set_work_queue_success(self): """测试用例1:初始化函数中设置工作队列成功""" jobs = [str(i) for i in xrange(2)] pool = ThreadPool(2, test_function, jobs, 0) while True: try: func, param = pool.work_queue.get(block=False) res = func(param) self.assertEqual(str(0), res) except Queue.Empty as e: self.logging.info(e) break pool.wait_allcomplete()
def start_tasks(): thread_pool = ThreadPool(size=20) store_list = load_stores() total_count = len(store_list) count = 0 for store in store_list: count += 1 task = FetcherTask(store=store, num=count, total=total_count) thread_pool.push_task(task) thread_pool.init_pool() thread_pool.start() print('Waiting Task Finished......') thread_pool.join()
def main(): store_list = load_stores() thread_pool = ThreadPool(size=20) index = 0 total = len(store_list) for store in store_list: index += 1 task = CompensationDisableTask(store=store, index=index, total=total) thread_pool.push_task(task) thread_pool.init_pool() print('Starting tasks...') thread_pool.start() print('Waiting for task exit!') thread_pool.join()
def main (): logging.basicConfig(filename='debug.log', filemode='w') with Imap(url) as imap: imap.login(address, password) logging.info('Logged in') folders = imap.get_folders() # print('Found folders:', len(folders)) # all_uids = get_uids_and_count(imap, folders) ui = UI() # 15 - imap simultaneous connections limit tasks = [(process_messages, (folder, ui)) for folder in folders] pool = ThreadPool(max=15) pool.run(tasks, delay=1)
def start(self): if not self.db_oper.is_enabled(): return repo_list = self.db_oper.get_repo_list() if repo_list is None: self.db_oper.close_db() return thread_pool = ThreadPool(self.scan_virus, self.settings.threads) thread_pool.start() for row in repo_list: repo_id, head_commit_id, scan_commit_id = row if head_commit_id == scan_commit_id: logger.debug('No change occur for repo %.8s, skip virus scan.', repo_id) continue thread_pool.put_task( ScanTask(repo_id, head_commit_id, scan_commit_id)) thread_pool.join() self.db_oper.close_db()
class BaseBot: def __init__(self, service): self.service = service self.error_count = 0 self.thread_pool = ThreadPool() # thread-safe 하다고 나와있긴 하지만 보장이 되는지 고민해 봐야함 # callback 패턴으로 바꾸는 것 고려 def handle_event(self, events): for event in events: if isinstance(event, ChatInitEvent): #print(event.user_index) self.handle_entered_room(event.team_index, event.room_index) elif isinstance(event, ChatMessageEvent): chat = self.service.get_chat_summary(event.room_index, event.msg_index) if chat: self.handle_chat(event.team_index, event.room_index, chat) elif isinstance(event, UserDropEvent) \ and self.service.my_index == event.user_index: logger.error("봇 계정이 탈퇴되었습니다.") sys.exit() elif isinstance(event, UserPasswordChangedEvent) \ and self.service.my_index == event.user_index: logger.error("봇 계정의 비밀번호가 바뀌었습니다.") sys.exit() def run(self): while True: try: events = self.service.get_events() if events: self.thread_pool.add_task(self.handle_event, events) time.sleep(self.service.config['lp_idle_time']) except Exception as e: self.error_count += 1 if self.error_count > 3: logger.error("오류가 발생했습니다. 프로그램을 종료합니다.") #sys.exit() else: logger.error("오류가 발생했습니다.:" + e) time.sleep(5) def handle_entered_room(self, team_index, room_index): raise NotImplementedError() def handle_chat(self, team_index, room_index, chat): raise NotImplementedError()
def testcase_ThreadPool_get_result_success(self): """测试用例3:get_result,所有任务执行完后结果为1""" jobs = [i for i in xrange(2)] pool = ThreadPool(3, test_function, jobs, 0) pool.wait_allcomplete() sum = 0 while True: try: res = pool.get_result() arr_res = json.loads(res) sum += int(arr_res['url']) except Queue.Empty as e: self.logging.info(e) break self.assertEqual(1, sum)
class Listener(): def __init__(self, redis_conn, channels): self.redis_conn = redis_conn self.pubsub = self.redis_conn.pubsub() self.pubsub.subscribe(channels) self.thread_pool = ThreadPool(size=10) def work(self, item): # 修改成你的代码逻辑 print item["channel"], item["data"] def run(self): self.thread_pool.start() for item in self.pubsub.listen(): self.thread_pool.append_job(self.work, item)
def run(self): cx, cu = self.db_connection() pool = ThreadPool(size=20) pool.start() file_submission_id = open(FILE_SUBMISSION_ID) finished_submissions = [int(item[0]) for item in cu.execute("select submission_id from code")] all_submissions = [int(item) for item in file_submission_id.readlines()] for line in list(set(all_submissions).difference(set(finished_submissions))): sleep(0.2) pool.append_job(s.job, line) pool.join() pool.stop()
def __init__(self, thread_count: int, host: str, port: str, db_name: str, user: str, channel_name: str) -> None: """ Конструктор класса Инициализирует: - количество потоков - подключение к базе данных - один пул потоков :param thread_count: количество потоков в пуле потоков :param host: hostname, на которой развернута база данных :param port: порт подключения к базе данных :param db_name: наименование базы данных :param user: роль для подключения к базе данных :param channel_name: наименование канала, в который поступают сообщения от базы данных """ self._host = host self._port = port self._db_name = db_name self._user = user self._thread_count = thread_count self._channel_name = channel_name self._e = self.connect() self.pool_task = ThreadPool(self._thread_count)
def main(event, lambdacontext): starttime = time.time() queue_url = event.get(c.KEY_SQS_QUEUE_URL, None) print "Started consumer with queue url '{}'".format(queue_url) context = event.get("context", {}) context[c.KEY_SQS_QUEUE_URL] = queue_url context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr(lambdacontext, 'function_name') else None context[c.KEY_REQUEST_ID] = lambdacontext.aws_request_id if hasattr(lambdacontext, 'aws_request_id') else None context[c.KEY_IS_LAMBDA_ENV] = context[c.KEY_REQUEST_ID] is not None prefix = util.get_stack_name_from_arn(os.environ[c.ENV_DEPLOYMENT_STACK_ARN]) context[c.KEY_STACK_PREFIX] = prefix context[c.KEY_SQS] = Sqs(context, "{0}_".format(prefix)) context[c.KEY_SQS_AMOEBA] = Sqs(context, "{0}{1}_".format(prefix, c.KEY_SQS_AMOEBA_SUFFIX)) context[c.KEY_SQS_AMOEBA].set_queue_url(lowest_load_queue=True) context[c.KEY_LAMBDA] = Lambda(context) context[c.KEY_CLOUDWATCH] = CloudWatch(context) context[c.KEY_THREAD_POOL] = ThreadPool(context, 8) context[c.KEY_METRIC_BUCKET] = os.environ[c.RES_S3_STORAGE] context[c.KEY_START_TIME] = starttime context[c.CW_ATTR_SAVE_DURATION] = context[c.KEY_CLOUDWATCH].avg_save_duration(util.get_cloudwatch_namespace(os.environ[c.ENV_DEPLOYMENT_STACK_ARN])) context[c.CW_ATTR_DELETE_DURATION] = context[c.KEY_CLOUDWATCH].avg_delete_duration(util.get_cloudwatch_namespace(os.environ[c.ENV_DEPLOYMENT_STACK_ARN])) context[c.KEY_SUCCEEDED_MSG_IDS] = [] process(context) del context gc.collect() return { 'StatusCode': 200 }
class DouBanPipeline(object): pool = ThreadPool(1) mongo_client = MongoDBApi() @classmethod def save_item(cls, item): cls.pool.callInThread(cls.__save_itme, item) pass @classmethod def __save_itme(cls, item): try: comments = [] if item.has_key('comments'): comments = item.pop('comments') insert_id = cls.mongo_client.insert_one(item) if insert_id: insert_id = ObjectId(insert_id) for index, comment in enumerate(comments): comment['movie_id'] = insert_id comments[index] = comment if comments: insert_ids = cls.mongo_client.insert_many( comments, 'movie_comments') logging.warn(u'========保存一条信息=======\n') else: logging.warn(u'========一条信息保存失败=======\n') except Exception as err: logging.error(traceback.format_exc(err))
class DnsServer: def __init__(self, config_name="config.json"): self.__config = ServerConfig(config_name) logging.basicConfig(filename=self.__config.log_file, level=logging.DEBUG, format='%(asctime)s %(message)s') self.thread_pool = ThreadPool() cache_dir = Path.cwd() / self.__config.cache_dir self.cache = CacheStorage(cache_dir) self.request_handler = RequestHandler(self.cache) def run(self): """Binds, listens, processing DNS requests on socket""" signal.signal(signal.SIGINT, self.__handle_exit) s = socket(AF_INET, SOCK_DGRAM) s.bind((self.__config.server_host, self.__config.server_port)) s.settimeout(self.__config.server_timeout) logging.info( f'Launched at {self.__config.server_host}:{self.__config.server_port}' ) while True: try: data, addr = s.recvfrom(self.__config.recv_buff_size) except SystemExit as e: s.close() break except Exception as e: logging.info(str(e)) s.close() break self.thread_pool.add_task(self.__process_request, data, addr, s) def __process_request(self, data, addr, s_socket): response = self.request_handler.handle_query(data) s_socket.sendto(response, addr) def __handle_exit(self, signal, frame): logging.info("Received SIGINT, shutting down threads...") print("shutting down...") self.thread_pool.tasks.join() self.thread_pool.terminate_all_workers() logging.info("Threads stopped, updating cache") self.cache.cleanup() sys.exit(0)
def __init__(self, bot_jid, stream): self.bot_jid = bot_jid self._stream = stream self.cmd_handler = CommandHandler(message_bus = self) self.admin_cmd_handler = AdminCMDHandler(message_bus = self) self._thread_pool = ThreadPool(5) self._thread_pool.start() # 启动线程池 self.logger = get_logger() return
def download_feed(feed, feed_tag): new_subtask(len(feed) * 2) item_thread_pool = ThreadPool() for entry in feed.get_entries(): increment_subtask() app_globals.STATS['items'] += 1 if entry is None: app_globals.STATS['failed'] += 1 error(" ** FAILED **") debug("(entry is None)") continue item = Item(entry, feed_tag) process_item(item, item_thread_pool) item_thread_pool.collect() item_thread_pool.collect_all()
def main1(): thread_pool = ThreadPool(20) thread_pool.start() session = get_session() topic_query = \ session.query(VideoTopic).filter(VideoTopic.video_type == 1) for topic in topic_query: thread_pool.add_task(job, topic.henhen_id) session.close() thread_pool.wait_done()
def test(): print 'start testing' wm = ThreadPool(10) for i in range(1): wm.add_job(test_job, i, i * 0.001) wm.wait_for_complete() print 'end testing'
def download(): lines = ["Topics"] thread_pool = ThreadPool() d = False if enable_proxie[0]: refresh_proxie() filename = datetime.datetime.now().strftime("%d-%m-%Y %H-%M-%S") + '.txt' for i, enable in enumerate(download_enables): if enable: thread_pool.give_task(download_concrete_page, args=(download_hrefs[i], lines)) d = True thread_pool.join() if d: file = open(filename, 'w') file.write('\n'.join(lines)) file.close() print(f'All chosen topics are saved to {filename}') to_main_menu() else: print("Nothing is chosen") input("Press <Enter> to continue") return True
def jp2_to_jpeg(_threads, _app, _source, _destination, _broken, _jpegs, _verbose): testApp(_app) t = ThreadPool(_threads) for (root, dirs, files) in os.walk(_destination): subpath = root.replace(_destination, '').lstrip('/') if _broken not in subpath: if any(".jp2" in s for s in files): print >> emaillog, 'Converting contents of ' + subpath + ' from JP2 to JPEG' for (output_file, size) in _jpegs: for file in files: if file.endswith('.jp2'): jp2 = os.path.join(root, file) newfile = os.path.join(root, os.path.splitext(file)[0]) + '_' \ + output_file command = _app + ' -size ' + size + " " + jp2 \ + ' -resize ' + size + ' ' + newfile if _verbose == True: print 'Creating ' + newfile t.add_task(executeConversion, command, None, jp2, _source, _broken, file, newfile) t.await_completion()
def pool_time(thread_num): start = time.clock() tp = ThreadPool(thread_num) for i in range(5): tp.add_task(time.sleep, i) tp.wait_completion() return time.clock() - start
class Interface(object): def __init__(self): self._read_config() self._init_threadpool() def _read_config(self): self.pipe_file = Base.get_config("QUEUE", "PIPE_FILE") self.queue_size = Base.get_config("QUEUE", "QUEUE_SIZE") self.thread_pool_num = Base.get_config("THREADPOOL", "NUM") self.pipe_fd = os.open(self.pipe_file, os.O_NONBLOCK | os.O_CREAT | os.O_RDWR) def _init_threadpool(self): self.pool = ThreadPool(int(self.thread_pool_num), int(self.queue_size)) def write(self, string): print string def transcode(self, string): self.pool.add_job(self._transcode, string) def _transcode(self, filepath): print filepath time.sleep(10) print "ok" def __getattribute__(self, name): try: res = object.__getattribute__(self, name) except: res = None return res def __del__(self): os.close(self.pipe_fd)
def tif_to_jp2(_threads, _app, _source, _destination, _broken, _options, _verbose): testApp(_app) t = ThreadPool(_threads) for (root, dirs, files) in os.walk(_source): subpath = root.replace(_source, '').lstrip('/') if _broken not in subpath: jp2Path = os.path.join(_destination, subpath) makeDir(jp2Path) if any(".tif" in s for s in files): print >> emaillog, 'Converting contents of ' + subpath + ' from TIF to JP2' for file in files: if file.endswith('.tif'): tiff = os.path.join(root, file) jp2 = os.path.join(_destination, subpath, os.path.splitext(file)[0] + '.jp2') tiffcopy = os.path.join(_destination, subpath, file) command = _app + ' -i ' + tiff + ' -o ' + jp2 + ' ' \ + _options command_post = 'shutil.move(\'' + tiff + '\',\'' + tiffcopy + '\')' if _verbose == True: print 'Creating ' + jp2 t.add_task(executeConversion, command, command_post, tiff, _destination, _broken, file, jp2) t.await_completion()
def _init(self): # 线程池,指定线程数 self.thread_pool = ThreadPool(self.thread_num) self.depth = 2 # 标注初始爬虫深度,从1开始 self.current_depth = 1 # 已访问的链接 self.visited_hrefs = set() # 待访问的链接 self.unvisited_hrefs = deque() # 标记爬虫是否开始执行任务 self.is_crawling = False self.resource_details = ResourceDetailCollection()
def main(): pool = ThreadPool(20) pool.start() session = get_session() topic_query = session.query(PicTopic).filter(PicTopic.pic_type == 'dongmantupian').order_by(PicTopic.id.desc()) for pic_topic in topic_query: pool.add_task(dump_job, pic_topic) session.close() pool.wait_done()
def threadstart(self): self.tp = ThreadPool(10) self.tpool = Thread(target=self.startThreadPool, args=()) self.tpool.daemon = True self.tpool.start() self.tdetect = Thread(target=self.detectConnect, args=()) self.tdetect.daemon = True self.tdetect.start() self.ttimeout = Thread(target=self.detectConnectTimeOut, args=()) self.ttimeout.daemon = True self.ttimeout.start()
def iterate(_source, _ignore, _patron, _patron_zip, _threads): print 'Descend into ' + _source t = ThreadPool(_threads) for (root, dirs, files) in os.walk(_source): t.add_task(patron_bundle, _patron, _patron_zip, root) t.await_completion()
def proxy_server(host, port): addr = host, port tp = ThreadPool(WORKERS) try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sock.bind(addr) sock.listen(WORKERS) print('Listening on', addr) while True: if not select.select([sock], [], [], 1)[0]: continue conn, client_addr = sock.accept() tp.add_task(proxy_connection, conn, client_addr) except socket.error: print('Cannot run the server') tp.wait_completion()
def main(): thread_pool = ThreadPool(50) thread_pool.start() video_type = '7' base_url = 'http://www.toutoulu.com/vodlist/%s_%s.html' # init task for page_num in range(1, page_info[video_type] + 1): url = base_url % (video_type, page_num) print 'add task %s' % url thread_pool.add_task(thread_pool_job, url, video_type) thread_pool.wait_done()
def __init__(self, myconfig): # 线程池, self.thread_pool = ThreadPool(myconfig.threadnum) # 已访问的url集合 self.visited_urls = set() # set 不是线程安全,所以这里加一把锁 self.visited_urls_lock = threading.Lock() # 未访问的url集合 self.will_visited_urls = deque() self.will_visited_urls.append(myconfig.url) self.temp_q = deque() self.cur_depth = 0 self.status = "" self.myconfig = myconfig MyLogger(myconfig.logfile, myconfig.loglevel) #MyLogger(myconfig.logfile, loglevel = 5) # debug self.db = Db()
def test_thread_pool_with_exception(): """ thread pool should be able to handle task processing even if there were exceptions in some tasks """ thread_pool = ThreadPool() result = [] def throw_ex_task(): raise Exception() def populate_result_task(): result.extend([i for i in range(0, 10)]) return thread_pool.add_task(throw_ex_task) thread_pool.add_task(populate_result_task) thread_pool.tasks.join() thread_pool.terminate_all_workers() assert result == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
def test_thread_pool(): """ thread pool should be able to handle task processing """ thread_pool = ThreadPool() result = [] def populate_result_task(): result.extend([i for i in range(0, 10)]) return thread_pool.add_task(populate_result_task) thread_pool.tasks.join() thread_pool.terminate_all_workers() assert result == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
def compile(self, jobs=0): if 0 == jobs: jobs = cpu_count() self.print_msg('BS', 'Using %s parallel job(s)' % colored(str(jobs), 'yellow')) for target in self.targets: self.print_msg('BS', 'Building target %s' % colored(target.name, 'yellow')) pool = ThreadPool(jobs) for source in target.sources: args = (target, source, None) pool.apply_async(self.compile_object, args=args, callback=self.compile_object_done) try: self._wait_for_compilation(target) except BuildError as e: raise finally: pool.close() pool.join() self.run_prefinal(target) target.final(self)
def scrap_connections(): areas=set() airports = [] with open('csv_files/top_100_airports.csv', 'rb') as csvfile: reader = csv.reader(csvfile) for row in reader: for airport in row: airports.append(airport) origins = dests = airports dates = [] for i in range(100): new_date = datetime.date(2014, 11, 02) + timedelta(days=i) dates.append(new_date) pool = ThreadPool(20) pool.start() for dest in dests: for origin in origins: if dest != origin: date = dates[randint(0, len(dates)-1)] pool.add_task(get_connections, origin, dest, date) areas.add(get_area(origin, dest)) pool.wait_completion() #arrange all connections in single set per area instead of list areas_conn = dict() for area in areas: conn_list = flight_resp_dal.get_connections_in_area(area) connections = set() for conn in conn_list: connections.update(set(conn)) areas_conn[area] = connections flight_resp_dal.clean_areas_to_connections_table() for area in areas: flight_resp_dal.add_connections_to_area(area, areas_conn[area])
def generate_threads(functionid, threads_count, iterations_per_thread, events_per_iteration, sleep_duration, use_lambda, event_type, sensitivity_type, compression_mode): start = time.time() context = {} threadpool = ThreadPool(context, threads_count) context=dict({}) db = DynamoDb(context) print "Sleep durations: ", sleep_duration print "Number of threads: ", threads_count print "Number of iterations per thread: ", iterations_per_thread print "Number of events per iteration: ", events_per_iteration print "Using event type: ", event_type print "Using sensitivity type: ", sensitivity_type print "Using compression mode: ", compression_mode for i in range(0, threads_count): threadpool.add(thread_job, functionid, iterations_per_thread, events_per_iteration, use_lambda, context, sleep_duration, event_type, sensitivity_type, compression_mode) threadpool.wait() print "A total of {} metrics have been sent to the FIFO queues.".format((iterations_per_thread*events_per_iteration)*threads_count) print "The overall process took {} seconds.".format(time.time() - start)
def main(event, request): context = dict({}) context[c.KEY_LAMBDA_FUNCTION] = request.function_name if hasattr( request, 'function_name') else None context[c.KEY_REQUEST_ID] = request.aws_request_id if hasattr( request, 'aws_request_id') else None stackid = os.environ[c.ENV_DEPLOYMENT_STACK_ARN] context[c.KEY_DB] = DynamoDb(context) context[c.KEY_ATHENA_QUERY] = Query(stackid) context[c.KEY_GLUE_CRAWLER] = Glue() thread_pool = ThreadPool(size=3) crawler_name = context[c.KEY_GLUE_CRAWLER].get_crawler_name(stackid) crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE]) glue = Glue() events = glue.get_events() start = datetime.datetime.utcnow() - datetime.timedelta(hours=2) now = datetime.datetime.utcnow() found = False for type in events: dt = start while dt <= now: prefix = metric_schema.s3_key_format().format( context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day, dt.hour, type, dt.strftime(util.partition_date_format())) found = crawler.exists(prefix) if found: print "FOUND new events=>", prefix break dt += timedelta(hours=1) if found: break if found: thread_pool.add(crawl, context, crawler_name, context[c.KEY_ATHENA_QUERY].execute_with_format) thread_pool.wait() return custom_resource_response.success_response({}, "*")
def launch(event, lambdacontext): print "Start" hours_delta = 36 context = dict({}) context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr( lambdacontext, 'function_name') else None context[c.KEY_REQUEST_ID] = lambdacontext.aws_request_id if hasattr( lambdacontext, 'aws_request_id') else None global threadpool global is_lambda threadpool = ThreadPool(context, 8) is_lambda = context[c.KEY_REQUEST_ID] is not None available_amoeba_lambdas = [] available_amoeba_lambdas.append(c.ENV_AMOEBA_1) available_amoeba_lambdas.append(c.ENV_AMOEBA_2) available_amoeba_lambdas.append(c.ENV_AMOEBA_3) available_amoeba_lambdas.append(c.ENV_AMOEBA_4) available_amoeba_lambdas.append(c.ENV_AMOEBA_5) db = DynamoDb(context) crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE]) glue = Glue() events = glue.get_events() #TODO: adjust the amoeba tree depth so that we have fully utilized all available amoebas; len(available_amoeba_lambdas) * 1000 #since the number of leaf nodes for the metric partitions can quickly get very large we use a 5 lambda pool to ensure we don't hit the 1000 invocation limit. start = datetime.datetime.utcnow() - datetime.timedelta(hours=hours_delta) now = datetime.datetime.utcnow() for type in events: dt = start while dt <= now: prefix = metric_schema.s3_key_format().format( context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day, dt.hour, type, dt.strftime(util.partition_date_format())) threadpool.add(crawler.crawl, prefix, available_amoeba_lambdas, invoke_lambda) dt += timedelta(hours=1) threadpool.wait() return custom_resource_response.success_response({"StatusCode": 200}, "*")
#!/usr/bin/env python # coding:utf-8 from thread_pool import ThreadPool import hackhttp import re import os hh = hackhttp.hackhttp(hackhttp.httpconpool(500)) tp = ThreadPool(500) package = "wooyun" if not os.path.exists(package): os.mkdir(package) def vlun(wid): print "[+]%s" % wid if os.path.isfile(wid + ".html"): return _, _, html, _, _ = hh.http( url="http://wooyun.org/bugs/%s" % wid, cookcookie=False) open(package + "/" + wid + '.html', 'wb').write(html) def catalog(page): _, _, html, _, _ = hh.http( url="http://wooyun.org/bugs/new_public/page/%d" % page, cookcookie=False) for wid in re.findall(r'href="/bugs/(wooyun-\d+-\d+)">', html): tp.add_task(vlun, wid) if page > 0:
for i in range(0, len(item)): c = item[i].decode("gb2312") if i == 0: l.append(c) else: if c[0] == "&": l.append(0) else: l.append(1) rooms.append(l) with open( "data/" + campus + "." + building + "." + week + "." + week_day + ".json", "w") as f: f.write(json.dumps(rooms)) print "finish: week:" + week + " week_day:" + week_day return "success" if __name__ == "__main__": s = Spider() s.cookies = {"JSESSIONID": "8B7DA565F71772D37B04170241A757A8.TAB2;"} pool = ThreadPool(size=20) pool.start() for week in range(1, 21): for week_day in range(1, 8): print "start week:" + str(week) + " week_day:" + str(week_day) # 请自行确定info.py中的校区id和教学楼id是正确的 # 然后按照info.py中的数据修改校区和教学楼id pool.append_job(s.craw, "1709", "1783", str(week), str(week_day)) pool.join()
from thread_pool import ThreadPool from db.flights_resp import FlightsRespDAL from flights_data.flight_checks import FlightChecker from datetime import date airports_error = [] def check_flights(origin, dest, depart_date, flight_checker): if not flight_checker.pricer.get_price_one_way(origin, dest, str(depart_date))[0]: airports_error.append(dest) pool = ThreadPool(20, "flight_checker", FlightChecker) flight_resp_dal = FlightsRespDAL() depart = date(2014, 8, 02) dests = flight_resp_dal.get_all_airports() print len(dests) for dest in dests: pool.add_task(check_flights, "JFK", dest, depart) pool.start() pool.wait_completion() print airports_error for y in airports_error: flight_resp_dal.airport_collection.remove({"airport_code": y})
# In range, start, end(plus 1 to include end) and steps pool_size = [x for x in range(min_thread, max_thread, thread_step)] # Create dict with thread sizes to keep track of time for thread_count in pool_size: times[thread_count] = [] for i in pool_size: if ovrld.overloaded: i = ovrld.opt_work_threads if need_count and i == pool_size[-1] \ or need_count and ovrld.overloaded and i == ovrld.opt_work_threads: clients = i * calculate_needed() else: clients = i * count pool = ThreadPool(i) # Clients is the final goal, it'll run the thread count for "count" iterations # count is from config sched_clients += clients while clients: # Change to your desired function... pool.add_task(time_event, xmlrpc_call, i) clients -= 1 total_clients += 1 if errors.error_count > errors_threshold: quit() pool.wait_completion() avg_time = sum(times[i]) / len(times[i]) ovrld.calc_time(avg_time, i)
"submissions": self.re_search("<td>Submissions</td><td align=center>(\d+)</td>", html), "accepted": self.re_search("<td>Accepted</td><td align=center>(\d+)</td>", html)} except Exception as e: logging.error(e) return {"website": "hduoj", "rank": 0, "problems_submitted": 0, "problems_solved": 0, "submissions": 0, "accepted": 0} d = DBHandler() s = Spider() user_list = d.get_user_list() pool = ThreadPool(size=10) pool.start() def add_username(func, username, oj_username): data = func(oj_username) data["username"] = username return data for user in user_list: pool.append_job(add_username, s.bestcoder, user[0], user[1]) pool.append_job(add_username, s.codefoces, user[0], user[2]) pool.append_job(add_username, s.hduoj, user[0], user[3]) pool.join() pool.stop()
class MessageBus(object): """ 消息总线 用于发送消息和桥接bot和命令 接收消息分发给群成员 处理消息命令,指派给相应的命令处理 供命令处理返回命令或广播命令结果 """ def __init__(self, bot_jid, stream): self.bot_jid = bot_jid self._stream = stream self.cmd_handler = CommandHandler(message_bus = self) self.admin_cmd_handler = AdminCMDHandler(message_bus = self) self._thread_pool = ThreadPool(5) self._thread_pool.start() # 启动线程池 self.logger = get_logger() return def make_message(self, to, typ, body): """ 构造消息 `to` - 接收人 JID `typ` - 消息类型 `body` - 消息主体 """ if typ not in ['normal', 'chat', 'groupchat', 'headline']: typ = 'normal' m = Message(from_jid = self.bot_jid, to_jid = to, stanza_type = typ, body = body) return m def send_to_admin(self, stanza, body): """ 给管理员发送消息 """ [self.send_message(stanza, admin, body, True) for admin in ADMINS] def send_private_msg(self, stanza, to, body): """ 发送私信 """ frm = stanza.from_jid nick = get_nick(frm) body = "[%s 悄悄对你说] %s" % (nick, body) self.send_message(stanza, to, body, True) def send_message(self, stanza, to, body, log = False): """ 发送消息 `stanza` - 消息节 `to` - 接收人 接收人不在线发送离线消息 `body` - 消息主体 `log` - 记录历史消息 """ if log: add_history(stanza.from_jid, to, body) if is_online(to): mode = get_info('mode', to) if mode == 'talk' or not mode: if isinstance(to, (str, unicode)): to = JID(to) self.logger.debug("send '{0}' to {1!r}".format(body, to)) typ = stanza.stanza_type self._stream.send(self.make_message(to, typ, body)) else: body = NOW() + ' ' + body self.logger.debug("store offline message'{0}' for {1!r}" .format(body, to)) offline_message = get_info('offline_message', to, '') offline_message += '\n' + body add_info('offline_message', offline_message, to) def send_offline_message(self, stanza): """ 发送离线消息 """ show = stanza.show frm = stanza.from_jid offline_message = get_info('offline_message', frm) if offline_message: offline_message = "离线期间的消息:\n" + offline_message m = self.make_message(frm, 'normal', offline_message) self._stream.send(m) set_online(frm, show) add_info('offline_message', '', frm) def send_all_msg(self, stanza, body): """ 给除了自己的所有成员发送消息 """ if cityid(body.strip()): return self.send_command(stanza, '-_tq ' + body.strip()) if body.strip() == 'help': return self.send_command(stanza, '-help') if body.strip() == 'ping': return self.send_command(stanza, '-_ping') mode = get_info('mode', stanza.from_jid) if mode == 'quiet': body = u'你处于{0},请使用-cd命令切换到 {1} '\ u'后发言'.format(MODES[mode], MODES['talk']) return self.send_back_msg(stanza, body) add_history(stanza.from_jid, 'all', body) members = get_members(stanza.from_jid) current = get_info('channel', stanza.from_jid, 'main') members = [m for m in members if get_info('channel', m, 'main') == current] self.logger.info("{0} send message {1} to {2!r}" .format(stanza.from_jid, body, members)) nick = get_nick(stanza.from_jid) body = "[{0}] {1}".format(nick, body) [self.send_message(stanza, m, body) for m in members] def send_back_msg(self, stanza, body): """ 发送返回消息 """ to = stanza.from_jid.bare().as_string() typ = stanza.stanza_type self._stream.send(self.make_message(to, typ, body)) def send_sys_msg(self, stanza, body): """ 发送系统消息 """ members = get_members() [self.send_message(stanza, m, body) for m in members] def send_command(self, stanza, body): """ 处理命令 为防止阻塞使用线程池处理命令 """ email = get_email(stanza.from_jid) self.logger.info("{0} run command {1}".format(stanza.from_jid, body)) if email in ADMINS: target = self.admin_cmd_handler._run_cmd else: target = self.cmd_handler._run_cmd self._thread_pool.add_job(target, stanza, body) def send_status(self, statustext, to = None): if to: to_jid = JID(to) p = Presence(status=statustext, to_jid = to_jid) else: p = Presence(status = statustext) self._stream.send(p) def send_subscribe(self, jid): """ 发送订阅 """ p1 = Presence(from_jid = self.bot_jid, to_jid = jid, stanza_type = 'subscribe') p = Presence(from_jid = self.bot_jid, to_jid = jid, stanza_type = 'subscribed') self._stream.send(p) self._stream.send(p1) def send_unsubscribe(self, jid): p1 = Presence(from_jid = self.my_jid, to_jid = jid, stanza_type = 'unsubscribe') p = Presence(from_jid = self.my_jid, to_jid = jid, stanza_type = 'unsubscribed') self._stream.send(p) self._stream.send(p1)
def entities(): error = None if request.method == 'POST': query = request.form['query'].lower() query = query.replace(" ", "%20") if query in app.entity_cache: return Response(json.dumps(app.entity_cache[query]), mimetype='application/json') url = GOOGLE_NEWS_RSS+query response = requests.get(url).text # Get article links from the RSS links = [] utils.find_links(links, response) if len(links) > 1: links = links[2:] # Get article titles from the RSS titles = [] utils.find_titles(titles, response) if len(titles) > 1: titles = titles[2:] num_links = len(links) entities = [] pool = ThreadPool(num_links) for i in range(num_links): pool.add_task( utils.get_article_entities, links[i], entities) pool.wait_completion() result = {} num_entities = len(entities) entity_dict = defaultdict(int) for entity_list in entities: for entity in entity_list: entity_dict[entity] += 1 entities_list = reversed(list(sorted(entity_dict.items(), key=lambda x: x[1]))) entities_list = [x[0] for x in entities_list] if len(entities_list) > 5: entities_list = entities_list[:5] fix_entities = False if not entities_list: fix_entities = True if num_entities > 0: entities_set = entities[0] if fix_entities: if len(entities[0]) > 3: entities_list = [word.title() for word in list(entities[0])[:3]] else: entities_list = [word.title() for word in entities[0]] result[0] = [word.title() for word in entities[0]] for i in range(1, num_entities): if fix_entities and len(entities_list) < 3: if len(entities[i]) + len(entities_list) > 3: entities_list.extend([word.title() for word in list(entities[i])[:3-len(entities_list)]]) else: entities_list.extend([word.title() for word in entities[i]]) result[i] = [word.title() for word in entities[i]] result["entities"] = entities_list app.entity_cache[query] = deepcopy(result) return Response(json.dumps(result), mimetype='application/json')
class Crawler(object): def __init__(self, args): self.thread_num = args.thread_num self.output = args.output if not os.path.exists(self.output): os.mkdir(self.output) self.domain_pattern = re.compile( r"^([0-9a-zA-Z][0-9a-zA-Z-]{0,62}\.)+([0-9a-zA-Z][0-9a-zA-Z-]{0,62})\.?$") def _init(self): # 线程池,指定线程数 self.thread_pool = ThreadPool(self.thread_num) self.depth = 2 # 标注初始爬虫深度,从1开始 self.current_depth = 1 # 已访问的链接 self.visited_hrefs = set() # 待访问的链接 self.unvisited_hrefs = deque() # 标记爬虫是否开始执行任务 self.is_crawling = False self.resource_details = ResourceDetailCollection() def _format_url(self, raw_value): raw_value_str = raw_value.strip().strip('\n') if len(raw_value_str) <= 0: return '' if not self.domain_pattern.match(raw_value_str): return '' if not raw_value_str.startswith('http'): value = 'http://' + raw_value_str else: value = raw_value_str return value def crawl(self, url): self._init() formatted_url = self._format_url(url) self.resource_details.set_main_frame_url(formatted_url) self.unvisited_hrefs.append(formatted_url) print '\nStart Crawling url %s\n' % formatted_url self.is_crawling = True self.thread_pool.start_threads() while self.current_depth < self.depth + 1: # 分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assigin_current_depth_tasks() # 等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 # self.thread_pool.task_join()可代替以下操作,可无法Ctrl-C Interupt while self.thread_pool.get_task_left(): time.sleep(8) print 'Depth %d Finish. Totally visited %d links. \n' % ( self.current_depth, len(self.visited_hrefs)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.current_depth, len(self.visited_hrefs))) self.current_depth += 1 # After finishing all the tasks, stop this crawling. print "all Tasks has finished" self._on_all_tasks_finished() self.stop() def stop(self): self.is_crawling = False self.thread_pool.stop_threads() def get_already_visited_num(self): # visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。 # 因此真实的已访问链接数为visitedHrefs数减去待访问的链接数 return len(self.visited_hrefs) - self.thread_pool.get_task_left() def _on_all_tasks_finished(self): resource_detail_data = unicode(json.dumps( self.resource_details.to_json_data(), indent=4)) hashed_file_name = hashlib.new("md5", self.resource_details.main_frame_url).hexdigest() + ".json" resource_detail_dataPath = os.path.join(self.output, hashed_file_name) with io.open(resource_detail_dataPath, 'w') as file: file.write(unicode(resource_detail_data)) def _assigin_current_depth_tasks(self): mylock.acquire() copied_unvisited_hrefs = deque() while self.unvisited_hrefs: copied_unvisited_hrefs.append(self.unvisited_hrefs.popleft()) mylock.release() while copied_unvisited_hrefs: url = copied_unvisited_hrefs.popleft() # 标注该链接已被访问,或即将被访问,防止重复访问相同链接 self.visited_hrefs.add(url) # 向任务队列分配任务 self.thread_pool.put_task(self._task_handler, url) def _task_handler(self, url): # 先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 url_fetcher = URLFetcher(url) retry = 1 if url_fetcher.fetch(retry): self._save_task_results(url, url_fetcher) self._add_unvisited_hrefs(url_fetcher) def _save_task_results(self, url, url_fetcher): print 'Visited URL : %s \n' % url response_headers = url_fetcher.get_response_headers() response_detail = ResourceDetail(url, url_fetcher.request_time, url_fetcher.response_time, response_headers) mylock.acquire() self.resource_details.add_detail(response_detail) mylock.release() def _add_unvisited_hrefs(self, url_fetcher): '''添加未访问的链接。将有效的url放进UnvisitedHrefs列表''' # 对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次 url, page_source = url_fetcher.get_data() hrefs = self.get_all_resource_hrefs(url, page_source) mylock.acquire() for href in hrefs: if self._is_http_or_https_protocol(href): if not self._is_href_repeated(href): self.unvisited_hrefs.append(href) mylock.release() def get_all_resource_hrefs(self, url, page_source): '''解析html源码,获取页面所有链接。返回链接列表''' hrefs = [] soup = BeautifulSoup(page_source) results = soup.find_all(True) for tag in results: href = None if tag.name == 'a': continue # 必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf # 在bs4中不会被自动url编码,从而导致encodeException if tag.has_attr('href'): href = tag.get('href').encode('utf8') elif tag.has_attr('src'): href = tag.get('src').encode('utf8') if href is not None: if not href.startswith('http'): href = urljoin(url, href) # 处理相对链接的问题 hrefs.append(href) return hrefs def _is_http_or_https_protocol(self, href): protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False def _is_href_repeated(self, href): if href in self.visited_hrefs or href in self.unvisited_hrefs: return True return False
def main(): url_task_queue = Queue.Queue() thread_searcher_pool = ThreadPool(500, url_task_queue) thread_searcher_pool.start() topic_searcher_urls_pool = ThreadPool(200) topic_searcher_urls_pool.start() topic_threader = TopicSearcherThreader( thread_searcher_pool, topic_searcher_urls_pool, build_url_job, HenHenLuTopicUrlExer, BASE_URL, START_NUM, END_NUM, PIC_TYPE, ) topic_threader.start() while True: print '#######################################' print 'URL has task count: %s' \ % str(thread_searcher_pool.current_task_count()) print '#######################################' print '#######################################' print 'TOPIC has task count: %s' \ % str(topic_searcher_urls_pool.current_task_count()) print '#######################################' time.sleep(5) topic_threader.join() topic_searcher_urls_pool.wait_done() thread_searcher_pool.wait_done()
l = [] for i in range(0, len(item)): c = item[i].decode("gb2312") if i == 0: l.append(c) else: if c[0] == "&": l.append(0) else: l.append(1) rooms.append(l) f = open("data/" + campus + "." + building + "." + week + "." + week_day + ".json", "w") f.write(json.dumps(rooms)) f.close() print "finish: week:" + week + " week_day:" + week_day return "success" if __name__ == "__main__": s = Spider() s.cookies = {"JSESSIONID": "8B7DA565F71772D37B04170241A757A8.TAB2;"} pool = ThreadPool(size=20) pool.start() for week in range(1, 21): for week_day in range(1, 8): print "start week:" + str(week) + " week_day:" + str(week_day) # 请自行确定info.py中的校区id和教学楼id是正确的 # 然后按照info.py中的数据修改校区和教学楼id pool.append_job(s.craw, "1709", "1783", str(week), str(week_day)) pool.join()
class Crawler(): def __init__(self, myconfig): # 线程池, self.thread_pool = ThreadPool(myconfig.threadnum) # 已访问的url集合 self.visited_urls = set() # set 不是线程安全,所以这里加一把锁 self.visited_urls_lock = threading.Lock() # 未访问的url集合 self.will_visited_urls = deque() self.will_visited_urls.append(myconfig.url) self.temp_q = deque() self.cur_depth = 0 self.status = "" self.myconfig = myconfig MyLogger(myconfig.logfile, myconfig.loglevel) #MyLogger(myconfig.logfile, loglevel = 5) # debug self.db = Db() def start(self): self.status = "start" while self.cur_depth < self.myconfig.depth: if self.status == "stop": break try: while self.will_visited_urls: url = self.will_visited_urls.popleft() # 添加工作,这里基本上没有阻塞,因为是在主线程里,只是负责 # 添加工作,真正执行工作是在线程里做的 self.thread_pool.add_job(self.handler, url) # # TODO: # 通知线程有活干了,这里可以看出是在将will_visited_urls的url # 都添加后才通知线程去干活的,这样设计,粒度似乎有点粗? # 如果还想节省时间的话,可以在url的数目 >= 线程初始数目的时候,就通知 # 线程池里的线程开始干活,如果url的数目 < 线程初始数目的时候,等都 # 添加完之后,再通知 #print ">>>>>>>> give event to threads in thread pool" # 通知线程池里的线程开始新一轮的抓取 self.thread_pool.event_do_job() # 主动退出调度,让子线程有时间可以执行 time.sleep(3) except Empty: # 需要访问的url没有了 logging.info("no url right now") finally: # 必须等线程池里的线程工作做完之后,才算本次深度的访问结束 # 这里做的处理是如果线程池里面有线程,则睡3s,再读, # 直到线程池里的工作线程为0才停下来 # 这样才算本次深度的抓取完毕 while True: #print "thread waiting num is %d, config thread num is %d" % (self.thread_pool.get_thread_waiting_num(), self.myconfig.thread) if self.thread_pool.get_thread_waiting_num() == self.myconfig.threadnum: # 如果等待的线程数目等于线程初始数目,则说明,所有线程都执行完毕 # 所以break break else: # 有线程仍然在执行,则说明, 本次深度的访问还没有结束 # 睡眠等待 time.sleep(10) #此次深度的访问结束,深度加一 self.cur_depth += 1 logging.info("crawler depth now is %s" % str(self.cur_depth)) if self.cur_depth > self.myconfig.depth: break # 从url中抓到的网页都放到了temp_q中, # 将temp_q中的网页从新给 will_visited_urls,继续 self.will_visited_urls = self.temp_q self.temp_q = deque() # 所有深度的url都抓取完毕 or 爬虫退出 self.thread_pool.stop_threads() logging.info("crawler exit") return def handler(self, url): content= self.get_html_content(url) if content == "" or content == None: # 无法获取content,直接返回 return # 添加此url为已访问过 self.add_url_to_visited(url) if content.find(self.myconfig.key) != -1: self.db.save_data(url, self.myconfig.key, content) try: hrefs = self.get_hrefs(content, url) except StandardError, se: logging.error("error: %s" % (se)) print se # log # 无法获取 hrefs return # 如果获得了hrefs if hrefs: # 将hrefs添加到 temp_q中,等本级深度访问完毕之后再访问 for link in hrefs: # 最后的考验 if not self.is_url_visited(link) \ and link not in self.will_visited_urls \ and link not in self.temp_q: #print "put %s into temp_q" % link self.temp_q.append(link)
# Get article titles from the RSS titles = [] utils.find_titles(titles, response) if len(titles) > 2: titles = titles[2:] num_links = len(links) num_titles = len(titles) if num_titles < num_links: links = links[:num_titles] if num_links < num_titles: titles = titles[:num_links] articles = {} pool = ThreadPool(num_links) for i in range(num_links): pool.add_task( utils.get_article_sentiment, links[i], titles[i], articles) pool.wait_completion() sentiments = [] result = {} result["articles"] = [] for key in articles: info = {} info["title"] = articles[key]["title"] info["link"] = key sentiments.append(articles[key]["sentiment"]) info["sentiment"] = articles[key]["sentiment"]
def _init_threadpool(self): self.pool = ThreadPool(int(self.thread_pool_num), int(self.queue_size))