async def process_partitions_queue( loop: asyncio.BaseEventLoop, partitions_queue: asyncio.Queue, results_queue: asyncio.Queue, server_address: URL, mission_template: Template, mission_loader: str, width: int, scale: int, ) -> Awaitable[None]: mission_name = mission_loader.split('/', 1)[0] async with aiohttp.ClientSession() as http: while True: partition = await partitions_queue.get() if partition is None: partitions_queue.task_done() return await process_partition( loop=loop, results_queue=results_queue, server_address=server_address, http=http, partition=partition, mission_template=mission_template, mission_loader=mission_loader, mission_name=mission_name, width=width, scale=scale, ) partitions_queue.task_done()
async def udp_writer(s: socket, oqueue: Queue) -> None: """Forward packets to the UDP socket.""" while True: peer, data = await oqueue.get() try: s.sendto(data, peer) finally: oqueue.task_done()
async def call(loop, inq: asyncio.Queue): while True: v = await inq.get() logger.debug("consume[S] v:%s", v) if v is None: inq.task_done() break v = await afn(v) logger.debug("consume[E] v:%s", v) inq.task_done() await inq.join() logger.debug("consume[CLOSE]")
async def __call__(self, inq: asyncio.Queue): while True: v = await inq.get() logger.debug("aggregate[S] v:%s", v) if v is None: inq.task_done() break await asyncio.sleep(0.1, loop=self.loop) print(v) logger.debug("aggregate[E] v:%s", v) inq.task_done() await inq.join() logger.debug("aggregate[CLOSE]")
async def call(loop, inq: asyncio.Queue, outq: asyncio.Queue): while True: v = await inq.get() logger.debug("communicate[S] v:%s", v) if v is None: inq.task_done() break v = await afn(v) logger.debug("communicate[E] v:%s", v) await outq.put(v) inq.task_done() await inq.join() await outq.put(None) logger.debug("communicate[CLOSE]")
async def __call__(self, inq: asyncio.Queue, outq: asyncio.Queue): while True: v = await inq.get() logger.debug("communicate[S] v:%s", v) if v is None: inq.task_done() break await asyncio.sleep(0.1, loop=self.loop) v = v * v logger.debug("communicate[E] v:%s", v) await outq.put(v) inq.task_done() await inq.join() await outq.put(None) logger.debug("communicate[CLOSE]")
async def process_results_queue( results_queue: asyncio.Queue, total_points: int, output_file_path: Path, ) -> Awaitable[None]: point_size = calcsize(HEIGHT_PACK_FORMAT) output_size = point_size * total_points natural_size = humanize.naturalsize( output_size, binary=True, format='%.3f', ) LOG.debug(f"output size: {natural_size}") processed_points = 0 output_file_path.parent.parent.mkdir(parents=True, exist_ok=True) with output_file_path.open('wb') as f: f.truncate(output_size) while True: data = await results_queue.get() if not data: results_queue.task_done() return partition, values = data start = partition.start * point_size processed_points += (partition.end - partition.start) + 1 progress = (processed_points / total_points) * 100 LOG.debug( f"gather results for range " f"[{partition.start}:{partition.end}], " f"progress: {progress:.2f}%" ) f.seek(start) f.write(values) results_queue.task_done()
async def kafka_producer(client: Producer, conf: DotDict, queue: Queue) -> None: """ Async producer for Kafka. Pulls messages from queue. """ while True: msg = await queue.get() client.produce( conf.kafka_topic, key=conf.page_url, value=json.dumps(msg), on_delivery=_ack_handler, ) client.poll(0) queue.task_done()
async def InterfaceSender(Client: IoTHubModuleClient, InterfaceOut: asyncio.Queue): try: while (True): data = await InterfaceOut.get() print('Interface sender: Message to send.', data) msg = json.dumps(data) msg = Message(msg) try: await Client.send_message_to_output(msg, 'InterfaceOut') InterfaceOut.task_done() except Exception as ex: print( 'Interface sender: Unexpected error in sender: {}'.format( ex)) print('Interface sender: Finished sending') except asyncio.CancelledError: print('Interface sender: Task cancelled')
async def worker(name, in_q: asyncio.Queue, output: asyncio.Queue): print(f'worker {name} started') while True: paperid, authorids = await in_q.get() if paperid is None: in_q.task_done() break authors = dict() for aid in authorids: for pos in db.postlist(aid): doc = db.get_document(pos.docid) data: bytes = doc.get_data() authors[aid] = data.decode('utf-8') result = [authors.get(int(i), 'MISSING_DATA') for i in authorids] s = json.dumps({'PaperId': paperid, 'Author': {'set': result}}) + '\n' await output.put(s) in_q.task_done() print(f'worker {name} stopping')
async def independent_task(queue: asyncio.Queue): """tento task je nezavisly na parentovi (neposila mu zadna data zpet, je to napriklad logovani) """ print("starting the child") val = await queue.get() while val is not None: print("Received is %s and processing data" % str(val)) await asyncio.sleep( 0.5 ) # procesovani zabere nejaky cas, aby se demonstrovala kapacita fronty print("Received data processed") queue.task_done() val = await queue.get() queue.task_done() # oznacuje poslední None hodnotu, ktera ukoncila cyklus print("The client is done here")
async def worker(name, in_q: asyncio.Queue, output: asyncio.Queue): s = get_async_localhost_session() print(f'worker {name} started') while True: x = await in_q.get() if x is None: in_q.task_done() break expr = f'search(mag_papers,q=ConferenceSeriesId:{x["ConferenceSeriesId"]},fl=PaperId, sort="PaperId asc",qt=/export)' async with s.collection('mag_papers').stream.expr(expr) as resp: response = await resp.json() paperids = [doc['PaperId'] for doc in response['result-set']['docs'][:-1]] conference_series_name = x['DisplayName'] for paper in paperids: await output.put(json.dumps({'PaperId': paper, 'ConferenceSeries': {'set': conference_series_name}}) + '\n') in_q.task_done() print(f'worker {name} stopping') await s.close()
async def token_consumer(in_q: asyncio.Queue): async with aiofile.AIOFile(sys.argv[3], 'wb') as f: write = aiofile.Writer(f) eos = False while not eos: tokens = deque([await in_q.get()]) while not in_q.empty(): token = await in_q.get() tokens.append(token) if tokens[-1] is OES: eos = True tokens.pop() if tokens: transform_tasks = [transform_token(t) for t in tokens] transformed = await asyncio.gather(*transform_tasks) await write(b''.join(transformed)) in_q.task_done() await f.fsync()
async def content_consumer(worker_id: int, queue: asyncio.Queue, tmpdir: str) -> None: """ Save a content to the localstorage """ while True: content = await queue.get() # Add timeout to see how it works if os.environ.get("DEBUG", False): await asyncio.sleep(1 + (1 * worker_id)) filename = os.path.join(tmpdir, f"async_{str(uuid.uuid4())}.mov") async with aiofiles.open(filename, "wb") as video_file: await video_file.write(content) logger.debug(f"[WORKER {worker_id}] Finished writing {filename}") queue.task_done()
async def downloader(db: DB, info_queue: Queue): """合集图片下载器""" logger.info(f"任务:{asyncio.current_task().get_name()} 启动") while True: collection_number, collection_name, url_list = await info_queue.get() logger.info(f"开始下载合集:{collection_name},共有{len(url_list)}张图片") # 此合集下载失败的图片数量,若大于 10,返回None 合集下载失败 fail_count = 0 for img_url in url_list: await sleep(random.uniform(.5, 2.5)) file_name = img_url.split("/")[-1] # 检查文件夹命名的格式,删除命名中的非法字符 for char in invalid_chars_in_path: if char in collection_name: collection_name = collection_name.replace(char, "") dir_path = os.path.join(DL_PATH, collection_name) if not os.path.exists(dir_path): try: os.mkdir(dir_path) except NotADirectoryError: os.mkdir(os.path.join(DL_PATH, "unknown")) file_path = os.path.join(dir_path, file_name) # 如果已经下载就跳过 if os.path.exists(file_path): continue try: await dl_session.get(img_url, file_path=file_path) logger.debug(f"{file_path} 下载完毕") except ConnectionError: fail_count += 1 if fail_count < 10: db.update_picture_status(collection_number, 1) info_queue.task_done() else: logger.warning(f"合集:{collection_name} 由于图片失败太多导致下载失败")
async def bouquet_designs_consumer( bouquet_designs_queue: asyncio.Queue, flowers_queue: asyncio.Queue, bouquets_queue: asyncio.Queue, ): """Taking care on bouquets and their state.""" worker_id = int(random.random() * 1000) bd_str = await bouquet_designs_queue.get() bd = BouquetDesign.from_str(bd_str) logger.info( f"Proccesing of a bd {repr(bd_str)} " f"started by worker id {repr(worker_id)}..." ) bouquet = Bouquet(name=bd.name, design=bd) while True: fl_str = await flowers_queue.get() logger.debug( f"Flower {repr(fl_str)} received by " f"worker id {repr(worker_id)}" ) flower = Flower.from_str(fl_str) try: await bouquet.use( flower, additional_debug_str=f"Worker id {repr(worker_id)}" ) except KeyError: # if flower is not compatible with the design # we put it back to the queue and allow switch the context # to another task await flowers_queue.put(fl_str) logger.debug( f"Worker id {repr(worker_id)} Flower {repr(fl_str)}" f" returned to the queue" ) await asyncio.sleep(0) else: if bouquet.is_ready: logger.info( f"Bouquet {repr(bouquet.to_str())} produced by worker" f" {repr(worker_id)}" ) bouquet_designs_queue.task_done() await bouquets_queue.put(bouquet.to_str()) asyncio.current_task().cancel()
async def consumer(q: asyncio.Queue, name): """ Реализация Consumer """ progress = tqdm(desc=f'consumer #{name}', leave=False) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36' } async with aiohttp.ClientSession(headers=headers) as session: while True: url = await q.get() if url_is_fetched(url): q.task_done() continue data = await fetch(url, session) with open(get_filename_for_write(url), 'w', encoding='utf-8') as file: file.write(data.decode('utf-8')) progress.update() q.task_done()
async def worker(self, queue: asyncio.Queue): if self.use_requests: await self._requests_worker(queue) else: async with aiohttp.ClientSession( raise_for_status=True, headers=[("User-Agent", ua.random)], timeout=aiohttp.ClientTimeout(total=60)) as sess: while True: url = await queue.get() try: es.Page.get(id=url) log.info('page existed, skip {}'.format(url)) except elasticsearch.NotFoundError: try: # resp, html = await fetch.get(url) async with sess.get(url) as resp: log.info('page fetching {}'.format(url)) html = await resp.text() log.info('page downloaded {}'.format(url)) self.parse(url, resp, html) await asyncio.sleep(3) log.info('page scraped {}'.format(url)) except aiohttp.ClientResponseError as e: page = es.Page( from_url=url, resolved_url=str(e.request_info.real_url), http_status=e.status,) page.save() log.info("fetch error & skiped: {}".format(e)) log.error(e) self.error_urls.append(url) except Exception as e: log.info( "scrape internal error & skiped: {}".format(e)) log.error(e) self.error_urls.append(url) except Exception as e: log.info("scrape internal error & skiped: {}".format(e)) log.error(e) self.error_urls.append(url) finally: queue.task_done()
async def LogFileDict_toList(self, queue_df: asyncio.Queue, Logfile_List: Optional[list] = None) -> str: """ :param self: Cisco_Function new instance :param Logfile_List: List which stores each line of Cisco firewall config """ # Version 2.0 - old Question / Code execution efficiency big problem # for item in Logfile_List: # self.df_cisco = self.df_cisco.append(self.Analyze_CiscoContent(each_content=item), ignore_index=True) # Create New List to store all generated dict improve from 34062ms to 34ms about 1000x # Version 2.1 - New Version / -> Store each config line in dict and Creat New DataFrame # while True: insert_dict = await queue_df.get() queue_df.task_done() if queue_df.empty() and insert_dict == "readline_complete": return "Finished! - LogFileDict_toList" self.dict_list.append(insert_dict)
class AsyncDownloader: def __init__(self, concurrence=10, headers=None): self.queue = Queue(concurrence) self.headers = headers self.concurrence = concurrence async def push_task(self, urls): for idx, url in enumerate(urls): await self.queue.put((idx, url)) print(f"pushed {idx}") async def worker(self): async with aiohttp.ClientSession() as session: while True: # print('worker start') idx, url = await self.queue.get() # print(f'{idx} start') rsp = await self.download(session=session, url=url, headers=self.headers) await self.save(rsp) self.queue.task_done() async def process(self, urls): tasks = asyncio.create_task(self.push_task(urls)) workers = [ asyncio.create_task(self.worker()) for _ in range(self.concurrence) ] await tasks await self.queue.join() for worker in workers: worker.cancel() @staticmethod async def download(session, url, headers=None, method='GET'): async with session.request(method=method, url=url, headers=headers) as rsp: data = await rsp.text() return data async def save(self, data): pass
class Test: def __init__(self): self.que = Queue() self.pue = Queue() async def consumer(self): while True: try: print('consumer', await self.que.get()) finally: try: self.que.task_done() except ValueError: if self.que.empty(): print("que empty") async def work(self): while True: try: value = await self.pue.get() print('producer', value) await self.que.put(value) finally: try: self.pue.task_done() except ValueError: if self.pue.empty(): print("pue empty") async def run(self): tasks = [asyncio.ensure_future(self.work()), asyncio.ensure_future(self.consumer())] await asyncio.wait([self.pue.put(i) for i in range(10)]) print('p queue join') await self.pue.join() print('p queue is done & q queue join') await self.que.join() print('q queue is done') asyncio.gather(*tasks).cancel()
async def update_products(queue_in: asyncio.Queue, queue_out: asyncio.Queue): session = FuturesSession() while True: product = await queue_in.get() if product is None: queue_in.task_done() await queue_out.put(None) break queue_in.task_done() url = product.url if product.update_url is None else product.update_url soup = await get_soup( url, session, product.shop.cookies ) # soup = await get_soup(url, session, shop.cookies) try: updated_product = await product.shop.product_parser(product, soup) except Exception as e: logger.exception("Exception occured: %s", getattr(e, "__dict__", {})) await queue_out.put(product) print(f'updated {product.shop.name} {product.name}')
async def worker(name, in_q: asyncio.Queue, output: asyncio.Queue): connection = await asyncpg.connect(user='******', database='mag') print(f'worker {name} started') while True: paperid, authorids = await in_q.get() if paperid is None: in_q.task_done() break query = ', '.join(authorids) q = f'SELECT "AuthorId", "DisplayName" FROM authors WHERE "AuthorId" in ({query});' # print(f'{name}: {q}') authors = await connection.fetch(q) authors = dict([tuple(a) for a in authors]) result = [authors.get(int(i), 'MISSING_DATA') for i in authorids] s = json.dumps({'PaperId': paperid, 'Author': {'set': result}}) + '\n' # await output.put(s) in_q.task_done() print(f'worker {name} stopping') await connection.close()
async def ffplay(queue: asyncio.Queue): """ Play media asynchronously. Each task runs endlessly until .cancel() """ assert isinstance(FFPLAY, str) while True: filein = await queue.get() cmd = [FFPLAY, "-loglevel", "warning", "-autoexit", str(filein)] proc = await asyncio.create_subprocess_exec(*cmd) ret = await proc.wait() if ret != 0: print(filein, "playback failure", file=sys.stderr) queue.task_done()
async def worker(q: asyncio.Queue): global threads, uLock, tasks while True: xfn = await q.get() # type: Thread print("x") uLock[f"{xfn.uid}"] = False if run.get(xfn.uid) is True: print("y") # Get a "work item" out of the queue. # Sleep for the "sleep_for" seconds. if xfn.stopped(): xfn.start() threads.append(xfn) # Notify the queue that the "work item" has been processed. q.task_done() q.put_nowait(xfn) else: print(f"Running?\nproc syst:{xfn}\n{xfn.stopped()}")
async def upload_training(session: aiohttp.ClientSession, training_data_queue: asyncio.Queue, strava_access_token: str): while True: training_data = await training_data_queue.get() data = {'file': training_data, 'data_type': 'tcx'} for i in range(3): response = await post_training(session, data, strava_access_token) print('upload status:', response.status) print('upload resp:', await response.read()) if response.status != 429: print("uploaded training") training_data_queue.task_done() break print(f"sleeping for 15 minutes") await asyncio.sleep(60 * 15 + 10) else: print('ERROR: not uploaded due to rate limits')
async def LogFile_toList(self, queue_df: asyncio.Queue): """ :param self: :param queue_df: """ # For Version 2.0, The old code block below limits the performance of creating the Pandas DataFrame # for item in content_list: # try: # self.df_topsec = self.df_topsec.append(self.Analyze_TopSec(each_line=item), ignore_index=True) # except (NameError, TypeError, RuntimeError, IndexError) as err: # config.Logger.log_warning("Below Config is not Supported by this Program! Please Check") # print(item) # New Version 2.1: while True: process_dict = await queue_df.get() queue_df.task_done() if queue_df.empty() and process_dict == "complete_process": return "LogFile_toList Function - complete" self.df_dict_list.append(process_dict)
async def collect_number(db: DB, tag_detail_url_queue: Queue, number_queue: Queue): """从tag详情页中提取合集编号,并将未记录的编号入库、入队""" logger.info(f"任务:{asyncio.current_task().get_name()} 启动") db_numbers = db.get_all_collection_numbers() while True: url = await tag_detail_url_queue.get() numbers = await extract_number_in_tag(url) new_numbers = set(numbers) - set(db_numbers) if not new_numbers: tag_detail_url_queue.task_done() continue for number in new_numbers: await number_queue.put(number) db.batch_add_collection_number(new_numbers) tag_detail_url_queue.task_done() logger.debug(f"新入库、入队 {len(new_numbers)} 条编号")
class Channel: """Holds messages for an Event in Bus""" def __init__(self): """Creates a Channel which has it's own queue of messages""" self._queue = Queue() async def __aiter__(self): return self async def __anext__(self): data = await self._queue.get() self._queue.task_done() if data == 'STOP': raise StopAsyncIteration return data async def put(self, data): await self._queue.put(data)
async def poll_until_done(session: aiohttp.ClientSession, q1: asyncio.Queue, q2: asyncio.Queue): """Poll specific experiment until it's done or failed.""" counter_max = int(os.getenv("UC1D_POLLING_RETRIES", "30")) freq = float(os.getenv("UC1D_POLLING_FREQUENCY", "0.1")) while True: # Retrieve first item from queue id, req_id, href = await q1.get() headers = {"X-Request-Id": req_id} # Poll status of simulation counter = 0 href_result = None while counter < counter_max: if counter > (counter_max / 2): freq *= 2 logger.log("REQUEST", f"GET {href}") async with session.get(href, headers=headers) as res: rep = await res.json() status = rep["status"] logger.trace( f"Polling status of simulation for individual '{id}': {status}" ) if status == "DONE": href_result = rep["linkToResult"] break if status == "FAILED": logger.warning("Simulation failed") break counter += 1 await asyncio.sleep(freq) # Enqueue link to result await q2.put((id, req_id, href_result)) # Indicate that a formerly enqueued task is complete q1.task_done()
async def process(self, queue: Queue, futures: List[Future], batch: List[FigmentContext]): """ Have the Figmentator process a batch """ try: results = await self.loop.run_in_executor( self.executor, self.figmentator.figmentate, batch) for future, result in zip(futures, results): # Set the result of the future future.set_result(result) # Need to notify the task queue for each item in the batch queue.task_done() except Exception as e: # pylint:disable=broad-except logging.error("Caught exception: %s", str(e)) self.ready.clear() for future in futures: # Set the exception on the future future.set_exception(e) # Need to notify the task queue for each item in the batch queue.task_done()
async def _subscription_queue_processor(self, queue: asyncio.Queue) -> None: try: while True: item = await queue.get() job_state = item["fields"] self._state[job_state["id"]] = job_state queue.task_done() job = self._get_job_no_fetch(job_state["id"]) if (JobStatus.is_completed(job.status) and job.id in self._job_wait_futures): self._job_wait_futures[job.id].set_result(job) del self._job_wait_futures[job.id] except asyncio.CancelledError: logger.debug( "core.get_jobs subscription work processing is getting canceled" ) raise except Exception as exc: logger.exception("exception while processing core.get_jobs data", exc_info=exc)
async def fetch_simulation_result(session: aiohttp.ClientSession, q: asyncio.Queue, q_repr_all: list): """Get the simulation result and parse it as dataframe.""" while True: # Retrieve first item from queue id, req_id, href = await q.get() headers = {"X-Request-Id": req_id} # Get simulation result logger.info(f"Retrieving result of simulation for individual '{id}''") async with session.get(href, headers=headers) as res: logger.log("REQUEST", f"GET {href}") rep = await res.json() logger.trace(json.dumps(rep, indent=JSON_DUMPS_INDENT)) # Enqueue for post-processing q_repr_all.append((id, rep)) # Indicate that a formerly enqueued task is complete q.task_done()
async def handle_main_events( run_state: RunState, mqtt_send_q: asyncio.Queue, garage_events_q: asyncio.Queue, main_events_q: asyncio.Queue, poller_ticker_q: asyncio.Queue, ): handlers = { "GarageStateEvent": handle_main_event_garage, "MqttMsgEvent": handle_main_event_mqtt, } while True: main_event = await main_events_q.get() logger.debug(f"Handling {main_event.event}...") handler = handlers.get(main_event.event) if handler: await handler(main_event, run_state, mqtt_send_q, garage_events_q, poller_ticker_q) else: logger.error(f"No handler found for {main_event.event}") main_events_q.task_done()
async def _download_worker(self, wk_name: str, queue: asyncio.Queue): downloaded_prices = [] try: while True: day: date = await queue.get() tic = monotonic() prices = await self._download_pvpc_prices(day) took = monotonic() - tic queue.task_done() if not prices: self._logger.warning( "[%s]: Bad download for day: %s in %.3f s", wk_name, day, took ) continue downloaded_prices.append((day, prices)) self._logger.debug( "[%s]: Task done for day: %s in %.3f s", wk_name, day, took ) except asyncio.CancelledError: return downloaded_prices
async def report_sightings(sub_endpoint: str, sightings_queue: asyncio.Queue): """ Starts a ZeroMQ publisher on the given endpoint and publishes sightings from the sightings_queue to Threat Bus. @param sub_endpoint A host:port string to connect to via ZeroMQ @param sightings_queue The queue to receive sightings from """ socket = zmq.Context().socket(zmq.PUB) socket.connect(f"tcp://{sub_endpoint}") topic = "stix2/sighting" logger.info(f"Forwarding sightings to Threat Bus at {sub_endpoint}/{topic}") while True: sighting = await sightings_queue.get() if type(sighting) is not Sighting: logger.warning( f"Ignoring unknown message type, expected Sighting: {type(sighting)}" ) continue socket.send_string(f"{topic} {sighting.serialize()}") sightings_queue.task_done() logger.debug(f"Reported sighting: {sighting}")
async def stream_to_postgres(self, q: asyncio.Queue): try: conn = await asyncpg.connect(self.config.conn_uri) except Exception as e: # noqa self._exception = e self.file_reader_task.cancel() return 0 log.debug('[stream_to_postgres] Connected to %s', self.config.conn_uri) num_rows_written = 0 try: await conn.execute(f''' CREATE TABLE IF NOT EXISTS {self.config.table_name} ( {self.schema})''') eos = False while not eos: records = deque([await q.get()]) while not q.empty(): record = await q.get() records.append(record) if records[-1] is EOS: eos = True records.pop() if records: status = await conn.copy_records_to_table( self.config.table_name, records=records) num_rows_written += parse_insert_status_string(status) q.task_done() log.debug('[stream_to_postgres] Wrote %d rows', num_rows_written) except KeyboardInterrupt: log.warning('[stream_to_postgres] User interrupt') except asyncio.CancelledError: log.warning('[stream_to_postgres] Task cancelled') raise except Exception as e: # noqa log.error('[stream_to_postgres] Exception: %s', e) raise finally: await conn.close() print('[read_file] returning') return num_rows_written
class WebCrawler: ''' WebCrawler class, starts at a root domain of a given resource. It starts on the root page, finds all Initialize a new webcrawler instance. @param(basePath): The root of the domain to crawl ''' def __init__(self, basePath, max_tasks=25): # max concurrent tasks self.max_tasks = max_tasks # we have seen this url self.processed = set() # BasePath of url to start crawl, should be root of a domain self.basePath = basePath # event loop, we are not fallbacking to iocp (win32) or select or any sort of other event loop, we will only use asyncio provided event loop self.loop = asyncio.get_event_loop() # create our session, which encapsulates a connection pool self.session = aiohttp.ClientSession(loop=self.loop) # get Queue self.queue = Queue(loop=self.loop) # first url self.queue.put_nowait(self.basePath) # JSON for visualization self.data = [] ''' Check if this is static data ''' def _is_static_(self): # As far as static vs. dynamic, it's because it looks like the resource is cachable (making it "static"). # You need a pragma: no-cache and/or a cache-control: no-cache header for it to really be a dynamic asset. pass ''' Get all static assets on a page ''' def get_static(self, s, url): # hacky but works scripts = [ x['src'] for x in s.findAll('script') if x.has_attr('src') and (x["src"].startswith('/') and not x['src'][1] == '/')] styles = [ x['href'] for x in s.findAll('link') if x.has_attr('href') and x["href"].startswith('/') ] return scripts + styles ''' Cleanup on aiohttp ''' def close(self): try: # aiohttp keeps a TCP connection alive for 30secs, this explicitly closes it self.session.close() except: pass ''' Process is a coroutine which our tasks/workers/threads/coroutines/whatever will do their corresponding work. Each process will fetch their urls from the queue for processing. ''' async def process(self): try: while True: try: # suspend until we get a new url to work on url = await self.queue.get() # remove trailing slash if url[-1] == '/': url = url[:-1] # we have not seen this url, so we fetch it and add it if url not in self.processed: self.processed.add(url) # suspend execution until we get data from our HTTP request resp = await self.fetch(url) if resp != None: # add to sites self.data.append(resp) # go through each link and add them to the queue if we have not traversed them links = [x for x in resp['links'] if x.startswith('/') or x.startswith(url)] for link in links: # formatting if not link.startswith(self.basePath): link = self.basePath + link if '#' in link: link = link[:link.index('#')] # add it to our queue for processing if link not in self.processed: if link != '' and link != None: self.queue.put_nowait(link) # this task is done self.queue.task_done() except Exception as err: pass except asyncio.CancelledError: pass ''' Parsed a url for links and other stuff too ''' def parse(self, data, url): # parse a single url s = soup(data.decode('utf-8', 'ignore'), "html.parser") # get links links = [ x['href'] for x in s.findAll('a') if x.has_attr('href') ] # get assets assets = self.get_static(s, url) # get title title = s.find('title') if title != None: title = title.text else: title = '' return { 'url': url, 'title': title, 'links': links, 'assets': assets } ''' Put our JSONStatham in a file ''' def _save_file(self): # save data with open('sitemap.json', 'w') as sitemapfile: json.dump({ "sitemap": "Sitemap generated for URL {} on {}. {} pages parsed.".format(self.basePath, datetime.now(), len(self.processed)), "sites": self.data }, sitemapfile) ''' Start ze crawl ''' def crawl(self): try: # crawl until complete self.loop.run_until_complete(self.__crawl__()) except KeyboardInterrupt: sys.stderr.flush() finally: pass ''' Asynchronous crawl ''' async def __crawl__(self): print('Starting webcrawler on url {}'.format(self.basePath)) t1 = time.time() # make tasks that are processing the queue tasks = [asyncio.ensure_future(self.process(), loop=self.loop) for _ in range(self.max_tasks)] # aggregate tasks and squash exceptions asyncio.gather(*tasks, return_exceptions=True) # all queue items should call task_done for each put await self.queue.join() # cancel tasks for t in tasks: t.cancel() self.close() self.loop.stop() # save JSON file for viewing self._save_file() # print('{} pages processed in {} secs. Data saved in sitemap.json'.format(len(self.processed), time.time() - t1)) # leave exit(1) ''' HTTP request a page. ''' async def fetch(self, url): try: # alright, so i really should be handling redirects myself, but i'm not, because of reasons async with self.session.get(url, allow_redirects=False) as r: assert r.status == 200 # Get the page and parse it resp = self.parse(await r.read(), url) return resp except: self.queue.task_done()
class Crawler(object): """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__( self, roots, scraper=None, data_handler=None, exclude=None, strict=True, # What to crawl. max_redirect=5, max_tries=10, # Per-url limits. max_tasks=10, max_connections_per_host=3, *, loop=None ): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.max_connections_per_host = max_connections_per_host self.scraper = scraper self.data_handler = data_handler self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r"\A[\d\.]*\Z", host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_urls(root) self.t0 = time.time() self.t1 = None def record_statistic( self, url=None, next_url=None, status=None, exception=None, content_type=None, encoding=None, num_urls=0, num_new_urls=0, ): """Record the FetchStatistic for completed / failed URL.""" fetch_statistic = FetchStatistic( url=url, next_url=next_url, status=status, size=0, exception=exception, content_type=content_type, encoding=encoding, num_urls=num_urls, num_new_urls=num_new_urls, ) self.done.append(fetch_statistic) def extract_data(self, root_url, html): raise NotImplementedError("You need to define a extract_data method!") def close(self): """Close resources.""" LOGGER.debug("closing resources") self.session.close() @asyncio.coroutine def parse_links(self, web_page_html, base_url, _content_type, _encoding): """Return a list of links.""" links = set() tree = html.fromstring(web_page_html) tree.make_links_absolute(base_url) urls = [link[2] for link in tree.iterlinks()] for url in urls: defragmented, frag = urllib.parse.urldefrag(url) if verify.url_allowed( defragmented, self.root_domains, exclude=self.exclude ): # Select Valid links, testing against regexp and root_domains links.add(defragmented) if urls: LOGGER.info( "got %r urls from %r new links: %i visited: %i", len(urls), base_url, len(links - self.seen_urls), len(self.seen_urls), ) new_links = [link for link in links.difference(self.seen_urls)] self.record_statistic( url=base_url, content_type=_content_type, encoding=_encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls), ) return new_links def handle_redirect(self, response, url, max_redirect): location = response.headers["location"] next_url = urllib.parse.urljoin(url, location) self.record_statistic(url=url, next_url=next_url, status=response.status) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info("redirect to %r from %r max_redir: %i", next_url, url, max_redirect - 1) self.add_urls(next_url, max_redirect - 1) else: LOGGER.error("redirect limit reached for %r from %r", next_url, url) return @asyncio.coroutine def fetch(self, url, max_redirect, sem): """Fetch one URL.""" tries = 0 web_page = None exception = None _url = None _encoding = None _content_type = None sleep_time = 0 while tries < self.max_tries: try: with (yield from sem): response = yield from asyncio.wait_for( self.session.get(url, allow_redirects=False), 10, loop=self.loop ) if tries > 1: LOGGER.debug("try %r for %r success", tries, url) break except Exception as client_error: sleep_time += 5 yield from asyncio.sleep(sleep_time) LOGGER.error("try %r for %r raised %r", tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error("%r failed after %r tries", url, self.max_tries) self.record_statistic(url=url, exception=exception) return (web_page, _url, _content_type, _encoding) try: _url, _content_type, _encoding = get_content_type_and_encoding(response) if is_redirect(response): self.handle_redirect(response, url, max_redirect) web_page = "redirect" elif response.status == 200 and _content_type in ("text/html", "application/xml"): web_page = yield from response.text() else: self.record_statistic( url=response.url, status=response.status, content_type=_content_type, encoding=_encoding ) except Exception as e: print("*******error**********") finally: yield from response.release() return (web_page, _url, _content_type, _encoding) def add_urls(self, urls, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect if not isinstance(urls, str): urls = set(urls) for link in urls.difference(self.seen_urls): self.q.put_nowait((link, max_redirect)) self.seen_urls.update(urls) elif urls not in self.seen_urls: self.q.put_nowait((urls, max_redirect)) self.seen_urls.add(urls) @asyncio.coroutine def work(self, sem): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() # assert url in self.seen_urls web_page, url, content_type, encoding = yield from self.fetch(url, max_redirect, sem) if web_page and web_page != "redirect": new_links = yield from self.parse_links(web_page, url, content_type, encoding) if self.scraper: data = self.scraper.scrape(url, web_page) if self.data_handler: self.data_handler.handle(data) self.add_urls(new_links) self.q.task_done() except (asyncio.CancelledError,): print("error") @asyncio.coroutine def crawl(self): sem = asyncio.Semaphore(value=self.max_connections_per_host, loop=self.loop) """Run the crawler until all finished.""" LOGGER.info("Starting crawl...") workers = [asyncio.Task(self.work(sem), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',text)) if urls: LOGGER.info('got %r distinct urls from %r',len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get(url, allow_redirects=False) #1 break #2 except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error else: return try: if is_redirect(response): location = response.headers['location'] else: #4 stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() #q.get() Remove and return an item from the queue. If queue is empty, wait until an item is available. #print('url',url, 'max_redirect', max_redirect) assert url in self.seen_urls #assert 断言,异常会直接抛出 yield from self.fetch(url, max_redirect) self.q.task_done() #Indicate that a formerly enqueued task is complete.表明以前排队的任务完成 except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) #put_nowait() Put an item into the queue without blocking.此句实际最先执行 @asyncio.coroutine def crawl(self): """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() #Block until all items in the queue have been gotten and processed.保持阻塞状态,直到处理了队列中的所有项目为止 self.t1 = time.time() for w in workers: w.cancel()
class SubscribeListener(SubscribeCallback): def __init__(self): self.connected = False self.connected_event = Event() self.disconnected_event = Event() self.presence_queue = Queue() self.message_queue = Queue() self.error_queue = Queue() def status(self, pubnub, status): if utils.is_subscribed_event(status) and not self.connected_event.is_set(): self.connected_event.set() elif utils.is_unsubscribed_event(status) and not self.disconnected_event.is_set(): self.disconnected_event.set() elif status.is_error(): self.error_queue.put_nowait(status.error_data.exception) def message(self, pubnub, message): self.message_queue.put_nowait(message) def presence(self, pubnub, presence): self.presence_queue.put_nowait(presence) @asyncio.coroutine def _wait_for(self, coro): scc_task = asyncio.ensure_future(coro) err_task = asyncio.ensure_future(self.error_queue.get()) yield from asyncio.wait([ scc_task, err_task ], return_when=asyncio.FIRST_COMPLETED) if err_task.done() and not scc_task.done(): if not scc_task.cancelled(): scc_task.cancel() raise err_task.result() else: if not err_task.cancelled(): err_task.cancel() return scc_task.result() @asyncio.coroutine def wait_for_connect(self): if not self.connected_event.is_set(): yield from self._wait_for(self.connected_event.wait()) else: raise Exception("instance is already connected") @asyncio.coroutine def wait_for_disconnect(self): if not self.disconnected_event.is_set(): yield from self._wait_for(self.disconnected_event.wait()) else: raise Exception("instance is already disconnected") @asyncio.coroutine def wait_for_message_on(self, *channel_names): channel_names = list(channel_names) while True: try: env = yield from self._wait_for(self.message_queue.get()) if env.channel in channel_names: return env else: continue finally: self.message_queue.task_done() @asyncio.coroutine def wait_for_presence_on(self, *channel_names): channel_names = list(channel_names) while True: try: env = yield from self._wait_for(self.presence_queue.get()) if env.channel in channel_names: return env else: continue finally: self.presence_queue.task_done()
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = BloomFilter(10000000, 0.01) self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) async def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = await response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = await response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: LOGGER.info("response.url:%s,type:%s", response.url, type(response.url)) LOGGER.info("parse_links url:%s,type:%s", url, type(url)) normalized = urllib.parse.urljoin(str(response.url), url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links) - len(self.seen_urls)) return stat, links async def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = await self.session.get( url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = await self.parse_links(response) self.record_statistic(stat) for link in utils.difference(links, self.seen_urls): # for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) # self.seen_urls.update(links) self.seen_urls.update(links) finally: await response.release() async def work(self): """Process queue items forever.""" try: while True: url, max_redirect = await self.q.get() assert url in self.seen_urls LOGGER.info("url:%s", url) LOGGER.info("max_redirect:%s", max_redirect) await self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) async def crawl(self): """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. self.session = aiohttp.ClientSession(loop=loop) # Put (URL, max_redirect) in the Queue self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): '''Run the crawler untill all work is done.''' workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] # When all work is done, exit. yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_redirect = yield from self.q.get() # Download page and add new links to self.q yield from self.fetch(url, max_redirect) self.q.task_done() @asyncio.coroutine def fetch(self, url, max_redirect): # Handle redirects ourselves. response = yield from self.session.get( url, allow_redirects=False) try: if is_redirect(response): if max_redirect > 0: next_url = response.headers['location'] if next_url in self.seen_urls: # We have done this before. return # Remember we have seen this url. self.seen_urls.add(next_url) # Follow the redirect. One less redirect remains. self.q.put_nowait((next_url, max_redirect -1)) else: links = yield from self.parse_links(response) # Python set-logic: for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: # Return connection to pool. yield from response.release()