class TorskelEventLogController(object): def __init__(self): self.logger = tornado.log.gen_log self.queue = Queue() def add_log_event(self, event): """ Put event into queue :param event: :return: """ if isinstance(event, dict): self.logger.debug(event) self.queue.put(event) async def write_log_from_queue(self, db, collection_name, events_writer_func) -> type(None): """ Retrieves events from the queue. and performs the insert into the database """ qsize = self.queue.qsize() if options.show_log_event_writer: self.logger.info(f'Writing events... queue size = {qsize}') if qsize > 0: step = qsize if qsize <= options.task_list_size else \ options.task_list_size inserts_list = [await self.queue.get() for _ in range(step)] if len(inserts_list) > 0: await events_writer_func(db, collection_name, inserts_list)
class SubscribeListener(SubscribeCallback): def __init__(self): self.connected = False self.connected_event = Event() self.disconnected_event = Event() self.presence_queue = Queue() self.message_queue = Queue() def status(self, pubnub, status): if utils.is_subscribed_event(status) and not self.connected_event.is_set(): self.connected_event.set() elif utils.is_unsubscribed_event(status) and not self.disconnected_event.is_set(): self.disconnected_event.set() def message(self, pubnub, message): self.message_queue.put(message) def presence(self, pubnub, presence): self.presence_queue.put(presence) @tornado.gen.coroutine def wait_for_connect(self): if not self.connected_event.is_set(): yield self.connected_event.wait() else: raise Exception("instance is already connected") @tornado.gen.coroutine def wait_for_disconnect(self): if not self.disconnected_event.is_set(): yield self.disconnected_event.wait() else: raise Exception("instance is already disconnected") @tornado.gen.coroutine def wait_for_message_on(self, *channel_names): channel_names = list(channel_names) while True: try: env = yield self.message_queue.get() if env.channel in channel_names: raise tornado.gen.Return(env) else: continue finally: self.message_queue.task_done() @tornado.gen.coroutine def wait_for_presence_on(self, *channel_names): channel_names = list(channel_names) while True: try: env = yield self.presence_queue.get() if env.channel in channel_names: raise tornado.gen.Return(env) else: continue finally: self.presence_queue.task_done()
def get_file_list(account, **kwargs): queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() data = set() @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) response_data = json.loads(response.body.decode('utf-8')) for file in response_data: # be sure we're a valid file type and less than our maximum response size limit extension = file['path'].lower().split('.')[-1] if extension in VALID_FILETYPES and int( file['bytes']) < RESPONSE_SIZE_LIMIT * 1000000: data.add(( file['path'].lstrip('/'), file['path'], )) app_log.info("Page {} completed".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() app_log.info("Gathering filelist for account {}".format(account._id)) for file_type in VALID_FILETYPES: file_type = '.'.join([file_type]) url = "https://api.dropbox.com/1/search/auto/?query={}&include_membership=true".format( file_type) queue.put(url) # start our concurrency worker worker() # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) app_log.info("Finished list retrieval. Found {} items.".format( data.__len__())) return sorted([{ "title": title, "value": path } for title, path in data], key=lambda f: f['title'])
class Publisher(MQAsyncSub): """Handles new data to be passed on to subscribers.""" def __init__(self): self.WSmessages = Queue() self.MQmessages = Queue() self.sub = MQAsyncSub.__init__(self, zmq.Context(), 'admin', []) self.subscribers = set() def register(self, subscriber): """Register a new subscriber.""" self.subscribers.add(subscriber) def deregister(self, subscriber): """Stop publishing to a subscriber.""" self.subscribers.remove(subscriber) @gen.coroutine def on_message(self, did, msg): """Receive message from MQ sub and send to WS.""" yield self.WSmessages.put({"msgid": did, "content": msg}) @gen.coroutine def submit(self, message): """Submit a new message to publish to subscribers.""" yield self.WSmessages.put(message) @gen.coroutine def publishToWS(self): while True: message = yield self.WSmessages.get() if len(self.subscribers) > 0: print("Pushing MQ message {} to {} WS subscribers...".format( message, len(self.subscribers))) yield [subscriber.submit(message) for subscriber in self.subscribers] @gen.coroutine def publishToMQ(self): ctx = zmq.Context() cli = MQSyncReq(ctx) pub = MQPub(ctx, 'admin') while True: message = yield self.MQmessages.get() jsons = json.loads(message) # req/rep if 'mq_request' in jsons and 'data' in jsons: msg = MQMessage() msg.set_action(str(jsons['mq_request'])) msg.set_data(jsons['data']) print("REQ : {0}".format(msg.get())) if 'dst' in jsons: print cli.request(str(jsons['dst']), msg.get(), timeout=10).get() else: print cli.request('manager', msg.get(), timeout=10).get() # pub elif 'mq_publish' in jsons and 'data' in jsons: print("Publish : {0}".format(jsons['data'])) pub.send_event(jsons['mq_publish'], jsons['data'])
class Decode(object): def __init__(self, sess_field): self.q = Queue(maxsize=1000) self.p = Queue(maxsize=1000) self.sess_field = sess_field @staticmethod def batch_pad(nd): max_length = max(map(len, nd)) pad_nd = [ i + [text_encoder.PAD_ID] * (max_length - len(i)) for i in nd ] return pad_nd @gen.coroutine def decode(self): log.info("[biz] Decode: model loading ... ") saver = tf.train.Saver() with tf.Session(config=self.sess_field.sess_config) as sess: # Load weights from checkpoint. log.info("[biz] Decode: restoring parameters") saver.restore(sess, self.sess_field.ckpt) log.info("[biz] Decode: model already loaded") while True: inputs = yield self.q.get() log.info("[biz] Decode: " + str(inputs)) st_time = time.time() inputs_numpy = [ self.sess_field.encoders["inputs"].encode(i) + [text_encoder.EOS_ID] for i in inputs ] num_decode_batches = (len(inputs_numpy) - 1) // self.sess_field.batch_size + 1 results = [] for i in range(num_decode_batches): input_numpy = inputs_numpy[i * self.sess_field.batch_size:(i + 1) * self.sess_field.batch_size] inputs_numpy_batch = input_numpy + [[ text_encoder.EOS_ID ]] * (self.sess_field.batch_size - len(input_numpy)) inputs_numpy_batch = self.batch_pad( inputs_numpy_batch) # pad using 0 # log.info("[biz] Decode: " + str(inputs_numpy_batch)) feed = {self.sess_field.inputs_ph: inputs_numpy_batch} result = sess.run(self.sess_field.prediction, feed) decoded_outputs = [ self.sess_field.encoders["targets"].decode(i).strip( "<pad>").strip("<EOS>") for i in result["outputs"][:len(input_numpy)] ] results += decoded_outputs self.p.put(results) log.info("[biz] Decode: source: " + str(inputs)) log.info("[biz] Decode: target: " + str(results)) log.info("[biz] Decode: using %s s" % (time.time() - st_time))
def queueStream(self, rdds, oneAtATime=True, default=None): """Create stream iterable over RDDs. :param rdds: Iterable over RDDs or lists. :param oneAtATime: Process one at a time or all. :param default: If no more RDDs in ``rdds``, return this. Can be None. :rtype: DStream Example: >>> import fast_pyspark_tester >>> sc = fast_pyspark_tester.Context() >>> ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1) >>> ( ... ssc ... .queueStream([[4], [2], [7]]) ... .foreachRDD(lambda rdd: print(rdd.collect())) ... ) >>> ssc.start() >>> ssc.awaitTermination(0.35) [4] [2] [7] Example testing the default value: >>> import fast_pyspark_tester >>> sc = fast_pyspark_tester.Context() >>> ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1) >>> ( ... ssc ... .queueStream([[4], [2]], default=['placeholder']) ... .foreachRDD(lambda rdd: print(rdd.collect())) ... ) >>> ssc.start() >>> ssc.awaitTermination(0.35) [4] [2] ['placeholder'] """ deserializer = QueueStreamDeserializer(self._context) if default is not None: default = deserializer(default) if Queue is False: log.error('Run "pip install tornado" to install tornado.') q = Queue() for i in rdds: q.put(i) qstream = QueueStream(q, oneAtATime, default) return DStream(qstream, self, deserializer)
def get_file_list(account, **kwargs): queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() data = set() @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) response_data = json.loads(response.body.decode('utf-8')) for file in response_data: # be sure we're a valid file type and less than our maximum response size limit extension = file['path'].lower().split('.')[-1] if extension in VALID_FILETYPES and int(file['bytes']) < RESPONSE_SIZE_LIMIT * 1000000: data.add((file['path'].lstrip('/'), file['path'], )) app_log.info("Page {} completed".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() app_log.info("Gathering filelist for account {}".format(account._id)) for file_type in VALID_FILETYPES: file_type = '.'.join([file_type]) url = "https://api.dropbox.com/1/search/auto/?query={}&include_membership=true".format(file_type) queue.put(url) # start our concurrency worker worker() # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) app_log.info("Finished list retrieval. Found {} items.".format(data.__len__())) return sorted([{"title": title, "value": path} for title, path in data], key=lambda f: f['title'])
class delay(Stream): """ Add a time delay to results """ _graphviz_shape = 'octagon' def __init__(self, upstream, interval, loop=None, **kwargs): loop = loop or upstream.loop or IOLoop.current() self.interval = interval self.queue = Queue() Stream.__init__(self, upstream, loop=loop, **kwargs) self.loop.add_callback(self.cb) @gen.coroutine def cb(self): while True: last = time() x = yield self.queue.get() yield self._emit(x) duration = self.interval - (time() - last) if duration > 0: yield gen.sleep(duration) def update(self, x, who=None): return self.queue.put(x)
class gather(Stream): def __init__(self, child, limit=10, client=None): self.client = client or default_client() self.queue = Queue(maxsize=limit) self.condition = Condition() Stream.__init__(self, child) self.client.loop.add_callback(self.cb) def update(self, x, who=None): return self.queue.put(x) @gen.coroutine def cb(self): while True: x = yield self.queue.get() L = [x] while not self.queue.empty(): L.append(self.queue.get_nowait()) results = yield self.client._gather(L) for x in results: yield self.emit(x) if self.queue.empty(): self.condition.notify_all() @gen.coroutine def flush(self): while not self.queue.empty(): yield self.condition.wait()
class BaseHandler(RequestHandler): """Base handler for subscribers. To be compatible with data stores defined in :mod:`tornadose.stores`, custom handlers should inherit this class and implement the :meth:`publish` method. """ def initialize(self, store): """Common initialization of handlers happens here. If additional initialization is required, this method must either be called with ``super`` or the child class must assign the ``store`` attribute and register itself with the store. """ assert isinstance(store, stores.BaseStore) self.messages = Queue() self.store = store self.store.register(self) @gen.coroutine def submit(self, message): """Submit a new message to be published.""" yield self.messages.put(message) def publish(self): """Push a message to the subscriber. This method must be implemented by child classes. """ raise NotImplementedError('publish must be implemented!')
class buffer(Stream): """ Allow results to pile up at this point in the stream This allows results to buffer in place at various points in the stream. This can help to smooth flow through the system when backpressure is applied. """ _graphviz_shape = 'diamond' def __init__(self, upstream, n, loop=None, **kwargs): loop = loop or upstream.loop or IOLoop.current() self.queue = Queue(maxsize=n) Stream.__init__(self, upstream, loop=loop, **kwargs) self.loop.add_callback(self.cb) def update(self, x, who=None): return self.queue.put(x) @gen.coroutine def cb(self): while True: x = yield self.queue.get() yield self._emit(x)
class Publisher(object): """Handles new data to be passed on to subscribers.""" def __init__(self): self.messages = Queue() self.subscribers = set() def register(self, subscriber): """Register a new subscriber.""" self.subscribers.add(subscriber) def deregister(self, subscriber): """Stop publishing to a subscriber.""" self.subscribers.remove(subscriber) @gen.coroutine def submit(self, message): """Submit a new message to publish to subscribers.""" yield self.messages.put(message) @gen.coroutine def publish(self): while True: message = yield self.messages.get() if len(self.subscribers) > 0: # print("Pushing message {} to {} subscribers...".format( # message, len(self.subscribers))) yield [ subscriber.submit(message) for subscriber in self.subscribers ]
def main(): cocurrency = 10 queue = Queue() queue.put("http://www.jianshu.com") workers = [] for _ in range(cocurrency): workers.append(Worker(app, queue)) for worker in workers: Log4Spider.debugLog("worker begin:", worker) worker.run() Log4Spider.debugLog("waitiing for spiderQueue empty:") yield queue.join(timeout=timedelta(seconds=300)) Log4Spider.debugLog("main done!")
class TestTCPServer(TCPServer): def __init__(self, family): super(TestTCPServer, self).__init__() self.streams = [] self.queue = Queue() sockets = bind_sockets(None, 'localhost', family) self.add_sockets(sockets) self.port = sockets[0].getsockname()[1] def handle_stream(self, stream, address): self.streams.append(stream) self.queue.put(stream) def stop(self): super(TestTCPServer, self).stop() for stream in self.streams: stream.close()
def main(): cocurrency = 10 queue = Queue() queue.put("http://www.jianshu.com") workers = [] for _ in range(cocurrency): workers.append(Worker(app,queue)) for worker in workers: Log4Spider.debugLog("worker begin:",worker) worker.run() Log4Spider.debugLog("waitiing for spiderQueue empty:") yield queue.join(timeout=timedelta(seconds=300)) Log4Spider.debugLog("main done!")
class CommandQueue(): def __init__(self): self.queue = Queue() @gen.coroutine def process_command(self): while True: item = yield self.queue.get() try: yield gen.sleep(0.1) command, view = item view.write_message({command[0]: command[1]}) finally: self.queue.task_done() def put(self, item): self.queue.put(item)
class TestTCPServer(TCPServer): def __init__(self, family): super(TestTCPServer, self).__init__() self.streams = [] # type: List[IOStream] self.queue = Queue() # type: Queue[IOStream] sockets = bind_sockets(0, "10.0.0.7", family) self.add_sockets(sockets) self.port = sockets[0].getsockname()[1] def handle_stream(self, stream, address): self.streams.append(stream) self.queue.put(stream) def stop(self): super(TestTCPServer, self).stop() for stream in self.streams: stream.close()
class ConnectionPool(object): def __init__(self, servers, maxsize=15, minsize=1, loop=None, debug=0): loop = loop if loop is not None else tornado.ioloop.IOLoop.instance() if debug: logging.basicConfig( level=logging.DEBUG, format="'%(levelname)s %(asctime)s" " %(module)s:%(lineno)d %(process)d %(thread)d %(message)s'") self._loop = loop self._servers = servers self._minsize = minsize self._debug = debug self._in_use = set() self._pool = Queue(maxsize) @gen.coroutine def clear(self): """Clear pool connections.""" while not self._pool.empty(): conn = yield self._pool.get() conn.close_socket() def size(self): return len(self._in_use) + self._pool.qsize() @gen.coroutine def acquire(self): """Acquire connection from the pool, or spawn new one if pool maxsize permits. :return: ``Connetion`` (reader, writer) """ while self.size() < self._minsize: _conn = yield self._create_new_conn() yield self._pool.put(_conn) conn = None while not conn: if not self._pool.empty(): conn = yield self._pool.get() if conn is None: conn = yield self._create_new_conn() self._in_use.add(conn) raise gen.Return(conn) @gen.coroutine def _create_new_conn(self): conn = yield Connection.get_conn(self._servers, self._debug) raise gen.Return(conn) def release(self, conn): self._in_use.remove(conn) try: self._pool.put_nowait(conn) except (QueueEmpty, QueueFull): conn.close_socket()
class TestChannelConfiguration: MESSAGE = "test_message" @gen.coroutine def _top(self): message = yield self._message_queue.get() self._message_queue.put(message) return message def test_channel_configuration(self, rabbitmq_url, configuration): self._message_queue = Queue(maxsize=1) io_loop = IOLoop.current() self.io_loop = io_loop async_connection = AsyncConnection(rabbitmq_url, io_loop, logging.getLogger(__name__)) publish_channel = ChannelConfiguration( async_connection, async_connection.logger, io_loop, **configuration["publish"]) receive_channel = ChannelConfiguration( async_connection, async_connection.logger, io_loop, **configuration["receive"]) # Test channel creation and getter from channel async queue channel = io_loop.run_sync(publish_channel._get_channel) assert channel.is_open # Publish message and check that it uses the same channel io_loop.run_sync(functools.partial(publish_channel.publish, self.MESSAGE)) assert io_loop.run_sync(publish_channel._get_channel) == channel # Start consuming and wait for message io_loop.spawn_callback(receive_channel.consume, self.callback) message = io_loop.run_sync(self._top, 10) # Stop the loop, in order to stop consuming io_loop.stop() assert message == self.MESSAGE def callback(self, channel, method, properties, body): body = body.decode() print(f"consumed: {body}") self._message_queue.put(body)
class TornadoQuerierBase(object): def __init__(self): self.tasks = TornadoQueue() def gen_task(self): raise NotImplementError() def run_task(self, task): raise NotImplementError() def prepare(self): self.running = True def cleanup(self): self.running = False @coroutine def run_worker(self, worker_id, f): while self.tasks.qsize() > 0: task = yield self.tasks.get() LOG.debug('worker[%d]: current task is %s' % (worker_id, task)) try: yield f(task) pass except Exception as e: LOG.warning(str(e)) finally: self.tasks.task_done() task = None LOG.debug('worker[%d]: all tasks done %s' % (worker_id, self.tasks)) @coroutine def start(self, num_workers=1): self.prepare() # add tasks tasks = yield self.gen_task() for task in tasks: yield self.tasks.put(task) # start shoot workers for worker_id in range(num_workers): LOG.debug('starting worker %d' % worker_id) self.run_worker(worker_id, self.run_task) yield self.tasks.join() self.cleanup()
class buffer(Stream): def __init__(self, n, child, loop=None): self.queue = Queue(maxsize=n) Stream.__init__(self, child, loop=loop) self.loop.add_callback(self.cb) def update(self, x, who=None): return self.queue.put(x) @gen.coroutine def cb(self): while True: x = yield self.queue.get() yield self.emit(x)
class MessageRouter(object): def __init__(self, message_sender, default_handler=None): self._queue = Queue() self.message_sender = message_sender self.default_handler = default_handler self._message_handlers = {} self._working = False def register_message_handler(self, message, handler): assert isinstance(message, MessageMeta) assert hasattr(handler, '__call__') self._message_handlers[message.__name__] = handler @gen.coroutine def put_message(self, message): assert isinstance(message, Message) yield self._queue.put(message) @gen.coroutine def start(self): self._working = True while self._working: message = yield self._queue.get() try: # TODO: Maybe we need to add special handling for BarrierRequest handler = self._message_handlers.get(message.type, self.default_handler) if handler: yield handler(message) except Exception as e: exc_type, exc_value, exc_tb = sys.exc_info() error_type, error_subtype, error_message, extended_message = errors.exception_to_error_args( exc_type, exc_value, exc_tb) response = Error.from_request( message, error_type=error_type, error_subtype=error_subtype, message=error_message, extended_message=extended_message) yield self.message_sender.send_message_ignore_response( response) finally: self._queue.task_done() def stop(self): self._working = False
class StreamClient(object): MAX_SIZE = 60 def __init__(self, steam_id): self.id = generate_id() self.stream_id = steam_id self.queue = Queue(StreamClient.MAX_SIZE) @coroutine def send(self, item): yield self.queue.put(item) @coroutine def fetch(self): item = yield self.queue.get() self.queue.task_done() return item def empty(self): return self.queue.qsize() == 0
class UploadHandler(BaseHandler): executor = ThreadPoolExecutor(max_workers=settings.THREAD_WORKERS) def prepare(self, *args, **kwargs): #Queue to take chunks of data received''' self.queue = Queue() #Change the size of body if self.request.method.lower() == "post": self.request.connection.set_max_body_size( settings.MAX_STREAMED_SIZE) try: self.content_length = int( self.request.headers.get("Content-Length", "0")) except KeyError: self.content_length = 0 super(UploadHandler, self).prepare(*args, **kwargs) @tornado.gen.coroutine def data_received(self, chunk): #Put chunks in a queue as received''' yield self.queue.put(chunk) #Upload to S3, with Threaded Pool @run_on_executor(executor='executor') def background_task(self, obj): return obj.upload_to_s3(self.queue) @tornado.gen.coroutine def post(self): obj = UploadFile(body=self.request.body, content_type=self.request.headers.get("Content-Type"), content_length=self.content_length) ret = obj.write_entry() self.write_json(data={"token": ret}) yield self.background_task(obj)
class Subscription(WebSocketHandler): """Websocket for subscribers.""" def initialize(self, publisher): self.publisher = publisher self.messages = Queue() self.finished = False def open(self): print("New subscriber.") self.publisher.register(self) self.run() def on_close(self): self._close() def _close(self): print("Subscriber left.") self.publisher.deregister(self) self.finished = True @gen.coroutine def submit(self, message): yield self.messages.put(message) @gen.coroutine def run(self): """ Empty the queue of messages to send to the WS """ while not self.finished: message = yield self.messages.get() self.send(message) def send(self, message): try: self.write_message(message) except WebSocketClosedError: self._close() def on_message(self, content): """ reciev message from websocket and send to MQ """ #print(u"WS to MQ: {0}".format(content)) self.publisher.MQmessages.put(content)
class delay(Stream): def __init__(self, interval, child, loop=None): self.interval = interval self.queue = Queue() Stream.__init__(self, child, loop=loop) self.loop.add_callback(self.cb) @gen.coroutine def cb(self): while True: last = time() x = yield self.queue.get() yield self.emit(x) duration = self.interval - (time() - last) if duration > 0: yield gen.sleep(duration) def update(self, x, who=None): return self.queue.put(x)
class PopularCategories: def __init__(self): self.categories = {} self.update_queue = Queue() @gen.coroutine def add_for_processing(self, predictions): yield self.update_queue.put(predictions) @gen.coroutine def process_queue(self): if self.update_queue.qsize() > 0: for i in range(self.update_queue.qsize()): predictions = yield self.update_queue.get() try: self._update_categories(predictions) finally: self.update_queue.task_done() # update top 5 top_5 = sorted(self.categories.items(), key=lambda x: x[1], reverse=True)[:5] mapped = map(lambda x: to_json_result(x[0], x[1]), top_5) yield update_top_5(list(mapped)) def _update_categories(self, new_predictions): predictions = new_predictions.argsort()[0] # update categories total for prediction in predictions: label = configuration.image_labels[prediction] score = new_predictions[0][prediction] if label in self.categories: update_score = (self.categories[label] + score) / 2 else: update_score = score self.categories[label] = update_score
class Subscription(WebSocketHandler): """Websocket for subscribers.""" def initialize(self, publisher): self.publisher = publisher self.messages = Queue() self.finished = False def check_origin(self, origin): return True def open(self): print("New subscriber.") self.publisher.register(self) self.run() def on_close(self): self._close() def _close(self): print("Subscriber left.") self.publisher.deregister(self) self.finished = True @gen.coroutine def submit(self, message): yield self.messages.put(message) @gen.coroutine def run(self): while not self.finished: message = yield self.messages.get() # print("New message: " + str(message)) self.send(message) def send(self, message): try: self.write_message(dict(value=message)) except WebSocketClosedError: self._close()
class Subscription(WebSocketHandler): """Websocket for subscribers.""" def initialize(self, publisher): self.publisher = publisher self.messages = Queue() self.finished = False def open(self): print("New subscriber.") self.publisher.register(self) self.run() def on_close(self): self._close() def _close(self): print("Subscriber left.") self.publisher.deregister(self) self.finished = True @gen.coroutine def submit(self, message): yield self.messages.put(message) @gen.coroutine def run(self): while not self.finished: message = yield self.messages.get() #print("New MQ message: " + str(message)) self.send(message) def send(self, message): try: self.write_message(message) except WebSocketClosedError: self._close() def on_message(self, content): self.publisher.MQmessages.put(content)
class QueueStore(BaseStore): """Publish data via queues. This class is meant to be used in cases where subscribers should not miss any data. Compared to the :class:`DataStore` class, new messages to be broadcast to clients are put in a queue to be processed in order. """ def initialize(self): self.messages = Queue() self.publish() @gen.coroutine def submit(self, message): yield self.messages.put(message) @gen.coroutine def publish(self): while True: message = yield self.messages.get() if len(self.subscribers) > 0: yield [subscriber.submit(message) for subscriber in self.subscribers]
class SQSDrain(object): """Implementation of IDrain that writes to an AWS SQS queue. """ def __init__(self, logger, loop, sqs_client, metric_prefix='emitter'): self.emitter = sqs_client self.logger = logger self.loop = loop self.metric_prefix = metric_prefix self.output_error = Event() self.state = RUNNING self.sender_tag = 'sender:%s.%s' % (self.__class__.__module__, self.__class__.__name__) self._send_queue = Queue() self._should_flush_queue = Event() self._flush_handle = None self.loop.spawn_callback(self._onSend) @gen.coroutine def _flush_send_batch(self, batch_size): send_batch = [ self._send_queue.get_nowait() for pos in range(min(batch_size, self.emitter.max_messages)) ] try: response = yield self.emitter.send_message_batch(*send_batch) except SQSError as err: self.logger.exception('Error encountered flushing data to SQS: %s', err) self.output_error.set() for msg in send_batch: self._send_queue.put_nowait(msg) else: if response.Failed: self.output_error.set() for req in response.Failed: self.logger.error('Message failed to send: %s', req.Id) self._send_queue.put_nowait(req) @gen.coroutine def _onSend(self): respawn = True while respawn: qsize = self._send_queue.qsize() # This will keep flushing until clear, # including items that show up in between flushes while qsize > 0: try: yield self._flush_send_batch(qsize) except Exception as err: self.logger.exception(err) self.output_error.set() qsize = self._send_queue.qsize() # We've cleared the backlog, remove any possible future flush if self._flush_handle: self.loop.remove_timeout(self._flush_handle) self._flush_handle = None self._should_flush_queue.clear() yield self._should_flush_queue.wait() @gen.coroutine def close(self, timeout=None): self.state = CLOSING yield self._send_queue.join(timeout) def emit_nowait(self, msg): if self._send_queue.qsize() >= self.emitter.max_messages: # Signal flush self._should_flush_queue.set() raise QueueFull() elif self._flush_handle is None: # Ensure we flush messages at least by MAX_TIMEOUT self._flush_handle = self.loop.add_timeout( MAX_TIMEOUT, lambda: self._should_flush_queue.set(), ) self.logger.debug("Drain emitting") self._send_queue.put_nowait(msg) @gen.coroutine def emit(self, msg, timeout=None): if self._send_queue.qsize() >= self.emitter.max_messages: # Signal flush self._should_flush_queue.set() elif self._flush_handle is None: # Ensure we flush messages at least by MAX_TIMEOUT self._flush_handle = self.loop.add_timeout( MAX_TIMEOUT, lambda: self._should_flush_queue.set(), ) yield self._send_queue.put(msg, timeout)
class BaseSpider(object): url_parser = None def __init__(self, engine, concurrent=3): self.engine = engine self.http = httpclient.AsyncHTTPClient() self.queue = Queue() self.concurrency = concurrent @property def hostname(self): return self.url_parser.hostname @property def url_root(self): return self.url_parser.url_root @property def base_url(self): return self.url_parser.base_url @gen.coroutine def __worker(self): """Consumes the queue.""" while True: yield self.fetch_url() @gen.coroutine def crawl(self, description, location): """Starts crawling the specified URL.""" url = self.url_parser(description, location) self.queue.put(url) self.engine.notify_started(self) for _ in range(self.concurrency): self.__worker() yield self.queue.join() self.engine.notify_finished(self) @gen.coroutine def fetch_url(self): """Retrieves a URL from the queue and returns the parsed data.""" url = yield self.queue.get() logger.info('fetching %s' % url) try: response = yield self.http.fetch(url) soup = BeautifulSoup(response.body) logger.info('got response %s' % url) urls = yield self.fetch_links(response, soup) for new_url in urls: logger.debug('Added %s to queue' % new_url) yield self.queue.put(new_url) data = yield self.parse_response(response, soup) logger.info('Parsed response for %s' % url) except (httpclient.HTTPError, ValueError): message = 'HTTP Error: (%s)' % url self.engine.write_message(message, self.engine.STATUS_ERROR) else: self.engine.write_data(data) finally: self.queue.task_done() @gen.coroutine def fetch_links(self, response, soup): """Fetch URLs to be added to the queue.""" raise gen.Return([]) def parse_response(self, response, soup): """Extract information from the response, return should be a list of dict's. Sample dict: { 'title': 'Job Title', 'company': 'Company Name', 'location': 'City/State/Country', 'tags': ['tag1', 'tag2', 'tag3'], 'category': 'Software Developer', 'origin': 'Name of the origin website', 'url': 'Link to the complete job description', } """ raise NotImplementedError
class TornadoSubscriptionManager(SubscriptionManager): def __init__(self, pubnub_instance): subscription_manager = self self._message_queue = Queue() self._consumer_event = Event() self._cancellation_event = Event() self._subscription_lock = Semaphore(1) # self._current_request_key_object = None self._heartbeat_periodic_callback = None self._reconnection_manager = TornadoReconnectionManager(pubnub_instance) super(TornadoSubscriptionManager, self).__init__(pubnub_instance) self._start_worker() class TornadoReconnectionCallback(ReconnectionCallback): def on_reconnect(self): subscription_manager.reconnect() pn_status = PNStatus() pn_status.category = PNStatusCategory.PNReconnectedCategory pn_status.error = False subscription_manager._subscription_status_announced = True subscription_manager._listener_manager.announce_status(pn_status) self._reconnection_listener = TornadoReconnectionCallback() self._reconnection_manager.set_reconnection_listener(self._reconnection_listener) def _set_consumer_event(self): self._consumer_event.set() def _message_queue_put(self, message): self._message_queue.put(message) def _start_worker(self): self._consumer = TornadoSubscribeMessageWorker(self._pubnub, self._listener_manager, self._message_queue, self._consumer_event) run = stack_context.wrap(self._consumer.run) self._pubnub.ioloop.spawn_callback(run) def reconnect(self): self._should_stop = False self._pubnub.ioloop.spawn_callback(self._start_subscribe_loop) # self._register_heartbeat_timer() def disconnect(self): self._should_stop = True self._stop_heartbeat_timer() self._stop_subscribe_loop() @tornado.gen.coroutine def _start_subscribe_loop(self): self._stop_subscribe_loop() yield self._subscription_lock.acquire() self._cancellation_event.clear() combined_channels = self._subscription_state.prepare_channel_list(True) combined_groups = self._subscription_state.prepare_channel_group_list(True) if len(combined_channels) == 0 and len(combined_groups) == 0: return envelope_future = Subscribe(self._pubnub) \ .channels(combined_channels).channel_groups(combined_groups) \ .timetoken(self._timetoken).region(self._region) \ .filter_expression(self._pubnub.config.filter_expression) \ .cancellation_event(self._cancellation_event) \ .future() canceller_future = self._cancellation_event.wait() wi = tornado.gen.WaitIterator(envelope_future, canceller_future) # iterates 2 times: one for result one for cancelled while not wi.done(): try: result = yield wi.next() except Exception as e: # TODO: verify the error will not be eaten logger.error(e) raise else: if wi.current_future == envelope_future: e = result elif wi.current_future == canceller_future: return else: raise Exception("Unexpected future resolved: %s" % str(wi.current_future)) if e.is_error(): # 599 error doesn't works - tornado use this status code # for a wide range of errors, for ex: # HTTP Server Error (599): [Errno -2] Name or service not known if e.status is not None and e.status.category == PNStatusCategory.PNTimeoutCategory: self._pubnub.ioloop.spawn_callback(self._start_subscribe_loop) return logger.error("Exception in subscribe loop: %s" % str(e)) if e.status is not None and e.status.category == PNStatusCategory.PNAccessDeniedCategory: e.status.operation = PNOperationType.PNUnsubscribeOperation self._listener_manager.announce_status(e.status) self._reconnection_manager.start_polling() self.disconnect() return else: self._handle_endpoint_call(e.result, e.status) self._pubnub.ioloop.spawn_callback(self._start_subscribe_loop) finally: self._cancellation_event.set() yield tornado.gen.moment self._subscription_lock.release() self._cancellation_event.clear() break def _stop_subscribe_loop(self): if self._cancellation_event is not None and not self._cancellation_event.is_set(): self._cancellation_event.set() def _stop_heartbeat_timer(self): if self._heartbeat_periodic_callback is not None: self._heartbeat_periodic_callback.stop() def _register_heartbeat_timer(self): super(TornadoSubscriptionManager, self)._register_heartbeat_timer() self._heartbeat_periodic_callback = PeriodicCallback( stack_context.wrap(self._perform_heartbeat_loop), self._pubnub.config.heartbeat_interval * TornadoSubscriptionManager.HEARTBEAT_INTERVAL_MULTIPLIER, self._pubnub.ioloop) self._heartbeat_periodic_callback.start() @tornado.gen.coroutine def _perform_heartbeat_loop(self): if self._heartbeat_call is not None: # TODO: cancel call pass cancellation_event = Event() state_payload = self._subscription_state.state_payload() presence_channels = self._subscription_state.prepare_channel_list(False) presence_groups = self._subscription_state.prepare_channel_group_list(False) if len(presence_channels) == 0 and len(presence_groups) == 0: return try: envelope = yield self._pubnub.heartbeat() \ .channels(presence_channels) \ .channel_groups(presence_groups) \ .state(state_payload) \ .cancellation_event(cancellation_event) \ .future() heartbeat_verbosity = self._pubnub.config.heartbeat_notification_options if envelope.status.is_error: if heartbeat_verbosity == PNHeartbeatNotificationOptions.ALL or \ heartbeat_verbosity == PNHeartbeatNotificationOptions.ALL: self._listener_manager.announce_status(envelope.status) else: if heartbeat_verbosity == PNHeartbeatNotificationOptions.ALL: self._listener_manager.announce_status(envelope.status) except PubNubTornadoException: pass # TODO: check correctness # if e.status is not None and e.status.category == PNStatusCategory.PNTimeoutCategory: # self._start_subscribe_loop() # else: # self._listener_manager.announce_status(e.status) except Exception as e: print(e) finally: cancellation_event.set() @tornado.gen.coroutine def _send_leave(self, unsubscribe_operation): envelope = yield Leave(self._pubnub) \ .channels(unsubscribe_operation.channels) \ .channel_groups(unsubscribe_operation.channel_groups).future() self._listener_manager.announce_status(envelope.status)
class Client(object): def __init__(self, server, name, stream): self.server = server self.name = name self.rooms = {} self.stream = stream self.inqueue = Queue(maxsize=QUEUE_SIZE) self.outqueue = Queue(maxsize=QUEUE_SIZE) @coroutine def forwarding(self): while True: msg = yield self.outqueue.get() if msg.command == COMMAND_QUIT: for _, room in self.rooms.items(): yield room.inqueue.put(msg) elif msg.command == COMMAND_JOIN: room_name = msg.receiver room = self.server.get_room(room_name) self.rooms[room_name] = room yield room.inqueue.put(msg) else: room = self.rooms[msg.receiver] yield room.inqueue.put(msg) self.outqueue.task_done() @coroutine def response(self): global SPEED while True: msg = yield self.inqueue.get() if msg.command == COMMAND_QUIT: self.stream.close() return else: response = ("%s %s:%s\n" % (datetime.datetime.now(), msg.sender.name, msg.content.decode()))\ .encode('utf-8') try: SPEED += 1 yield self.stream.write(response) except Exception as e: logging.debug(str(e)) self.stream.close() @coroutine def receive(self): while True: try: line = yield self.stream.read_until(b'\n') except Exception as e: logging.debug(str(e)) msg = Message(self, '', COMMAND_QUIT, 'CONNECTION ERROR') yield self.outqueue.put(msg) return data = line.strip().split(b' ') if len(data) != 2: continue room_name, content = data[0], data[1] if room_name in self.rooms: msg = Message(self, room_name, COMMAND_NORMAL, content) else: msg = Message(self, room_name, COMMAND_JOIN, content) yield self.outqueue.put(msg)
class BlogBackup(object): _default_dir_name = 'seg_blog_backup' def _generate_save_dir(self): cur_dir = os.path.dirname(__file__) self.save_path = os.path.join(cur_dir, self._default_dir_name) if not os.path.isdir(self.save_path): os.mkdir(self.save_path) def _parse_save_path(self): if self.save_path: if os.path.exists(self.save_path) and \ os.path.isdir(self.save_path): return else: raise BlogSavePathError( "'%s' not exists or is not dir!" % self.save_path) else: self._generate_save_dir() def _get_user_cookies(self): url = target_url + login_page_path self.driver.get(url) try: user_input = self.driver.find_element_by_name('mail') passwd_input = self.driver.find_element_by_name('password') submit_btn = self.driver.find_element_by_class_name('pr20') except NoSuchElementException: raise PageHtmlChanged( "%s login page structure have changed!" % _domain) user_input.send_keys(self.username) passwd_input.send_keys(self.passwd) submit_btn.click() try: WebDriverWait(self.driver, 3).until(staleness_of(submit_btn)) except TimeoutException: raise Exception("Wrong username or password!") WebDriverWait(self.driver, timeout=10).until(has_page_load) try_times = 0 while True: time.sleep(1) if url != self.driver.current_url: return self.driver.get_cookies() try_times += 1 if try_times > 10: raise Exception("Getting cookie info failed!") def _get_driver(self): if self.phantomjs_path: try: return webdriver.PhantomJS( executable_path=self.phantomjs_path, service_log_path=os.path.devnull) except WebDriverException: raise PhantomjsPathError("Phantomjs locate path invalid!") else: return webdriver.PhantomJS(service_log_path=os.path.devnull) def __init__(self, **conf): self.username = conf['username'] self.passwd = conf['passwd'] self.phantomjs_path = conf.get('phantomjs_path') self.save_path = conf.get('save_path') self._q = Queue() self._parse_save_path() self.driver = self._get_driver() self._cookies = self._get_user_cookies() @gen.coroutine def run(self): self.__filter_cookies() start_url = target_url + blog_path yield self._fetch_blog_list_page(start_url) for _ in xrange(cpu_count()): self._fetch_essay_content() yield self._q.join() def __filter_cookies(self): self._cookies = {k['name']: k['value'] for k in self._cookies if k['domain'] == _domain} @gen.coroutine def _fetch_blog_list_page(self, page_link): ret = requests.get(page_link, cookies=self._cookies) d = pq(ret.text) link_elements = d('.stream-list__item > .summary > h2 > a') for link in link_elements: yield self._q.put(d(link).attr('href')) next_ele = d('.pagination li.next a') if next_ele: next_page_url = target_url + next_ele.attr('href') self._fetch_blog_list_page(next_page_url) @gen.coroutine def _fetch_essay_content(self): while True: try: essay_path = yield self._q.get(timeout=1) essay_url = target_url + essay_path + edit_suffix ret = requests.get(essay_url, cookies=self._cookies) d = pq(ret.text) title = d("#myTitle").val() content = d("#myEditor").text() file_name = title + '.md' real_file_name = os.path.join(self.save_path, file_name) with open(real_file_name, 'w') as f: f.writelines(content.encode('utf8')) except gen.TimeoutError: raise gen.Return() finally: self._q.task_done()
class Scraper(): def __init__( self, destinations=None, transform=None, headers={}, max_clients=50, maxsize=50, connect_timeout=1200, request_timeout=600,): """Instantiate a tornado async http client to do multiple concurrent requests""" if None in [destinations, transform]: sys.stderr.write('You must pass both collection of URLS and a transform function') raise SystemExit self.max_clients = max_clients self.maxsize = maxsize self.connect_timeout = connect_timeout self.request_timeout = request_timeout AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients) self.http_client = AsyncHTTPClient() self.queue = Queue(maxsize=50) self.destinations = destinations self.transform = transform self.headers = headers self.read(self.destinations) self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout, self.http_client) self.loop = ioloop.IOLoop.current() self.join_future = self.queue.join() def done(future): self.loop.stop() self.join_future.add_done_callback(done) self.loop.start() @gen.coroutine def read(self, destinations): for url in destinations: yield self.queue.put(url) @gen.coroutine def get(self, transform, headers, connect_timeout, request_timeout, http_client): while True: url = yield self.queue.get() try: request = HTTPRequest(url, connect_timeout=connect_timeout, request_timeout=request_timeout, method="GET", headers = headers ) except Exception as e: sys.stderr.write('Destination {0} returned error {1}'.format(url, str(e) + '\n')) future = self.http_client.fetch(request) def done_callback(future): body = future.result().body url = future.result().effective_url transform(body, url=url) self.queue.task_done() try: future.add_done_callback(done_callback) except Exception as e: sys.stderr.write(str(e)) queue.put(url)
class ProjectGroomer(object): """ Cleans up expired transactions for a project. """ def __init__(self, project_id, coordinator, zk_client, db_access, thread_pool): """ Creates a new ProjectGroomer. Args: project_id: A string specifying a project ID. coordinator: A GroomingCoordinator. zk_client: A KazooClient. db_access: A DatastoreProxy. thread_pool: A ThreadPoolExecutor. """ self.project_id = project_id self._coordinator = coordinator self._zk_client = zk_client self._tornado_zk = TornadoKazoo(self._zk_client) self._db_access = db_access self._thread_pool = thread_pool self._project_node = '/appscale/apps/{}'.format(self.project_id) self._containers = [] self._inactive_containers = set() self._batch_resolver = BatchResolver(self.project_id, self._db_access) self._zk_client.ensure_path(self._project_node) self._zk_client.ChildrenWatch(self._project_node, self._update_containers) self._txid_manual_offset = 0 self._offset_node = '/'.join([self._project_node, OFFSET_NODE]) self._zk_client.DataWatch(self._offset_node, self._update_offset) self._stop_event = AsyncEvent() self._stopped_event = AsyncEvent() # Keeps track of cleanup results for each round of grooming. self._txids_cleaned = 0 self._oldest_valid_tx_time = None self._worker_queue = AsyncQueue(maxsize=MAX_CONCURRENCY) for _ in range(MAX_CONCURRENCY): IOLoop.current().spawn_callback(self._worker) IOLoop.current().spawn_callback(self.start) @gen.coroutine def start(self): """ Starts the grooming process until the stop event is set. """ logger.info('Grooming {}'.format(self.project_id)) while True: if self._stop_event.is_set(): break try: yield self._groom_project() except Exception: # Prevent the grooming loop from stopping if an error is encountered. logger.exception( 'Unexpected error while grooming {}'.format(self.project_id)) yield gen.sleep(MAX_TX_DURATION) self._stopped_event.set() @gen.coroutine def stop(self): """ Stops the grooming process. """ logger.info('Stopping grooming process for {}'.format(self.project_id)) self._stop_event.set() yield self._stopped_event.wait() @gen.coroutine def _worker(self): """ Processes items in the worker queue. """ while True: tx_path, composite_indexes = yield self._worker_queue.get() try: tx_time = yield self._resolve_txid(tx_path, composite_indexes) if tx_time is None: self._txids_cleaned += 1 if tx_time is not None and tx_time < self._oldest_valid_tx_time: self._oldest_valid_tx_time = tx_time finally: self._worker_queue.task_done() def _update_offset(self, new_offset, _): """ Watches for updates to the manual offset node. Args: new_offset: A string specifying the new manual offset. """ self._txid_manual_offset = int(new_offset or 0) def _update_containers(self, nodes): """ Updates the list of active txid containers. Args: nodes: A list of strings specifying ZooKeeper nodes. """ counters = [int(node[len(CONTAINER_PREFIX):] or 1) for node in nodes if node.startswith(CONTAINER_PREFIX) and node not in self._inactive_containers] counters.sort() containers = [CONTAINER_PREFIX + str(counter) for counter in counters] if containers and containers[0] == '{}1'.format(CONTAINER_PREFIX): containers[0] = CONTAINER_PREFIX self._containers = containers @gen.coroutine def _groom_project(self): """ Runs the grooming process. """ index = self._coordinator.index worker_count = self._coordinator.total_workers oldest_valid_tx_time = yield self._fetch_and_clean(index, worker_count) # Wait until there's a reasonable chance that some transactions have # timed out. next_timeout_eta = oldest_valid_tx_time + MAX_TX_DURATION # The oldest ignored transaction should still be valid, but ensure that # the timeout is not negative. next_timeout = max(0, next_timeout_eta - time.time()) time_to_wait = datetime.timedelta( seconds=next_timeout + (MAX_TX_DURATION / 2)) # Allow the wait to be cut short when a project is removed. try: yield self._stop_event.wait(timeout=time_to_wait) except gen.TimeoutError: raise gen.Return() @gen.coroutine def _remove_path(self, tx_path): """ Removes a ZooKeeper node. Args: tx_path: A string specifying the path to delete. """ try: yield self._tornado_zk.delete(tx_path) except NoNodeError: pass except NotEmptyError: yield self._thread_pool.submit(self._zk_client.delete, tx_path, recursive=True) @gen.coroutine def _resolve_txid(self, tx_path, composite_indexes): """ Cleans up a transaction if it has expired. Args: tx_path: A string specifying the location of the ZooKeeper node. composite_indexes: A list of CompositeIndex objects. Returns: The transaction start time if still valid, None if invalid because this method will also delete it. """ tx_data = yield self._tornado_zk.get(tx_path) tx_time = float(tx_data[0]) _, container, tx_node = tx_path.rsplit('/', 2) tx_node_id = int(tx_node.lstrip(COUNTER_NODE_PREFIX)) container_count = int(container[len(CONTAINER_PREFIX):] or 1) if tx_node_id < 0: yield self._remove_path(tx_path) raise gen.Return() container_size = MAX_SEQUENCE_COUNTER + 1 automatic_offset = (container_count - 1) * container_size txid = self._txid_manual_offset + automatic_offset + tx_node_id if txid < 1: yield self._remove_path(tx_path) raise gen.Return() # If the transaction is still valid, return the time it was created. if tx_time + MAX_TX_DURATION >= time.time(): raise gen.Return(tx_time) yield self._batch_resolver.resolve(txid, composite_indexes) yield self._remove_path(tx_path) yield self._batch_resolver.cleanup(txid) @gen.coroutine def _fetch_and_clean(self, worker_index, worker_count): """ Cleans up expired transactions. Args: worker_index: An integer specifying this worker's index. worker_count: An integer specifying the number of total workers. Returns: A float specifying the time of the oldest valid transaction as a unix timestamp. """ self._txids_cleaned = 0 self._oldest_valid_tx_time = time.time() children = [] for index, container in enumerate(self._containers): container_path = '/'.join([self._project_node, container]) new_children = yield self._tornado_zk.get_children(container_path) if not new_children and index < len(self._containers) - 1: self._inactive_containers.add(container) children.extend(['/'.join([container_path, node]) for node in new_children]) logger.debug( 'Found {} transaction IDs for {}'.format(len(children), self.project_id)) if not children: raise gen.Return(self._oldest_valid_tx_time) # Refresh these each time so that the indexes are fresh. encoded_indexes = yield self._thread_pool.submit( self._db_access.get_indices, self.project_id) composite_indexes = [CompositeIndex(index) for index in encoded_indexes] for tx_path in children: tx_node_id = int(tx_path.split('/')[-1].lstrip(COUNTER_NODE_PREFIX)) # Only resolve transactions that this worker has been assigned. if tx_node_id % worker_count != worker_index: continue yield self._worker_queue.put((tx_path, composite_indexes)) yield self._worker_queue.join() if self._txids_cleaned > 0: logger.info('Cleaned up {} expired txids for {}'.format( self._txids_cleaned, self.project_id)) raise gen.Return(self._oldest_valid_tx_time)
class Worker(Server): """ Worker Node Workers perform two functions: 1. **Serve data** from a local dictionary 2. **Perform computation** on that data and on data from peers Additionally workers keep a Center informed of their data and use that Center to gather data from other workers when necessary to perform a computation. You can start a worker with the ``dworker`` command line application:: $ dworker scheduler-ip:port **State** * **data:** ``{key: object}``: Dictionary mapping keys to actual values * **active:** ``{key}``: Set of keys currently under computation * **ncores:** ``int``: Number of cores used by this worker process * **executor:** ``concurrent.futures.ThreadPoolExecutor``: Executor used to perform computation * **local_dir:** ``path``: Path on local machine to store temporary files * **center:** ``rpc``: Location of center or scheduler. See ``.ip/.port`` attributes. * **name:** ``string``: Alias * **services:** ``{str: Server}``: Auxiliary web servers running on this worker * **service_ports:** ``{str: port}``: Examples -------- Create centers and workers in Python: >>> from distributed import Center, Worker >>> c = Center('192.168.0.100', 8787) # doctest: +SKIP >>> w = Worker(c.ip, c.port) # doctest: +SKIP >>> yield w._start(port=8788) # doctest: +SKIP Or use the command line:: $ dcenter Start center at 127.0.0.1:8787 $ dworker 127.0.0.1:8787 Start worker at: 127.0.0.1:8788 Registered with center at: 127.0.0.1:8787 See Also -------- distributed.center.Center: """ def __init__(self, center_ip, center_port, ip=None, ncores=None, loop=None, local_dir=None, services=None, service_ports=None, name=None, **kwargs): self.ip = ip or get_ip() self._port = 0 self.ncores = ncores or _ncores self.data = dict() self.loop = loop or IOLoop.current() self.status = None self.local_dir = local_dir or tempfile.mkdtemp(prefix='worker-') self.executor = ThreadPoolExecutor(self.ncores) self.thread_tokens = Queue() # https://github.com/tornadoweb/tornado/issues/1595#issuecomment-198551572 for i in range(self.ncores): self.thread_tokens.put_nowait(i) self.center = rpc(ip=center_ip, port=center_port) self.active = set() self.name = name if not os.path.exists(self.local_dir): os.mkdir(self.local_dir) if self.local_dir not in sys.path: sys.path.insert(0, self.local_dir) self.services = {} self.service_ports = service_ports or {} for k, v in (services or {}).items(): if isinstance(k, tuple): k, port = k else: port = 0 self.services[k] = v(self) self.services[k].listen(port) self.service_ports[k] = self.services[k].port handlers = {'compute': self.compute, 'gather': self.gather, 'compute-stream': self.compute_stream, 'run': self.run, 'get_data': self.get_data, 'update_data': self.update_data, 'delete_data': self.delete_data, 'terminate': self.terminate, 'ping': pingpong, 'health': self.health, 'upload_file': self.upload_file} super(Worker, self).__init__(handlers, **kwargs) @gen.coroutine def _start(self, port=0): self.listen(port) self.name = self.name or self.address for k, v in self.services.items(): v.listen(0) self.service_ports[k] = v.port logger.info(' Start worker at: %20s:%d', self.ip, self.port) for k, v in self.service_ports.items(): logger.info(' %16s at: %20s:%d' % (k, self.ip, v)) logger.info('Waiting to connect to: %20s:%d', self.center.ip, self.center.port) while True: try: resp = yield self.center.register( ncores=self.ncores, address=(self.ip, self.port), keys=list(self.data), services=self.service_ports, name=self.name) break except (OSError, StreamClosedError): logger.debug("Unable to register with scheduler. Waiting") yield gen.sleep(0.5) if resp != 'OK': raise ValueError(resp) logger.info(' Registered to: %20s:%d', self.center.ip, self.center.port) self.status = 'running' def start(self, port=0): self.loop.add_callback(self._start, port) def identity(self, stream): return {'type': type(self).__name__, 'id': self.id, 'center': (self.center.ip, self.center.port)} @gen.coroutine def _close(self, report=True, timeout=10): if report: yield gen.with_timeout(timedelta(seconds=timeout), self.center.unregister(address=(self.ip, self.port)), io_loop=self.loop) self.center.close_streams() self.stop() self.executor.shutdown() if os.path.exists(self.local_dir): shutil.rmtree(self.local_dir) for k, v in self.services.items(): v.stop() self.status = 'closed' self.stop() @gen.coroutine def terminate(self, stream, report=True): yield self._close(report=report) raise Return('OK') @property def address(self): return '%s:%d' % (self.ip, self.port) @property def address_tuple(self): return (self.ip, self.port) @gen.coroutine def gather(self, stream=None, who_has=None): who_has = {k: [coerce_to_address(addr) for addr in v] for k, v in who_has.items() if k not in self.data} try: result = yield gather_from_workers(who_has) except KeyError as e: logger.warn("Could not find data", e) raise Return({'status': 'missing-data', 'keys': e.args}) else: self.data.update(result) raise Return({'status': 'OK'}) @gen.coroutine def _ready_task(self, function=None, key=None, args=(), kwargs={}, task=None, who_has=None): diagnostics = {} if who_has: local_data = {k: self.data[k] for k in who_has if k in self.data} who_has = {k: set(map(coerce_to_address, v)) for k, v in who_has.items() if k not in self.data} try: logger.info("gather %d keys from peers: %s", len(who_has), str(who_has)) diagnostics['transfer-start'] = time() other = yield gather_from_workers(who_has) diagnostics['transfer-stop'] = time() data = merge(local_data, other) except KeyError as e: logger.warn("Could not find data for %s", key) raise Return({'status': 'missing-data', 'keys': e.args, 'key': key}) else: data = {} transfer_time = 0 try: start = default_timer() if task is not None: task = loads(task) if function is not None: function = loads(function) if args: args = loads(args) if kwargs: kwargs = loads(kwargs) diagnostics['deserialization'] = default_timer() - start except Exception as e: logger.warn("Could not deserialize task", exc_info=True) raise Return(assoc(error_message(e), 'key', key)) if task is not None: assert not function and not args and not kwargs function = execute_task args = (task,) # Fill args with data args2 = pack_data(args, data) kwargs2 = pack_data(kwargs, data) raise Return({'status': 'OK', 'function': function, 'args': args2, 'kwargs': kwargs2, 'diagnostics': diagnostics, 'key': key}) @gen.coroutine def executor_submit(self, key, function, *args, **kwargs): """ Safely run function in thread pool executor We've run into issues running concurrent.future futures within tornado. Apparently it's advantageous to use timeouts and periodic callbacks to ensure things run smoothly. This can get tricky, so we pull it off into an separate method. """ token = yield self.thread_tokens.get() job_counter[0] += 1 i = job_counter[0] # logger.info("%s:%d Starts job %d, %s", self.ip, self.port, i, key) future = self.executor.submit(function, *args, **kwargs) pc = PeriodicCallback(lambda: logger.debug("future state: %s - %s", key, future._state), 1000); pc.start() try: if sys.version_info < (3, 2): yield future else: while not future.done() and future._state != 'FINISHED': try: yield gen.with_timeout(timedelta(seconds=1), future, io_loop=self.loop) break except gen.TimeoutError: logger.info("work queue size: %d", self.executor._work_queue.qsize()) logger.info("future state: %s", future._state) logger.info("Pending job %d: %s", i, future) finally: pc.stop() self.thread_tokens.put(token) result = future.result() logger.info("Finish job %d, %s", i, key) raise gen.Return(result) @gen.coroutine def compute_stream(self, stream): with log_errors(): logger.debug("Open compute stream") bstream = BatchedSend(interval=10, loop=self.loop) bstream.start(stream) @gen.coroutine def process(msg): try: result = yield self.compute(report=False, **msg) bstream.send(result) except Exception as e: logger.exception(e) bstream.send(assoc(error_message(e), 'key', msg.get('key'))) with log_errors(): while True: try: msgs = yield read(stream) except StreamClosedError: break if not isinstance(msgs, list): msgs = [msgs] for msg in msgs: op = msg.pop('op', None) if op == 'close': break elif op == 'compute-task': self.loop.add_callback(process, msg) else: logger.warning("Unknown operation %s, %s", op, msg) yield bstream.close() logger.info("Close compute stream") @gen.coroutine def compute(self, stream=None, function=None, key=None, args=(), kwargs={}, task=None, who_has=None, report=True): """ Execute function """ self.active.add(key) # Ready function for computation msg = yield self._ready_task(function=function, key=key, args=args, kwargs=kwargs, task=task, who_has=who_has) if msg['status'] != 'OK': try: self.active.remove(key) except KeyError: pass raise Return(msg) else: function = msg['function'] args = msg['args'] kwargs = msg['kwargs'] # Log and compute in separate thread result = yield self.executor_submit(key, apply_function, function, args, kwargs) result['key'] = key result.update(msg['diagnostics']) if result['status'] == 'OK': self.data[key] = result.pop('result') if report: response = yield self.center.add_keys(address=(self.ip, self.port), keys=[key]) if not response == 'OK': logger.warn('Could not report results to center: %s', str(response)) else: logger.warn(" Compute Failed\n" "Function: %s\n" "args: %s\n" "kwargs: %s\n", str(funcname(function))[:1000], str(args)[:1000], str(kwargs)[:1000], exc_info=True) logger.debug("Send compute response to scheduler: %s, %s", key, msg) try: self.active.remove(key) except KeyError: pass raise Return(result) @gen.coroutine def run(self, stream, function=None, args=(), kwargs={}): function = loads(function) if args: args = loads(args) if kwargs: kwargs = loads(kwargs) try: result = function(*args, **kwargs) except Exception as e: logger.warn(" Run Failed\n" "Function: %s\n" "args: %s\n" "kwargs: %s\n", str(funcname(function))[:1000], str(args)[:1000], str(kwargs)[:1000], exc_info=True) response = error_message(e) else: response = { 'status': 'OK', 'result': dumps(result), } raise Return(response) @gen.coroutine def update_data(self, stream, data=None, report=True): data = valmap(loads, data) self.data.update(data) if report: response = yield self.center.add_keys(address=(self.ip, self.port), keys=list(data)) assert response == 'OK' info = {'nbytes': {k: sizeof(v) for k, v in data.items()}, 'status': 'OK'} raise Return(info) @gen.coroutine def delete_data(self, stream, keys=None, report=True): for key in keys: if key in self.data: del self.data[key] logger.info("Deleted %d keys", len(keys)) if report: logger.debug("Reporting loss of keys to center") yield self.center.remove_keys(address=self.address, keys=list(keys)) raise Return('OK') def get_data(self, stream, keys=None): return {k: dumps(self.data[k]) for k in keys if k in self.data} def upload_file(self, stream, filename=None, data=None, load=True): out_filename = os.path.join(self.local_dir, filename) if isinstance(data, unicode): data = data.encode() with open(out_filename, 'wb') as f: f.write(data) f.flush() if load: try: name, ext = os.path.splitext(filename) if ext in ('.py', '.pyc'): logger.info("Reload module %s from .py file", name) name = name.split('-')[0] reload(import_module(name)) if ext == '.egg': sys.path.append(out_filename) pkgs = pkg_resources.find_distributions(out_filename) for pkg in pkgs: logger.info("Load module %s from egg", pkg.project_name) reload(import_module(pkg.project_name)) if not pkgs: logger.warning("Found no packages in egg file") except Exception as e: logger.exception(e) return {'status': 'error', 'exception': dumps(e)} return {'status': 'OK', 'nbytes': len(data)} def health(self, stream=None): """ Information about worker """ d = {'active': len(self.active), 'stored': len(self.data), 'time': time()} try: import psutil mem = psutil.virtual_memory() d.update({'cpu': psutil.cpu_percent(), 'memory': mem.total, 'memory-percent': mem.percent}) try: net_io = psutil.net_io_counters() d['network-send'] = net_io.bytes_sent - self._last_net_io.bytes_sent d['network-recv'] = net_io.bytes_recv - self._last_net_io.bytes_recv except AttributeError: pass self._last_net_io = net_io try: disk_io = psutil.disk_io_counters() d['disk-read'] = disk_io.read_bytes - self._last_disk_io.read_bytes d['disk-write'] = disk_io.write_bytes - self._last_disk_io.write_bytes except (AttributeError, RuntimeError): disk_io = None self._last_disk_io = disk_io except ImportError: pass return d
class PlainStreamHandler(web.RequestHandler, Watcher): """ Provides the job stdout stream via plain HTTP GET """ @gen.coroutine def get(self): self.job = None try: project_name = self.request.query_arguments["project"][0] except (KeyError, IndexError): self.write(b"no project given\n") return try: build_id = self.request.query_arguments["hash"][0] except (KeyError, IndexError): self.write(b"no build hash given\n") return try: job_name = self.request.query_arguments["job"][0] except (KeyError, IndexError): self.write(b"no job given\n") return project_name = project_name.decode(errors='replace') build_id = build_id.decode(errors='replace') job_name = job_name.decode(errors='replace') try: project = CFG.projects[project_name] except KeyError: self.write(b"unknown project requested\n") return build = get_build(project, build_id) if not build: self.write(("no such build: project %s [%s]\n" % ( project_name, build_id)).encode()) return else: self.job = build.jobs.get(job_name) if not self.job: self.write(("unknown job in project %s [%s]: %s\n" % ( project_name, build_id, job_name)).encode()) return # the message queue to be sent to the http client self.queue = Queue() # request the updates from the watched jobs self.job.watch(self) # emit the updates and wait until no more are coming yield self.watch_job() @gen.coroutine def watch_job(self): """ Process updates and send them to the client """ self.set_header("Content-Type", "text/plain") while True: update = yield self.queue.get() if update is StopIteration: break if isinstance(update, StdOut): self.write(update.data.encode()) elif isinstance(update, JobState): if update.is_errored(): self.write( ("\x1b[31merror:\x1b[m %s\n" % (update.text)).encode() ) elif update.is_succeeded(): self.write( ("\x1b[32msuccess:\x1b[m %s\n" % (update.text)).encode() ) elif update.is_finished(): self.write( ("\x1b[31mfailed:\x1b[m %s\n" % (update.text)).encode() ) yield self.flush() return self.finish() def on_update(self, update): """ Put a message to the stream queue """ self.queue.put(update) def on_connection_close(self): """ Add a connection-end marker to the queue """ self.on_update(StopIteration) def on_finish(self): # TODO: only do this if we got a GET request. if self.job is not None: self.job.unwatch(self)
class SubscribeListener(SubscribeCallback): def __init__(self): self.connected = False self.connected_event = Event() self.disconnected_event = Event() self.presence_queue = Queue() self.message_queue = Queue() self.error_queue = Queue() def status(self, pubnub, status): if utils.is_subscribed_event(status) and not self.connected_event.is_set(): self.connected_event.set() elif utils.is_unsubscribed_event(status) and not self.disconnected_event.is_set(): self.disconnected_event.set() elif status.is_error(): self.error_queue.put_nowait(status.error_data.exception) def message(self, pubnub, message): self.message_queue.put(message) def presence(self, pubnub, presence): self.presence_queue.put(presence) @tornado.gen.coroutine def _wait_for(self, coro): error = self.error_queue.get() wi = tornado.gen.WaitIterator(coro, error) while not wi.done(): result = yield wi.next() if wi.current_future == coro: raise gen.Return(result) elif wi.current_future == error: raise result else: raise Exception("Unexpected future resolved: %s" % str(wi.current_future)) @tornado.gen.coroutine def wait_for_connect(self): if not self.connected_event.is_set(): yield self._wait_for(self.connected_event.wait()) else: raise Exception("instance is already connected") @tornado.gen.coroutine def wait_for_disconnect(self): if not self.disconnected_event.is_set(): yield self._wait_for(self.disconnected_event.wait()) else: raise Exception("instance is already disconnected") @tornado.gen.coroutine def wait_for_message_on(self, *channel_names): channel_names = list(channel_names) while True: try: # NOQA env = yield self._wait_for(self.message_queue.get()) if env.channel in channel_names: raise tornado.gen.Return(env) else: continue finally: self.message_queue.task_done() @tornado.gen.coroutine def wait_for_presence_on(self, *channel_names): channel_names = list(channel_names) while True: try: try: env = yield self._wait_for(self.presence_queue.get()) except: # NOQA E722 pylint: disable=W0702 break if env.channel in channel_names: raise tornado.gen.Return(env) else: continue finally: self.presence_queue.task_done()
class Application(object): def __init__(self, routes, node, pipe): """ Application instantiates and registers handlers for each message type, and routes messages to the pre-instantiated instances of each message handler :param routes: list of tuples in the form of (<message type str>, <MessageHandler class>) :param node: Node instance of the local node :param pipe: Instance of multiprocessing.Pipe for communicating with the parent process """ # We don't really have to worry about synchronization # so long as we're careful about explicit context switching self.nodes = {node.node_id: node} self.local_node = node self.handlers = {} self.tcpclient = TCPClient() self.gossip_inbox = Queue() self.gossip_outbox = Queue() self.sequence_number = 0 if routes: self.add_handlers(routes) self.pipe = pipe self.ioloop = IOLoop.current() self.add_node_event = Event() def next_sequence_number(self): self.sequence_number += 1 return self.sequence_number @coroutine def ping_random_node(self): node = yield self.get_random_node() LOGGER.debug('{} pinging random node: {}'.format(self.local_node.node_id, node.node_id)) try: yield self.ping(node) except TimeoutError: self.mark_suspect(node) @coroutine def add_node(self, node): if node.node_id not in self.nodes: LOGGER.debug('Adding node {} to {}'.format(node, self.nodes)) self.add_node_event.set() self.nodes[node.node_id] = node LOGGER.debug('Added node {} to {}'.format(node, self.nodes)) @coroutine def remove_node(self, node): if node.node_id in self.nodes: del self.nodes[node.node_id] other_nodes = yield self.get_other_nodes if not other_nodes: self.add_node_event.clear() def add_handlers(self, handlers): for message_type, handler_cls in handlers: assert message_type in MESSAGE_TYPES, ( 'Message type {!r} not found in MESSAGE TYPES {}'.format( message_type, MESSAGE_TYPES.keys() ) ) self.handlers[message_type] = handler_cls(self) def route_stream_message(self, stream, message_type, message): LOGGER.debug('{!r} received {} message from {!r}'.format(self, message_type, stream)) message_cls = MESSAGE_TYPES[message_type] message_obj = message_cls(**message) handler = self.handlers[message_type] LOGGER.debug('Routing {} to {}'.format(message_type, handler)) handler(stream, message_obj) @coroutine def send_message(self, stream, message): LOGGER.debug('Sending message {!r} to {}'.format(message.MESSAGE_TYPE, stream)) try: yield stream.write(message.to_msgpack) except StreamClosedError: LOGGER.warn('Unable to send {} to {} - stream closed'.format(message.MESSAGE_TYPE, stream)) @coroutine def _get_next_message(self, stream): # get the next message from the stream unpacker = msgpack.Unpacker() try: wire_bytes = yield with_timeout( datetime.timedelta(seconds=PING_TIMEOUT), stream.read_bytes(4096, partial=True) ) except StreamClosedError: LOGGER.warn('Unable to get next message from {} - stream closed'.format(stream)) else: unpacker.feed(wire_bytes) LOGGER.debug('Deserializing object from stream {}'.format(stream)) message = unpacker.next() message.pop('type') raise Return(message) @coroutine def ping(self, node): """ Ping a node :param node: Instance of Node to ping :returns: Boolean, True if successful/False if fail """ host = node.addr port = node.port LOGGER.debug('pinging {}:{}'.format(host, port)) ping = Ping(seqno=self.next_sequence_number(), node=node, sender=self.local_node) # Connect to the node try: stream = yield self.tcpclient.connect(host, port) except StreamClosedError: LOGGER.error('Unable to connect from {} to {} (pinging host)'.format(self.local_node.node_id, node.node_id)) raise Return(False) try: # Send the ping LOGGER.debug('Sending {!r} to {!r}'.format(ping.MESSAGE_TYPE, node)) yield self.send_message(stream, ping) # Wait for an ACK message in response LOGGER.debug('Getting next message from {}:{}'.format(host, port)) message = yield self._get_next_message(stream) if message is None: raise Return(False) ack = Ack(**message) LOGGER.debug('Received {!r} from {!r} (response to {!r})'.format(ack.MESSAGE_TYPE, node.node_id, ping.MESSAGE_TYPE)) # Check that the ACK sequence number matches the PING sequence number if ack.seqno == ping.seqno: LOGGER.debug('Sequence number matches. Node {} looks good to !'.format(node.node_id, self.local_node.node_id)) # Process the gossip messages tacked onto the ACK message's payload for message in ack.payload: try: self.gossip_inbox.put_nowait(message) except QueueFull: LOGGER.error('Unable to add {} message from {} to gossip inbox'.format(message.MESSAGE_TYPE, node.node_id)) # mark the node as ALIVE in self.nodes self.mark_alive(node) # Send gossip that this node is alive self.queue_gossip_send( Alive(node=node, sender=self.local_node) ) raise Return(True) else: raise Return(False) finally: stream.close() @coroutine def ack(self, stream, seqno): payload = [] for _ in xrange(ACK_PAYLOAD_SIZE): try: gossip = self.gossip_outbox.get_nowait() payload.append(gossip) except QueueEmpty: break ack = Ack(seqno=seqno, payload=payload) LOGGER.debug('Trying to send ack: {}'.format(ack)) try: yield stream.write(ack.to_msgpack) except StreamClosedError: LOGGER.error('Unable to connect from {} to stream (acking PING)'.format(self.local_node.node_id)) LOGGER.debug('Sent ack to {}'.format(stream)) @coroutine def _change_node_state(self, node, state): """ Because Tornado has explicit context switching, we don't need to worry much about synchronization here """ LOGGER.debug('{} knows about {}: {}'.format(self.local_node.node_id, node.node_id, state)) self.add_node(node) self.nodes[node.node_id].state = state @coroutine def mark_alive(self, node): if node.node_id != self.local_node.node_id: LOGGER.debug('Marking {} ALIVE'.format(node.node_id)) self._change_node_state(node, State.ALIVE) @coroutine def mark_dead(self, node): self._change_node_state(node, State.DEAD) @coroutine def mark_suspect(self, node): self._change_node_state(node, State.SUSPECT) @coroutine def ingest_gossip_inbox(self): while True: LOGGER.debug('checking inbox') message = yield self.gossip_inbox.get() LOGGER.debug('Received message {} from gossip inbox'.format(message.MESSAGE_TYPE)) if message.MESSAGE_TYPE == Alive.MESSAGE_TYPE: self.mark_alive(message.sender) self.mark_alive(message.node) self.queue_gossip_send(message) elif message.MESSAGE_TYPE == Suspect.MESSAGE_TYPE: self.mark_alive(message.sender) self.mark_suspect(message.node) self.queue_gossip_send(message) elif message.MESSAGE_TYPE == Dead.MESSAGE_TYPE: self.mark_alive(message.sender) self.mark_dead(message.node) self.queue_gossip_send(message) @coroutine def queue_gossip_send(self, message): """ If the message is gossipable, add it to the outbox """ try: next_incarnation = message.next_incarnation next_incarnation.sender = self.local_node except message.MaxIncarnationsReached: LOGGER.debug('Max incarnations reached for {}! No gossip 4 u'.format(message.MESSAGE_TYPE)) else: LOGGER.debug('Enqueuing {} gossips for {}'.format(GOSSIP_PEERS, message)) for _ in xrange(GOSSIP_PEERS): yield self.gossip_outbox.put(next_incarnation) @coroutine def send_buffered_gossip(self): while True: random_node = yield self.get_random_node() message = yield self.gossip_outbox.get() LOGGER.debug('{} connecting to {} for gossip'.format(self.local_node, random_node)) try: stream = yield self.tcpclient.connect(random_node.addr, random_node.port) except StreamClosedError: LOGGER.error('Unable to connect from {} to {} (sending gossip)'.format(self.local_node.node_id, random_node.node_id)) LOGGER.warning('Putting the gossip back on our queue') try: self.gossip_outbox.put_nowait(message) except QueueFull: LOGGER.error('Unable to put gossip back onto the queue. Giving up!') else: try: LOGGER.debug('{} gossipping with {}'.format(self.local_node.node_id, random_node.node_id)) yield self.send_message(stream, message) finally: stream.close() @coroutine def get_other_nodes(self, exclude=None): if exclude is None: exclude = (self.local_node,) exclude_node_ids = [n.node_id for n in exclude] raise Return([n for n in self.nodes if n not in exclude_node_ids]) @coroutine def get_random_node(self, exclude=None): LOGGER.debug('Waiting for more nodes') yield self.add_node_event.wait() LOGGER.debug('Getting non-self random node') other_nodes = yield self.get_other_nodes(exclude=exclude) LOGGER.debug('{} got something! choices: {}'.format(self.local_node.node_id, other_nodes)) assert other_nodes node_id = random.choice(other_nodes) raise Return(self.nodes[node_id])
class CounterCache(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.m_queue = Queue() self.m_CacheFlag = 1 self.m_CounterCache = None self.m_Cache_A = defaultdict() self.m_Cache_B = defaultdict() self.database = Database(redis_conf = REDISEVER, password = STATUS_REDIS_PASS) self.cacheInit(self.m_Cache_A) self.cacheInit(self.m_Cache_B) def switchCache(self): if self.m_CacheFlag == 1: return self.m_Cache_A elif self.m_CacheFlag == 2: return self.m_Cache_B def chageCacheFlag(self): if self.m_CacheFlag == 1: self.m_CacheFlag = 2 elif self.m_CacheFlag == 2: self.m_CacheFlag = 1 def clearCache(self): if self.m_CacheFlag == 1: self.m_Cache_B.clear() self.cacheInit(self.m_Cache_B) elif self.m_CacheFlag == 2: self.m_Cache_A.clear() self.cacheInit(self.m_Cache_A) def cacheInit(self, cache): cache['pid_info'] = defaultdict(int) cache['eid_info'] = { 'pv':defaultdict(int), 'exchange_price':defaultdict(int) } cache['adx_info'] = { 'pv':defaultdict(int), 'exchange_price':defaultdict(int) } cache['aid_info'] = { 'exchange_price':defaultdict(int) } @tornado.gen.coroutine def queueMsgPut(self, msg): yield self.m_queue.put(msg) @tornado.gen.coroutine def queueMsgGet(self): while True: msg = yield self.m_queue.get() #print msg logger.info('QueueGet:%r' % msg) self.cacheInfoPut(msg) def cacheInfoPut(self, info): cache = self.switchCache() type = eid = pid = aid = price = adx = None if info.has_key('type'): type = info['type'] if info.has_key('eid'): eid = info['eid'] if info.has_key('pid'): pid = info['pid'] if info.has_key('price'): price = info['price'] if info.has_key('aid'): aid = info['aid'] #if info.has_key('adx'): # adx = info['adx'] if type == 1 and eid and (price != None) and aid: # pv cache['aid_info']['exchange_price'][aid] = cache['aid_info']['exchange_price'][aid] + price cache['eid_info']['pv'][eid] = cache['eid_info']['pv'][eid] + 1 cache['eid_info']['exchange_price'][eid] = cache['eid_info']['exchange_price'][eid] + price #cache['adx_info']['pv'][adx] = cache['adx_info']['pv'][adx] + 1 #cache['adx_info']['exchange_price'][adx] = cache['adx_info']['exchange_price'][adx] + price else: return None def cacheDura(self): cache = None if self.m_CacheFlag == 1: cache = self.m_Cache_B if self.m_CacheFlag == 2: cache = self.m_Cache_A #loginfo(cache) if cache.has_key('pid_info'): pass if cache.has_key('eid_info'): it_p = cache['eid_info']['exchange_price'] it_m = cache['eid_info']['pv'] for eid in it_p.iterkeys(): self.database.incEidHourSp(eid, it_p[eid]) logger.debug("increase Order:%r Money:%r OK!" % (eid, it_p[eid])) for eid in it_m.iterkeys(): self.database.incEidShow(eid, it_m[eid]) logger.debug("increase Order:%r PV:%r OK!" % (eid,it_m[eid])) if cache.has_key('aid_info'): it_a = cache['aid_info']['exchange_price'] for aid in it_a.iterkeys(): self.database.incAidHourSp(aid, it_a[aid]) self.database.decAdvBidSpend(aid, "-%.3f" % (float(it_a[aid])/1000)) logger.debug("increase Advertiser:%s Money:%s!" % (aid, str(float(it_a[aid])/1000)) ) def run(self): while True: try: time.sleep( CACHE_DUR_FREQ ) self.chageCacheFlag() self.cacheDura() self.clearCache() except Exception, e: logger.error(e) continue
def get_file_list(account, **kwargs): queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() data = [] ids = set() @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) response_data = json.loads(response.body.decode('utf-8')) url = response_data.get('@odata.nextLink', None) if url is not None: queue.put(url) for file in response_data.get('value', []): if file['name'][-4:].strip('.').lower() in VALID_FILETYPES: if file['id'] not in ids: ids.add(file['id']) data.append({ "title": file['parentReference']['path'].split(':') [1].lstrip('/') + '/' + file['name'], "value": file['id'] }) app_log.info("Page {} completed".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() app_log.info("Gathering filelist for account {}".format(account._id)) for file_type in VALID_FILETYPES: file_type = '.'.join([file_type]) url = "https://api.onedrive.com/v1.0/drive/root/view.search?top=1000&select=parentReference,name,id,size&q={}" \ .format(file_type) queue.put(url) # start our concurrency worker worker() # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) app_log.info("Finished list retrieval. Found {} items.".format( data.__len__())) return sorted(data, key=lambda f: f['title'])
class TornadoTransmission(): def __init__(self, max_concurrent_batches=10, block_on_send=False, block_on_response=False, max_batch_size=100, send_frequency=timedelta(seconds=0.25), user_agent_addition=''): if not has_tornado: raise ImportError( 'TornadoTransmission requires tornado, but it was not found.' ) self.block_on_send = block_on_send self.block_on_response = block_on_response self.max_batch_size = max_batch_size self.send_frequency = send_frequency user_agent = "libhoney-py/" + VERSION if user_agent_addition: user_agent += " " + user_agent_addition self.http_client = AsyncHTTPClient( force_instance=True, defaults=dict(user_agent=user_agent)) # libhoney adds events to the pending queue for us to send self.pending = Queue(maxsize=1000) # we hand back responses from the API on the responses queue self.responses = Queue(maxsize=2000) self.batch_data = {} self.sd = statsd.StatsClient(prefix="libhoney") self.batch_sem = Semaphore(max_concurrent_batches) def start(self): ioloop.IOLoop.current().spawn_callback(self._sender) def send(self, ev): '''send accepts an event and queues it to be sent''' self.sd.gauge("queue_length", self.pending.qsize()) try: if self.block_on_send: self.pending.put(ev) else: self.pending.put_nowait(ev) self.sd.incr("messages_queued") except QueueFull: response = { "status_code": 0, "duration": 0, "metadata": ev.metadata, "body": "", "error": "event dropped; queue overflow", } if self.block_on_response: self.responses.put(response) else: try: self.responses.put_nowait(response) except QueueFull: # if the response queue is full when trying to add an event # queue is full response, just skip it. pass self.sd.incr("queue_overflow") # We're using the older decorator/yield model for compatibility with # Python versions before 3.5. # See: http://www.tornadoweb.org/en/stable/guide/coroutines.html#python-3-5-async-and-await @gen.coroutine def _sender(self): '''_sender is the control loop that pulls events off the `self.pending` queue and submits batches for actual sending. ''' events = [] last_flush = time.time() while True: try: ev = yield self.pending.get(timeout=self.send_frequency) if ev is None: # signals shutdown yield self._flush(events) return events.append(ev) if (len(events) > self.max_batch_size or time.time() - last_flush > self.send_frequency.total_seconds()): yield self._flush(events) events = [] except TimeoutError: yield self._flush(events) events = [] last_flush = time.time() @gen.coroutine def _flush(self, events): if not events: return for dest, group in group_events_by_destination(events).items(): yield self._send_batch(dest, group) @gen.coroutine def _send_batch(self, destination, events): ''' Makes a single batch API request with the given list of events. The `destination` argument contains the write key, API host and dataset name used to build the request.''' start = time.time() status_code = 0 try: # enforce max_concurrent_batches yield self.batch_sem.acquire() url = urljoin(urljoin(destination.api_host, "/1/batch/"), destination.dataset) payload = [] for ev in events: event_time = ev.created_at.isoformat() if ev.created_at.tzinfo is None: event_time += "Z" payload.append({ "time": event_time, "samplerate": ev.sample_rate, "data": ev.fields() }) req = HTTPRequest( url, method='POST', headers={ "X-Honeycomb-Team": destination.writekey, "Content-Type": "application/json", }, body=json.dumps(payload, default=json_default_handler), ) self.http_client.fetch(req, self._response_callback) # store the events that were sent so we can process responses later # it is important that we delete these eventually, or we'll run into memory issues self.batch_data[req] = {"start": start, "events": events} except Exception as e: # Catch all exceptions and hand them to the responses queue. self._enqueue_errors(status_code, e, start, events) finally: self.batch_sem.release() def _enqueue_errors(self, status_code, error, start, events): for ev in events: self.sd.incr("send_errors") self._enqueue_response(status_code, "", error, start, ev.metadata) def _response_callback(self, resp): # resp.request should be the same HTTPRequest object built by _send_batch # and mapped to values in batch_data events = self.batch_data[resp.request]["events"] start = self.batch_data[resp.request]["start"] try: status_code = resp.code resp.rethrow() statuses = [d["status"] for d in json.loads(resp.body)] for ev, status in zip(events, statuses): self._enqueue_response(status, "", None, start, ev.metadata) self.sd.incr("messages_sent") except Exception as e: self._enqueue_errors(status_code, e, start, events) self.sd.incr("send_errors") finally: # clean up the data for this batch del self.batch_data[resp.request] def _enqueue_response(self, status_code, body, error, start, metadata): resp = { "status_code": status_code, "body": body, "error": error, "duration": (time.time() - start) * 1000, "metadata": metadata } if self.block_on_response: self.responses.put(resp) else: try: self.responses.put_nowait(resp) except QueueFull: pass def close(self): '''call close to send all in-flight requests and shut down the senders nicely. Times out after max 20 seconds per sending thread plus 10 seconds for the response queue''' try: self.pending.put(None, 10) except QueueFull: pass # signal to the responses queue that nothing more is coming. try: self.responses.put(None, 10) except QueueFull: pass def get_response_queue(self): ''' return the responses queue on to which will be sent the response objects from each event send''' return self.responses
class BlogBackup(object): _default_dir_name = "seg_blog_backup" def _generate_save_dir(self): cur_dir = os.path.dirname(__file__) self.save_path = os.path.join(cur_dir, self._default_dir_name) if not os.path.isdir(self.save_path): os.mkdir(self.save_path) def _parse_save_path(self): if self.save_path: if os.path.exists(self.save_path) and os.path.isdir(self.save_path): return else: raise BlogSavePathError("'%s' not exists or is not dir!" % self.save_path) else: self._generate_save_dir() @staticmethod def parse_token_from_html(content): overall_pat = re.compile(r"SF.token =.*?,\s+_\w+ = [\d,\[\]]+;", re.DOTALL) overall_res = overall_pat.search(content) if overall_res: overall_content = overall_res.group() # remove /* */ type annotation filter_res = re.sub(r"(/\*[/a-zA-Z\d' ]+\*/)", "", overall_content) str_list = re.findall(r"(?<!//)'([a-zA-Z\d]+)'", filter_res, re.DOTALL) filter_list = re.findall(r"\[(\d+),(\d+)\]", overall_content) ret = "".join(str_list) if filter_list: for m, n in filter_list: ret = ret[: int(m)] + ret[int(n) :] if len(ret) == 32: return ret raise PageHtmlChanged("website login token has changed") def _get_user_cookies(self): s = requests.Session() s.headers.update(headers) rep = s.get(target_url) post_url = "%s%s?_=%s" % (target_url, login_api_path, self.parse_token_from_html(rep.text)) data = {"mail": self.username, "password": self.passwd} s.post(post_url, data=data) return s.cookies def __init__(self, **conf): self.username = conf["username"] self.passwd = conf["passwd"] self.save_path = conf.get("save_path") self._q = Queue() self._cookies = self._get_user_cookies() self._parse_save_path() @gen.coroutine def run(self): start_url = target_url + blog_path yield self._fetch_blog_list_page(start_url) for _ in xrange(cpu_count()): self._fetch_essay_content() yield self._q.join() @gen.coroutine def _fetch_blog_list_page(self, page_link): ret = requests.get(page_link, cookies=self._cookies) d = pq(ret.text) link_elements = d(".stream-list__item > .summary > h2 > a") for link in link_elements: yield self._q.put(d(link).attr("href")) next_ele = d(".pagination li.next a") if next_ele: next_page_url = target_url + next_ele.attr("href") self._fetch_blog_list_page(next_page_url) @gen.coroutine def _fetch_essay_content(self): while True: try: essay_path = yield self._q.get(timeout=1) essay_url = target_url + essay_path + edit_suffix ret = requests.get(essay_url, cookies=self._cookies) d = pq(ret.text) title = d("#myTitle").val() content = d("#myEditor").text() real_file_name = os.path.join(self.save_path, title + ".md") logger.info("is backup essay: %s" % title) with open(real_file_name, "w") as f: f.writelines(content.encode("utf8")) except gen.TimeoutError: raise gen.Return() finally: self._q.task_done()
class Model: def __init__(self, config_file): self.lock = locks.Lock() self.classification_queue = Queue() print('loading config %s' % config_file, file=log.v5) # Load and setup config try: self.config = Config.Config() self.config.load_file(config_file) self.pause_after_first_seq = self.config.float('pause_after_first_seq', 0.2) self.batch_size = self.config.int('batch_size', 5000) self.max_seqs = self.config.int('max_seqs', -1) except Exception: print('Error: loading config %s failed' % config_file, file=log.v1) raise try: self.devices = self._init_devices() except Exception: print('Error: Loading devices for config %s failed' % config_file, file=log.v1) raise print('Starting engine for config %s' % config_file, file=log.v5) self.engine = Engine.Engine(self.devices) try: self.engine.init_network_from_config(config=self.config) except Exception: print('Error: Loading network for config %s failed' % config_file, file=log.v1) raise IOLoop.current().spawn_callback(self.classify_in_background) self.last_used = datetime.datetime.now() def _init_devices(self): """ Initiates the required devices for a config. Same as the funtion initDevices in rnn.py. :param config: :return: A list with the devices used. """ oldDeviceConfig = ",".join(self.config.list('device', ['default'])) if "device" in TheanoFlags: # This is important because Theano likely already has initialized that device. config.set("device", TheanoFlags["device"]) print("Devices: Use %s via THEANO_FLAGS instead of %s." % (TheanoFlags["device"], oldDeviceConfig), file=log.v4) devArgs = get_devices_init_args(self.config) assert len(devArgs) > 0 devices = [Device(**kwargs) for kwargs in devArgs] for device in devices: while not device.initialized: time.sleep(0.25) if devices[0].blocking: print("Devices: Used in blocking / single proc mode.", file=log.v4) else: print("Devices: Used in multiprocessing mode.", file=log.v4) return devices @tornado.gen.coroutine def classify_in_background(self): while True: requests = [] # fetch first request r = yield self.classification_queue.get() requests.append(r) # grab all other waiting requests try: while True: requests.append(self.classification_queue.get_nowait()) except QueueEmpty: pass output_dim = {} # Do dataset creation and classification. dataset = StaticDataset(data=[r.data for r in requests], output_dim=output_dim) dataset.init_seq_order() batches = dataset.generate_batches(recurrent_net=self.engine.network.recurrent, batch_size=self.batch_size, max_seqs=self.max_seqs) with (yield self.lock.acquire()): ctt = ForwardTaskThread(self.engine.network, self.devices, dataset, batches) yield ctt.join() try: for i in range(dataset.num_seqs): requests[i].future.set_result(ctt.result[i]) self.classification_queue.task_done() except Exception as e: print('exception', e) raise @tornado.gen.coroutine def classify(self, data): self.last_used = datetime.datetime.now() request = ClassificationRequest(data) yield self.classification_queue.put(request) yield request.future return request.future.result()
class TaskLogger(object): def __init__(self, task_id, engine=EngineType.REQUESTS, io_loop=None, task_url=TASK_URL, wrap=False, tenant=None): self.task_id = task_id self.task_url = task_url self._seq = 0 self._partial_log_url = self._get_partial_url('log') self._partial_result_url = self._get_partial_url('result') self.wrap = wrap if wrap and tenant: self._partial_log_url = update_query_params( self._partial_log_url, {'tenant': tenant}) self._partial_result_url = update_query_params( self._partial_result_url, {'tenant': tenant}) if engine == EngineType.REQUESTS: self.log = self._log_by_requests self.result = self._result_by_requests elif engine == EngineType.TORNADO: io_loop = io_loop if io_loop else IOLoop.current() self._http_client = AsyncHTTPClient(io_loop=io_loop) self._queue = Queue() self.log = self._log_by_tornado self.result = self._result_by_tornado else: raise TaskLoggerError('', reason='engine only supports {}'.format( EngineType.types_str())) def _get_partial_url(self, partial_name): url = urljoin(self.task_url, partial_name) url = update_query_params(url, {'task_id': self.task_id}) return url def _get_log_url(self, seq): url = update_query_params(self._partial_log_url, {'seq': seq}) return url def _get_result_url(self, seq, exit_code=0): url = update_query_params(self._partial_result_url, { 'seq': seq, 'exit_code': exit_code }) return url def _log_by_requests(self, log): self._seq += 1 log_url = self._get_log_url(self._seq) data = self._create_log(log, self._seq) self._send_by_requests(log_url, data) def _result_by_requests(self, result, exit_code=0): self._seq += 1 result_url = self._get_result_url(self._seq, exit_code) data = self._create_result(result, self._seq, exit_code=exit_code) self._send_by_requests(result_url, data) @staticmethod def _send_by_requests(url, data): res = requests.post(url, data=data, verify=False) if res.status_code != 200: raise TaskLoggerError(data, reason=res.reason) @gen.coroutine def _log_by_tornado(self, log): yield self._queue.put(1) self._seq += 1 log_url = self._get_log_url(self._seq) data = self._create_log(log, self._seq) try: yield self._send_by_tornado(log_url, data) finally: yield self._queue.get() self._queue.task_done() @gen.coroutine def _result_by_tornado(self, result, exit_code=0): yield self._queue.join() self._seq += 1 result_url = self._get_result_url(self._seq, exit_code) data = self._create_result(result, self._seq, exit_code=exit_code) yield self._send_by_tornado(result_url, data) @gen.coroutine def _send_by_tornado(self, url, data): try: response = yield self._http_client.fetch( url, method='POST', headers={'Content-Type': 'application/json'}, validate_cert=False, body=data) except Exception as exc: if hasattr(exc, 'response') and exc.response: exc = 'url:{}, exc:{}, body:{}'.format(url, exc, exc.response.body) raise TaskLoggerError(data, str(exc)) else: if response.code != 200: raise TaskLoggerError(data, reason=response.body) def _create_log(self, log, seq): assert isinstance(log, basestring) log = log + '\n' if self.wrap: log_msg = TaskLogMessage(task_id=self.task_id, log=log, seq=seq) data = json_encode({'messages': log_msg}) else: data = log return data def _create_result(self, result, seq, exit_code): assert isinstance(result, basestring) result = result + '\n' if self.wrap: result_msg = TaskResultMessage(task_id=self.task_id, result=result, seq=seq, exit_code=exit_code) data = json_encode({'messages': result_msg}) else: data = result return data
class TornadoTransmission(): def __init__(self, max_concurrent_batches=10, block_on_send=False, block_on_response=False, max_batch_size=100, send_frequency=0.25, user_agent_addition=''): if not has_tornado: raise ImportError('TornadoTransmission requires tornado, but it was not found.') self.block_on_send = block_on_send self.block_on_response = block_on_response self.max_batch_size = max_batch_size self.send_frequency = send_frequency user_agent = "libhoney-py/" + VERSION if user_agent_addition: user_agent += " " + user_agent_addition self.http_client = AsyncHTTPClient( force_instance=True, defaults=dict(user_agent=user_agent)) # libhoney adds events to the pending queue for us to send self.pending = Queue(maxsize=1000) # we hand back responses from the API on the responses queue self.responses = Queue(maxsize=2000) self.batch_data = {} self.sd = statsd.StatsClient(prefix="libhoney") self.batch_sem = Semaphore(max_concurrent_batches) def start(self): ioloop.IOLoop.current().spawn_callback(self._sender) def send(self, ev): '''send accepts an event and queues it to be sent''' self.sd.gauge("queue_length", self.pending.qsize()) try: if self.block_on_send: self.pending.put(ev) else: self.pending.put_nowait(ev) self.sd.incr("messages_queued") except QueueFull: response = { "status_code": 0, "duration": 0, "metadata": ev.metadata, "body": "", "error": "event dropped; queue overflow", } if self.block_on_response: self.responses.put(response) else: try: self.responses.put_nowait(response) except QueueFull: # if the response queue is full when trying to add an event # queue is full response, just skip it. pass self.sd.incr("queue_overflow") # We're using the older decorator/yield model for compatibility with # Python versions before 3.5. # See: http://www.tornadoweb.org/en/stable/guide/coroutines.html#python-3-5-async-and-await @gen.coroutine def _sender(self): '''_sender is the control loop that pulls events off the `self.pending` queue and submits batches for actual sending. ''' events = [] last_flush = time.time() while True: try: ev = yield self.pending.get(timeout=self.send_frequency) if ev is None: # signals shutdown yield self._flush(events) return events.append(ev) if (len(events) > self.max_batch_size or time.time() - last_flush > self.send_frequency): yield self._flush(events) events = [] except TimeoutError: yield self._flush(events) events = [] last_flush = time.time() @gen.coroutine def _flush(self, events): if not events: return for dest, group in group_events_by_destination(events).items(): yield self._send_batch(dest, group) @gen.coroutine def _send_batch(self, destination, events): ''' Makes a single batch API request with the given list of events. The `destination` argument contains the write key, API host and dataset name used to build the request.''' start = time.time() status_code = 0 try: # enforce max_concurrent_batches yield self.batch_sem.acquire() url = urljoin(urljoin(destination.api_host, "/1/batch/"), destination.dataset) payload = [] for ev in events: event_time = ev.created_at.isoformat() if ev.created_at.tzinfo is None: event_time += "Z" payload.append({ "time": event_time, "samplerate": ev.sample_rate, "data": ev.fields()}) req = HTTPRequest( url, method='POST', headers={ "X-Honeycomb-Team": destination.writekey, "Content-Type": "application/json", }, body=json.dumps(payload, default=json_default_handler), ) self.http_client.fetch(req, self._response_callback) # store the events that were sent so we can process responses later # it is important that we delete these eventually, or we'll run into memory issues self.batch_data[req] = {"start": start, "events": events} except Exception as e: # Catch all exceptions and hand them to the responses queue. self._enqueue_errors(status_code, e, start, events) finally: self.batch_sem.release() def _enqueue_errors(self, status_code, error, start, events): for ev in events: self.sd.incr("send_errors") self._enqueue_response(status_code, "", error, start, ev.metadata) def _response_callback(self, resp): # resp.request should be the same HTTPRequest object built by _send_batch # and mapped to values in batch_data events = self.batch_data[resp.request]["events"] start = self.batch_data[resp.request]["start"] try: status_code = resp.code resp.rethrow() statuses = [d["status"] for d in json.loads(resp.body)] for ev, status in zip(events, statuses): self._enqueue_response(status, "", None, start, ev.metadata) self.sd.incr("messages_sent") except Exception as e: self._enqueue_errors(status_code, e, start, events) self.sd.incr("send_errors") finally: # clean up the data for this batch del self.batch_data[resp.request] def _enqueue_response(self, status_code, body, error, start, metadata): resp = { "status_code": status_code, "body": body, "error": error, "duration": (time.time() - start) * 1000, "metadata": metadata } if self.block_on_response: self.responses.put(resp) else: try: self.responses.put_nowait(resp) except QueueFull: pass def close(self): '''call close to send all in-flight requests and shut down the senders nicely. Times out after max 20 seconds per sending thread plus 10 seconds for the response queue''' try: self.pending.put(None, 10) except QueueFull: pass # signal to the responses queue that nothing more is coming. try: self.responses.put(None, 10) except QueueFull: pass def get_response_queue(self): ''' return the responses queue on to which will be sent the response objects from each event send''' return self.responses
class Worker(Server): """ Worker Node Workers perform two functions: 1. **Serve data** from a local dictionary 2. **Perform computation** on that data and on data from peers Additionally workers keep a Center informed of their data and use that Center to gather data from other workers when necessary to perform a computation. You can start a worker with the ``dworker`` command line application:: $ dworker scheduler-ip:port **State** * **data:** ``{key: object}``: Dictionary mapping keys to actual values * **active:** ``{key}``: Set of keys currently under computation * **ncores:** ``int``: Number of cores used by this worker process * **executor:** ``concurrent.futures.ThreadPoolExecutor``: Executor used to perform computation * **local_dir:** ``path``: Path on local machine to store temporary files * **center:** ``rpc``: Location of center or scheduler. See ``.ip/.port`` attributes. * **name:** ``string``: Alias * **services:** ``{str: Server}``: Auxiliary web servers running on this worker * **service_ports:** ``{str: port}``: Examples -------- Create centers and workers in Python: >>> from distributed import Center, Worker >>> c = Center('192.168.0.100', 8787) # doctest: +SKIP >>> w = Worker(c.ip, c.port) # doctest: +SKIP >>> yield w._start(port=8788) # doctest: +SKIP Or use the command line:: $ dcenter Start center at 127.0.0.1:8787 $ dworker 127.0.0.1:8787 Start worker at: 127.0.0.1:8788 Registered with center at: 127.0.0.1:8787 See Also -------- distributed.center.Center: """ def __init__(self, center_ip, center_port, ip=None, ncores=None, loop=None, local_dir=None, services=None, service_ports=None, name=None, **kwargs): self.ip = ip or get_ip() self._port = 0 self.ncores = ncores or _ncores self.data = dict() self.loop = loop or IOLoop.current() self.status = None self.local_dir = local_dir or tempfile.mkdtemp(prefix='worker-') self.executor = ThreadPoolExecutor(self.ncores) self.thread_tokens = Queue( ) # https://github.com/tornadoweb/tornado/issues/1595#issuecomment-198551572 for i in range(self.ncores): self.thread_tokens.put_nowait(i) self.center = rpc(ip=center_ip, port=center_port) self.active = set() self.name = name if not os.path.exists(self.local_dir): os.mkdir(self.local_dir) if self.local_dir not in sys.path: sys.path.insert(0, self.local_dir) self.services = {} self.service_ports = service_ports or {} for k, v in (services or {}).items(): if isinstance(k, tuple): k, port = k else: port = 0 self.services[k] = v(self) self.services[k].listen(port) self.service_ports[k] = self.services[k].port handlers = { 'compute': self.compute, 'gather': self.gather, 'compute-stream': self.compute_stream, 'run': self.run, 'get_data': self.get_data, 'update_data': self.update_data, 'delete_data': self.delete_data, 'terminate': self.terminate, 'ping': pingpong, 'health': self.health, 'upload_file': self.upload_file } super(Worker, self).__init__(handlers, **kwargs) @gen.coroutine def _start(self, port=0): self.listen(port) self.name = self.name or self.address for k, v in self.services.items(): v.listen(0) self.service_ports[k] = v.port logger.info(' Start worker at: %20s:%d', self.ip, self.port) for k, v in self.service_ports.items(): logger.info(' %16s at: %20s:%d' % (k, self.ip, v)) logger.info('Waiting to connect to: %20s:%d', self.center.ip, self.center.port) while True: try: resp = yield self.center.register(ncores=self.ncores, address=(self.ip, self.port), keys=list(self.data), services=self.service_ports, name=self.name) break except (OSError, StreamClosedError): logger.debug("Unable to register with scheduler. Waiting") yield gen.sleep(0.5) if resp != 'OK': raise ValueError(resp) logger.info(' Registered to: %20s:%d', self.center.ip, self.center.port) self.status = 'running' def start(self, port=0): self.loop.add_callback(self._start, port) def identity(self, stream): return { 'type': type(self).__name__, 'id': self.id, 'center': (self.center.ip, self.center.port) } @gen.coroutine def _close(self, report=True, timeout=10): if report: yield gen.with_timeout(timedelta(seconds=timeout), self.center.unregister(address=(self.ip, self.port)), io_loop=self.loop) self.center.close_streams() self.stop() self.executor.shutdown() if os.path.exists(self.local_dir): shutil.rmtree(self.local_dir) for k, v in self.services.items(): v.stop() self.status = 'closed' self.stop() @gen.coroutine def terminate(self, stream, report=True): yield self._close(report=report) raise Return('OK') @property def address(self): return '%s:%d' % (self.ip, self.port) @property def address_tuple(self): return (self.ip, self.port) @gen.coroutine def gather(self, stream=None, who_has=None): who_has = { k: [coerce_to_address(addr) for addr in v] for k, v in who_has.items() if k not in self.data } try: result = yield gather_from_workers(who_has) except KeyError as e: logger.warn("Could not find data", e) raise Return({'status': 'missing-data', 'keys': e.args}) else: self.data.update(result) raise Return({'status': 'OK'}) @gen.coroutine def _ready_task(self, function=None, key=None, args=(), kwargs={}, task=None, who_has=None): who_has = who_has or {} diagnostics = {} data = {k: self.data[k] for k in who_has if k in self.data} who_has = { k: set(map(coerce_to_address, v)) for k, v in who_has.items() if k not in self.data } if who_has: try: logger.info("gather %d keys from peers: %s", len(who_has), str(who_has)) diagnostics['transfer-start'] = time() other = yield gather_from_workers(who_has) diagnostics['transfer-stop'] = time() self.data.update(other) yield self.center.add_keys(address=self.address, keys=list(other)) data.update(other) except KeyError as e: logger.warn("Could not find data for %s", key) raise Return({ 'status': 'missing-data', 'keys': e.args, 'key': key }) else: transfer_time = 0 try: start = default_timer() if task is not None: task = loads(task) if function is not None: function = loads(function) if args: args = loads(args) if kwargs: kwargs = loads(kwargs) diagnostics['deserialization'] = default_timer() - start except Exception as e: logger.warn("Could not deserialize task", exc_info=True) raise Return(assoc(error_message(e), 'key', key)) if task is not None: assert not function and not args and not kwargs function = execute_task args = (task, ) # Fill args with data args2 = pack_data(args, data) kwargs2 = pack_data(kwargs, data) raise Return({ 'status': 'OK', 'function': function, 'args': args2, 'kwargs': kwargs2, 'diagnostics': diagnostics, 'key': key }) @gen.coroutine def executor_submit(self, key, function, *args, **kwargs): """ Safely run function in thread pool executor We've run into issues running concurrent.future futures within tornado. Apparently it's advantageous to use timeouts and periodic callbacks to ensure things run smoothly. This can get tricky, so we pull it off into an separate method. """ token = yield self.thread_tokens.get() job_counter[0] += 1 i = job_counter[0] # logger.info("%s:%d Starts job %d, %s", self.ip, self.port, i, key) future = self.executor.submit(function, *args, **kwargs) pc = PeriodicCallback( lambda: logger.debug("future state: %s - %s", key, future._state), 1000) pc.start() try: if sys.version_info < (3, 2): yield future else: while not future.done() and future._state != 'FINISHED': try: yield gen.with_timeout(timedelta(seconds=1), future, io_loop=self.loop) break except gen.TimeoutError: logger.info("work queue size: %d", self.executor._work_queue.qsize()) logger.info("future state: %s", future._state) logger.info("Pending job %d: %s", i, future) finally: pc.stop() self.thread_tokens.put(token) result = future.result() logger.info("Finish job %d, %s", i, key) raise gen.Return(result) @gen.coroutine def compute_stream(self, stream): with log_errors(): logger.debug("Open compute stream") bstream = BatchedSend(interval=10, loop=self.loop) bstream.start(stream) @gen.coroutine def process(msg): try: result = yield self.compute(report=False, **msg) bstream.send(result) except Exception as e: logger.exception(e) bstream.send(assoc(error_message(e), 'key', msg.get('key'))) with log_errors(): while True: try: msgs = yield read(stream) except StreamClosedError: break if not isinstance(msgs, list): msgs = [msgs] for msg in msgs: op = msg.pop('op', None) if op == 'close': break elif op == 'compute-task': self.loop.add_callback(process, msg) else: logger.warning("Unknown operation %s, %s", op, msg) yield bstream.close() logger.info("Close compute stream") @gen.coroutine def compute(self, stream=None, function=None, key=None, args=(), kwargs={}, task=None, who_has=None, report=True): """ Execute function """ self.active.add(key) # Ready function for computation msg = yield self._ready_task(function=function, key=key, args=args, kwargs=kwargs, task=task, who_has=who_has) if msg['status'] != 'OK': try: self.active.remove(key) except KeyError: pass raise Return(msg) else: function = msg['function'] args = msg['args'] kwargs = msg['kwargs'] # Log and compute in separate thread result = yield self.executor_submit(key, apply_function, function, args, kwargs) result['key'] = key result.update(msg['diagnostics']) if result['status'] == 'OK': self.data[key] = result.pop('result') if report: response = yield self.center.add_keys(address=(self.ip, self.port), keys=[key]) if not response == 'OK': logger.warn('Could not report results to center: %s', str(response)) else: logger.warn( " Compute Failed\n" "Function: %s\n" "args: %s\n" "kwargs: %s\n", str(funcname(function))[:1000], str(args)[:1000], str(kwargs)[:1000], exc_info=True) logger.debug("Send compute response to scheduler: %s, %s", key, msg) try: self.active.remove(key) except KeyError: pass raise Return(result) @gen.coroutine def run(self, stream, function=None, args=(), kwargs={}): function = loads(function) if args: args = loads(args) if kwargs: kwargs = loads(kwargs) try: result = function(*args, **kwargs) except Exception as e: logger.warn( " Run Failed\n" "Function: %s\n" "args: %s\n" "kwargs: %s\n", str(funcname(function))[:1000], str(args)[:1000], str(kwargs)[:1000], exc_info=True) response = error_message(e) else: response = { 'status': 'OK', 'result': dumps(result), } raise Return(response) @gen.coroutine def update_data(self, stream, data=None, report=True): data = valmap(loads, data) self.data.update(data) if report: response = yield self.center.add_keys(address=(self.ip, self.port), keys=list(data)) assert response == 'OK' info = { 'nbytes': {k: sizeof(v) for k, v in data.items()}, 'status': 'OK' } raise Return(info) @gen.coroutine def delete_data(self, stream, keys=None, report=True): for key in keys: if key in self.data: del self.data[key] logger.info("Deleted %d keys", len(keys)) if report: logger.debug("Reporting loss of keys to center") yield self.center.remove_keys(address=self.address, keys=list(keys)) raise Return('OK') def get_data(self, stream, keys=None): return {k: dumps(self.data[k]) for k in keys if k in self.data} def upload_file(self, stream, filename=None, data=None, load=True): out_filename = os.path.join(self.local_dir, filename) if isinstance(data, unicode): data = data.encode() with open(out_filename, 'wb') as f: f.write(data) f.flush() if load: try: name, ext = os.path.splitext(filename) if ext in ('.py', '.pyc'): logger.info("Reload module %s from .py file", name) name = name.split('-')[0] reload(import_module(name)) if ext == '.egg': sys.path.append(out_filename) pkgs = pkg_resources.find_distributions(out_filename) for pkg in pkgs: logger.info("Load module %s from egg", pkg.project_name) reload(import_module(pkg.project_name)) if not pkgs: logger.warning("Found no packages in egg file") except Exception as e: logger.exception(e) return {'status': 'error', 'exception': dumps(e)} return {'status': 'OK', 'nbytes': len(data)} def health(self, stream=None): """ Information about worker """ d = { 'active': len(self.active), 'stored': len(self.data), 'time': time() } try: import psutil mem = psutil.virtual_memory() d.update({ 'cpu': psutil.cpu_percent(), 'memory': mem.total, 'memory-percent': mem.percent }) try: net_io = psutil.net_io_counters() d['network-send'] = net_io.bytes_sent - self._last_net_io.bytes_sent d['network-recv'] = net_io.bytes_recv - self._last_net_io.bytes_recv except AttributeError: pass self._last_net_io = net_io try: disk_io = psutil.disk_io_counters() d['disk-read'] = disk_io.read_bytes - self._last_disk_io.read_bytes d['disk-write'] = disk_io.write_bytes - self._last_disk_io.write_bytes except AttributeError: pass self._last_disk_io = disk_io except ImportError: pass return d
class Publisher(MQAsyncSub): """Handles new data to be passed on to subscribers.""" def __init__(self): self.ctx = zmq.Context() self.WSmessages = Queue() self.MQmessages = Queue() self.sub = MQAsyncSub.__init__(self, self.ctx, 'admin', []) self.pub = MQPub(self.ctx, 'admin-ws') self.subscribers = set() def register(self, subscriber): """Register a new subscriber.""" self.subscribers.add(subscriber) def deregister(self, subscriber): """Stop publishing to a subscriber.""" try: self.subscribers.remove(subscriber) except: pass @gen.coroutine def on_message(self, did, msg): """Receive message from MQ sub and send to WS.""" yield self.WSmessages.put({"msgid": did, "content": msg}) @gen.coroutine def publishToWS(self): while True: message = yield self.WSmessages.get() if len(self.subscribers) > 0: #print(u"Pushing MQ message to {} WS subscribers...".format(len(self.subscribers))) yield [subscriber.submit(message) for subscriber in self.subscribers] @gen.coroutine def publishToMQ(self): while True: message = yield self.MQmessages.get() self.sendToMQ(message) def sendToMQ(self, message): try: ctx = zmq.Context() jsons = json.loads(message) # req/rep if 'mq_request' in jsons and 'data' in jsons: cli = MQSyncReq(ctx) msg = MQMessage() msg.set_action(str(jsons['mq_request'])) msg.set_data(jsons['data']) print(u"REQ : {0}".format(msg.get())) if 'dst' in jsons: dst = str(jsons['dst']) else: dst = 'manager' res = cli.request(dst, msg.get(), timeout=10) if res: print(res.get()) cli.shutdown() del cli # pub elif 'mq_publish' in jsons and 'data' in jsons: self.pub.send_event(jsons['mq_publish'], jsons['data']) except Exception as e: print(u"Error sending mq message: {0}".format(e))
class Kernel(SingletonConfigurable): #--------------------------------------------------------------------------- # Kernel interface #--------------------------------------------------------------------------- # attribute to override with a GUI eventloop = Any(None) @observe('eventloop') def _update_eventloop(self, change): """schedule call to eventloop from IOLoop""" loop = ioloop.IOLoop.current() if change.new is not None: loop.add_callback(self.enter_eventloop) session = Instance(Session, allow_none=True) profile_dir = Instance('IPython.core.profiledir.ProfileDir', allow_none=True) shell_stream = Instance(ZMQStream, allow_none=True) shell_streams = List( help="""Deprecated shell_streams alias. Use shell_stream .. versionchanged:: 6.0 shell_streams is deprecated. Use shell_stream. """) @default("shell_streams") def _shell_streams_default(self): warnings.warn( "Kernel.shell_streams is deprecated in yapkernel 6.0. Use Kernel.shell_stream", DeprecationWarning, stacklevel=2, ) if self.shell_stream is not None: return [self.shell_stream] else: return [] @observe("shell_streams") def _shell_streams_changed(self, change): warnings.warn( "Kernel.shell_streams is deprecated in yapkernel 6.0. Use Kernel.shell_stream", DeprecationWarning, stacklevel=2, ) if len(change.new) > 1: warnings.warn( "Kernel only supports one shell stream. Additional streams will be ignored.", RuntimeWarning, stacklevel=2, ) if change.new: self.shell_stream = change.new[0] control_stream = Instance(ZMQStream, allow_none=True) debug_shell_socket = Any() control_thread = Any() iopub_socket = Any() iopub_thread = Any() stdin_socket = Any() log = Instance(logging.Logger, allow_none=True) # identities: int_id = Integer(-1) ident = Unicode() @default('ident') def _default_ident(self): return str(uuid.uuid4()) # This should be overridden by wrapper kernels that implement any real # language. language_info = { 'name': 'Prolog (YAP)', 'mimetype': 'text/x-prolog', 'file_extension': '.yap', } # any links that should go in the help menu help_links = List() # Private interface _darwin_app_nap = Bool( True, help="""Whether to use appnope for compatibility with OS X App Nap. Only affects OS X >= 10.9. """).tag(config=True) # track associations with current request _allow_stdin = Bool(False) _parents = Dict({"shell": {}, "control": {}}) _parent_ident = Dict({'shell': b'', 'control': b''}) @property def _parent_header(self): warnings.warn( "Kernel._parent_header is deprecated in yapkernel 6. Use .get_parent()", DeprecationWarning, stacklevel=2, ) return self.get_parent(channel="shell") # Time to sleep after flushing the stdout/err buffers in each execute # cycle. While this introduces a hard limit on the minimal latency of the # execute cycle, it helps prevent output synchronization problems for # clients. # Units are in seconds. The minimum zmq latency on local host is probably # ~150 microseconds, set this to 500us for now. We may need to increase it # a little if it's not enough after more interactive testing. _execute_sleep = Float(0.0005).tag(config=True) # Frequency of the kernel's event loop. # Units are in seconds, kernel subclasses for GUI toolkits may need to # adapt to milliseconds. _poll_interval = Float(0.01).tag(config=True) stop_on_error_timeout = Float( 0.0, config=True, help="""time (in seconds) to wait for messages to arrive when aborting queued requests after an error. Requests that arrive within this window after an error will be cancelled. Increase in the event of unusually slow network causing significant delays, which can manifest as e.g. "Run all" in a notebook aborting some, but not all, messages after an error. """) # If the shutdown was requested over the network, we leave here the # necessary reply message so it can be sent by our registered atexit # handler. This ensures that the reply is only sent to clients truly at # the end of our shutdown process (which happens after the underlying # IPython shell's own shutdown). _shutdown_message = None # This is a dict of port number that the kernel is listening on. It is set # by record_ports and used by connect_request. _recorded_ports = Dict() # set of aborted msg_ids aborted = Set() # Track execution count here. For IPython, we override this to use the # execution count we store in the shell. execution_count = 0 msg_types = [ 'execute_request', 'complete_request', 'inspect_request', 'history_request', 'comm_info_request', 'kernel_info_request', 'connect_request', 'shutdown_request', 'is_complete_request', 'interrupt_request', # deprecated: 'apply_request', ] # add deprecated ipyparallel control messages control_msg_types = msg_types + [ 'clear_request', 'abort_request', 'debug_request' ] def __init__(self, **kwargs): super(Kernel, self).__init__(**kwargs) # Build dict of handlers for message types self.shell_handlers = {} for msg_type in self.msg_types: self.shell_handlers[msg_type] = getattr(self, msg_type) self.control_handlers = {} for msg_type in self.control_msg_types: self.control_handlers[msg_type] = getattr(self, msg_type) self.control_queue = Queue() def dispatch_control(self, msg): self.control_queue.put_nowait(msg) async def poll_control_queue(self): while True: msg = await self.control_queue.get() # handle tracers from _flush_control_queue if isinstance(msg, (concurrent.futures.Future, asyncio.Future)): msg.set_result(None) continue await self.process_control(msg) async def _flush_control_queue(self): """Flush the control queue, wait for processing of any pending messages""" if self.control_thread: control_loop = self.control_thread.io_loop # concurrent.futures.Futures are threadsafe # and can be used to await across threads tracer_future = concurrent.futures.Future() awaitable_future = asyncio.wrap_future(tracer_future) else: control_loop = self.io_loop tracer_future = awaitable_future = asyncio.Future() def _flush(): # control_stream.flush puts messages on the queue self.control_stream.flush() # put Future on the queue after all of those, # so we can wait for all queued messages to be processed self.control_queue.put(tracer_future) control_loop.add_callback(_flush) return awaitable_future async def process_control(self, msg): """dispatch control requests""" idents, msg = self.session.feed_identities(msg, copy=False) try: msg = self.session.deserialize(msg, content=True, copy=False) except Exception: self.log.error("Invalid Control Message", exc_info=True) return self.log.debug("Control received: %s", msg) # Set the parent message for side effects. self.set_parent(idents, msg, channel='control') self._publish_status('busy', 'control') header = msg['header'] msg_type = header['msg_type'] handler = self.control_handlers.get(msg_type, None) if handler is None: self.log.error("UNKNOWN CONTROL MESSAGE TYPE: %r", msg_type) else: try: result = handler(self.control_stream, idents, msg) if inspect.isawaitable(result): await result except Exception: self.log.error("Exception in control handler:", exc_info=True) sys.stdout.flush() sys.stderr.flush() self._publish_status('idle', 'control') # flush to ensure reply is sent self.control_stream.flush(zmq.POLLOUT) def should_handle(self, stream, msg, idents): """Check whether a shell-channel message should be handled Allows subclasses to prevent handling of certain messages (e.g. aborted requests). """ msg_id = msg['header']['msg_id'] if msg_id in self.aborted: # is it safe to assume a msg_id will not be resubmitted? self.aborted.remove(msg_id) self._send_abort_reply(stream, msg, idents) return False return True async def dispatch_shell(self, msg): """dispatch shell requests""" # flush control queue before handling shell requests await self._flush_control_queue() idents, msg = self.session.feed_identities(msg, copy=False) try: msg = self.session.deserialize(msg, content=True, copy=False) except Exception: self.log.error("Invalid Message", exc_info=True) return # Set the parent message for side effects. self.set_parent(idents, msg, channel='shell') self._publish_status('busy', 'shell') msg_type = msg['header']['msg_type'] # Only abort execute requests if self._aborting and msg_type == 'execute_request': self._send_abort_reply(self.shell_stream, msg, idents) self._publish_status('idle', 'shell') # flush to ensure reply is sent before # handling the next request self.shell_stream.flush(zmq.POLLOUT) return # Print some info about this message and leave a '--->' marker, so it's # easier to trace visually the message chain when debugging. Each # handler prints its message at the end. self.log.debug('\n*** MESSAGE TYPE:%s***', msg_type) self.log.debug(' Content: %s\n --->\n ', msg['content']) if not self.should_handle(self.shell_stream, msg, idents): return handler = self.shell_handlers.get(msg_type, None) if handler is None: self.log.warning("Unknown message type: %r", msg_type) else: self.log.debug("%s: %s", msg_type, msg) try: self.pre_handler_hook() except Exception: self.log.debug("Unable to signal in pre_handler_hook:", exc_info=True) try: result = handler(self.shell_stream, idents, msg) if inspect.isawaitable(result): await result except Exception: self.log.error("Exception in message handler:", exc_info=True) except KeyboardInterrupt: # Ctrl-c shouldn't crash the kernel here. self.log.error("KeyboardInterrupt caught in kernel.") finally: try: self.post_handler_hook() except Exception: self.log.debug("Unable to signal in post_handler_hook:", exc_info=True) sys.stdout.flush() sys.stderr.flush() self._publish_status('idle', 'shell') # flush to ensure reply is sent before # handling the next request self.shell_stream.flush(zmq.POLLOUT) def pre_handler_hook(self): """Hook to execute before calling message handler""" # ensure default_int_handler during handler call self.saved_sigint_handler = signal(SIGINT, default_int_handler) def post_handler_hook(self): """Hook to execute after calling message handler""" signal(SIGINT, self.saved_sigint_handler) def enter_eventloop(self): """enter eventloop""" self.log.info("Entering eventloop %s", self.eventloop) # record handle, so we can check when this changes eventloop = self.eventloop if eventloop is None: self.log.info("Exiting as there is no eventloop") return def advance_eventloop(): # check if eventloop changed: if self.eventloop is not eventloop: self.log.info("exiting eventloop %s", eventloop) return if self.msg_queue.qsize(): self.log.debug("Delaying eventloop due to waiting messages") # still messages to process, make the eventloop wait schedule_next() return self.log.debug("Advancing eventloop %s", eventloop) try: eventloop(self) except KeyboardInterrupt: # Ctrl-C shouldn't crash the kernel self.log.error("KeyboardInterrupt caught in kernel") pass if self.eventloop is eventloop: # schedule advance again schedule_next() def schedule_next(): """Schedule the next advance of the eventloop""" # flush the eventloop every so often, # giving us a chance to handle messages in the meantime self.log.debug("Scheduling eventloop advance") self.io_loop.call_later(0.001, advance_eventloop) # begin polling the eventloop schedule_next() async def do_one_iteration(self): """Process a single shell message Any pending control messages will be flushed as well .. versionchanged:: 5 This is now a coroutine """ # flush messages off of shell stream into the message queue self.shell_stream.flush() # process at most one shell message per iteration await self.process_one(wait=False) async def process_one(self, wait=True): """Process one request Returns None if no message was handled. """ if wait: t, dispatch, args = await self.msg_queue.get() else: try: t, dispatch, args = self.msg_queue.get_nowait() except asyncio.QueueEmpty: return None await dispatch(*args) async def dispatch_queue(self): """Coroutine to preserve order of message handling Ensures that only one message is processing at a time, even when the handler is async """ while True: try: await self.process_one() except Exception: self.log.exception("Error in message handler") _message_counter = Any(help="""Monotonic counter of messages """, ) @default('_message_counter') def _message_counter_default(self): return itertools.count() def schedule_dispatch(self, dispatch, *args): """schedule a message for dispatch""" idx = next(self._message_counter) self.msg_queue.put_nowait(( idx, dispatch, args, )) # ensure the eventloop wakes up self.io_loop.add_callback(lambda: None) def start(self): """register dispatchers for streams""" self.io_loop = ioloop.IOLoop.current() self.msg_queue = Queue() self.io_loop.add_callback(self.dispatch_queue) self.control_stream.on_recv(self.dispatch_control, copy=False) if self.control_thread: control_loop = self.control_thread.io_loop else: control_loop = self.io_loop asyncio.run_coroutine_threadsafe(self.poll_control_queue(), control_loop.asyncio_loop) self.shell_stream.on_recv( partial( self.schedule_dispatch, self.dispatch_shell, ), copy=False, ) # publish idle status self._publish_status('starting', 'shell') def record_ports(self, ports): """Record the ports that this kernel is using. The creator of the Kernel instance must call this methods if they want the :meth:`connect_request` method to return the port numbers. """ self._recorded_ports = ports #--------------------------------------------------------------------------- # Kernel request handlers #--------------------------------------------------------------------------- def _publish_execute_input(self, code, parent, execution_count): """Publish the code request on the iopub stream.""" self.session.send(self.iopub_socket, 'execute_input', { 'code': code, 'execution_count': execution_count }, parent=parent, ident=self._topic('execute_input')) def _publish_status(self, status, channel, parent=None): """send status (busy/idle) on IOPub""" self.session.send( self.iopub_socket, "status", {"execution_state": status}, parent=parent or self.get_parent(channel), ident=self._topic("status"), ) def _publish_debug_event(self, event): self.session.send( self.iopub_socket, "debug_event", event, parent=self.get_parent("control"), ident=self._topic("debug_event"), ) def set_parent(self, ident, parent, channel='shell'): """Set the current parent request Side effects (IOPub messages) and replies are associated with the request that caused them via the parent_header. The parent identity is used to route input_request messages on the stdin channel. """ self._parent_ident[channel] = ident self._parents[channel] = parent def get_parent(self, channel="shell"): """Get the parent request associated with a channel. .. versionadded:: 6 Parameters ---------- channel : str the name of the channel ('shell' or 'control') Returns ------- message : dict the parent message for the most recent request on the channel. """ return self._parents.get(channel, {}) def send_response(self, stream, msg_or_type, content=None, ident=None, buffers=None, track=False, header=None, metadata=None, channel='shell'): """Send a response to the message we're currently processing. This accepts all the parameters of :meth:`jupyter_client.session.Session.send` except ``parent``. This relies on :meth:`set_parent` having been called for the current message. """ return self.session.send( stream, msg_or_type, content, self.get_parent(channel), ident, buffers, track, header, metadata, ) def init_metadata(self, parent): """Initialize metadata. Run at the beginning of execution requests. """ # FIXME: `started` is part of ipyparallel # Remove for yapkernel 5.0 return { 'started': now(), } def finish_metadata(self, parent, metadata, reply_content): """Finish populating metadata. Run after completing an execution request. """ return metadata async def execute_request(self, stream, ident, parent): """handle an execute_request""" try: content = parent['content'] code = content['code'] silent = content['silent'] store_history = content.get('store_history', not silent) user_expressions = content.get('user_expressions', {}) allow_stdin = content.get('allow_stdin', False) except Exception: self.log.error("Got bad msg: ") self.log.error("%s", parent) return stop_on_error = content.get('stop_on_error', True) metadata = self.init_metadata(parent) # Re-broadcast our input for the benefit of listening clients, and # start computing output if not silent: self.execution_count += 1 self._publish_execute_input(code, parent, self.execution_count) reply_content = self.do_execute( code, silent, store_history, user_expressions, allow_stdin, ) if inspect.isawaitable(reply_content): reply_content = await reply_content # Flush output before sending the reply. sys.stdout.flush() sys.stderr.flush() # FIXME: on rare occasions, the flush doesn't seem to make it to the # clients... This seems to mitigate the problem, but we definitely need # to better understand what's going on. if self._execute_sleep: time.sleep(self._execute_sleep) # Send the reply. reply_content = json_clean(reply_content) metadata = self.finish_metadata(parent, metadata, reply_content) reply_msg = self.session.send(stream, 'execute_reply', reply_content, parent, metadata=metadata, ident=ident) self.log.debug("%s", reply_msg) if not silent and reply_msg['content'][ 'status'] == 'error' and stop_on_error: await self._abort_queues() def do_execute(self, code, silent, store_history=True, user_expressions=None, allow_stdin=False): """Execute user code. Must be overridden by subclasses. """ raise NotImplementedError async def complete_request(self, stream, ident, parent): content = parent['content'] code = content['code'] cursor_pos = content['cursor_pos'] matches = self.do_complete(code, cursor_pos) if inspect.isawaitable(matches): matches = await matches matches = json_clean(matches) self.session.send(stream, "complete_reply", matches, parent, ident) def do_complete(self, code, cursor_pos): """Override in subclasses to find completions. """ return { 'matches': [], 'cursor_end': cursor_pos, 'cursor_start': cursor_pos, 'metadata': {}, 'status': 'ok' } async def inspect_request(self, stream, ident, parent): content = parent['content'] reply_content = self.do_inspect( content['code'], content['cursor_pos'], content.get('detail_level', 0), ) if inspect.isawaitable(reply_content): reply_content = await reply_content # Before we send this object over, we scrub it for JSON usage reply_content = json_clean(reply_content) msg = self.session.send(stream, 'inspect_reply', reply_content, parent, ident) self.log.debug("%s", msg) def do_inspect(self, code, cursor_pos, detail_level=0): """Override in subclasses to allow introspection. """ return {'status': 'ok', 'data': {}, 'metadata': {}, 'found': False} async def history_request(self, stream, ident, parent): content = parent['content'] reply_content = self.do_history(**content) if inspect.isawaitable(reply_content): reply_content = await reply_content reply_content = json_clean(reply_content) msg = self.session.send(stream, 'history_reply', reply_content, parent, ident) self.log.debug("%s", msg) def do_history(self, hist_access_type, output, raw, session=None, start=None, stop=None, n=None, pattern=None, unique=False): """Override in subclasses to access history. """ return {'status': 'ok', 'history': []} async def connect_request(self, stream, ident, parent): if self._recorded_ports is not None: content = self._recorded_ports.copy() else: content = {} content['status'] = 'ok' msg = self.session.send(stream, 'connect_reply', content, parent, ident) self.log.debug("%s", msg) @property def kernel_info(self): return { 'protocol_version': kernel_protocol_version, 'implementation': self.implementation, 'implementation_version': self.implementation_version, 'language_info': self.language_info, 'banner': self.banner, 'help_links': self.help_links, } async def kernel_info_request(self, stream, ident, parent): content = {'status': 'ok'} content.update(self.kernel_info) msg = self.session.send(stream, 'kernel_info_reply', content, parent, ident) self.log.debug("%s", msg) async def comm_info_request(self, stream, ident, parent): content = parent['content'] target_name = content.get('target_name', None) # Should this be moved to ipkernel? if hasattr(self, 'comm_manager'): comms = { k: dict(target_name=v.target_name) for (k, v) in self.comm_manager.comms.items() if v.target_name == target_name or target_name is None } else: comms = {} reply_content = dict(comms=comms, status='ok') msg = self.session.send(stream, 'comm_info_reply', reply_content, parent, ident) self.log.debug("%s", msg) async def interrupt_request(self, stream, ident, parent): pid = os.getpid() pgid = os.getpgid(pid) if os.name == "nt": self.log.error("Interrupt message not supported on Windows") else: # Prefer process-group over process if pgid and hasattr(os, "killpg"): try: os.killpg(pgid, SIGINT) return except OSError: pass try: os.kill(pid, SIGINT) except OSError: pass content = parent['content'] self.session.send(stream, 'interrupt_reply', content, parent, ident=ident) return async def shutdown_request(self, stream, ident, parent): content = self.do_shutdown(parent['content']['restart']) if inspect.isawaitable(content): content = await content self.session.send(stream, 'shutdown_reply', content, parent, ident=ident) # same content, but different msg_id for broadcasting on IOPub self._shutdown_message = self.session.msg('shutdown_reply', content, parent) self._at_shutdown() self.log.debug('Stopping control ioloop') control_io_loop = self.control_stream.io_loop control_io_loop.add_callback(control_io_loop.stop) self.log.debug('Stopping shell ioloop') shell_io_loop = self.shell_stream.io_loop shell_io_loop.add_callback(shell_io_loop.stop) def do_shutdown(self, restart): """Override in subclasses to do things when the frontend shuts down the kernel. """ return {'status': 'ok', 'restart': restart} async def is_complete_request(self, stream, ident, parent): content = parent['content'] code = content['code'] reply_content = self.do_is_complete(code) if inspect.isawaitable(reply_content): reply_content = await reply_content reply_content = json_clean(reply_content) reply_msg = self.session.send(stream, 'is_complete_reply', reply_content, parent, ident) self.log.debug("%s", reply_msg) def do_is_complete(self, code): """Override in subclasses to find completions. """ return {'status': 'unknown'} async def debug_request(self, stream, ident, parent): content = parent['content'] reply_content = self.do_debug_request(content) if inspect.isawaitable(reply_content): reply_content = await reply_content reply_content = json_clean(reply_content) reply_msg = self.session.send(stream, 'debug_reply', reply_content, parent, ident) self.log.debug("%s", reply_msg) async def do_debug_request(self, msg): raise NotImplementedError #--------------------------------------------------------------------------- # Engine methods (DEPRECATED) #--------------------------------------------------------------------------- async def apply_request(self, stream, ident, parent): self.log.warning( "apply_request is deprecated in kernel_base, moving to ipyparallel." ) try: content = parent['content'] bufs = parent['buffers'] msg_id = parent['header']['msg_id'] except Exception: self.log.error("Got bad msg: %s", parent, exc_info=True) return md = self.init_metadata(parent) reply_content, result_buf = self.do_apply(content, bufs, msg_id, md) # flush i/o sys.stdout.flush() sys.stderr.flush() md = self.finish_metadata(parent, md, reply_content) self.session.send(stream, 'apply_reply', reply_content, parent=parent, ident=ident, buffers=result_buf, metadata=md) def do_apply(self, content, bufs, msg_id, reply_metadata): """DEPRECATED""" raise NotImplementedError #--------------------------------------------------------------------------- # Control messages (DEPRECATED) #--------------------------------------------------------------------------- async def abort_request(self, stream, ident, parent): """abort a specific msg by id""" self.log.warning( "abort_request is deprecated in kernel_base. It is only part of IPython parallel" ) msg_ids = parent['content'].get('msg_ids', None) if isinstance(msg_ids, str): msg_ids = [msg_ids] if not msg_ids: self._abort_queues() for mid in msg_ids: self.aborted.add(str(mid)) content = dict(status='ok') reply_msg = self.session.send(stream, 'abort_reply', content=content, parent=parent, ident=ident) self.log.debug("%s", reply_msg) async def clear_request(self, stream, idents, parent): """Clear our namespace.""" self.log.warning( "clear_request is deprecated in kernel_base. It is only part of IPython parallel" ) content = self.do_clear() self.session.send(stream, 'clear_reply', ident=idents, parent=parent, content=content) def do_clear(self): """DEPRECATED since 4.0.3""" raise NotImplementedError #--------------------------------------------------------------------------- # Protected interface #--------------------------------------------------------------------------- def _topic(self, topic): """prefixed topic for IOPub messages""" base = "kernel.%s" % self.ident return ("%s.%s" % (base, topic)).encode() _aborting = Bool(False) async def _abort_queues(self): self.shell_stream.flush() self._aborting = True def stop_aborting(): self.log.info("Finishing abort") self._aborting = False asyncio.get_event_loop().call_later(self.stop_on_error_timeout, stop_aborting) def _send_abort_reply(self, stream, msg, idents): """Send a reply to an aborted request""" self.log.info( f"Aborting {msg['header']['msg_id']}: {msg['header']['msg_type']}") reply_type = msg["header"]["msg_type"].rsplit("_", 1)[0] + "_reply" status = {"status": "aborted"} md = self.init_metadata(msg) md = self.finish_metadata(msg, md, status) md.update(status) self.session.send( stream, reply_type, metadata=md, content=status, parent=msg, ident=idents, ) def _no_raw_input(self): """Raise StdinNotImplementedError if active frontend doesn't support stdin.""" raise StdinNotImplementedError("raw_input was called, but this " "frontend does not support stdin.") def getpass(self, prompt='', stream=None): """Forward getpass to frontends Raises ------ StdinNotImplementedError if active frontend doesn't support stdin. """ if not self._allow_stdin: raise StdinNotImplementedError( "getpass was called, but this frontend does not support input requests." ) if stream is not None: import warnings warnings.warn( "The `stream` parameter of `getpass.getpass` will have no effect when using yapkernel", UserWarning, stacklevel=2, ) return self._input_request( prompt, self._parent_ident["shell"], self.get_parent("shell"), password=True, ) def raw_input(self, prompt=''): """Forward raw_input to frontends Raises ------ StdinNotImplementedError if active frontend doesn't support stdin. """ if not self._allow_stdin: raise StdinNotImplementedError( "raw_input was called, but this frontend does not support input requests." ) return self._input_request( str(prompt), self._parent_ident["shell"], self.get_parent("shell"), password=False, ) def _input_request(self, prompt, ident, parent, password=False): # Flush output before making the request. sys.stderr.flush() sys.stdout.flush() # flush the stdin socket, to purge stale replies while True: try: self.stdin_socket.recv_multipart(zmq.NOBLOCK) except zmq.ZMQError as e: if e.errno == zmq.EAGAIN: break else: raise # Send the input request. content = json_clean(dict(prompt=prompt, password=password)) self.session.send(self.stdin_socket, 'input_request', content, parent, ident=ident) # Await a response. while True: try: # Use polling with select() so KeyboardInterrupts can get # through; doing a blocking recv() means stdin reads are # uninterruptible on Windows. We need a timeout because # zmq.select() is also uninterruptible, but at least this # way reads get noticed immediately and KeyboardInterrupts # get noticed fairly quickly by human response time standards. rlist, _, xlist = zmq.select([self.stdin_socket], [], [self.stdin_socket], 0.01) if rlist or xlist: ident, reply = self.session.recv(self.stdin_socket) if (ident, reply) != (None, None): break except KeyboardInterrupt: # re-raise KeyboardInterrupt, to truncate traceback raise KeyboardInterrupt("Interrupted by user") from None except Exception: self.log.warning("Invalid Message:", exc_info=True) try: value = reply["content"]["value"] except Exception: self.log.error("Bad input_reply: %s", parent) value = '' if value == '\x04': # EOF raise EOFError return value def _at_shutdown(self): """Actions taken at shutdown by the kernel, called by python's atexit. """ if self._shutdown_message is not None: self.session.send(self.iopub_socket, self._shutdown_message, ident=self._topic('shutdown')) self.log.debug("%s", self._shutdown_message) self.control_stream.flush(zmq.POLLOUT)
__author__ = 'zhangxa' import sys sys.path.append("../..") from app.application import Application from workers.worker import Worker from tornado.queues import Queue app = Application([ (r"^http://www.baidu.com.*$", "urlHandler.urlHandler.UrlSeekHandler",{"a":10,"b":3}), (r"^http://www.jianshu.com/([0-9]+)/([0-9])+", "urlHandler.urlHandler.UrlBaseHandler",{"a":3}), ]) if __name__ == "__main__": queue = Queue() queue.put("http://www.jianshu.com") worker = Worker(app,queue) worker._find_url_handler("http://www.jianshu.com/1234/4") print(worker)
def get_file_list(account, **kwargs): queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() data = [] ids = set() @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) response_data = json.loads(response.body.decode("utf-8")) url = response_data.get("@odata.nextLink", None) if url is not None: queue.put(url) for file in response_data.get("value", []): if file["name"][-4:].strip(".").lower() in VALID_FILETYPES: if file["id"] not in ids: ids.add(file["id"]) data.append( { "title": file["parentReference"]["path"].split(":")[1].lstrip("/") + "/" + file["name"], "value": file["id"], } ) app_log.info("Page {} completed".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() app_log.info("Gathering filelist for account {}".format(account._id)) for file_type in VALID_FILETYPES: file_type = ".".join([file_type]) url = "https://api.onedrive.com/v1.0/drive/root/view.search?top=1000&select=parentReference,name,id,size&q={}".format( file_type ) queue.put(url) # start our concurrency worker worker() # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) app_log.info("Finished list retrieval. Found {} items.".format(data.__len__())) return sorted(data, key=lambda f: f["title"])
def get_data(cls, account, source_filter, limit=100, skip=0): """ Gathers commit information from GH GET https://api.github.com/repos/:owner/:repo/commits Header: Accept: application/vnd.github.v3+json """ if not account or not account.enabled: raise ValueError('cannot gather information without a valid account') client = AsyncHTTPClient() source_filter = GitHubRepositoryDateFilter(source_filter) if source_filter.repository is None: raise ValueError('required parameter projects missing') default_headers = {"Content-Type": "application/json", "Accept": "application/vnd.github.v3+json"} # first we grab our list of commits uri = "https://api.github.com/repos/{}/commits".format(source_filter.repository) qs = source_filter.get_qs() if qs != '': uri = uri + '?' + qs app_log.info("Starting retrieval of commit list for account {}".format(account._id)) if limit is not None and limit <= 100: # we can handle our limit right here uri += "?per_page={}".format(limit) elif limit is None: uri += "?per_page=100" # maximum number per page for GitHub API taken = 0 queue = Queue() sem = BoundedSemaphore(FETCH_CONCURRENCY) done, working = set(), set() while uri is not None: app_log.info( "({}) Retrieving next page, received {} commits thus far".format(account._id, taken)) req = account.get_request(uri, headers=default_headers) response = yield client.fetch(req) page_data = json.loads(response.body.decode('utf-8')) taken += page_data.__len__() for item in page_data: queue.put(item.get('url', None)) if limit is None or taken < limit: # parse the Link header from GitHub (https://developer.github.com/v3/#pagination) links = parse_link_header(response.headers.get('Link', '')) uri = links.get('next', None) else: break if queue.qsize() > 500: raise HTTPError(413, 'too many commits') app_log.info("({}) Commit list retrieved, fetching info for {} commits".format(account._id, taken)) # open our list cls.write('[') # our worker to actually fetch the info @gen.coroutine def fetch_url(): current_url = yield queue.get() try: if current_url in working: return page_no = working.__len__() app_log.info("Fetching page {}".format(page_no)) working.add(current_url) req = account.get_request(current_url) client = AsyncHTTPClient() response = yield client.fetch(req) response_data = json.loads(response.body.decode('utf-8')) obj = { 'date': response_data['commit']['author']['date'], 'author': response_data['commit']['author']['name'], 'added_files': [file for file in response_data['files'] if file['status'] == 'added'].__len__(), 'deleted_files': [file for file in response_data['files'] if file['status'] == 'deleted'].__len__(), 'modified_files': [file for file in response_data['files'] if file['status'] == 'modified'].__len__(), 'additions': response_data['stats']['additions'], 'deletions': response_data['stats']['deletions'] } if done.__len__() > 0: cls.write(',') cls.write(json.dumps(obj)) done.add(current_url) app_log.info("Page {} downloaded".format(page_no)) finally: queue.task_done() sem.release() @gen.coroutine def worker(): while True: yield sem.acquire() fetch_url() # start our concurrency worker worker() try: # wait until we're done yield queue.join(timeout=timedelta(seconds=MAXIMUM_REQ_TIME)) except gen.TimeoutError: app_log.warning("Request exceeds maximum time, cutting response short") finally: # close our list cls.write(']') app_log.info("Finished retrieving commits for {}".format(account._id))