def on_start(self): super(VizTransformProcForMatplotlibGraphs, self).on_start() #assert len(self.streams)==1 self.initDataFlag = True self.graph_data = { } # Stores a dictionary of variables : [List of values] # Need some clients self.rr_cli = ResourceRegistryServiceProcessClient( process=self, node=self.container.node) self.pubsub_cli = PubsubManagementServiceClient( node=self.container.node) # extract the various parameters passed to the transform process self.out_stream_id = self.CFG.get('process').get( 'publish_streams').get('visualization_service_submit_stream_id') # Create a publisher on the output stream #stream_route = self.pubsub_cli.register_producer(stream_id=self.out_stream_id) out_stream_pub_registrar = StreamPublisherRegistrar( process=self.container, node=self.container.node) self.out_stream_pub = out_stream_pub_registrar.create_publisher( stream_id=self.out_stream_id) self.data_product_id = self.CFG.get('data_product_id') self.stream_def_id = self.CFG.get("stream_def_id") self.stream_def = self.rr_cli.read(self.stream_def_id) # Start the thread responsible for keeping track of time and generating graphs # Mutex for ensuring proper concurrent communications between threads self.lock = RLock() self.rendering_proc = Greenlet(self.rendering_thread) self.rendering_proc.start()
def __init__(self, filename, mode='r', buffer=16 << 10): modes = os.O_LARGEFILE | os.O_CREAT self._offset = 0 self._buffer_size = buffer if buffer: self._buffer_lock = RLock() self._read = False self._write = False self._read_buf = None self._write_buf = None self._eof = False # Optimization to limit calls self._append = False # Append Mode writes ignore offset self._stay_alive = gevent.spawn(_keep_awake) if mode.startswith('r') or '+' in mode: self._read = True self._read_buf = bytearray() if '+' not in mode: modes |= os.O_RDONLY if mode.startswith('w') or mode.startswith('a') or '+' in mode: if mode.startswith('w'): modes |= os.O_TRUNC self._write = True self._write_buf = bytearray() self._flush = False if '+' not in mode: modes |= os.O_WRONLY if '+' in mode: modes |= os.O_RDWR if mode.startswith('a'): modes |= os.O_APPEND self._append = True self._fd = os.open(filename, modes)
class GeventScheduler(Scheduler): """A scheduler that dispatches tasks via Gevent""" def __init__(self): Scheduler.__init__(self) from gevent.coros import RLock self._lock = RLock() def start(self): """Spawn a greenlet for the main event loop.""" self.greenlet = gevent.spawn(self._run) def stop(self): """Stop the scheduler and wait for the thread to finish.""" Scheduler.stop(self) try: self.greenlet.kill(block=False) except AttributeError: pass def _acquire_lock(self): """Lock the thread's task queue.""" self._lock.acquire() def _release_lock(self): """Release the lock on the thread's task queue.""" self._lock.release()
class Deliverator(object): """ The deliverator holds the channels that will be used to deliver the replies that come over a resilient connection """ def __init__(self): self._log = logging.getLogger("Deliverator") self._active_requests = dict() self._lock = RLock() def add_request(self, message_id): """ Add a message_id return a channel (gevent.queue.Queue) When the web_server's pull server gets a reply for this message id it will push the message into the queue. The caller can block on the queue, waiting for the reply. We can't use the zero size 'channel' queue because the web server moves on after 8 of 10 retrieves and nobody is waiting on the last two. So we use a size of one, and it is the caller's responsibility to clean up unused channels. """ channel = Queue(maxsize=1) self._lock.acquire() try: if message_id in self._active_requests: raise ValueError("Duplicate request '%s'" % (message_id, )) self._active_requests[message_id] = channel finally: self._lock.release() return channel def deliver_reply(self, message): """ Deliver the reply nessage over the channel for its message-id And discard the channel """ self._lock.acquire() try: channel = self._active_requests.pop(message.control["message-id"]) except KeyError: channel = None finally: self._lock.release() if channel is None: self._log.error("undeliverable message %s" % (message.control, )) else: channel.put(( message.control, message.body, ))
class ThreadSafeFSM(InstrumentFSM): def __init__(self, states, events, enter_event, exit_event): self._lock = RLock() super(ThreadSafeFSM, self).__init__(states, events, enter_event, exit_event) def on_event(self, event, *args, **kwargs): with self._lock: return super(ThreadSafeFSM, self).on_event(event, *args, **kwargs) def on_event_if_free(self, event, *args, **kwargs): if not self._lock.acquire(blocking=False): raise FSMLockedError try: retval = super(ThreadSafeFSM, self).on_event(event, *args, **kwargs) finally: self._lock.release() return retval
def __init__(self, filename, mode='r', buffer=16<<10): modes = os.O_LARGEFILE | os.O_CREAT self._offset = 0 self._buffer_size = buffer if buffer: self._buffer_lock = RLock() self._read = False self._write = False self._read_buf = None self._write_buf = None self._eof = False # Optimization to limit calls self._append = False # Append Mode writes ignore offset self._stay_alive = gevent.spawn(_keep_awake); if mode.startswith('r') or '+' in mode: self._read = True self._read_buf = bytearray() if '+' not in mode: modes |= os.O_RDONLY if mode.startswith('w') or mode.startswith('a') or '+' in mode: if mode.startswith('w'): modes |= os.O_TRUNC self._write = True self._write_buf = bytearray() self._flush = False if '+' not in mode: modes |= os.O_WRONLY if '+' in mode: modes |= os.O_RDWR if mode.startswith('a'): modes |= os.O_APPEND self._append = True self._fd = os.open(filename, modes)
def on_start(self): #pragma no cover #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion TransformStreamProcess.on_start(self) self.queue_name = self.CFG.get_safe('process.queue_name',self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() #-------------------------------------------------------------------------------- # Normal on_start after this point #-------------------------------------------------------------------------------- BaseIngestionWorker.on_start(self) self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.event_publisher = EventPublisher(OT.DatasetModified) self.stored_value_manager = StoredValueManager(self.container) self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[]) self.input_product = self.CFG.get_safe('process.input_product','') self.new_lookups = Queue() self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True) self.add_endpoint(self.lookup_monitor) self.connection_id = '' self.connection_index = None self.start_listener()
def on_start(self): #pragma no cover #super(TransformWorker,self).on_start() #-------------------------------------------------------------------------------- # Explicit on_start #-------------------------------------------------------------------------------- # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created # We want explicit management of the thread and subscriber object for ingestion #todo: check how to manage multi queue subscription (transform scenario 3) TransformStreamProcess.on_start(self) #todo: can the subscription be changed or updated when new dataprocesses are added ? self.queue_name = self.CFG.get_safe('process.queue_name', self.id) self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback) self.thread_lock = RLock() self._rpc_server = self.container.proc_manager._create_listening_endpoint( from_name=self.id, process=self) self.add_endpoint(self._rpc_server) self.start_listener() #todo: determine and publish appropriate set of status events self.event_publisher = EventPublisher(OT.DataProcessStatusEvent)
def on_start(self): super(VizTransformProcForMatplotlibGraphs,self).on_start() #assert len(self.streams)==1 self.initDataFlag = True self.graph_data = {} # Stores a dictionary of variables : [List of values] # Need some clients self.rr_cli = ResourceRegistryServiceProcessClient(process = self, node = self.container.node) self.pubsub_cli = PubsubManagementServiceClient(node=self.container.node) # extract the various parameters passed to the transform process self.out_stream_id = self.CFG.get('process').get('publish_streams').get('visualization_service_submit_stream_id') # Create a publisher on the output stream #stream_route = self.pubsub_cli.register_producer(stream_id=self.out_stream_id) out_stream_pub_registrar = StreamPublisherRegistrar(process=self.container, node=self.container.node) self.out_stream_pub = out_stream_pub_registrar.create_publisher(stream_id=self.out_stream_id) self.data_product_id = self.CFG.get('data_product_id') self.stream_def_id = self.CFG.get("stream_def_id") self.stream_def = self.rr_cli.read(self.stream_def_id) # Start the thread responsible for keeping track of time and generating graphs # Mutex for ensuring proper concurrent communications between threads self.lock = RLock() self.rendering_proc = Greenlet(self.rendering_thread) self.rendering_proc.start()
def __init__(self, canvas): self.canvas = canvas self.socket = None self.connect_ts = time.time() # And this is used to limit clients to X messages per tick # We start at 0 (instead of x) to add a reconnect-penalty. self.lock = RLock()
def __init__(self, fileserver_ip=None, fileserver_port=None): self.privatekeys = {} # Onion: Privatekey self.site_onions = {} # Site address: Onion self.tor_exe = "tools/tor/tor.exe" self.has_meek_bridges = os.path.isfile( "tools/tor/PluggableTransports/meek-client.exe") self.tor_process = None self.log = logging.getLogger("TorManager") self.start_onions = None self.conn = None self.lock = RLock() if config.tor == "disable": self.enabled = False self.start_onions = False self.setStatus("Disabled") else: self.enabled = True self.setStatus("Waiting") if fileserver_port: self.fileserver_port = fileserver_port else: self.fileserver_port = config.fileserver_port self.ip, self.port = config.tor_controller.split(":") self.port = int(self.port) self.proxy_ip, self.proxy_port = config.tor_proxy.split(":") self.proxy_port = int(self.proxy_port)
def __init__(self, socket, LAST_MESSAGES, rabbitcox): (ip, port) = socket.getpeername() self.logger = logging.getLogger('radiovisserver.stompserver.' + ip + '.' + str(port)) self.socket = socket # Buffer for icoming data self.incomingData = '' # Topic the client subscribled to self.topics = [] # Queue of messages self.queue = queue.Queue() # Lock to send frame self.lock = RLock() # Mapping channel -> id for subscritions self.idsByChannels = {} # Mapping id- -> channel for subscritions self.channelsByIds = {} # Last messages self.LAST_MESSAGES = LAST_MESSAGES # RabbitCox self.rabbitcox = rabbitcox # Station id, if authenticated self.station_id = None
def __init__(self, server, ip, port, sock=None, target_onion=None, is_tracker_connection=False): self.sock = sock self.ip = ip self.port = port self.cert_pin = None if "#" in ip: self.ip, self.cert_pin = ip.split("#") self.target_onion = target_onion # Requested onion adress self.id = server.last_connection_id server.last_connection_id += 1 self.protocol = "?" self.type = "?" if helper.isPrivateIp(self.ip) and self.ip not in config.ip_local: self.is_private_ip = True else: self.is_private_ip = False self.is_tracker_connection = is_tracker_connection self.server = server self.unpacker = None # Stream incoming socket messages here self.req_id = 0 # Last request id self.handshake = {} # Handshake info got from peer self.crypt = None # Connection encryption method self.sock_wrapped = False # Socket wrapped to encryption self.connected = False self.event_connected = gevent.event.AsyncResult( ) # Solves on handshake received self.closed = False # Stats self.start_time = time.time() self.last_recv_time = 0 self.last_message_time = 0 self.last_send_time = 0 self.last_sent_time = 0 self.incomplete_buff_recv = 0 self.bytes_recv = 0 self.bytes_sent = 0 self.last_ping_delay = None self.last_req_time = 0 self.last_cmd_sent = None self.last_cmd_recv = None self.bad_actions = 0 self.sites = 0 self.cpu_time = 0.0 self.send_lock = RLock() self.name = None self.updateName() self.waiting_requests = {} # Waiting sent requests self.waiting_streams = {} # Waiting response file streams
class RedisConnWrapper(object): _db = {} dumb = True db_lock = RLock() def lock__db(func): def gen(self, *args, **kwargs): self.db_lock.acquire() func(self, *args, **kwargs) self.db_lock.release() return gen def get(self, key): return self._db.get(key) @lock__db def set(self, key, value): self._db[key] = value @lock__db def incr(self, key): if self._db.get(key): self._db[key] += 1 else: self._db[key] = 1 def smembers(self, set_key): return self._db.get(set_key) @lock__db def spop(self, set_key): if type(self._db.get(set_key)) != set: return None self._db[set_key].pop() @lock__db def srem(self, set_key, value): if type(self._db.get(set_key)) != set: return False else: try: self._db[set_key].remove(value) return True except KeyError: return False @lock__db def sadd(self, set_key, value): if type(self._db.get(set_key)) != set: self._db[set_key] = set() self._db[set_key].add(value) def __getattr__(self, name): raise RedisImportError( 'You use dumb redis storage that doesn\'t' 'support this function,\n you should install redis-server' 'and redis-py')
def publish(self, topic, data): lock = self.lock if not lock: lock = RLock() self.lock = lock with lock: return RedisInterconnect.publish(self, topic, data)
class Deliverator(object): """ The deliverator holds the channels that will be used to deliver the replies that come over a resilient connection """ def __init__(self): self._log = logging.getLogger("Deliverator") self._active_requests = dict() self._lock = RLock() def add_request(self, message_id): """ Add a message_id return a channel (gevent.queue.Queue) When the web_server's pull server gets a reply for this message id it will push the message into the queue. The caller can block on the queue, waiting for the reply. We can't use the zero size 'channel' queue because the web server moves on after 8 of 10 retrieves and nobody is waiting on the last two. So we use a size of one, and it is the caller's responsibility to clean up unused channels. """ channel = Queue(maxsize=1) self._lock.acquire() try: if message_id in self._active_requests: raise ValueError("Duplicate request '%s'" % (message_id, )) self._active_requests[message_id] = channel finally: self._lock.release() return channel def deliver_reply(self, message): """ Deliver the reply nessage over the channel for its message-id And discard the channel """ self._lock.acquire() try: channel = self._active_requests.pop(message.control["message-id"]) except KeyError: channel = None finally: self._lock.release() if channel is None: self._log.error("undeliverable message %s" % (message.control, )) else: channel.put((message.control, message.body, ))
def __init__(self, pa): """ Initializes all status parameters according to the immediate children of this platform and starts the related subscribers. The PlatformAgent must have been already initialized to properly access the handled elements. Note that the number of subscribers and entries for other related status information will increase and decrease as we get device_added and device_removed events. @param pa The associated platform agent object to access the elements handled by this helper. """ assert pa._platform_id is not None assert pa._children_resource_ids is not None self._agent = pa self._platform_id = pa._platform_id self.resource_id = pa.resource_id self._children_resource_ids = pa._children_resource_ids self._event_publisher = pa._event_publisher self.aparam_child_agg_status = pa.aparam_child_agg_status self.aparam_aggstatus = pa.aparam_aggstatus self.aparam_rollup_status = pa.aparam_rollup_status # All EventSubscribers created: {origin: EventSubscriber, ...} self._event_subscribers = {} # {pid: origin ...} the origin (resource_id) of each PID used in # ProcessLifecycleEvent subscribers self._rids = {} # set to False by a call to destroy self._active = True # RLock to synchronize access to the various mutable variables here. self._lock = RLock() # init statuses, and subscribers for the given children with self._lock: # initialize my own statuses: for status_name in AggregateStatusType._str_map.keys(): self.aparam_aggstatus[ status_name] = DeviceStatusType.STATUS_UNKNOWN self.aparam_rollup_status[ status_name] = DeviceStatusType.STATUS_UNKNOWN # do status preparations for the immediate children for origin in pa._children_resource_ids: self._prepare_new_child(origin) # diagnostics report on demand: self._diag_sub = None self._start_diagnostics_subscriber()
class ElasticConnection(object): if _use_gevent: session = None session_lock = RLock() def __init__(self, timeout=None, **params): self.status_code = 0 self.timeout = timeout self.encoding = None self.headers = {'Content-Type': 'Application/json; charset=utf-8'} if params.has_key('encoding'): self.encoding = 'utf8' del params['encoding'] if _use_gevent: if ElasticConnection.session is None: ElasticConnection.session_lock.acquire() ElasticConnection.session = requests.Session(**params) ElasticConnection.session_lock.release() else: self.session = requests.Session(**params) def get(self, url): try: response = self.session.get(url, headers=self.headers, timeout=self.timeout) except requests.ConnectionError as e: self.status_code = 0 return {'error': e.message} self.status_code = response.status_code return json.loads(response.content, encoding=self.encoding) def post(self, url, data): body = json.dumps(data) try: response = self.session.post(url, data=body, headers=self.headers, timeout=self.timeout) except requests.ConnectionError as e: self.status_code = 0 return {'error': e.message} self.status_code = response.status_code return json.loads(response.content, encoding=self.encoding) def put(self, url, data): body = json.dumps(data) try: response = self.session.post(url, data=body, headers=self.headers, timeout=self.timeout) except requests.ConnectionError as e: self.status_code = 0 return {'error': e.message} self.status_code = response.status_code return json.loads(response.content, encoding=self.encoding) def delete(self, url): try: response = self.session.delete(url, headers=self.headers, timeout=self.timeout) except requests.ConnectionError as e: self.status_code = 0 return {'error': e.message} self.status_code = response.status_code return json.loads(response.content, encoding=self.encoding)
def __init__(self, params): self.total = 0 self.processed = 0 from gevent.coros import RLock self.lock = RLock() for item in params: for test_stage in item['test_stages']: for test_name in item['test_stages'][test_stage]: self.total += 1
def __init__(self, fileserver_ip=None, fileserver_port=None): self.privatekeys = {} # Onion: Privatekey self.site_onions = {} # Site address: Onion self.tor_exe = "tools/tor/tor.exe" self.tor_process = None self.log = logging.getLogger("TorManager") self.start_onions = None self.conn = None #self.trackers = [] #self.trackers_key = {} self.lock = RLock() if config.tor == "disable": self.enabled = False self.start_onions = False self.setStatus("Disabled") else: self.enabled = True self.setStatus("Waiting") if fileserver_port: self.fileserver_port = fileserver_port else: self.fileserver_port = config.fileserver_port self.ip, self.port = config.tor_controller.split(":") self.port = int(self.port) self.proxy_ip, self.proxy_port = config.tor_proxy.split(":") self.proxy_port = int(self.proxy_port) # Test proxy port if config.tor != "disable": try: assert self.connect(), "No connection" self.log.debug("Tor proxy port %s check ok" % config.tor_proxy) except Exception, err: self.log.info( "Starting self-bundled Tor, due to Tor proxy port %s check error: %s" % (config.tor_proxy, err)) self.enabled = False # Change to self-bundled Tor ports from lib.PySocks import socks self.port = 49051 self.proxy_port = 49050 socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", self.proxy_port) if os.path.isfile( self.tor_exe): # Already, downloaded: sync mode self.startTor() else: # Not downloaded yet: Async mode gevent.spawn(self.startTor)
def __init__(self, filter="ALL", eventjson=True, pool_size=5000, trace=False): self._is_eventjson = eventjson # Callbacks for reading events and sending responses. self._response_callbacks = { 'api/response': self._api_response, 'command/reply': self._command_reply, 'text/disconnect-notice': self._disconnect_notice, 'text/event-json': self._event_json, 'text/event-plain': self._event_plain } # Closing state flag self._closing_state = False # Default event filter. self._filter = filter # Commands pool list self._commands_pool = [] # Lock to force eventsocket commands to be sequential. self._lock = RLock() # Sets connected to False. self.connected = False # Sets greenlet handler to None self._g_handler = None # Build events callbacks dict self._event_callbacks = {} for meth in dir(self): if meth[:3] == 'on_': event_name = meth[3:].upper() func = getattr(self, meth, None) if func: self._event_callbacks[event_name] = func unbound = getattr(self, 'unbound_event', None) self._event_callbacks['unbound_event'] = unbound # Set greenlet spawner if pool_size > 0: self.pool = gevent.pool.Pool(pool_size) self._spawn = self.pool.spawn else: self._spawn = gevent.spawn_raw # set tracer try: logger = self.log except AttributeError: logger = None if logger and trace is True: self.trace = self._trace else: self.trace = self._notrace
def __new__(cls, *args, **kw): self = object.__new__(cls) object.__setattr__(self, '_local__args', (args, kw)) object.__setattr__(self, '_local__lock', RLock()) dicts = WeakKeyDictionary() object.__setattr__(self, '_local__dicts', dicts) if (args or kw) and (cls.__init__ is object.__init__): raise TypeError("Initialization arguments are not supported") # We need to create the greenlet dict in anticipation of # __init__ being called, to make sure we don't call it again ourselves. dict = object.__getattribute__(self, '__dict__') dicts[getcurrent()] = dict return self
def __init__(self): self.logger = logging.getLogger('radiovisserver.watchdog') # The lock to modify the list of channels self.channels_lock = RLock() # List of channels self.channels = [] # Last message, by channel self.channels_last_message = {} # List of ids, by channel self.id_by_channel = {} # Init lists self.get_channels()
def __init__(self, platform_id, attr_info, get_attribute_values, notify_driver_event): """ @param platform_id Platform ID @param attr_info Attribute information @param get_attribute_values Function to retrieve attribute values for the specific platform, called like this: get_attribute_values([attr_id], from_time) for each attr_id in the platform. @param notify_driver_event Callback to notify whenever a value is retrieved. """ self._platform_id = platform_id self._attr_info = attr_info self._get_attribute_values = get_attribute_values self._notify_driver_event = notify_driver_event log.debug("%r: PlatformResourceMonitor instance created", self._platform_id) # _monitors: dict { rate_secs: ResourceMonitor } self._monitors = {} # buffers used by the monitoring greenlets to put retrieved data in # and by the publisher greenlet to process that data to construct # aggregated AttributeValueDriverEvent objects that the platform # agent finally process to create and publish granules. self._buffers = {} # to synchronize access to the buffers self._lock = RLock() # publishing rate in seconds, set by _set_publisher_rate self._pub_rate = None self._publisher_active = False # for debugging purposes self._pp = pprint.PrettyPrinter()
def __init__(self, *args, **kwargs): super(ReplayProcess, self).__init__(*args, **kwargs) self.lock = RLock()
class ReplayProcess(BaseReplayProcess): process_type = 'standalone' def __init__(self, *args, **kwargs): super(ReplayProcess, self).__init__(*args, **kwargs) self.lock = RLock() def on_start(self): self.query = self.CFG.get_safe('process.query', {}) self.delivery_format = self.CFG.get_safe('process.delivery_format', {}) self.datastore_name = self.CFG.get_safe('process.datastore_name', 'dm_datastore') definition_id = self.delivery_format.get('definition_id') rrsc = ResourceRegistryServiceProcessClient(process=self, node=self.container.node) definition = rrsc.read(definition_id) self.definition = definition.container self.fields = self.delivery_format.get('fields', None) self.view_name = self.CFG.get_safe('process.view_name', 'datasets/dataset_by_id') self.key_id = self.CFG.get_safe('process.key_id') self.stream_id = self.CFG.get_safe('process.publish_streams.output') if not self.stream_id: raise Inconsistent( 'The replay process requires a stream id. Invalid configuration!' ) self.data_stream_id = self.definition.data_stream_id self.encoding_id = self.definition.identifiables[ self.data_stream_id].encoding_id self.element_type_id = self.definition.identifiables[ self.data_stream_id].element_type_id self.element_count_id = self.definition.identifiables[ self.data_stream_id].element_count_id self.data_record_id = self.definition.identifiables[ self.element_type_id].data_record_id self.field_ids = self.definition.identifiables[ self.data_record_id].field_ids self.domain_ids = self.definition.identifiables[ self.data_record_id].domain_ids self.time_id = self.definition.identifiables[ self.domain_ids[0]].temporal_coordinate_vector_id def execute_replay(self): ''' @brief Spawns a greenlet to take care of the query and work ''' if not hasattr(self, 'output'): raise Inconsistent( 'The replay process requires an output stream publisher named output. Invalid configuration!' ) datastore_name = self.datastore_name key_id = self.key_id view_name = self.view_name opts = { 'start_key': [key_id, 0], 'end_key': [key_id, 2], 'include_docs': True } g = Greenlet(self._query, datastore_name=datastore_name, view_name=view_name, opts=opts, callback=lambda results: self._publish_query(results)) g.start() def _query(self, datastore_name='dm_datastore', view_name='posts/posts_by_id', opts={}, callback=None): ''' @brief Makes the couch query and then callsback to publish @param datastore_name Name of the datastore @param view_name The name of the design view where the data is organized @param opts options to pass @param callback the content handler ''' db = self.container.datastore_manager.get_datastore( datastore_name, DataStore.DS_PROFILE.SCIDATA, self.CFG) ret = db.query_view(view_name=view_name, opts=opts) callback(ret) def _publish_query(self, results): ''' @brief Publishes the appropriate data based on the delivery format and data returned from query @param results The query results from the couch query ''' if results is None: log.info('No Results') return publish_queue = self._parse_results(results) for item in publish_queue: log.debug('Item in queue: %s' % type(item)) granule = self._merge(publish_queue) if not granule: return # no dataset if self.delivery_format.has_key('fields'): res = self.subset(granule, self.delivery_format['fields']) granule = res if self.delivery_format.has_key('time'): granule = self.time_subset(granule, self.delivery_format['time']) total_records = granule.identifiables[self.element_count_id].value granule.identifiables[self.element_count_id].constraint.intervals = [ [0, total_records - 1], ] if self.delivery_format.has_key('records'): assert isinstance(self.delivery_format['records'], int), 'delivery format is incorrectly formatted.' for chunk in self._records(granule, self.delivery_format['records']): self.lock.acquire() self.output.publish(chunk) self.lock.release() return self.lock.acquire() self.output.publish(granule) self.lock.release() def _parse_results(self, results): ''' @brief Switch-case logic for what packet types replay can handle and how to handle @param results List of results returned from couch view @return A queue of msgs parsed and formatted to be iterated through and published. ''' log.debug('called _parse_results') publish_queue = [] for result in results: assert ('doc' in result) packet = result['doc'] if isinstance(packet, BlogBase): packet.is_replay = True self.lock.acquire() self.output.publish(packet) self.lock.release() continue if isinstance(packet, StreamDefinitionContainer): continue # Ignore if isinstance(packet, StreamGranuleContainer): packet = self._parse_granule(packet) log.debug('Got packet') if packet: log.debug('Appending packet') publish_queue.append(packet) continue log.info('Unknown packet type in replay.') return publish_queue def _records(self, granule, n): ''' @brief Yields n records from a granule per iteration @param granule consisting of dataset @param n number of records to yield ''' bin_size = n record_count = granule.identifiables[self.element_count_id].value i = 0 while (i + bin_size) < record_count: log.debug('Yielding %d to %d', i, i + bin_size) yield self._slice(granule, slice(i, i + bin_size)) i += bin_size if i < record_count: yield self._slice(granule, slice(i, i + bin_size)) return def _pair_up(self, granule): ''' @brief Creates a list of tuples consisting of acquire_data friendly var_names and full values_paths @param granule consisting of full dataset. @return list of tuples ''' fields = self._list_data(self.definition, granule) pairs = list() for i in fields.values(): pairs.append((i.split('/').pop(), i)) return pairs def _find_vp(self, pairs, var_name): ''' @brief Determines the value path based on the acquire_data friendly var_name @param pairs List of tuples consisting of pair-wise var_name/value_path @param var_name Desired var_name @return Associated value_path ''' for pair in pairs: if var_name == pair[0]: return pair[1] return def _slice(self, granule, slice_): ''' @brief Creates a granule which is a slice of the granule parameter @param granule the superset @param slice_ The slice values for which to create the granule @return Crafted subset granule of the parameter granule. ''' retval = copy.deepcopy(granule) fields = self._list_data(self.definition, granule) record_count = slice_.stop - slice_.start assert record_count > 0, 'slice is malformed' pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs log.debug('var_names: %s', var_names) file_path = self._get_hdf_from_string( granule.identifiables[self.data_stream_id].values) codec = HDFEncoder() vectors = acquire_data([file_path], var_names, record_count, slice_).next() for row, value in vectors.iteritems(): vp = self._find_vp(pairs, row) # Determine the range_id reverse dictionary lookup #@todo: improve this pattern for field, path in fields.iteritems(): if vp == path: range_id = field break bounds_id = retval.identifiables[range_id].bounds_id # Recalculate the bounds for this fields and update the granule range = value['range'] retval.identifiables[bounds_id].value_pair[0] = float(range[0]) retval.identifiables[bounds_id].value_pair[1] = float(range[1]) codec.add_hdf_dataset(vp, value['values']) record_count = len(value['values']) #----- DEBUGGING --------- log.debug('slice- row: %s', row) log.debug('slice- value_path: %s', vp) log.debug('slice- range_id: %s', range_id) log.debug('slice- bounds_id: %s', bounds_id) log.debug('slice- limits: %s', value['range']) #------------------------- retval.identifiables[self.element_count_id].value = record_count hdf_string = codec.encoder_close() self._patch_granule(retval, hdf_string) FileSystem.unlink(file_path) return retval def _parse_granule(self, granule): ''' @brief Ensures the granule is valid and gets some metadata from the granule for building the dataset @param granule raw granule straight from couch @return metadata in the granule as well as the granule itself if valid. ''' granule.stream_resource_id = self.stream_id element_count_id = self.element_count_id encoding_id = self.encoding_id record_count = granule.identifiables[element_count_id].value sha1 = granule.identifiables[encoding_id].sha1 or None # If there are no records then this is not a proper granule if not (record_count > 0): log.debug('Granule had no record count discarding.') return None # No encoding, no packet if not encoding_id in granule.identifiables: log.debug('Granule had no encoding discarding.') return None if not sha1: log.debug('Granule had no sha1') return None filepath = FileSystem.get_hierarchical_url(FS.CACHE, sha1, '.hdf5') if not os.path.exists(filepath): log.debug('File with sha1 does not exist') return None return {'granule': granule, 'records': record_count, 'sha1': sha1} @staticmethod def merge_granule(definition, granule1, granule2): ''' @brief Merges two granules based on the definition @param definition Stream Definition @param granule1 First Granule @param granule2 Second Granule @return Returns granule1 which is then merged with granule2 and the file pair for indexing @description granule1 := granule1 U granule2 ''' import numpy as np assert isinstance( definition, StreamDefinitionContainer), 'object is not a definition.' assert isinstance(granule1, StreamGranuleContainer), 'object is not a granule.' encoding_id = DefinitionTree.get( definition, '%s.encoding_id' % definition.data_stream_id) if not granule2: pair = (granule1.identifiables['time_bounds'].value_pair[0], '%s.hdf5' % granule1.identifiables[encoding_id].sha1) return {'granule': granule1, 'files': [pair]} assert isinstance(granule2, StreamGranuleContainer), 'object is not a granule.' assert granule1.identifiables.has_key( 'time_bounds' ), 'object has no time bounds and therefore is invalid.' assert granule2.identifiables.has_key( 'time_bounds' ), 'object has no time bounds and therefore is invalid.' #------------------------------------------------------------------------------------- # First step is figure out where each granule belongs on the timeline # We do this with a tuple consisting of the point in the timeline and the filename # These will get stable sorted later #------------------------------------------------------------------------------------- pair1 = (granule1.identifiables['time_bounds'].value_pair[0], '%s.hdf5' % granule1.identifiables[encoding_id].sha1) pair2 = (granule2.identifiables['time_bounds'].value_pair[0], '%s.hdf5' % granule2.identifiables[encoding_id].sha1) files = [] if encoding_id in granule1.identifiables: if granule1.identifiables[encoding_id].sha1: files.append('%s.hdf5' % granule1.identifiables[encoding_id].sha1) if encoding_id in granule2.identifiables: if granule2.identifiables[encoding_id].sha1: files.append('%s.hdf5' % granule2.identifiables[encoding_id].sha1) element_count_id = DefinitionTree.get( definition, '%s.element_count_id' % definition.data_stream_id) record_count = 0 if element_count_id in granule1.identifiables: record_count += granule1.identifiables[element_count_id].value if element_count_id in granule2.identifiables: record_count += granule2.identifiables[element_count_id].value if not element_count_id in granule1.identifiables: granule1.identifiables[element_count_id] = CountElement() granule1.identifiables[element_count_id].value = record_count else: granule1.identifiables[element_count_id].value = record_count fields1 = ReplayProcess._list_data(definition, granule1) fields2 = ReplayProcess._list_data(definition, granule2) #@todo albeit counterintuitive an intersection is the only thing I can support merged_paths = {} for k, v in fields1.iteritems(): if fields2.has_key(k): merged_paths[k] = v for k, v in granule2.identifiables.iteritems(): # Switch(value): # Case Bounds: if isinstance(v, QuantityRangeElement): # If its not in granule1 just throw it in there if k not in granule1.identifiables: granule1.identifiables[k] = v else: bounds1 = granule1.identifiables[k].value_pair bounds2 = granule2.identifiables[k].value_pair bounds = np.append(bounds1, bounds2) granule1.identifiables[k].value_pair = [ np.nanmin(bounds), np.nanmax(bounds) ] if isinstance(v, RangeSet): #Including coordinate axis if merged_paths.has_key( k) and not granule1.identifiables.has_key(k): granule1.identifiables[k] = v # Copy it over # Now make sure granule1 doesnt have excess stuff del_list = [] for k, v in granule1.identifiables.iteritems(): if isinstance(v, RangeSet): if not merged_paths.has_key(k): del_list.append(k) for item in del_list: del granule1.identifiables[item] return {'granule': granule1, 'files': [pair1, pair2]} @staticmethod def _list_data(definition, granule): ''' @brief Lists all the fields in the granule based on the Stream Definition @param definition Stream Definition @param granule Stream Granule @return dict of field_id : values_path for each field_id that exists ''' from interface.objects import StreamDefinitionContainer, StreamGranuleContainer, RangeSet, CoordinateAxis assert isinstance( definition, StreamDefinitionContainer), 'object is not a definition.' assert isinstance( granule, StreamGranuleContainer ), 'object is not a granule. its a %s' % type(granule) retval = {} for key, value in granule.identifiables.iteritems(): if isinstance(value, RangeSet): values_path = value.values_path or definition.identifiables[ key].values_path retval[key] = values_path elif isinstance(value, CoordinateAxis): values_path = value.values_path or definition.identifiables[ key].values_path retval[key] = values_path return retval def _merge(self, msgs): ''' @brief Merges all the granules and datasets into one large dataset (Union) @param msgs raw granules from couch @return complete dataset @description n D := U [ msgs_i ] i=0 ''' granule = None file_list = list() count = len(msgs) used_vals = list() #------------------------------------------------------------------------------------- # Merge each granule to another granule one by one. # After each merge operation keep track of what files belong where on the timeline #------------------------------------------------------------------------------------- for i in xrange(count): if i == 0: granule = msgs[0]['granule'] psc = PointSupplementConstructor( point_definition=self.definition) res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][ 0] not in used_vals: file_list.append(tuple(file_pair[0])) used_vals.append(file_pair[0][0]) else: res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule']) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][ 0] not in used_vals: file_list.append(tuple(file_pair[0])) used_vals.append(file_pair[0][0]) if file_pair[1] not in file_list and file_pair[1][ 0] not in used_vals: file_list.append(tuple(file_pair[1])) used_vals.append(file_pair[1][0]) if not granule: return log.debug('file_list: %s', file_list) #------------------------------------------------------------------------------------- # Order the lists using a stable sort from python (by the first value in the tuples # Then peel off just the file names # Then get the appropriate URL for the file using FileSystem #------------------------------------------------------------------------------------- file_list.sort() file_list = list(i[1] for i in file_list) file_list = list([ FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list ]) pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value codec = HDFEncoder() log.debug('acquire_data:') log.debug('\tfile_list: %s', file_list) log.debug('\tfields: %s', var_names) log.debug('\trecords: %s', record_count) data = acquire_data(file_list, var_names, record_count).next() for row, value in data.iteritems(): value_path = self._find_vp(pairs, row) codec.add_hdf_dataset(value_path, nparray=value['values']) #------------------------------------------------------------------------------------- # Debugging #------------------------------------------------------------------------------------- log.debug('row: %s', row) log.debug('value path: %s', value_path) log.debug('value: %s', value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule, hdf_string) return granule def _patch_granule(self, granule, hdf_string): ''' @brief Adds the hdf_string and sha1 to the granule @param granule Stream Granule @param hdf_string string consisting of raw bytes from an hdf5 file ''' granule.identifiables[self.data_stream_id].values = hdf_string granule.identifiables[self.encoding_id].sha1 = hashlib.sha1( hdf_string).hexdigest().upper() def time_subset(self, granule, time_bounds): ''' @brief Obtains a subset of the granule dataset based on the specified time_bounds @param granule Dataset @param time_bounds tuple consisting of a lower and upper bound @return A subset of the granule's dataset based on the time boundaries. ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' lower = time_bounds[0] - 1 upper = time_bounds[1] granule = self._slice(granule, slice(lower, upper)) return granule def _get_time_index(self, granule, timeval): ''' @brief Obtains the index where a time's value is @param granule must be a complete dataset (hdf_string provided) @param timeval the vector value @return Index value for timeval or closest approx such that timeval is IN the subset ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' assert granule.identifiables[ self.data_stream_id].values, 'hdf_string is not provided.' hdf_string = granule.identifiables[self.data_stream_id].values file_path = self._get_hdf_from_string(hdf_string) #------------------------------------------------------------------------------------- # Determine the field_id for the temporal coordinate vector (aka time) #------------------------------------------------------------------------------------- time_field = self.definition.identifiables[ self.time_id].coordinate_ids[0] value_path = granule.identifiables[ time_field].values_path or self.definition.identifiables[ time_field].values_path record_count = granule.identifiables[self.element_count_id].value #------------------------------------------------------------------------------------- # Go through the time vector and get the indexes that correspond to the timeval # It will find a value such that # t_n <= i < t_(n+1), where i is the index #------------------------------------------------------------------------------------- var_name = value_path.split('/').pop() res = acquire_data([file_path], [var_name], record_count).next() time_vector = res[var_name]['values'] retval = 0 for i in xrange(len(time_vector)): if time_vector[i] == timeval: retval = i break elif i == 0 and time_vector[i] > timeval: retval = i break elif (i + 1) < len(time_vector): # not last val if time_vector[i] < timeval and time_vector[i + 1] > timeval: retval = i break else: # last val retval = i break FileSystem.unlink(file_path) return retval def _get_hdf_from_string(self, hdf_string): ''' @param hdf_string binary string consisting of an HDF5 file. @return temporary file (full path) where the string was written to. @note client's responsible to unlink when finished. ''' f = FileSystem.mktemp() f.write(hdf_string) retval = f.name f.close() return retval def subset(self, granule, coverages): ''' @param granule @return dataset subset based on the fields ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' field_ids = self.field_ids element_count_id = self.element_count_id values_path = list() domain_ids = list() coverage_ids = list() coverages = list(coverages) log.debug('Coverages include %s of type %s', coverages, type(coverages)) #----------------------------------------------------------------------------------------------------------- # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises # - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain) # - If its a rangeset make sure that it's part of what the client asked for, if not discard it #----------------------------------------------------------------------------------------------------------- for field_id in field_ids: range_id = self.definition.identifiables[field_id].range_id #------------------------------------------------------------------------------------- # Coordinate Axis # - Keep track of this in our domains # - Add it to the paths we need to grab from the file(s) #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], CoordinateAxis): log.debug('got a domain: %s' % range_id) domain_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[ range_id].values_path or self.definition.identifiables[ range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[ range_id].values_path values_path.append(value_path) continue #------------------------------------------------------------------------------------- # Range Set # - If it's part of the coverages we want to keep # - Add it to the list of ranges we're tracking # - Add the value path to the paths we're tracking. #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], RangeSet): # If its a rangeset, a specified coverage and the granule has it, add it to the list if field_id in coverages: if granule.identifiables.has_key(range_id): log.debug('got a range: %s' % range_id) coverage_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[ range_id].values_path or self.definition.identifiables[ range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[ range_id].values_path values_path.append(value_path) continue # ---- # We need to track the range and bounds because, # you guessed it, we need to update the bounds # ---- range_id = self.definition.identifiables[field_id].range_id bounds_id = self.definition.identifiables[range_id].bounds_id #--- # Lastly, if the field is there and we don't want it, we need to strip it #--- if not (field_id in coverages): log.debug('%s doesn\'t belong in %s.', field_id, coverages) log.debug('rebool: %s', bool(field_id in coverages)) if granule.identifiables.has_key(range_id): log.debug('Removing %s from granule', range_id) del granule.identifiables[range_id] if granule.identifiables.has_key(bounds_id): log.debug('Removing %s from granule', bounds_id) del granule.identifiables[bounds_id] log.debug('Domains: %s', domain_ids) log.debug('Ranges: %s', coverage_ids) log.debug('Values_paths: %s', values_path) file_path = self._get_hdf_from_string( granule.identifiables[self.data_stream_id].values) full_coverage = list(domain_ids + coverage_ids) log.debug('Full coverage: %s' % full_coverage) log.debug('Calling acquire_data with: %s, %s, %s', [file_path], values_path, granule.identifiables[element_count_id].value) codec = HDFEncoder() pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value data = acquire_data([file_path], var_names, record_count).next() for row, value in data.iteritems(): vp = self._find_vp(pairs, row) codec.add_hdf_dataset(vp, value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule, hdf_string) FileSystem.unlink(file_path) return granule
class ReplayProcess(BaseReplayProcess): process_type = 'standalone' def __init__(self, *args, **kwargs): super(ReplayProcess,self).__init__(*args,**kwargs) self.lock = RLock() def on_start(self): self.query = self.CFG.get_safe('process.query',{}) self.delivery_format = self.CFG.get_safe('process.delivery_format',{}) self.datastore_name = self.CFG.get_safe('process.datastore_name','dm_datastore') definition_id = self.delivery_format.get('definition_id') rrsc = ResourceRegistryServiceProcessClient(process=self, node=self.container.node) definition = rrsc.read(definition_id) self.definition = definition.container self.fields = self.delivery_format.get('fields',None) self.view_name = self.CFG.get_safe('process.view_name','datasets/dataset_by_id') self.key_id = self.CFG.get_safe('process.key_id') self.stream_id = self.CFG.get_safe('process.publish_streams.output') if not self.stream_id: raise Inconsistent('The replay process requires a stream id. Invalid configuration!') self.data_stream_id = self.definition.data_stream_id self.encoding_id = self.definition.identifiables[self.data_stream_id].encoding_id self.element_type_id = self.definition.identifiables[self.data_stream_id].element_type_id self.element_count_id = self.definition.identifiables[self.data_stream_id].element_count_id self.data_record_id = self.definition.identifiables[self.element_type_id].data_record_id self.field_ids = self.definition.identifiables[self.data_record_id].field_ids self.domain_ids = self.definition.identifiables[self.data_record_id].domain_ids self.time_id = self.definition.identifiables[self.domain_ids[0]].temporal_coordinate_vector_id def execute_replay(self): ''' @brief Spawns a greenlet to take care of the query and work ''' if not hasattr(self, 'output'): raise Inconsistent('The replay process requires an output stream publisher named output. Invalid configuration!') datastore_name = self.datastore_name key_id = self.key_id view_name = self.view_name opts = { 'start_key':[key_id,0], 'end_key':[key_id,2], 'include_docs':True } g = Greenlet(self._query,datastore_name=datastore_name, view_name=view_name, opts=opts, callback=lambda results: self._publish_query(results)) g.start() def _query(self,datastore_name='dm_datastore', view_name='posts/posts_by_id', opts={}, callback=None): ''' @brief Makes the couch query and then callsback to publish @param datastore_name Name of the datastore @param view_name The name of the design view where the data is organized @param opts options to pass @param callback the content handler ''' db = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA, self.CFG) ret = db.query_view(view_name=view_name,opts=opts) callback(ret) def _publish_query(self, results): ''' @brief Publishes the appropriate data based on the delivery format and data returned from query @param results The query results from the couch query ''' if results is None: log.info('No Results') return publish_queue = self._parse_results(results) for item in publish_queue: log.debug('Item in queue: %s' % type(item)) granule = self._merge(publish_queue) if not granule: return # no dataset if self.delivery_format.has_key('fields'): res = self.subset(granule,self.delivery_format['fields']) granule = res if self.delivery_format.has_key('time'): granule = self.time_subset(granule, self.delivery_format['time']) total_records = granule.identifiables[self.element_count_id].value granule.identifiables[self.element_count_id].constraint.intervals = [[0, total_records-1],] if self.delivery_format.has_key('records'): assert isinstance(self.delivery_format['records'], int), 'delivery format is incorrectly formatted.' for chunk in self._records(granule,self.delivery_format['records']): self.lock.acquire() self.output.publish(chunk) self.lock.release() return self.lock.acquire() self.output.publish(granule) self.lock.release() def _parse_results(self, results): ''' @brief Switch-case logic for what packet types replay can handle and how to handle @param results List of results returned from couch view @return A queue of msgs parsed and formatted to be iterated through and published. ''' log.debug('called _parse_results') publish_queue = [] for result in results: assert('doc' in result) packet = result['doc'] if isinstance(packet, BlogBase): packet.is_replay = True self.lock.acquire() self.output.publish(packet) self.lock.release() continue if isinstance(packet, StreamDefinitionContainer): continue # Ignore if isinstance(packet, StreamGranuleContainer): packet = self._parse_granule(packet) log.debug('Got packet') if packet: log.debug('Appending packet') publish_queue.append(packet) continue log.info('Unknown packet type in replay.') return publish_queue def _records(self, granule, n): ''' @brief Yields n records from a granule per iteration @param granule consisting of dataset @param n number of records to yield ''' bin_size = n record_count = granule.identifiables[self.element_count_id].value i=0 while (i+bin_size) < record_count: log.debug('Yielding %d to %d', i, i+bin_size) yield self._slice(granule,slice(i,i+bin_size)) i+=bin_size if i < record_count: yield self._slice(granule, slice(i,i+bin_size)) return def _pair_up(self, granule): ''' @brief Creates a list of tuples consisting of acquire_data friendly var_names and full values_paths @param granule consisting of full dataset. @return list of tuples ''' fields = self._list_data(self.definition, granule) pairs = list() for i in fields.values(): pairs.append((i.split('/').pop(),i)) return pairs def _find_vp(self, pairs, var_name): ''' @brief Determines the value path based on the acquire_data friendly var_name @param pairs List of tuples consisting of pair-wise var_name/value_path @param var_name Desired var_name @return Associated value_path ''' for pair in pairs: if var_name == pair[0]: return pair[1] return def _slice(self,granule,slice_): ''' @brief Creates a granule which is a slice of the granule parameter @param granule the superset @param slice_ The slice values for which to create the granule @return Crafted subset granule of the parameter granule. ''' retval = copy.deepcopy(granule) fields = self._list_data(self.definition,granule) record_count = slice_.stop - slice_.start assert record_count > 0, 'slice is malformed' pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs log.debug('var_names: %s',var_names) file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values) codec = HDFEncoder() vectors = acquire_data([file_path],var_names,record_count,slice_ ).next() for row, value in vectors.iteritems(): vp = self._find_vp(pairs, row) # Determine the range_id reverse dictionary lookup #@todo: improve this pattern for field,path in fields.iteritems(): if vp==path: range_id = field break bounds_id = retval.identifiables[range_id].bounds_id # Recalculate the bounds for this fields and update the granule range = value['range'] retval.identifiables[bounds_id].value_pair[0] = float(range[0]) retval.identifiables[bounds_id].value_pair[1] = float(range[1]) codec.add_hdf_dataset(vp, value['values']) record_count = len(value['values']) #----- DEBUGGING --------- log.debug('slice- row: %s', row) log.debug('slice- value_path: %s', vp) log.debug('slice- range_id: %s', range_id) log.debug('slice- bounds_id: %s', bounds_id) log.debug('slice- limits: %s', value['range']) #------------------------- retval.identifiables[self.element_count_id].value = record_count hdf_string = codec.encoder_close() self._patch_granule(retval, hdf_string) FileSystem.unlink(file_path) return retval def _parse_granule(self, granule): ''' @brief Ensures the granule is valid and gets some metadata from the granule for building the dataset @param granule raw granule straight from couch @return metadata in the granule as well as the granule itself if valid. ''' granule.stream_resource_id = self.stream_id element_count_id = self.element_count_id encoding_id = self.encoding_id record_count = granule.identifiables[element_count_id].value sha1 = granule.identifiables[encoding_id].sha1 or None # If there are no records then this is not a proper granule if not (record_count > 0): log.debug('Granule had no record count discarding.') return None # No encoding, no packet if not encoding_id in granule.identifiables: log.debug('Granule had no encoding discarding.') return None if not sha1: log.debug('Granule had no sha1') return None filepath = FileSystem.get_hierarchical_url(FS.CACHE, sha1, '.hdf5') if not os.path.exists(filepath): log.debug('File with sha1 does not exist') return None return { 'granule':granule, 'records':record_count, 'sha1':sha1 } @staticmethod def merge_granule(definition, granule1, granule2): ''' @brief Merges two granules based on the definition @param definition Stream Definition @param granule1 First Granule @param granule2 Second Granule @return Returns granule1 which is then merged with granule2 and the file pair for indexing @description granule1 := granule1 U granule2 ''' import numpy as np assert isinstance(definition,StreamDefinitionContainer), 'object is not a definition.' assert isinstance(granule1, StreamGranuleContainer), 'object is not a granule.' encoding_id = DefinitionTree.get(definition,'%s.encoding_id' % definition.data_stream_id) if not granule2: pair = ( granule1.identifiables['time_bounds'].value_pair[0], '%s.hdf5' % granule1.identifiables[encoding_id].sha1 ) return { 'granule':granule1, 'files':[pair] } assert isinstance(granule2, StreamGranuleContainer), 'object is not a granule.' assert granule1.identifiables.has_key('time_bounds'), 'object has no time bounds and therefore is invalid.' assert granule2.identifiables.has_key('time_bounds'), 'object has no time bounds and therefore is invalid.' #------------------------------------------------------------------------------------- # First step is figure out where each granule belongs on the timeline # We do this with a tuple consisting of the point in the timeline and the filename # These will get stable sorted later #------------------------------------------------------------------------------------- pair1 = ( granule1.identifiables['time_bounds'].value_pair[0], '%s.hdf5' % granule1.identifiables[encoding_id].sha1 ) pair2 = ( granule2.identifiables['time_bounds'].value_pair[0], '%s.hdf5' % granule2.identifiables[encoding_id].sha1 ) files = [] if encoding_id in granule1.identifiables: if granule1.identifiables[encoding_id].sha1: files.append('%s.hdf5' % granule1.identifiables[encoding_id].sha1) if encoding_id in granule2.identifiables: if granule2.identifiables[encoding_id].sha1: files.append('%s.hdf5' % granule2.identifiables[encoding_id].sha1) element_count_id = DefinitionTree.get(definition,'%s.element_count_id' % definition.data_stream_id) record_count = 0 if element_count_id in granule1.identifiables: record_count += granule1.identifiables[element_count_id].value if element_count_id in granule2.identifiables: record_count += granule2.identifiables[element_count_id].value if not element_count_id in granule1.identifiables: granule1.identifiables[element_count_id] = CountElement() granule1.identifiables[element_count_id].value = record_count else: granule1.identifiables[element_count_id].value = record_count fields1 = ReplayProcess._list_data(definition, granule1) fields2 = ReplayProcess._list_data(definition, granule2) #@todo albeit counterintuitive an intersection is the only thing I can support merged_paths = {} for k,v in fields1.iteritems(): if fields2.has_key(k): merged_paths[k] = v for k,v in granule2.identifiables.iteritems(): # Switch(value): # Case Bounds: if isinstance(v, QuantityRangeElement): # If its not in granule1 just throw it in there if k not in granule1.identifiables: granule1.identifiables[k] = v else: bounds1 = granule1.identifiables[k].value_pair bounds2 = granule2.identifiables[k].value_pair bounds = np.append(bounds1,bounds2) granule1.identifiables[k].value_pair = [np.nanmin(bounds), np.nanmax(bounds)] if isinstance(v, RangeSet): #Including coordinate axis if merged_paths.has_key(k) and not granule1.identifiables.has_key(k): granule1.identifiables[k] = v # Copy it over # Now make sure granule1 doesnt have excess stuff del_list = [] for k,v in granule1.identifiables.iteritems(): if isinstance(v, RangeSet): if not merged_paths.has_key(k): del_list.append(k) for item in del_list: del granule1.identifiables[item] return { 'granule':granule1, 'files':[pair1, pair2] } @staticmethod def _list_data(definition, granule): ''' @brief Lists all the fields in the granule based on the Stream Definition @param definition Stream Definition @param granule Stream Granule @return dict of field_id : values_path for each field_id that exists ''' from interface.objects import StreamDefinitionContainer, StreamGranuleContainer, RangeSet, CoordinateAxis assert isinstance(definition, StreamDefinitionContainer), 'object is not a definition.' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule. its a %s' % type(granule) retval = {} for key, value in granule.identifiables.iteritems(): if isinstance(value, RangeSet): values_path = value.values_path or definition.identifiables[key].values_path retval[key] = values_path elif isinstance(value, CoordinateAxis): values_path = value.values_path or definition.identifiables[key].values_path retval[key] = values_path return retval def _merge(self, msgs): ''' @brief Merges all the granules and datasets into one large dataset (Union) @param msgs raw granules from couch @return complete dataset @description n D := U [ msgs_i ] i=0 ''' granule = None file_list = list() count = len(msgs) used_vals = list() #------------------------------------------------------------------------------------- # Merge each granule to another granule one by one. # After each merge operation keep track of what files belong where on the timeline #------------------------------------------------------------------------------------- for i in xrange(count): if i==0: granule = msgs[0]['granule'] psc = PointSupplementConstructor(point_definition=self.definition) res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][0] not in used_vals: file_list.append( tuple(file_pair[0])) used_vals.append(file_pair[0][0]) else: res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule']) granule = res['granule'] file_pair = res['files'] log.debug('file_pair: %s', file_pair) if file_pair[0] not in file_list and file_pair[0][0] not in used_vals: file_list.append( tuple(file_pair[0])) used_vals.append(file_pair[0][0]) if file_pair[1] not in file_list and file_pair[1][0] not in used_vals: file_list.append(tuple(file_pair[1])) used_vals.append(file_pair[1][0]) if not granule: return log.debug('file_list: %s', file_list) #------------------------------------------------------------------------------------- # Order the lists using a stable sort from python (by the first value in the tuples # Then peel off just the file names # Then get the appropriate URL for the file using FileSystem #------------------------------------------------------------------------------------- file_list.sort() file_list = list(i[1] for i in file_list) file_list = list([FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list]) pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value codec = HDFEncoder() log.debug('acquire_data:') log.debug('\tfile_list: %s', file_list) log.debug('\tfields: %s', var_names) log.debug('\trecords: %s', record_count) data = acquire_data(file_list, var_names, record_count).next() for row,value in data.iteritems(): value_path = self._find_vp(pairs,row) codec.add_hdf_dataset(value_path,nparray=value['values']) #------------------------------------------------------------------------------------- # Debugging #------------------------------------------------------------------------------------- log.debug('row: %s', row) log.debug('value path: %s', value_path) log.debug('value: %s', value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule,hdf_string) return granule def _patch_granule(self, granule, hdf_string): ''' @brief Adds the hdf_string and sha1 to the granule @param granule Stream Granule @param hdf_string string consisting of raw bytes from an hdf5 file ''' granule.identifiables[self.data_stream_id].values = hdf_string granule.identifiables[self.encoding_id].sha1 = hashlib.sha1(hdf_string).hexdigest().upper() def time_subset(self, granule, time_bounds): ''' @brief Obtains a subset of the granule dataset based on the specified time_bounds @param granule Dataset @param time_bounds tuple consisting of a lower and upper bound @return A subset of the granule's dataset based on the time boundaries. ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' lower = time_bounds[0]-1 upper = time_bounds[1] granule = self._slice(granule, slice(lower,upper)) return granule def _get_time_index(self, granule, timeval): ''' @brief Obtains the index where a time's value is @param granule must be a complete dataset (hdf_string provided) @param timeval the vector value @return Index value for timeval or closest approx such that timeval is IN the subset ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' assert granule.identifiables[self.data_stream_id].values, 'hdf_string is not provided.' hdf_string = granule.identifiables[self.data_stream_id].values file_path = self._get_hdf_from_string(hdf_string) #------------------------------------------------------------------------------------- # Determine the field_id for the temporal coordinate vector (aka time) #------------------------------------------------------------------------------------- time_field = self.definition.identifiables[self.time_id].coordinate_ids[0] value_path = granule.identifiables[time_field].values_path or self.definition.identifiables[time_field].values_path record_count = granule.identifiables[self.element_count_id].value #------------------------------------------------------------------------------------- # Go through the time vector and get the indexes that correspond to the timeval # It will find a value such that # t_n <= i < t_(n+1), where i is the index #------------------------------------------------------------------------------------- var_name = value_path.split('/').pop() res = acquire_data([file_path], [var_name], record_count).next() time_vector = res[var_name]['values'] retval = 0 for i in xrange(len(time_vector)): if time_vector[i] == timeval: retval = i break elif i==0 and time_vector[i] > timeval: retval = i break elif (i+1) < len(time_vector): # not last val if time_vector[i] < timeval and time_vector[i+1] > timeval: retval = i break else: # last val retval = i break FileSystem.unlink(file_path) return retval def _get_hdf_from_string(self, hdf_string): ''' @param hdf_string binary string consisting of an HDF5 file. @return temporary file (full path) where the string was written to. @note client's responsible to unlink when finished. ''' f = FileSystem.mktemp() f.write(hdf_string) retval = f.name f.close() return retval def subset(self,granule,coverages): ''' @param granule @return dataset subset based on the fields ''' assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.' field_ids = self.field_ids element_count_id = self.element_count_id values_path = list() domain_ids = list() coverage_ids = list() coverages = list(coverages) log.debug('Coverages include %s of type %s', coverages, type(coverages)) #----------------------------------------------------------------------------------------------------------- # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises # - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain) # - If its a rangeset make sure that it's part of what the client asked for, if not discard it #----------------------------------------------------------------------------------------------------------- for field_id in field_ids: range_id = self.definition.identifiables[field_id].range_id #------------------------------------------------------------------------------------- # Coordinate Axis # - Keep track of this in our domains # - Add it to the paths we need to grab from the file(s) #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], CoordinateAxis): log.debug('got a domain: %s' % range_id) domain_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[range_id].values_path values_path.append(value_path) continue #------------------------------------------------------------------------------------- # Range Set # - If it's part of the coverages we want to keep # - Add it to the list of ranges we're tracking # - Add the value path to the paths we're tracking. #------------------------------------------------------------------------------------- if isinstance(self.definition.identifiables[range_id], RangeSet): # If its a rangeset, a specified coverage and the granule has it, add it to the list if field_id in coverages: if granule.identifiables.has_key(range_id): log.debug('got a range: %s' % range_id) coverage_ids.append(field_id) if granule.identifiables.has_key(range_id): value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path values_path.append(value_path) else: value_path = self.definition.identifiables[range_id].values_path values_path.append(value_path) continue # ---- # We need to track the range and bounds because, # you guessed it, we need to update the bounds # ---- range_id = self.definition.identifiables[field_id].range_id bounds_id = self.definition.identifiables[range_id].bounds_id #--- # Lastly, if the field is there and we don't want it, we need to strip it #--- if not (field_id in coverages): log.debug('%s doesn\'t belong in %s.', field_id, coverages) log.debug('rebool: %s', bool(field_id in coverages)) if granule.identifiables.has_key(range_id): log.debug('Removing %s from granule', range_id) del granule.identifiables[range_id] if granule.identifiables.has_key(bounds_id): log.debug('Removing %s from granule', bounds_id) del granule.identifiables[bounds_id] log.debug('Domains: %s', domain_ids) log.debug('Ranges: %s', coverage_ids) log.debug('Values_paths: %s', values_path) file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values) full_coverage = list(domain_ids + coverage_ids) log.debug('Full coverage: %s' % full_coverage) log.debug('Calling acquire_data with: %s, %s, %s', [file_path],values_path,granule.identifiables[element_count_id].value) codec = HDFEncoder() pairs = self._pair_up(granule) var_names = list([i[0] for i in pairs]) record_count = granule.identifiables[self.element_count_id].value data = acquire_data([file_path], var_names, record_count).next() for row,value in data.iteritems(): vp = self._find_vp(pairs, row) codec.add_hdf_dataset(vp, value['values']) hdf_string = codec.encoder_close() self._patch_granule(granule,hdf_string) FileSystem.unlink(file_path) return granule
class ReplayProcess(BaseReplayProcess): process_type="standalone" def __init__(self, *args, **kwargs): super(ReplayProcess, self).__init__(*args,**kwargs) #@todo Init stuff # mutex for shared resources between threads self.lock = RLock() def on_start(self): ''' Creates a publisher for each stream_id passed in as publish_streams Creates an attribute with the name matching the stream name which corresponds to the publisher ex: say we have publish_streams:{'output': my_output_stream_id } then the instance has an attribute output which corresponds to the publisher for the stream in my_output_stream_id ''' self.stream_publisher_registrar = StreamPublisherRegistrar(process=self,node=self.container.node) # Get the query self.query = self.CFG.get_safe('process.query',{}) # Get the delivery_format self.delivery_format = self.CFG.get_safe('process.delivery_format',{}) self.datastore_name = self.CFG.get_safe('process.datastore_name','dm_datastore') self.view_name = self.CFG.get_safe('process.view_name','datasets/dataset_by_id') self.key_id = self.CFG.get_safe('process.key_id') # Get a stream_id for this process self.stream_id = self.CFG.get_safe('process.publish_streams.output',{}) if not (self.stream_id and hasattr(self,'output')): raise RuntimeError('The replay agent requires an output stream publisher named output. Invalid configuration!') def _records(self, records, n): """ Given a list of records, yield at most n at a time """ while True: yval = [] try: for i in xrange(n): yval = yval + [records.pop(0)] yield yval except IndexError: if yval: yield yval break def _publish_query(self, results): ''' Callback to publish the specified results ''' #----------------------- # Iteration #----------------------- # - Go through the results, if the user had include_docs=True in the options field # then the full document is in result.doc; however if the query did not include_docs, # then only the doc_id is provided in the result.value. # # - What this allows us to do is limit the amount of traffic in information for large queries. # If we only are making a query in a sequence of queries (such as map and reduce) then we don't # care about the full document, yet, we only care about the doc id and will retrieve the document later. # - Example: # Imagine the blogging example, we want the latest blog by author George and all the comments for that blog # The series of queries would go, post_by_updated -> posts_by_author -> posts_join_comments and then # in the last query we'll set include_docs to true and parse the docs. #----------------------- log.warn('results: %s', results) for result in results: log.warn('REPLAY Result: %s' % result) assert('doc' in result) replay_obj_msg = result['doc'] if isinstance(replay_obj_msg, BlogBase): replay_obj_msg.is_replay = True self.lock.acquire() self.output.publish(replay_obj_msg) self.lock.release() elif isinstance(replay_obj_msg, StreamDefinitionContainer): replay_obj_msg.stream_resource_id = self.stream_id elif isinstance(replay_obj_msg, StreamGranuleContainer): # Override the resource_stream_id so ingestion doesn't reingest, also this is a NEW stream (replay) replay_obj_msg.stream_resource_id = self.stream_id datastream = None sha1 = None for key, identifiable in replay_obj_msg.identifiables.iteritems(): if isinstance(identifiable, DataStream): datastream = identifiable elif isinstance(identifiable, Encoding): sha1 = identifiable.sha1 if sha1: # if there is an encoding # Get the file from disk filename = FileSystem.get_url(FS.CACHE, sha1, ".hdf5") log.warn('Replay reading from filename: %s' % filename) hdf_string = '' try: with open(filename, mode='rb') as f: hdf_string = f.read() f.close() # Check the Sha1 retreived_hdfstring_sha1 = hashlib.sha1(hdf_string).hexdigest().upper() if sha1 != retreived_hdfstring_sha1: raise ReplayProcessException('The sha1 mismatch between the sha1 in datastream and the sha1 of hdf_string in the saved file in hdf storage') except IOError: log.warn('No HDF file found!') #@todo deal with this situation? How? hdf_string = 'HDF File %s not found!' % filename # set the datastream.value field! datastream.values = hdf_string else: log.warn('No encoding in the StreamGranuleContainer!') self.lock.acquire() self.output.publish(replay_obj_msg) self.lock.release() else: log.warn('Unknown type retrieved in DOC!') #@todo: log when there are not results if results is None: log.warn('No results found in replay query!') else: log.debug('Published replay!') def execute_replay(self): log.debug('(Replay Agent %s)', self.name) # Handle the query datastore_name = self.datastore_name key_id = self.key_id # Got the post ID, pull the post and the comments view_name = self.view_name opts = { 'start_key':[key_id, 0], 'end_key':[key_id,2], 'include_docs': True } g = Greenlet(self._query,datastore_name=datastore_name, view_name=view_name, opts=opts, callback=lambda results: self._publish_query(results)) g.start() def _query(self,datastore_name='dm_datastore', view_name='posts/posts_by_id', opts={}, callback=None): ''' Performs the query action ''' log.debug('Couch Query:\n\t%s\n\t%s\n\t%s', datastore_name, view_name, opts) #@todo: Fix this datastore management profile with correct data profile in near future db = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.EXAMPLES, self.CFG) ret = db.query_view(view_name=view_name,opts=opts) callback(ret)
raw_data = resp.read() try: data = json.loads(raw_data) except: raise BuyException("Could not parse json: {}".format(raw_data)) if "erroMsg" in data: raise BuyException("Error: {}".format(raw_data)) if data["isFraud"] == "true": raise BuyException("Should not be fraud: {}".format(raw_data)) if __name__ == "__main__": from gevent.coros import RLock stats_lock = RLock() errors = [] success_times = [] error_times = [] total_requests = 0 REPORT_EACH = 50 hostname = "dcf-ces63.appspot.com" print "Testing", hostname def run(customer, nreqs): global total_requests for i in range(nreqs): t = time.time() last_error = None try: customer.buy_something()
class aioFile(object): """a buffered File like object that uses pyaio and gevent""" def __init__(self, filename, mode='r', buffer=16<<10): modes = os.O_LARGEFILE | os.O_CREAT self._offset = 0 self._buffer_size = buffer if buffer: self._buffer_lock = RLock() self._read = False self._write = False self._read_buf = None self._write_buf = None self._eof = False # Optimization to limit calls self._append = False # Append Mode writes ignore offset self._stay_alive = gevent.spawn(_keep_awake); if mode.startswith('r') or '+' in mode: self._read = True self._read_buf = bytearray() if '+' not in mode: modes |= os.O_RDONLY if mode.startswith('w') or mode.startswith('a') or '+' in mode: if mode.startswith('w'): modes |= os.O_TRUNC self._write = True self._write_buf = bytearray() self._flush = False if '+' not in mode: modes |= os.O_WRONLY if '+' in mode: modes |= os.O_RDWR if mode.startswith('a'): modes |= os.O_APPEND self._append = True self._fd = os.open(filename, modes) def _clear_read_buf(self): if self._read: self._eof = False del self._read_buf[0:] def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def close(self): self.flush() os.close(self._fd) self._stay_alive.kill() def stat(self): return os.fstat(self._fd) def seek(self, pos, how=os.SEEK_SET): """Change the file pos, will clear read cache and flush writes """ \ """This will also clear the EOF flag for the file""" offset = self._offset if how != os.SEEK_CUR and how != os.SEEK_END and how != os.SEEK_SET: raise OSError(14, 'Invalid seek point use os.SEEK_SET, os.SEEK_CUR, os.SEEK_END') if how == os.SEEK_CUR: offset += pos elif how == os.SEEK_END: #Ugh this could be harry if we have outstanding writes offset = self.stat().st_size + pos else: offset = pos if offset < 0: raise OSError(14, 'File Position invalid, less than 0') #Even if the pos didn't change fix the buffers and EOF self._clear_read_buf() if not self._append: # DON'T FLUSH on seek with append self.flush() self._offset = offset return offset def flush(self): """Flush write buffer""" if self._write and self._buffer_size: self._flush = True while len(self._write_buf): self.write(None) self._flush = False def _read_file(self): fbuf = bytearray() while True: part = self.read(16 << 10) # Read 16k if part is None: # EOF break fbuf.extend(part) return fbuf def write(self, buf, offset=None): """write a buffer object to file""" if not self._write: raise IOError(9, 'Bad file descriptor') if not self._append and self._buffer_size and self._read_buf: # We should clear read cache self._clear_read_buf() if offset is None: offset = self._offset write_size = self._buffer_size if not self._buffer_size and buf: write_size = len(buf) if not self._append and offset != self._offset: self.seek(offset) # Makes sure we write our buffer #If we buffer we use the global buffer if not we use a local buffer if self._buffer_size: lbuf = self._write_buf self._buffer_lock.acquire() if buf: # The a memoryview of the buffer lbuf.extend(buf) # pushed to pyaio so we need to lock else: lbuf = buf while lbuf and len(lbuf) >= self._buffer_size \ or (self._flush and lbuf): result = AsyncResult() def _write_results(rcode, errno): result.set((rcode, errno)) pyaio.aio_write(self._fd, memoryview(lbuf)[0:write_size], offset, _write_results) rcode, errno = result.get() #SLEEP if rcode < 0: # Some kind of error raise IOError(errno, 'AIO Write Error %d' % errno) # Clean up buffer (of actually written bytes) if self._buffer_size: del lbuf[0:rcode] else: lbuf = None self._offset = offset = offset + rcode # Move the file offset if self._buffer_size: self._buffer_lock.release() if buf: return len(buf) else: return 0 def read(self, size=0, offset=None): """read a size of bytes from the file, or entire file if 0 """ \ """for speed we assume EOF after first short read""" if not self._read: raise IOError(9, 'Bad file descriptor') if not self._append and self._buffer_size and self._write_buf: self.flush() if offset is None: offset = self._offset if offset != self._offset: self.seek(offset) # To make sure we blow away our read cache if size == 0: # Attempt to read entire file and return in a single return return self._read_file() else: rbuf = bytearray() # Holding Place for multiple reads while len(rbuf) < size: # People get what they ask for # If we don't want to buffer then just read what they want if len(self._read_buf) < size - len(rbuf) and not self._eof: #Ok we are buffer short so lets do a read result = AsyncResult() def _read_results(buf, rcode, errno): result.set((buf, rcode, errno)) read_size = size - len(rbuf) if self._buffer_size: # If we buffer read buffer instead read_size = self._buffer_size pyaio.aio_read(self._fd, offset, read_size, _read_results) buf, rcode, errno = result.get() #SLEEP if rcode < 0: # Some kind of error raise IOError(errno, 'AIO Read Error %d' % errno) #Rcode will be the bytes read so lets push the offset self._offset = offset = offset + rcode if self._buffer_size: self._read_buf.extend(buf) else: rbuf = buf # Pass through because we are not buffering if rcode == 0 or rcode < read_size: # Good Enough self._eof = True #Do a buffer read toread = size - len(rbuf) if self._buffer_size: rbuf.extend(memoryview(self._read_buf)[0:toread]) #Clean up read buffer del self._read_buf[0:toread] if not self._read_buf and self._eof: # Empty buffer and eof break if self._eof and not rbuf: return None #EOF NO DATA else: return rbuf
def __init__(self, next_sink): super(RefCountedSink, self).__init__() self._ref_count = 0 self._open_lock = RLock() self._open_ar = None self.next_sink = next_sink
class VizTransformProcForMatplotlibGraphs(TransformDataProcess): """ This class is used for instantiating worker processes that have subscriptions to data streams and convert incoming data from CDM format to Matplotlib graphs """ def on_start(self): super(VizTransformProcForMatplotlibGraphs, self).on_start() #assert len(self.streams)==1 self.initDataFlag = True self.graph_data = { } # Stores a dictionary of variables : [List of values] # Need some clients self.rr_cli = ResourceRegistryServiceProcessClient( process=self, node=self.container.node) self.pubsub_cli = PubsubManagementServiceClient( node=self.container.node) # extract the various parameters passed to the transform process self.out_stream_id = self.CFG.get('process').get( 'publish_streams').get('visualization_service_submit_stream_id') # Create a publisher on the output stream #stream_route = self.pubsub_cli.register_producer(stream_id=self.out_stream_id) out_stream_pub_registrar = StreamPublisherRegistrar( process=self.container, node=self.container.node) self.out_stream_pub = out_stream_pub_registrar.create_publisher( stream_id=self.out_stream_id) self.data_product_id = self.CFG.get('data_product_id') self.stream_def_id = self.CFG.get("stream_def_id") self.stream_def = self.rr_cli.read(self.stream_def_id) # Start the thread responsible for keeping track of time and generating graphs # Mutex for ensuring proper concurrent communications between threads self.lock = RLock() self.rendering_proc = Greenlet(self.rendering_thread) self.rendering_proc.start() def process(self, packet): log.debug('(%s): Received Viz Data Packet' % self.name) #log.debug('(%s): - Processing: %s' % (self.name,packet)) # parse the incoming data psd = PointSupplementStreamParser( stream_definition=self.stream_def.container, stream_granule=packet) # re-arrange incoming data into an easy to parse dictionary vardict = {} arrLen = None for varname in psd.list_field_names(): vardict[varname] = psd.get_values(varname) arrLen = len(vardict[varname]) if self.initDataFlag: # look at the incoming packet and store for varname in psd.list_field_names(): self.lock.acquire() self.graph_data[varname] = [] self.lock.release() self.initDataFlag = False # If code reached here, the graph data storage has been initialized. Just add values # to the list with self.lock: for varname in psd.list_field_names(): self.graph_data[varname].extend(vardict[varname]) def rendering_thread(self): from copy import deepcopy # Service Client # init Matplotlib fig = Figure() ax = fig.add_subplot(111) canvas = FigureCanvas(fig) imgInMem = StringIO.StringIO() while True: # Sleep for a pre-decided interval. Should be specifiable in a YAML file gevent.sleep(20) # If there's no data, wait # Lock is used here to make sure the entire vector exists start to finish, this assures that the data won working_set = None with self.lock: if len(self.graph_data) == 0: continue else: working_set = deepcopy(self.graph_data) # For the simple case of testing, lets plot all time variant variables one at a time xAxisVar = 'time' xAxisFloatData = working_set[xAxisVar] for varName, varData in working_set.iteritems(): if varName == 'time' or varName == 'height' or varName == 'longitude' or varName == 'latitude': continue yAxisVar = varName yAxisFloatData = working_set[varName] # Generate the plot ax.plot(xAxisFloatData, yAxisFloatData, 'ro') ax.set_xlabel(xAxisVar) ax.set_ylabel(yAxisVar) ax.set_title(yAxisVar + ' vs ' + xAxisVar) ax.set_autoscale_on(False) # generate filename for the output image fileName = yAxisVar + '_vs_' + xAxisVar + '.png' # Save the figure to the in memory file canvas.print_figure(imgInMem, format="png") imgInMem.seek(0) # submit resulting table back using the out stream publisher msg = { "viz_product_type": "matplotlib_graphs", "data_product_id": self.data_product_id, "image_obj": imgInMem.getvalue(), "image_name": fileName } self.out_stream_pub.publish(msg) #clear the canvas for the next image ax.clear()
class VizTransformProcForMatplotlibGraphs(TransformDataProcess): """ This class is used for instantiating worker processes that have subscriptions to data streams and convert incoming data from CDM format to Matplotlib graphs """ def on_start(self): super(VizTransformProcForMatplotlibGraphs,self).on_start() #assert len(self.streams)==1 self.initDataFlag = True self.graph_data = {} # Stores a dictionary of variables : [List of values] # Need some clients self.rr_cli = ResourceRegistryServiceProcessClient(process = self, node = self.container.node) self.pubsub_cli = PubsubManagementServiceClient(node=self.container.node) # extract the various parameters passed to the transform process self.out_stream_id = self.CFG.get('process').get('publish_streams').get('visualization_service_submit_stream_id') # Create a publisher on the output stream #stream_route = self.pubsub_cli.register_producer(stream_id=self.out_stream_id) out_stream_pub_registrar = StreamPublisherRegistrar(process=self.container, node=self.container.node) self.out_stream_pub = out_stream_pub_registrar.create_publisher(stream_id=self.out_stream_id) self.data_product_id = self.CFG.get('data_product_id') self.stream_def_id = self.CFG.get("stream_def_id") self.stream_def = self.rr_cli.read(self.stream_def_id) # Start the thread responsible for keeping track of time and generating graphs # Mutex for ensuring proper concurrent communications between threads self.lock = RLock() self.rendering_proc = Greenlet(self.rendering_thread) self.rendering_proc.start() def process(self, packet): log.debug('(%s): Received Viz Data Packet' % self.name ) #log.debug('(%s): - Processing: %s' % (self.name,packet)) # parse the incoming data psd = PointSupplementStreamParser(stream_definition=self.stream_def.container, stream_granule=packet) # re-arrange incoming data into an easy to parse dictionary vardict = {} arrLen = None for varname in psd.list_field_names(): vardict[varname] = psd.get_values(varname) arrLen = len(vardict[varname]) if self.initDataFlag: # look at the incoming packet and store for varname in psd.list_field_names(): self.lock.acquire() self.graph_data[varname] = [] self.lock.release() self.initDataFlag = False # If code reached here, the graph data storage has been initialized. Just add values # to the list with self.lock: for varname in psd.list_field_names(): self.graph_data[varname].extend(vardict[varname]) def rendering_thread(self): from copy import deepcopy # Service Client # init Matplotlib fig = Figure() ax = fig.add_subplot(111) canvas = FigureCanvas(fig) imgInMem = StringIO.StringIO() while True: # Sleep for a pre-decided interval. Should be specifiable in a YAML file gevent.sleep(20) # If there's no data, wait # Lock is used here to make sure the entire vector exists start to finish, this assures that the data won working_set=None with self.lock: if len(self.graph_data) == 0: continue else: working_set = deepcopy(self.graph_data) # For the simple case of testing, lets plot all time variant variables one at a time xAxisVar = 'time' xAxisFloatData = working_set[xAxisVar] for varName, varData in working_set.iteritems(): if varName == 'time' or varName == 'height' or varName == 'longitude' or varName == 'latitude': continue yAxisVar = varName yAxisFloatData = working_set[varName] # Generate the plot ax.plot(xAxisFloatData, yAxisFloatData, 'ro') ax.set_xlabel(xAxisVar) ax.set_ylabel(yAxisVar) ax.set_title(yAxisVar + ' vs ' + xAxisVar) ax.set_autoscale_on(False) # generate filename for the output image fileName = yAxisVar + '_vs_' + xAxisVar + '.png' # Save the figure to the in memory file canvas.print_figure(imgInMem, format="png") imgInMem.seek(0) # submit resulting table back using the out stream publisher msg = {"viz_product_type": "matplotlib_graphs", "data_product_id": self.data_product_id, "image_obj": imgInMem.getvalue(), "image_name": fileName} self.out_stream_pub.publish(msg) #clear the canvas for the next image ax.clear()
''' basic data-parallel main function module ''' from gevent.coros import RLock from gevent.pool import Pool import logging from data import load, save_result, strip_ephemeral from serial_process import process, required_actions_count, print_progress_info logger = logging.getLogger(__name__) PROCESSED = 0 TOTAL = 0 REPORT_LOCK = RLock() def target(ostream, params, parallel_tests, sorted_mode): global PROCESSED, REPORT_LOCK, TOTAL for result in process(params, pool_size=parallel_tests, sorted_mode=sorted_mode): with REPORT_LOCK: PROCESSED += 1 print_progress_info(PROCESSED, TOTAL) save_result(ostream, strip_ephemeral(result)) def main(conf, istream, ostream, test_whitelist, test_blacklist, stage_whitelist, stage_blacklist, tags_whitelist, tags_blacklist,
def __init__(self): self._log = logging.getLogger("Deliverator") self._active_requests = dict() self._lock = RLock()
def __init__(self): Scheduler.__init__(self) from gevent.coros import RLock self._lock = RLock()
def __init__(self, *args, **kwargs): super(GreenQueuePool, self).__init__(*args, **kwargs) if self._overflow_lock is not None: self._overflow_lock = RLock()
def __init__(self, *args, **kwargs): super(ReplayProcess, self).__init__(*args,**kwargs) #@todo Init stuff # mutex for shared resources between threads self.lock = RLock()
def __init__(self, node): transport = TSocket.TSocket(str(node.host), int(node.port)) self.transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) self.client = AtlasNode.Client(protocol) self.lock = RLock()
def __init__(self, *args, **kwargs): super(ReplayProcess,self).__init__(*args,**kwargs) self.lock = RLock()
def __init__(self, addr, timeout=2): self.addr = addr self.sock = None self.unpacker = None self.timeout = timeout self.lock = RLock()
class aioFile(object): """a buffered File like object that uses pyaio and gevent""" def __init__(self, filename, mode='r', buffer=16 << 10): modes = os.O_LARGEFILE | os.O_CREAT self._offset = 0 self._buffer_size = buffer if buffer: self._buffer_lock = RLock() self._read = False self._write = False self._read_buf = None self._write_buf = None self._eof = False # Optimization to limit calls self._append = False # Append Mode writes ignore offset self._stay_alive = gevent.spawn(_keep_awake) if mode.startswith('r') or '+' in mode: self._read = True self._read_buf = bytearray() if '+' not in mode: modes |= os.O_RDONLY if mode.startswith('w') or mode.startswith('a') or '+' in mode: if mode.startswith('w'): modes |= os.O_TRUNC self._write = True self._write_buf = bytearray() self._flush = False if '+' not in mode: modes |= os.O_WRONLY if '+' in mode: modes |= os.O_RDWR if mode.startswith('a'): modes |= os.O_APPEND self._append = True self._fd = os.open(filename, modes) def _clear_read_buf(self): if self._read: self._eof = False del self._read_buf[0:] def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def close(self): self.flush() os.close(self._fd) self._stay_alive.kill() def stat(self): return os.fstat(self._fd) def seek(self, pos, how=os.SEEK_SET): """Change the file pos, will clear read cache and flush writes """ \ """This will also clear the EOF flag for the file""" offset = self._offset if how != os.SEEK_CUR and how != os.SEEK_END and how != os.SEEK_SET: raise OSError( 14, 'Invalid seek point use os.SEEK_SET, os.SEEK_CUR, os.SEEK_END') if how == os.SEEK_CUR: offset += pos elif how == os.SEEK_END: #Ugh this could be harry if we have outstanding writes offset = self.stat().st_size + pos else: offset = pos if offset < 0: raise OSError(14, 'File Position invalid, less than 0') #Even if the pos didn't change fix the buffers and EOF self._clear_read_buf() if not self._append: # DON'T FLUSH on seek with append self.flush() self._offset = offset return offset def flush(self): """Flush write buffer""" if self._write and self._buffer_size: self._flush = True while len(self._write_buf): self.write(None) self._flush = False def _read_file(self): fbuf = bytearray() while True: part = self.read(16 << 10) # Read 16k if part is None: # EOF break fbuf.extend(part) return fbuf def write(self, buf, offset=None): """write a buffer object to file""" if not self._write: raise IOError(9, 'Bad file descriptor') if not self._append and self._buffer_size and self._read_buf: # We should clear read cache self._clear_read_buf() if offset is None: offset = self._offset write_size = self._buffer_size if not self._buffer_size and buf: write_size = len(buf) if not self._append and offset != self._offset: self.seek(offset) # Makes sure we write our buffer #If we buffer we use the global buffer if not we use a local buffer if self._buffer_size: lbuf = self._write_buf self._buffer_lock.acquire() if buf: # The a memoryview of the buffer lbuf.extend(buf) # pushed to pyaio so we need to lock else: lbuf = buf while lbuf and len(lbuf) >= self._buffer_size \ or (self._flush and lbuf): result = AsyncResult() def _write_results(rcode, errno): result.set((rcode, errno)) pyaio.aio_write(self._fd, memoryview(lbuf)[0:write_size], offset, _write_results) rcode, errno = result.get() #SLEEP if rcode < 0: # Some kind of error raise IOError(errno, 'AIO Write Error %d' % errno) # Clean up buffer (of actually written bytes) if self._buffer_size: del lbuf[0:rcode] else: lbuf = None self._offset = offset = offset + rcode # Move the file offset if self._buffer_size: self._buffer_lock.release() if buf: return len(buf) else: return 0 def read(self, size=0, offset=None): """read a size of bytes from the file, or entire file if 0 """ \ """for speed we assume EOF after first short read""" if not self._read: raise IOError(9, 'Bad file descriptor') if not self._append and self._buffer_size and self._write_buf: self.flush() if offset is None: offset = self._offset if offset != self._offset: self.seek(offset) # To make sure we blow away our read cache if size == 0: # Attempt to read entire file and return in a single return return self._read_file() else: rbuf = bytearray() # Holding Place for multiple reads while len(rbuf) < size: # People get what they ask for # If we don't want to buffer then just read what they want if len(self._read_buf) < size - len(rbuf) and not self._eof: #Ok we are buffer short so lets do a read result = AsyncResult() def _read_results(buf, rcode, errno): result.set((buf, rcode, errno)) read_size = size - len(rbuf) if self._buffer_size: # If we buffer read buffer instead read_size = self._buffer_size pyaio.aio_read(self._fd, offset, read_size, _read_results) buf, rcode, errno = result.get() #SLEEP if rcode < 0: # Some kind of error raise IOError(errno, 'AIO Read Error %d' % errno) #Rcode will be the bytes read so lets push the offset self._offset = offset = offset + rcode if self._buffer_size: self._read_buf.extend(buf) else: rbuf = buf # Pass through because we are not buffering if rcode == 0 or rcode < read_size: # Good Enough self._eof = True #Do a buffer read toread = size - len(rbuf) if self._buffer_size: rbuf.extend(memoryview(self._read_buf)[0:toread]) #Clean up read buffer del self._read_buf[0:toread] if not self._read_buf and self._eof: # Empty buffer and eof break if self._eof and not rbuf: return None #EOF NO DATA else: return rbuf
LINE_PER_FILE = 10000 # HTTPError 403: 代理时IP请求超过上限 # 10022 IP requests out of rate limit IP请求频次超过上限 # 10023 User requests out of rate limit 用户请求频次超过上限 # 10024 User requests for (%s) out of rate limit 用户请求特殊接口 (%s) 频次超过上限 ERROR_NORMAL = 0 ERROR_API = -1 ERROR_RATE = -2 task_queue = gevent.queue.JoinableQueue(10000) result_queue = gevent.queue.JoinableQueue(10000) log_queue = gevent.queue.JoinableQueue(1000) live_signal = 0 log_lock = RLock() live_lock = RLock() logger = get_logger(LOG_FILE) def wait_time(proxy): try: rl = api.rate_limit(proxy=proxy) except Exception, e: rl = None if rl: if rl['remaining_ip_hits'] > 1 and rl['remaining_user_hits'] > 1: return 1 return rl['reset_time_in_seconds'] + 1 now = datetime.now()
def __init__(self, states, events, enter_event, exit_event): self._lock = RLock() super(ThreadSafeFSM, self).__init__(states, events, enter_event, exit_event)