def on_start(self):
        super(VizTransformProcForMatplotlibGraphs, self).on_start()
        #assert len(self.streams)==1
        self.initDataFlag = True
        self.graph_data = {
        }  # Stores a dictionary of variables : [List of values]

        # Need some clients
        self.rr_cli = ResourceRegistryServiceProcessClient(
            process=self, node=self.container.node)
        self.pubsub_cli = PubsubManagementServiceClient(
            node=self.container.node)

        # extract the various parameters passed to the transform process
        self.out_stream_id = self.CFG.get('process').get(
            'publish_streams').get('visualization_service_submit_stream_id')

        # Create a publisher on the output stream
        #stream_route = self.pubsub_cli.register_producer(stream_id=self.out_stream_id)
        out_stream_pub_registrar = StreamPublisherRegistrar(
            process=self.container, node=self.container.node)
        self.out_stream_pub = out_stream_pub_registrar.create_publisher(
            stream_id=self.out_stream_id)

        self.data_product_id = self.CFG.get('data_product_id')
        self.stream_def_id = self.CFG.get("stream_def_id")
        self.stream_def = self.rr_cli.read(self.stream_def_id)

        # Start the thread responsible for keeping track of time and generating graphs
        # Mutex for ensuring proper concurrent communications between threads
        self.lock = RLock()
        self.rendering_proc = Greenlet(self.rendering_thread)
        self.rendering_proc.start()
Exemple #2
0
 def __init__(self, filename, mode='r', buffer=16 << 10):
     modes = os.O_LARGEFILE | os.O_CREAT
     self._offset = 0
     self._buffer_size = buffer
     if buffer:
         self._buffer_lock = RLock()
     self._read = False
     self._write = False
     self._read_buf = None
     self._write_buf = None
     self._eof = False  # Optimization to limit calls
     self._append = False  # Append Mode writes ignore offset
     self._stay_alive = gevent.spawn(_keep_awake)
     if mode.startswith('r') or '+' in mode:
         self._read = True
         self._read_buf = bytearray()
         if '+' not in mode:
             modes |= os.O_RDONLY
     if mode.startswith('w') or mode.startswith('a') or '+' in mode:
         if mode.startswith('w'):
             modes |= os.O_TRUNC
         self._write = True
         self._write_buf = bytearray()
         self._flush = False
         if '+' not in mode:
             modes |= os.O_WRONLY
     if '+' in mode:
         modes |= os.O_RDWR
     if mode.startswith('a'):
         modes |= os.O_APPEND
         self._append = True
     self._fd = os.open(filename, modes)
Exemple #3
0
    class GeventScheduler(Scheduler):
        """A scheduler that dispatches tasks via Gevent"""

        def __init__(self):
            Scheduler.__init__(self)
            from gevent.coros import RLock
            self._lock = RLock()

        def start(self):
            """Spawn a greenlet for the main event loop."""
            self.greenlet = gevent.spawn(self._run)

        def stop(self):
            """Stop the scheduler and wait for the thread to finish."""
            Scheduler.stop(self)
            try:
                self.greenlet.kill(block=False)
            except AttributeError:
                pass

        def _acquire_lock(self):
            """Lock the thread's task queue."""
            self._lock.acquire()

        def _release_lock(self):
            """Release the lock on the thread's task queue."""
            self._lock.release()
Exemple #4
0
class Deliverator(object):
    """
    The deliverator holds the channels that will be used to deliver 
    the replies that come over a resilient connection
    """
    def __init__(self):
        self._log = logging.getLogger("Deliverator")
        self._active_requests = dict()
        self._lock = RLock()

    def add_request(self, message_id):
        """
        Add a message_id

        return a channel (gevent.queue.Queue)

        When the web_server's pull server gets a reply for this message id
        it will push the message into the queue. The caller can block on the
        queue, waiting for the reply.

        We can't use the zero size 'channel' queue because the web server moves 
        on after 8 of 10 retrieves and nobody is waiting on the last two.

        So we use a size of one, and it is the caller's responsibility to clean
        up unused channels.
        """
        channel = Queue(maxsize=1)

        self._lock.acquire()
        try:
            if message_id in self._active_requests:
                raise ValueError("Duplicate request '%s'" % (message_id, ))
            self._active_requests[message_id] = channel
        finally:
            self._lock.release()

        return channel

    def deliver_reply(self, message):
        """
        Deliver the reply nessage over the channel for its message-id

        And discard the channel
        """
        self._lock.acquire()
        try:
            channel = self._active_requests.pop(message.control["message-id"])
        except KeyError:
            channel = None
        finally:
            self._lock.release()

        if channel is None:
            self._log.error("undeliverable message %s" % (message.control, ))
        else:
            channel.put((
                message.control,
                message.body,
            ))
class ThreadSafeFSM(InstrumentFSM):
    def __init__(self, states, events, enter_event, exit_event):
        self._lock = RLock()
        super(ThreadSafeFSM, self).__init__(states, events, enter_event, exit_event)
    def on_event(self, event, *args, **kwargs):
        with self._lock:
            return super(ThreadSafeFSM, self).on_event(event, *args, **kwargs)
    def on_event_if_free(self, event, *args, **kwargs):
        if not self._lock.acquire(blocking=False):
            raise FSMLockedError
        try:
            retval = super(ThreadSafeFSM, self).on_event(event, *args, **kwargs)
        finally:
            self._lock.release()
        return retval
Exemple #6
0
class ThreadSafeFSM(InstrumentFSM):
    def __init__(self, states, events, enter_event, exit_event):
        self._lock = RLock()
        super(ThreadSafeFSM, self).__init__(states, events, enter_event, exit_event)
    def on_event(self, event, *args, **kwargs):
        with self._lock:
            return super(ThreadSafeFSM, self).on_event(event, *args, **kwargs)
    def on_event_if_free(self, event, *args, **kwargs):
        if not self._lock.acquire(blocking=False):
            raise FSMLockedError
        try:
            retval = super(ThreadSafeFSM, self).on_event(event, *args, **kwargs)
        finally:
            self._lock.release()
        return retval
Exemple #7
0
 def __init__(self, filename, mode='r', buffer=16<<10):
     modes = os.O_LARGEFILE | os.O_CREAT
     self._offset = 0
     self._buffer_size = buffer
     if buffer:
         self._buffer_lock = RLock()
     self._read = False
     self._write = False
     self._read_buf = None
     self._write_buf = None
     self._eof = False   # Optimization to limit calls
     self._append = False   # Append Mode writes ignore offset
     self._stay_alive = gevent.spawn(_keep_awake);
     if mode.startswith('r') or '+' in mode:
         self._read = True
         self._read_buf = bytearray()
         if '+' not in mode:
             modes |= os.O_RDONLY
     if mode.startswith('w') or mode.startswith('a') or '+' in mode:
         if mode.startswith('w'):
             modes |= os.O_TRUNC
         self._write = True
         self._write_buf = bytearray()
         self._flush = False
         if '+' not in mode:
             modes |= os.O_WRONLY
     if '+' in mode:
         modes |= os.O_RDWR
     if mode.startswith('a'):
         modes |= os.O_APPEND
         self._append = True
     self._fd = os.open(filename, modes)
Exemple #8
0
    def on_start(self): #pragma no cover
        #--------------------------------------------------------------------------------
        # Explicit on_start
        #--------------------------------------------------------------------------------

        # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created
        # We want explicit management of the thread and subscriber object for ingestion

        TransformStreamProcess.on_start(self)
        
        self.queue_name = self.CFG.get_safe('process.queue_name',self.id)
        self.subscriber = StreamSubscriber(process=self, exchange_name=self.queue_name, callback=self.receive_callback)
        self.thread_lock = RLock()
        
        #--------------------------------------------------------------------------------
        # Normal on_start after this point
        #--------------------------------------------------------------------------------

        BaseIngestionWorker.on_start(self)
        self._rpc_server = self.container.proc_manager._create_listening_endpoint(from_name=self.id, process=self)
        self.add_endpoint(self._rpc_server)

        self.event_publisher = EventPublisher(OT.DatasetModified)
        self.stored_value_manager = StoredValueManager(self.container)

        self.lookup_docs = self.CFG.get_safe('process.lookup_docs',[])
        self.input_product = self.CFG.get_safe('process.input_product','')
        self.new_lookups = Queue()
        self.lookup_monitor = EventSubscriber(event_type=OT.ExternalReferencesUpdatedEvent, callback=self._add_lookups, auto_delete=True)
        self.add_endpoint(self.lookup_monitor)
        self.connection_id = ''
        self.connection_index = None
        
        self.start_listener()
Exemple #9
0
    def on_start(self):  #pragma no cover
        #super(TransformWorker,self).on_start()
        #--------------------------------------------------------------------------------
        # Explicit on_start
        #--------------------------------------------------------------------------------

        # Skip TransformStreamListener and go to StreamProcess to avoid the subscriber being created
        # We want explicit management of the thread and subscriber object for ingestion
        #todo: check how to manage multi queue subscription (transform scenario 3)

        TransformStreamProcess.on_start(self)

        #todo: can the subscription be changed or updated when new dataprocesses are added ?
        self.queue_name = self.CFG.get_safe('process.queue_name', self.id)
        self.subscriber = StreamSubscriber(process=self,
                                           exchange_name=self.queue_name,
                                           callback=self.receive_callback)
        self.thread_lock = RLock()

        self._rpc_server = self.container.proc_manager._create_listening_endpoint(
            from_name=self.id, process=self)
        self.add_endpoint(self._rpc_server)

        self.start_listener()

        #todo: determine and publish appropriate set of status events
        self.event_publisher = EventPublisher(OT.DataProcessStatusEvent)
    def on_start(self):
        super(VizTransformProcForMatplotlibGraphs,self).on_start()
        #assert len(self.streams)==1
        self.initDataFlag = True
        self.graph_data = {} # Stores a dictionary of variables : [List of values]

        # Need some clients
        self.rr_cli = ResourceRegistryServiceProcessClient(process = self, node = self.container.node)
        self.pubsub_cli = PubsubManagementServiceClient(node=self.container.node)

        # extract the various parameters passed to the transform process
        self.out_stream_id = self.CFG.get('process').get('publish_streams').get('visualization_service_submit_stream_id')

        # Create a publisher on the output stream
        #stream_route = self.pubsub_cli.register_producer(stream_id=self.out_stream_id)
        out_stream_pub_registrar = StreamPublisherRegistrar(process=self.container, node=self.container.node)
        self.out_stream_pub = out_stream_pub_registrar.create_publisher(stream_id=self.out_stream_id)

        self.data_product_id = self.CFG.get('data_product_id')
        self.stream_def_id = self.CFG.get("stream_def_id")
        self.stream_def = self.rr_cli.read(self.stream_def_id)

        # Start the thread responsible for keeping track of time and generating graphs
        # Mutex for ensuring proper concurrent communications between threads
        self.lock = RLock()
        self.rendering_proc = Greenlet(self.rendering_thread)
        self.rendering_proc.start()
Exemple #11
0
 def __init__(self, canvas):
     self.canvas = canvas
     self.socket = None
     self.connect_ts = time.time()
     # And this is used to limit clients to X messages per tick
     # We start at 0 (instead of x) to add a reconnect-penalty.
     self.lock = RLock()
Exemple #12
0
    def __init__(self, fileserver_ip=None, fileserver_port=None):
        self.privatekeys = {}  # Onion: Privatekey
        self.site_onions = {}  # Site address: Onion
        self.tor_exe = "tools/tor/tor.exe"
        self.has_meek_bridges = os.path.isfile(
            "tools/tor/PluggableTransports/meek-client.exe")
        self.tor_process = None
        self.log = logging.getLogger("TorManager")
        self.start_onions = None
        self.conn = None
        self.lock = RLock()

        if config.tor == "disable":
            self.enabled = False
            self.start_onions = False
            self.setStatus("Disabled")
        else:
            self.enabled = True
            self.setStatus("Waiting")

        if fileserver_port:
            self.fileserver_port = fileserver_port
        else:
            self.fileserver_port = config.fileserver_port

        self.ip, self.port = config.tor_controller.split(":")
        self.port = int(self.port)

        self.proxy_ip, self.proxy_port = config.tor_proxy.split(":")
        self.proxy_port = int(self.proxy_port)
Exemple #13
0
    def __init__(self, socket, LAST_MESSAGES, rabbitcox):
        (ip, port) = socket.getpeername()

        self.logger = logging.getLogger('radiovisserver.stompserver.' + ip +
                                        '.' + str(port))

        self.socket = socket
        # Buffer for icoming data
        self.incomingData = ''
        # Topic the client subscribled to
        self.topics = []
        # Queue of messages
        self.queue = queue.Queue()
        # Lock to send frame
        self.lock = RLock()

        # Mapping channel -> id for subscritions
        self.idsByChannels = {}

        # Mapping id- -> channel for subscritions
        self.channelsByIds = {}

        # Last messages
        self.LAST_MESSAGES = LAST_MESSAGES

        # RabbitCox
        self.rabbitcox = rabbitcox

        # Station id, if authenticated
        self.station_id = None
Exemple #14
0
    def __init__(self,
                 server,
                 ip,
                 port,
                 sock=None,
                 target_onion=None,
                 is_tracker_connection=False):
        self.sock = sock
        self.ip = ip
        self.port = port
        self.cert_pin = None
        if "#" in ip:
            self.ip, self.cert_pin = ip.split("#")
        self.target_onion = target_onion  # Requested onion adress
        self.id = server.last_connection_id
        server.last_connection_id += 1
        self.protocol = "?"
        self.type = "?"

        if helper.isPrivateIp(self.ip) and self.ip not in config.ip_local:
            self.is_private_ip = True
        else:
            self.is_private_ip = False
        self.is_tracker_connection = is_tracker_connection

        self.server = server
        self.unpacker = None  # Stream incoming socket messages here
        self.req_id = 0  # Last request id
        self.handshake = {}  # Handshake info got from peer
        self.crypt = None  # Connection encryption method
        self.sock_wrapped = False  # Socket wrapped to encryption

        self.connected = False
        self.event_connected = gevent.event.AsyncResult(
        )  # Solves on handshake received
        self.closed = False

        # Stats
        self.start_time = time.time()
        self.last_recv_time = 0
        self.last_message_time = 0
        self.last_send_time = 0
        self.last_sent_time = 0
        self.incomplete_buff_recv = 0
        self.bytes_recv = 0
        self.bytes_sent = 0
        self.last_ping_delay = None
        self.last_req_time = 0
        self.last_cmd_sent = None
        self.last_cmd_recv = None
        self.bad_actions = 0
        self.sites = 0
        self.cpu_time = 0.0
        self.send_lock = RLock()

        self.name = None
        self.updateName()

        self.waiting_requests = {}  # Waiting sent requests
        self.waiting_streams = {}  # Waiting response file streams
Exemple #15
0
    class RedisConnWrapper(object):

        _db = {}
        dumb = True
        db_lock = RLock()

        def lock__db(func):
            def gen(self, *args, **kwargs):
                self.db_lock.acquire()
                func(self, *args, **kwargs)
                self.db_lock.release()

            return gen

        def get(self, key):
            return self._db.get(key)

        @lock__db
        def set(self, key, value):
            self._db[key] = value

        @lock__db
        def incr(self, key):
            if self._db.get(key):
                self._db[key] += 1
            else:
                self._db[key] = 1

        def smembers(self, set_key):
            return self._db.get(set_key)

        @lock__db
        def spop(self, set_key):
            if type(self._db.get(set_key)) != set:
                return None
            self._db[set_key].pop()

        @lock__db
        def srem(self, set_key, value):
            if type(self._db.get(set_key)) != set:
                return False
            else:
                try:
                    self._db[set_key].remove(value)
                    return True
                except KeyError:
                    return False

        @lock__db
        def sadd(self, set_key, value):
            if type(self._db.get(set_key)) != set:
                self._db[set_key] = set()
            self._db[set_key].add(value)

        def __getattr__(self, name):
            raise RedisImportError(
                'You use dumb redis storage that doesn\'t'
                'support this function,\n you should install redis-server'
                'and redis-py')
Exemple #16
0
    def publish(self, topic, data):
        lock = self.lock
        if not lock:
            lock = RLock()
            self.lock = lock

        with lock:
            return RedisInterconnect.publish(self, topic, data)
Exemple #17
0
class Deliverator(object):
    """
    The deliverator holds the channels that will be used to deliver 
    the replies that come over a resilient connection
    """
    def __init__(self):
        self._log = logging.getLogger("Deliverator")
        self._active_requests = dict()
        self._lock = RLock()

    def add_request(self, message_id):
        """
        Add a message_id

        return a channel (gevent.queue.Queue)

        When the web_server's pull server gets a reply for this message id
        it will push the message into the queue. The caller can block on the
        queue, waiting for the reply.

        We can't use the zero size 'channel' queue because the web server moves 
        on after 8 of 10 retrieves and nobody is waiting on the last two.

        So we use a size of one, and it is the caller's responsibility to clean
        up unused channels.
        """
        channel = Queue(maxsize=1)

        self._lock.acquire()
        try:
            if message_id in self._active_requests:
                raise ValueError("Duplicate request '%s'" % (message_id, ))
            self._active_requests[message_id] = channel
        finally:
            self._lock.release()

        return channel

    def deliver_reply(self, message):
        """
        Deliver the reply nessage over the channel for its message-id

        And discard the channel
        """
        self._lock.acquire()
        try:
            channel = self._active_requests.pop(message.control["message-id"])
        except KeyError:
            channel = None
        finally:
            self._lock.release()
        
        if channel is None:
            self._log.error("undeliverable message %s" % (message.control, ))
        else:
            channel.put((message.control, message.body, ))
Exemple #18
0
    def __init__(self, pa):
        """
        Initializes all status parameters according to the immediate
        children of this platform and starts the related subscribers.

        The PlatformAgent must have been already initialized to properly
        access the handled elements.

        Note that the number of subscribers and entries for other
        related status information will increase and decrease
        as we get device_added and device_removed events.

        @param pa   The associated platform agent object to access the
                    elements handled by this helper.
        """

        assert pa._platform_id is not None
        assert pa._children_resource_ids is not None

        self._agent = pa

        self._platform_id = pa._platform_id
        self.resource_id = pa.resource_id
        self._children_resource_ids = pa._children_resource_ids
        self._event_publisher = pa._event_publisher
        self.aparam_child_agg_status = pa.aparam_child_agg_status
        self.aparam_aggstatus = pa.aparam_aggstatus
        self.aparam_rollup_status = pa.aparam_rollup_status

        # All EventSubscribers created: {origin: EventSubscriber, ...}
        self._event_subscribers = {}

        # {pid: origin ...} the origin (resource_id) of each PID used in
        # ProcessLifecycleEvent subscribers
        self._rids = {}

        # set to False by a call to destroy
        self._active = True

        # RLock to synchronize access to the various mutable variables here.
        self._lock = RLock()

        # init statuses, and subscribers for the given children
        with self._lock:
            # initialize my own statuses:
            for status_name in AggregateStatusType._str_map.keys():
                self.aparam_aggstatus[
                    status_name] = DeviceStatusType.STATUS_UNKNOWN
                self.aparam_rollup_status[
                    status_name] = DeviceStatusType.STATUS_UNKNOWN

            # do status preparations for the immediate children
            for origin in pa._children_resource_ids:
                self._prepare_new_child(origin)

        # diagnostics report on demand:
        self._diag_sub = None
        self._start_diagnostics_subscriber()
Exemple #19
0
class ElasticConnection(object):
    if _use_gevent:
        session = None
        session_lock = RLock()

    def __init__(self, timeout=None, **params):
        self.status_code = 0
        self.timeout = timeout
        self.encoding = None
        self.headers = {'Content-Type': 'Application/json; charset=utf-8'}
        if params.has_key('encoding'):
            self.encoding = 'utf8'
            del params['encoding']
        if _use_gevent:
            if ElasticConnection.session is None:
                ElasticConnection.session_lock.acquire()
                ElasticConnection.session = requests.Session(**params)
                ElasticConnection.session_lock.release()
        else:
            self.session = requests.Session(**params)

    def get(self, url):
        try:
            response = self.session.get(url, headers=self.headers, timeout=self.timeout)
        except requests.ConnectionError as e:
            self.status_code = 0
            return {'error': e.message}
        self.status_code = response.status_code
        return json.loads(response.content, encoding=self.encoding)

    def post(self, url, data):
        body = json.dumps(data)
        try:
            response = self.session.post(url, data=body, headers=self.headers, timeout=self.timeout)
        except requests.ConnectionError as e:
            self.status_code = 0
            return {'error': e.message}
        self.status_code = response.status_code
        return json.loads(response.content, encoding=self.encoding)

    def put(self, url, data):
        body = json.dumps(data)
        try:
            response = self.session.post(url, data=body, headers=self.headers, timeout=self.timeout)
        except requests.ConnectionError as e:
            self.status_code = 0
            return {'error': e.message}
        self.status_code = response.status_code
        return json.loads(response.content, encoding=self.encoding)

    def delete(self, url):
        try:
            response = self.session.delete(url, headers=self.headers, timeout=self.timeout)
        except requests.ConnectionError as e:
            self.status_code = 0
            return {'error': e.message}
        self.status_code = response.status_code
        return json.loads(response.content, encoding=self.encoding)
Exemple #20
0
 def __init__(self, params):
     self.total = 0
     self.processed = 0
     from gevent.coros import RLock
     self.lock = RLock()
     for item in params:
         for test_stage in item['test_stages']:
             for test_name in item['test_stages'][test_stage]:
                 self.total += 1
Exemple #21
0
    def __init__(self, fileserver_ip=None, fileserver_port=None):
        self.privatekeys = {}  # Onion: Privatekey
        self.site_onions = {}  # Site address: Onion
        self.tor_exe = "tools/tor/tor.exe"
        self.tor_process = None
        self.log = logging.getLogger("TorManager")
        self.start_onions = None
        self.conn = None
        #self.trackers = []
        #self.trackers_key = {}

        self.lock = RLock()

        if config.tor == "disable":
            self.enabled = False
            self.start_onions = False
            self.setStatus("Disabled")
        else:
            self.enabled = True
            self.setStatus("Waiting")

        if fileserver_port:
            self.fileserver_port = fileserver_port
        else:
            self.fileserver_port = config.fileserver_port

        self.ip, self.port = config.tor_controller.split(":")
        self.port = int(self.port)

        self.proxy_ip, self.proxy_port = config.tor_proxy.split(":")
        self.proxy_port = int(self.proxy_port)

        # Test proxy port
        if config.tor != "disable":
            try:
                assert self.connect(), "No connection"
                self.log.debug("Tor proxy port %s check ok" % config.tor_proxy)
            except Exception, err:
                self.log.info(
                    "Starting self-bundled Tor, due to Tor proxy port %s check error: %s"
                    % (config.tor_proxy, err))
                self.enabled = False
                # Change to self-bundled Tor ports
                from lib.PySocks import socks
                self.port = 49051
                self.proxy_port = 49050
                socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1",
                                      self.proxy_port)
                if os.path.isfile(
                        self.tor_exe):  # Already, downloaded: sync mode
                    self.startTor()
                else:  # Not downloaded yet: Async mode
                    gevent.spawn(self.startTor)
Exemple #22
0
 def __init__(self,
              filter="ALL",
              eventjson=True,
              pool_size=5000,
              trace=False):
     self._is_eventjson = eventjson
     # Callbacks for reading events and sending responses.
     self._response_callbacks = {
         'api/response': self._api_response,
         'command/reply': self._command_reply,
         'text/disconnect-notice': self._disconnect_notice,
         'text/event-json': self._event_json,
         'text/event-plain': self._event_plain
     }
     # Closing state flag
     self._closing_state = False
     # Default event filter.
     self._filter = filter
     # Commands pool list
     self._commands_pool = []
     # Lock to force eventsocket commands to be sequential.
     self._lock = RLock()
     # Sets connected to False.
     self.connected = False
     # Sets greenlet handler to None
     self._g_handler = None
     # Build events callbacks dict
     self._event_callbacks = {}
     for meth in dir(self):
         if meth[:3] == 'on_':
             event_name = meth[3:].upper()
             func = getattr(self, meth, None)
             if func:
                 self._event_callbacks[event_name] = func
     unbound = getattr(self, 'unbound_event', None)
     self._event_callbacks['unbound_event'] = unbound
     # Set greenlet spawner
     if pool_size > 0:
         self.pool = gevent.pool.Pool(pool_size)
         self._spawn = self.pool.spawn
     else:
         self._spawn = gevent.spawn_raw
     # set tracer
     try:
         logger = self.log
     except AttributeError:
         logger = None
     if logger and trace is True:
         self.trace = self._trace
     else:
         self.trace = self._notrace
Exemple #23
0
    def __new__(cls, *args, **kw):
        self = object.__new__(cls)
        object.__setattr__(self, '_local__args', (args, kw))
        object.__setattr__(self, '_local__lock', RLock())
        dicts = WeakKeyDictionary()
        object.__setattr__(self, '_local__dicts', dicts)

        if (args or kw) and (cls.__init__ is object.__init__):
            raise TypeError("Initialization arguments are not supported")

        # We need to create the greenlet dict in anticipation of
        # __init__ being called, to make sure we don't call it again ourselves.
        dict = object.__getattribute__(self, '__dict__')
        dicts[getcurrent()] = dict
        return self
Exemple #24
0
	def __init__(self):

		self.logger = logging.getLogger('radiovisserver.watchdog')

		# The lock to modify the list of channels
		self.channels_lock = RLock()
		
		# List of channels
		self.channels = []

		# Last message, by channel
		self.channels_last_message = {}

		# List of ids, by channel
		self.id_by_channel = {}

		# Init lists
		self.get_channels()
    def __init__(self, platform_id, attr_info, get_attribute_values,
                 notify_driver_event):
        """
        @param platform_id Platform ID
        @param attr_info Attribute information
        @param get_attribute_values Function to retrieve attribute
                 values for the specific platform, called like this:
                 get_attribute_values([attr_id], from_time)
                 for each attr_id in the platform.
        @param notify_driver_event Callback to notify whenever a value is
                retrieved.
        """

        self._platform_id = platform_id
        self._attr_info = attr_info
        self._get_attribute_values = get_attribute_values
        self._notify_driver_event = notify_driver_event

        log.debug("%r: PlatformResourceMonitor instance created",
                  self._platform_id)

        # _monitors: dict { rate_secs: ResourceMonitor }
        self._monitors = {}

        # buffers used by the monitoring greenlets to put retrieved data in
        # and by the publisher greenlet to process that data to construct
        # aggregated AttributeValueDriverEvent objects that the platform
        # agent finally process to create and publish granules.
        self._buffers = {}

        # to synchronize access to the buffers
        self._lock = RLock()

        # publishing rate in seconds, set by _set_publisher_rate
        self._pub_rate = None
        self._publisher_active = False

        # for debugging purposes
        self._pp = pprint.PrettyPrinter()
Exemple #26
0
    def __init__(self, *args, **kwargs):

        super(ReplayProcess, self).__init__(*args, **kwargs)
        self.lock = RLock()
Exemple #27
0
class ReplayProcess(BaseReplayProcess):

    process_type = 'standalone'

    def __init__(self, *args, **kwargs):

        super(ReplayProcess, self).__init__(*args, **kwargs)
        self.lock = RLock()

    def on_start(self):

        self.query = self.CFG.get_safe('process.query', {})

        self.delivery_format = self.CFG.get_safe('process.delivery_format', {})
        self.datastore_name = self.CFG.get_safe('process.datastore_name',
                                                'dm_datastore')

        definition_id = self.delivery_format.get('definition_id')
        rrsc = ResourceRegistryServiceProcessClient(process=self,
                                                    node=self.container.node)
        definition = rrsc.read(definition_id)
        self.definition = definition.container

        self.fields = self.delivery_format.get('fields', None)

        self.view_name = self.CFG.get_safe('process.view_name',
                                           'datasets/dataset_by_id')
        self.key_id = self.CFG.get_safe('process.key_id')
        self.stream_id = self.CFG.get_safe('process.publish_streams.output')

        if not self.stream_id:
            raise Inconsistent(
                'The replay process requires a stream id. Invalid configuration!'
            )

        self.data_stream_id = self.definition.data_stream_id
        self.encoding_id = self.definition.identifiables[
            self.data_stream_id].encoding_id
        self.element_type_id = self.definition.identifiables[
            self.data_stream_id].element_type_id
        self.element_count_id = self.definition.identifiables[
            self.data_stream_id].element_count_id
        self.data_record_id = self.definition.identifiables[
            self.element_type_id].data_record_id
        self.field_ids = self.definition.identifiables[
            self.data_record_id].field_ids
        self.domain_ids = self.definition.identifiables[
            self.data_record_id].domain_ids
        self.time_id = self.definition.identifiables[
            self.domain_ids[0]].temporal_coordinate_vector_id

    def execute_replay(self):
        '''
        @brief Spawns a greenlet to take care of the query and work
        '''
        if not hasattr(self, 'output'):
            raise Inconsistent(
                'The replay process requires an output stream publisher named output. Invalid configuration!'
            )

        datastore_name = self.datastore_name
        key_id = self.key_id

        view_name = self.view_name

        opts = {
            'start_key': [key_id, 0],
            'end_key': [key_id, 2],
            'include_docs': True
        }

        g = Greenlet(self._query,
                     datastore_name=datastore_name,
                     view_name=view_name,
                     opts=opts,
                     callback=lambda results: self._publish_query(results))
        g.start()

    def _query(self,
               datastore_name='dm_datastore',
               view_name='posts/posts_by_id',
               opts={},
               callback=None):
        '''
        @brief Makes the couch query and then callsback to publish
        @param datastore_name Name of the datastore
        @param view_name The name of the design view where the data is organized
        @param opts options to pass
        @param callback the content handler
        '''
        db = self.container.datastore_manager.get_datastore(
            datastore_name, DataStore.DS_PROFILE.SCIDATA, self.CFG)

        ret = db.query_view(view_name=view_name, opts=opts)

        callback(ret)

    def _publish_query(self, results):
        '''
        @brief Publishes the appropriate data based on the delivery format and data returned from query
        @param results The query results from the couch query
        '''

        if results is None:
            log.info('No Results')
            return

        publish_queue = self._parse_results(results)
        for item in publish_queue:
            log.debug('Item in queue: %s' % type(item))
        granule = self._merge(publish_queue)
        if not granule:
            return  # no dataset

        if self.delivery_format.has_key('fields'):
            res = self.subset(granule, self.delivery_format['fields'])
            granule = res

        if self.delivery_format.has_key('time'):
            granule = self.time_subset(granule, self.delivery_format['time'])

        total_records = granule.identifiables[self.element_count_id].value
        granule.identifiables[self.element_count_id].constraint.intervals = [
            [0, total_records - 1],
        ]

        if self.delivery_format.has_key('records'):
            assert isinstance(self.delivery_format['records'],
                              int), 'delivery format is incorrectly formatted.'

            for chunk in self._records(granule,
                                       self.delivery_format['records']):
                self.lock.acquire()
                self.output.publish(chunk)
                self.lock.release()
            return

        self.lock.acquire()
        self.output.publish(granule)
        self.lock.release()

    def _parse_results(self, results):
        '''
        @brief Switch-case logic for what packet types replay can handle and how to handle
        @param results List of results returned from couch view
        @return A queue of msgs parsed and formatted to be iterated through and published.
        '''
        log.debug('called _parse_results')
        publish_queue = []

        for result in results:
            assert ('doc' in result)

            packet = result['doc']

            if isinstance(packet, BlogBase):
                packet.is_replay = True
                self.lock.acquire()
                self.output.publish(packet)
                self.lock.release()
                continue

            if isinstance(packet, StreamDefinitionContainer):
                continue  # Ignore

            if isinstance(packet, StreamGranuleContainer):
                packet = self._parse_granule(packet)
                log.debug('Got packet')
                if packet:
                    log.debug('Appending packet')
                    publish_queue.append(packet)
                continue

            log.info('Unknown packet type in replay.')

        return publish_queue

    def _records(self, granule, n):
        '''
        @brief Yields n records from a granule per iteration
        @param granule consisting of dataset
        @param n number of records to yield
        '''
        bin_size = n
        record_count = granule.identifiables[self.element_count_id].value

        i = 0
        while (i + bin_size) < record_count:
            log.debug('Yielding %d to %d', i, i + bin_size)
            yield self._slice(granule, slice(i, i + bin_size))
            i += bin_size
        if i < record_count:
            yield self._slice(granule, slice(i, i + bin_size))
        return

    def _pair_up(self, granule):
        '''
        @brief Creates a list of tuples consisting of acquire_data friendly var_names and full values_paths
        @param granule consisting of full dataset.
        @return list of tuples
        '''
        fields = self._list_data(self.definition, granule)
        pairs = list()
        for i in fields.values():
            pairs.append((i.split('/').pop(), i))
        return pairs

    def _find_vp(self, pairs, var_name):
        '''
        @brief Determines the value path based on the acquire_data friendly var_name
        @param pairs List of tuples consisting of pair-wise var_name/value_path
        @param var_name Desired var_name
        @return Associated value_path
        '''
        for pair in pairs:
            if var_name == pair[0]:
                return pair[1]
        return

    def _slice(self, granule, slice_):
        '''
        @brief Creates a granule which is a slice of the granule parameter
        @param granule the superset
        @param slice_ The slice values for which to create the granule
        @return Crafted subset granule of the parameter granule.
        '''
        retval = copy.deepcopy(granule)
        fields = self._list_data(self.definition, granule)
        record_count = slice_.stop - slice_.start
        assert record_count > 0, 'slice is malformed'
        pairs = self._pair_up(granule)
        var_names = list([i[0]
                          for i in pairs])  # Get the var_names from the pairs
        log.debug('var_names: %s', var_names)
        file_path = self._get_hdf_from_string(
            granule.identifiables[self.data_stream_id].values)
        codec = HDFEncoder()
        vectors = acquire_data([file_path], var_names, record_count,
                               slice_).next()

        for row, value in vectors.iteritems():
            vp = self._find_vp(pairs, row)
            # Determine the range_id reverse dictionary lookup
            #@todo: improve this pattern
            for field, path in fields.iteritems():
                if vp == path:
                    range_id = field
                    break
            bounds_id = retval.identifiables[range_id].bounds_id
            # Recalculate the bounds for this fields and update the granule
            range = value['range']
            retval.identifiables[bounds_id].value_pair[0] = float(range[0])
            retval.identifiables[bounds_id].value_pair[1] = float(range[1])
            codec.add_hdf_dataset(vp, value['values'])
            record_count = len(value['values'])
            #----- DEBUGGING ---------
            log.debug('slice- row: %s', row)
            log.debug('slice- value_path: %s', vp)
            log.debug('slice- range_id: %s', range_id)
            log.debug('slice- bounds_id: %s', bounds_id)
            log.debug('slice- limits: %s', value['range'])
            #-------------------------

        retval.identifiables[self.element_count_id].value = record_count
        hdf_string = codec.encoder_close()
        self._patch_granule(retval, hdf_string)
        FileSystem.unlink(file_path)
        return retval

    def _parse_granule(self, granule):
        '''
        @brief Ensures the granule is valid and gets some metadata from the granule for building the dataset
        @param granule raw granule straight from couch
        @return metadata in the granule as well as the granule itself if valid.
        '''

        granule.stream_resource_id = self.stream_id

        element_count_id = self.element_count_id
        encoding_id = self.encoding_id

        record_count = granule.identifiables[element_count_id].value
        sha1 = granule.identifiables[encoding_id].sha1 or None

        # If there are no records then this is not a proper granule
        if not (record_count > 0):
            log.debug('Granule had no record count discarding.')
            return None

        # No encoding, no packet
        if not encoding_id in granule.identifiables:
            log.debug('Granule had no encoding discarding.')
            return None

        if not sha1:
            log.debug('Granule had no sha1')
            return None

        filepath = FileSystem.get_hierarchical_url(FS.CACHE, sha1, '.hdf5')

        if not os.path.exists(filepath):
            log.debug('File with sha1 does not exist')
            return None

        return {'granule': granule, 'records': record_count, 'sha1': sha1}

    @staticmethod
    def merge_granule(definition, granule1, granule2):
        '''
        @brief Merges two granules based on the definition
        @param definition Stream Definition
        @param granule1 First Granule
        @param granule2 Second Granule
        @return Returns granule1 which is then merged with granule2 and the file pair for indexing

        @description granule1 := granule1 U granule2
        '''
        import numpy as np

        assert isinstance(
            definition,
            StreamDefinitionContainer), 'object is not a definition.'
        assert isinstance(granule1,
                          StreamGranuleContainer), 'object is not a granule.'
        encoding_id = DefinitionTree.get(
            definition, '%s.encoding_id' % definition.data_stream_id)

        if not granule2:
            pair = (granule1.identifiables['time_bounds'].value_pair[0],
                    '%s.hdf5' % granule1.identifiables[encoding_id].sha1)
            return {'granule': granule1, 'files': [pair]}

        assert isinstance(granule2,
                          StreamGranuleContainer), 'object is not a granule.'

        assert granule1.identifiables.has_key(
            'time_bounds'
        ), 'object has no time bounds and therefore is invalid.'

        assert granule2.identifiables.has_key(
            'time_bounds'
        ), 'object has no time bounds and therefore is invalid.'

        #-------------------------------------------------------------------------------------
        # First step is figure out where each granule belongs on the timeline
        # We do this with a tuple consisting of the point in the timeline and the filename
        # These will get stable sorted later
        #-------------------------------------------------------------------------------------

        pair1 = (granule1.identifiables['time_bounds'].value_pair[0],
                 '%s.hdf5' % granule1.identifiables[encoding_id].sha1)

        pair2 = (granule2.identifiables['time_bounds'].value_pair[0],
                 '%s.hdf5' % granule2.identifiables[encoding_id].sha1)

        files = []

        if encoding_id in granule1.identifiables:
            if granule1.identifiables[encoding_id].sha1:
                files.append('%s.hdf5' %
                             granule1.identifiables[encoding_id].sha1)
        if encoding_id in granule2.identifiables:
            if granule2.identifiables[encoding_id].sha1:
                files.append('%s.hdf5' %
                             granule2.identifiables[encoding_id].sha1)

        element_count_id = DefinitionTree.get(
            definition, '%s.element_count_id' % definition.data_stream_id)
        record_count = 0
        if element_count_id in granule1.identifiables:
            record_count += granule1.identifiables[element_count_id].value
        if element_count_id in granule2.identifiables:
            record_count += granule2.identifiables[element_count_id].value

        if not element_count_id in granule1.identifiables:
            granule1.identifiables[element_count_id] = CountElement()
            granule1.identifiables[element_count_id].value = record_count
        else:
            granule1.identifiables[element_count_id].value = record_count

        fields1 = ReplayProcess._list_data(definition, granule1)
        fields2 = ReplayProcess._list_data(definition, granule2)
        #@todo albeit counterintuitive an intersection is the only thing I can support
        merged_paths = {}
        for k, v in fields1.iteritems():
            if fields2.has_key(k):
                merged_paths[k] = v

        for k, v in granule2.identifiables.iteritems():
            # Switch(value):

            # Case Bounds:
            if isinstance(v, QuantityRangeElement):
                # If its not in granule1 just throw it in there
                if k not in granule1.identifiables:
                    granule1.identifiables[k] = v
                else:
                    bounds1 = granule1.identifiables[k].value_pair
                    bounds2 = granule2.identifiables[k].value_pair
                    bounds = np.append(bounds1, bounds2)
                    granule1.identifiables[k].value_pair = [
                        np.nanmin(bounds),
                        np.nanmax(bounds)
                    ]

            if isinstance(v, RangeSet):  #Including coordinate axis
                if merged_paths.has_key(
                        k) and not granule1.identifiables.has_key(k):
                    granule1.identifiables[k] = v  # Copy it over

        # Now make sure granule1 doesnt have excess stuff
        del_list = []
        for k, v in granule1.identifiables.iteritems():
            if isinstance(v, RangeSet):
                if not merged_paths.has_key(k):
                    del_list.append(k)

        for item in del_list:
            del granule1.identifiables[item]

        return {'granule': granule1, 'files': [pair1, pair2]}

    @staticmethod
    def _list_data(definition, granule):
        '''
        @brief Lists all the fields in the granule based on the Stream Definition
        @param definition Stream Definition
        @param granule Stream Granule
        @return dict of field_id : values_path for each field_id that exists
        '''
        from interface.objects import StreamDefinitionContainer, StreamGranuleContainer, RangeSet, CoordinateAxis
        assert isinstance(
            definition,
            StreamDefinitionContainer), 'object is not a definition.'
        assert isinstance(
            granule, StreamGranuleContainer
        ), 'object is not a granule. its a %s' % type(granule)
        retval = {}
        for key, value in granule.identifiables.iteritems():
            if isinstance(value, RangeSet):
                values_path = value.values_path or definition.identifiables[
                    key].values_path
                retval[key] = values_path

            elif isinstance(value, CoordinateAxis):
                values_path = value.values_path or definition.identifiables[
                    key].values_path
                retval[key] = values_path

        return retval

    def _merge(self, msgs):
        '''
        @brief Merges all the granules and datasets into one large dataset (Union)
        @param msgs raw granules from couch
        @return complete dataset
        @description
             n
        D := U [ msgs_i ]
            i=0
        '''
        granule = None
        file_list = list()
        count = len(msgs)
        used_vals = list()

        #-------------------------------------------------------------------------------------
        # Merge each granule to another granule one by one.
        # After each merge operation keep track of what files belong where on the timeline
        #-------------------------------------------------------------------------------------

        for i in xrange(count):
            if i == 0:
                granule = msgs[0]['granule']
                psc = PointSupplementConstructor(
                    point_definition=self.definition)

                res = ReplayProcess.merge_granule(definition=self.definition,
                                                  granule1=granule,
                                                  granule2=None)
                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])

            else:
                res = ReplayProcess.merge_granule(definition=self.definition,
                                                  granule1=granule,
                                                  granule2=msgs[i]['granule'])

                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])
                if file_pair[1] not in file_list and file_pair[1][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[1]))
                    used_vals.append(file_pair[1][0])

        if not granule:
            return
        log.debug('file_list: %s', file_list)
        #-------------------------------------------------------------------------------------
        # Order the lists using a stable sort from python (by the first value in the tuples
        # Then peel off just the file names
        # Then get the appropriate URL for the file using FileSystem
        #-------------------------------------------------------------------------------------
        file_list.sort()
        file_list = list(i[1] for i in file_list)
        file_list = list([
            FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i)
            for i in file_list
        ])

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        codec = HDFEncoder()
        log.debug('acquire_data:')
        log.debug('\tfile_list: %s', file_list)
        log.debug('\tfields: %s', var_names)
        log.debug('\trecords: %s', record_count)

        data = acquire_data(file_list, var_names, record_count).next()

        for row, value in data.iteritems():
            value_path = self._find_vp(pairs, row)
            codec.add_hdf_dataset(value_path, nparray=value['values'])
            #-------------------------------------------------------------------------------------
            # Debugging
            #-------------------------------------------------------------------------------------
            log.debug('row: %s', row)
            log.debug('value path: %s', value_path)
            log.debug('value: %s', value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule, hdf_string)
        return granule

    def _patch_granule(self, granule, hdf_string):
        '''
        @brief Adds the hdf_string and sha1 to the granule
        @param granule Stream Granule
        @param hdf_string string consisting of raw bytes from an hdf5 file
        '''
        granule.identifiables[self.data_stream_id].values = hdf_string
        granule.identifiables[self.encoding_id].sha1 = hashlib.sha1(
            hdf_string).hexdigest().upper()

    def time_subset(self, granule, time_bounds):
        '''
        @brief Obtains a subset of the granule dataset based on the specified time_bounds
        @param granule Dataset
        @param time_bounds tuple consisting of a lower and upper bound
        @return A subset of the granule's dataset based on the time boundaries.
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        lower = time_bounds[0] - 1
        upper = time_bounds[1]
        granule = self._slice(granule, slice(lower, upper))
        return granule

    def _get_time_index(self, granule, timeval):
        '''
        @brief Obtains the index where a time's value is
        @param granule must be a complete dataset (hdf_string provided)
        @param timeval the vector value
        @return Index value for timeval or closest approx such that timeval is IN the subset
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        assert granule.identifiables[
            self.data_stream_id].values, 'hdf_string is not provided.'

        hdf_string = granule.identifiables[self.data_stream_id].values
        file_path = self._get_hdf_from_string(hdf_string)

        #-------------------------------------------------------------------------------------
        # Determine the field_id for the temporal coordinate vector (aka time)
        #-------------------------------------------------------------------------------------

        time_field = self.definition.identifiables[
            self.time_id].coordinate_ids[0]
        value_path = granule.identifiables[
            time_field].values_path or self.definition.identifiables[
                time_field].values_path
        record_count = granule.identifiables[self.element_count_id].value

        #-------------------------------------------------------------------------------------
        # Go through the time vector and get the indexes that correspond to the timeval
        # It will find a value such that
        # t_n <= i < t_(n+1), where i is the index
        #-------------------------------------------------------------------------------------

        var_name = value_path.split('/').pop()
        res = acquire_data([file_path], [var_name], record_count).next()
        time_vector = res[var_name]['values']
        retval = 0
        for i in xrange(len(time_vector)):
            if time_vector[i] == timeval:
                retval = i
                break
            elif i == 0 and time_vector[i] > timeval:
                retval = i
                break
            elif (i + 1) < len(time_vector):  # not last val
                if time_vector[i] < timeval and time_vector[i + 1] > timeval:
                    retval = i
                    break
            else:  # last val
                retval = i
                break
        FileSystem.unlink(file_path)
        return retval

    def _get_hdf_from_string(self, hdf_string):
        '''
        @param hdf_string binary string consisting of an HDF5 file.
        @return temporary file (full path) where the string was written to.
        @note client's responsible to unlink when finished.
        '''
        f = FileSystem.mktemp()
        f.write(hdf_string)
        retval = f.name
        f.close()
        return retval

    def subset(self, granule, coverages):
        '''
        @param granule
        @return dataset subset based on the fields
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        field_ids = self.field_ids
        element_count_id = self.element_count_id

        values_path = list()
        domain_ids = list()
        coverage_ids = list()
        coverages = list(coverages)
        log.debug('Coverages include %s of type %s', coverages,
                  type(coverages))
        #-----------------------------------------------------------------------------------------------------------
        # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises
        #  - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain)
        #  - If its a rangeset make sure that it's part of what the client asked for, if not discard it
        #-----------------------------------------------------------------------------------------------------------

        for field_id in field_ids:

            range_id = self.definition.identifiables[field_id].range_id

            #-------------------------------------------------------------------------------------
            # Coordinate Axis
            # - Keep track of this in our domains
            # - Add it to the paths we need to grab from the file(s)
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id],
                          CoordinateAxis):
                log.debug('got a domain: %s' % range_id)
                domain_ids.append(field_id)
                if granule.identifiables.has_key(range_id):
                    value_path = granule.identifiables[
                        range_id].values_path or self.definition.identifiables[
                            range_id].values_path
                    values_path.append(value_path)
                else:
                    value_path = self.definition.identifiables[
                        range_id].values_path
                    values_path.append(value_path)
                continue

            #-------------------------------------------------------------------------------------
            # Range Set
            # - If it's part of the coverages we want to keep
            #   - Add it to the list of ranges we're tracking
            #   - Add the value path to the paths we're tracking.
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id], RangeSet):
                # If its a rangeset, a specified coverage and the granule has it, add it to the list
                if field_id in coverages:
                    if granule.identifiables.has_key(range_id):
                        log.debug('got a range: %s' % range_id)
                        coverage_ids.append(field_id)
                        if granule.identifiables.has_key(range_id):
                            value_path = granule.identifiables[
                                range_id].values_path or self.definition.identifiables[
                                    range_id].values_path
                            values_path.append(value_path)
                        else:
                            value_path = self.definition.identifiables[
                                range_id].values_path
                            values_path.append(value_path)
                        continue

                # ----
                # We need to track the range and bounds because,
                # you guessed it, we need to update the bounds
                # ----

                range_id = self.definition.identifiables[field_id].range_id
                bounds_id = self.definition.identifiables[range_id].bounds_id

                #---
                # Lastly, if the field is there and we don't want it, we need to strip it
                #---

                if not (field_id in coverages):
                    log.debug('%s doesn\'t belong in %s.', field_id, coverages)
                    log.debug('rebool: %s', bool(field_id in coverages))
                    if granule.identifiables.has_key(range_id):
                        log.debug('Removing %s from granule', range_id)
                        del granule.identifiables[range_id]
                    if granule.identifiables.has_key(bounds_id):
                        log.debug('Removing %s from granule', bounds_id)
                        del granule.identifiables[bounds_id]

        log.debug('Domains: %s', domain_ids)
        log.debug('Ranges: %s', coverage_ids)
        log.debug('Values_paths: %s', values_path)

        file_path = self._get_hdf_from_string(
            granule.identifiables[self.data_stream_id].values)
        full_coverage = list(domain_ids + coverage_ids)

        log.debug('Full coverage: %s' % full_coverage)
        log.debug('Calling acquire_data with: %s, %s, %s', [file_path],
                  values_path, granule.identifiables[element_count_id].value)

        codec = HDFEncoder()

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        data = acquire_data([file_path], var_names, record_count).next()
        for row, value in data.iteritems():
            vp = self._find_vp(pairs, row)
            codec.add_hdf_dataset(vp, value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule, hdf_string)

        FileSystem.unlink(file_path)

        return granule
class ReplayProcess(BaseReplayProcess):

    process_type = 'standalone'

    def __init__(self, *args, **kwargs):

        super(ReplayProcess,self).__init__(*args,**kwargs)
        self.lock = RLock()

    def on_start(self):

        self.query = self.CFG.get_safe('process.query',{})

        self.delivery_format = self.CFG.get_safe('process.delivery_format',{})
        self.datastore_name = self.CFG.get_safe('process.datastore_name','dm_datastore')

        definition_id = self.delivery_format.get('definition_id')
        rrsc = ResourceRegistryServiceProcessClient(process=self, node=self.container.node)
        definition = rrsc.read(definition_id)
        self.definition = definition.container

        self.fields = self.delivery_format.get('fields',None)

        self.view_name = self.CFG.get_safe('process.view_name','datasets/dataset_by_id')
        self.key_id = self.CFG.get_safe('process.key_id')
        self.stream_id = self.CFG.get_safe('process.publish_streams.output')

        if not self.stream_id:
            raise Inconsistent('The replay process requires a stream id. Invalid configuration!')

        self.data_stream_id = self.definition.data_stream_id
        self.encoding_id = self.definition.identifiables[self.data_stream_id].encoding_id
        self.element_type_id = self.definition.identifiables[self.data_stream_id].element_type_id
        self.element_count_id = self.definition.identifiables[self.data_stream_id].element_count_id
        self.data_record_id = self.definition.identifiables[self.element_type_id].data_record_id
        self.field_ids = self.definition.identifiables[self.data_record_id].field_ids
        self.domain_ids = self.definition.identifiables[self.data_record_id].domain_ids
        self.time_id = self.definition.identifiables[self.domain_ids[0]].temporal_coordinate_vector_id

    def execute_replay(self):
        '''
        @brief Spawns a greenlet to take care of the query and work
        '''
        if not hasattr(self, 'output'):
            raise Inconsistent('The replay process requires an output stream publisher named output. Invalid configuration!')

        datastore_name = self.datastore_name
        key_id = self.key_id

        view_name = self.view_name

        opts = {
            'start_key':[key_id,0],
            'end_key':[key_id,2],
            'include_docs':True
        }

        g = Greenlet(self._query,datastore_name=datastore_name, view_name=view_name, opts=opts,
            callback=lambda results: self._publish_query(results))
        g.start()

    def _query(self,datastore_name='dm_datastore', view_name='posts/posts_by_id', opts={}, callback=None):
        '''
        @brief Makes the couch query and then callsback to publish
        @param datastore_name Name of the datastore
        @param view_name The name of the design view where the data is organized
        @param opts options to pass
        @param callback the content handler
        '''
        db = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA, self.CFG)

        ret = db.query_view(view_name=view_name,opts=opts)

        callback(ret)

    def _publish_query(self, results):
        '''
        @brief Publishes the appropriate data based on the delivery format and data returned from query
        @param results The query results from the couch query
        '''

        if results is None:
            log.info('No Results')
            return

        publish_queue = self._parse_results(results)
        for item in publish_queue:
            log.debug('Item in queue: %s' % type(item))
        granule = self._merge(publish_queue)
        if not granule:
            return # no dataset

        if self.delivery_format.has_key('fields'):
            res = self.subset(granule,self.delivery_format['fields'])
            granule = res

        if self.delivery_format.has_key('time'):
            granule = self.time_subset(granule, self.delivery_format['time'])

        total_records = granule.identifiables[self.element_count_id].value
        granule.identifiables[self.element_count_id].constraint.intervals = [[0, total_records-1],]


        if self.delivery_format.has_key('records'):
            assert isinstance(self.delivery_format['records'], int), 'delivery format is incorrectly formatted.'

            for chunk in self._records(granule,self.delivery_format['records']):
                self.lock.acquire()
                self.output.publish(chunk)
                self.lock.release()
            return


        self.lock.acquire()
        self.output.publish(granule)
        self.lock.release()

    def _parse_results(self, results):
        '''
        @brief Switch-case logic for what packet types replay can handle and how to handle
        @param results List of results returned from couch view
        @return A queue of msgs parsed and formatted to be iterated through and published.
        '''
        log.debug('called _parse_results')
        publish_queue = []

        for result in results:
            assert('doc' in result)

            packet = result['doc']

            if isinstance(packet, BlogBase):
                packet.is_replay = True
                self.lock.acquire()
                self.output.publish(packet)
                self.lock.release()
                continue

            if isinstance(packet, StreamDefinitionContainer):
                continue # Ignore

            if isinstance(packet, StreamGranuleContainer):
                packet = self._parse_granule(packet)
                log.debug('Got packet')
                if packet:
                    log.debug('Appending packet')
                    publish_queue.append(packet)
                continue

            log.info('Unknown packet type in replay.')

        return publish_queue

    def _records(self, granule, n):
        '''
        @brief Yields n records from a granule per iteration
        @param granule consisting of dataset
        @param n number of records to yield
        '''
        bin_size = n
        record_count = granule.identifiables[self.element_count_id].value

        i=0
        while (i+bin_size) < record_count:
            log.debug('Yielding %d to %d', i, i+bin_size)
            yield self._slice(granule,slice(i,i+bin_size))
            i+=bin_size
        if i < record_count:
            yield self._slice(granule, slice(i,i+bin_size))
        return

    def _pair_up(self, granule):
        '''
        @brief Creates a list of tuples consisting of acquire_data friendly var_names and full values_paths
        @param granule consisting of full dataset.
        @return list of tuples
        '''
        fields = self._list_data(self.definition, granule)
        pairs = list()
        for i in fields.values():
            pairs.append((i.split('/').pop(),i))
        return pairs

    def _find_vp(self, pairs, var_name):
        '''
        @brief Determines the value path based on the acquire_data friendly var_name
        @param pairs List of tuples consisting of pair-wise var_name/value_path
        @param var_name Desired var_name
        @return Associated value_path
        '''
        for pair in pairs:
            if var_name == pair[0]:
                return pair[1]
        return

    def _slice(self,granule,slice_):
        '''
        @brief Creates a granule which is a slice of the granule parameter
        @param granule the superset
        @param slice_ The slice values for which to create the granule
        @return Crafted subset granule of the parameter granule.
        '''
        retval = copy.deepcopy(granule)
        fields = self._list_data(self.definition,granule)
        record_count = slice_.stop - slice_.start
        assert record_count > 0, 'slice is malformed'
        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs
        log.debug('var_names: %s',var_names)
        file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values)
        codec = HDFEncoder()
        vectors = acquire_data([file_path],var_names,record_count,slice_ ).next()

        for row, value in vectors.iteritems():
            vp = self._find_vp(pairs, row)
            # Determine the range_id reverse dictionary lookup
            #@todo: improve this pattern
            for field,path in fields.iteritems():
                if vp==path:
                    range_id = field
                    break
            bounds_id = retval.identifiables[range_id].bounds_id
            # Recalculate the bounds for this fields and update the granule
            range = value['range']
            retval.identifiables[bounds_id].value_pair[0] = float(range[0])
            retval.identifiables[bounds_id].value_pair[1] = float(range[1])
            codec.add_hdf_dataset(vp, value['values'])
            record_count = len(value['values'])
            #----- DEBUGGING ---------
            log.debug('slice- row: %s', row)
            log.debug('slice- value_path: %s', vp)
            log.debug('slice- range_id: %s', range_id)
            log.debug('slice- bounds_id: %s', bounds_id)
            log.debug('slice- limits: %s', value['range'])
            #-------------------------


        retval.identifiables[self.element_count_id].value = record_count
        hdf_string = codec.encoder_close()
        self._patch_granule(retval, hdf_string)
        FileSystem.unlink(file_path)
        return retval


    def _parse_granule(self, granule):
        '''
        @brief Ensures the granule is valid and gets some metadata from the granule for building the dataset
        @param granule raw granule straight from couch
        @return metadata in the granule as well as the granule itself if valid.
        '''

        granule.stream_resource_id = self.stream_id

        element_count_id = self.element_count_id
        encoding_id = self.encoding_id

        record_count = granule.identifiables[element_count_id].value
        sha1 = granule.identifiables[encoding_id].sha1 or None

        # If there are no records then this is not a proper granule
        if not (record_count > 0):
            log.debug('Granule had no record count discarding.')
            return None

        # No encoding, no packet
        if not encoding_id in granule.identifiables:
            log.debug('Granule had no encoding discarding.')
            return None

        if not sha1:
            log.debug('Granule had no sha1')
            return None


        filepath = FileSystem.get_hierarchical_url(FS.CACHE, sha1, '.hdf5')

        if not os.path.exists(filepath):
            log.debug('File with sha1 does not exist')
            return None

        return {
            'granule':granule,
            'records':record_count,
            'sha1':sha1
        }

    @staticmethod
    def merge_granule(definition, granule1, granule2):
        '''
        @brief Merges two granules based on the definition
        @param definition Stream Definition
        @param granule1 First Granule
        @param granule2 Second Granule
        @return Returns granule1 which is then merged with granule2 and the file pair for indexing

        @description granule1 := granule1 U granule2
        '''
        import numpy as np

        assert isinstance(definition,StreamDefinitionContainer), 'object is not a definition.'
        assert isinstance(granule1, StreamGranuleContainer), 'object is not a granule.'
        encoding_id = DefinitionTree.get(definition,'%s.encoding_id' % definition.data_stream_id)

        if not granule2:
            pair = (
                granule1.identifiables['time_bounds'].value_pair[0],
                '%s.hdf5' % granule1.identifiables[encoding_id].sha1
                )
            return {
                'granule':granule1,
                'files':[pair]
            }

        assert isinstance(granule2, StreamGranuleContainer), 'object is not a granule.'

        assert granule1.identifiables.has_key('time_bounds'), 'object has no time bounds and therefore is invalid.'

        assert granule2.identifiables.has_key('time_bounds'), 'object has no time bounds and therefore is invalid.'

        #-------------------------------------------------------------------------------------
        # First step is figure out where each granule belongs on the timeline
        # We do this with a tuple consisting of the point in the timeline and the filename
        # These will get stable sorted later
        #-------------------------------------------------------------------------------------

        pair1 = (
            granule1.identifiables['time_bounds'].value_pair[0],
            '%s.hdf5' % granule1.identifiables[encoding_id].sha1
            )

        pair2 = (
            granule2.identifiables['time_bounds'].value_pair[0],
            '%s.hdf5' % granule2.identifiables[encoding_id].sha1
            )

        files = []

        if encoding_id in granule1.identifiables:
            if granule1.identifiables[encoding_id].sha1:
                files.append('%s.hdf5' % granule1.identifiables[encoding_id].sha1)
        if encoding_id in granule2.identifiables:
            if granule2.identifiables[encoding_id].sha1:
                files.append('%s.hdf5' % granule2.identifiables[encoding_id].sha1)

        element_count_id = DefinitionTree.get(definition,'%s.element_count_id' % definition.data_stream_id)
        record_count = 0
        if element_count_id in granule1.identifiables:
            record_count += granule1.identifiables[element_count_id].value
        if element_count_id in granule2.identifiables:
            record_count += granule2.identifiables[element_count_id].value

        if not element_count_id in granule1.identifiables:
            granule1.identifiables[element_count_id] = CountElement()
            granule1.identifiables[element_count_id].value = record_count
        else:
            granule1.identifiables[element_count_id].value = record_count

        fields1 = ReplayProcess._list_data(definition, granule1)
        fields2 = ReplayProcess._list_data(definition, granule2)
        #@todo albeit counterintuitive an intersection is the only thing I can support
        merged_paths = {}
        for k,v in fields1.iteritems():
            if fields2.has_key(k):
                merged_paths[k] = v



        for k,v in granule2.identifiables.iteritems():
            # Switch(value):

            # Case Bounds:
            if isinstance(v, QuantityRangeElement):
                # If its not in granule1 just throw it in there
                if k not in granule1.identifiables:
                    granule1.identifiables[k] = v
                else:
                    bounds1 = granule1.identifiables[k].value_pair
                    bounds2 = granule2.identifiables[k].value_pair
                    bounds = np.append(bounds1,bounds2)
                    granule1.identifiables[k].value_pair = [np.nanmin(bounds), np.nanmax(bounds)]


            if isinstance(v, RangeSet): #Including coordinate axis
                if merged_paths.has_key(k) and not granule1.identifiables.has_key(k):
                    granule1.identifiables[k] = v # Copy it over

        # Now make sure granule1 doesnt have excess stuff
        del_list = []
        for k,v in granule1.identifiables.iteritems():
            if isinstance(v, RangeSet):
                if not merged_paths.has_key(k):
                    del_list.append(k)

        for item in del_list:
            del granule1.identifiables[item]



        return {
            'granule':granule1,
            'files':[pair1, pair2]
        }




    @staticmethod
    def _list_data(definition, granule):
        '''
        @brief Lists all the fields in the granule based on the Stream Definition
        @param definition Stream Definition
        @param granule Stream Granule
        @return dict of field_id : values_path for each field_id that exists
        '''
        from interface.objects import StreamDefinitionContainer, StreamGranuleContainer, RangeSet, CoordinateAxis
        assert isinstance(definition, StreamDefinitionContainer), 'object is not a definition.'
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule. its a %s' % type(granule)
        retval = {}
        for key, value in granule.identifiables.iteritems():
            if isinstance(value, RangeSet):
                values_path = value.values_path or definition.identifiables[key].values_path
                retval[key] = values_path

            elif isinstance(value, CoordinateAxis):
                values_path = value.values_path or definition.identifiables[key].values_path
                retval[key] = values_path

        return retval



    def _merge(self, msgs):
        '''
        @brief Merges all the granules and datasets into one large dataset (Union)
        @param msgs raw granules from couch
        @return complete dataset
        @description
             n
        D := U [ msgs_i ]
            i=0
        '''
        granule = None
        file_list = list()
        count = len(msgs)
        used_vals = list()

        #-------------------------------------------------------------------------------------
        # Merge each granule to another granule one by one.
        # After each merge operation keep track of what files belong where on the timeline
        #-------------------------------------------------------------------------------------


        for i in xrange(count):
            if i==0:
                granule = msgs[0]['granule']
                psc = PointSupplementConstructor(point_definition=self.definition)

                res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None)
                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][0] not in used_vals:
                    file_list.append( tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])


            else:
                res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule'])

                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][0] not in used_vals:
                    file_list.append( tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])
                if file_pair[1] not in file_list and file_pair[1][0] not in used_vals:
                    file_list.append(tuple(file_pair[1]))
                    used_vals.append(file_pair[1][0])

        if not granule:
            return
        log.debug('file_list: %s', file_list)
        #-------------------------------------------------------------------------------------
        # Order the lists using a stable sort from python (by the first value in the tuples
        # Then peel off just the file names
        # Then get the appropriate URL for the file using FileSystem
        #-------------------------------------------------------------------------------------
        file_list.sort()
        file_list = list(i[1] for i in file_list)
        file_list = list([FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list])

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        codec = HDFEncoder()
        log.debug('acquire_data:')
        log.debug('\tfile_list: %s', file_list)
        log.debug('\tfields: %s', var_names)
        log.debug('\trecords: %s', record_count)

        data = acquire_data(file_list, var_names, record_count).next()

        for row,value in data.iteritems():
            value_path = self._find_vp(pairs,row)
            codec.add_hdf_dataset(value_path,nparray=value['values'])
            #-------------------------------------------------------------------------------------
            # Debugging
            #-------------------------------------------------------------------------------------
            log.debug('row: %s', row)
            log.debug('value path: %s', value_path)
            log.debug('value: %s', value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule,hdf_string)
        return granule

    def _patch_granule(self, granule, hdf_string):
        '''
        @brief Adds the hdf_string and sha1 to the granule
        @param granule Stream Granule
        @param hdf_string string consisting of raw bytes from an hdf5 file
        '''
        granule.identifiables[self.data_stream_id].values = hdf_string
        granule.identifiables[self.encoding_id].sha1 = hashlib.sha1(hdf_string).hexdigest().upper()


    def time_subset(self, granule, time_bounds):
        '''
        @brief Obtains a subset of the granule dataset based on the specified time_bounds
        @param granule Dataset
        @param time_bounds tuple consisting of a lower and upper bound
        @return A subset of the granule's dataset based on the time boundaries.
        '''
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.'
        lower = time_bounds[0]-1
        upper = time_bounds[1]
        granule = self._slice(granule, slice(lower,upper))
        return granule



    def _get_time_index(self, granule, timeval):
        '''
        @brief Obtains the index where a time's value is
        @param granule must be a complete dataset (hdf_string provided)
        @param timeval the vector value
        @return Index value for timeval or closest approx such that timeval is IN the subset
        '''
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.'
        assert granule.identifiables[self.data_stream_id].values, 'hdf_string is not provided.'

        hdf_string = granule.identifiables[self.data_stream_id].values
        file_path = self._get_hdf_from_string(hdf_string)

        #-------------------------------------------------------------------------------------
        # Determine the field_id for the temporal coordinate vector (aka time)
        #-------------------------------------------------------------------------------------

        time_field = self.definition.identifiables[self.time_id].coordinate_ids[0]
        value_path = granule.identifiables[time_field].values_path or self.definition.identifiables[time_field].values_path
        record_count = granule.identifiables[self.element_count_id].value

        #-------------------------------------------------------------------------------------
        # Go through the time vector and get the indexes that correspond to the timeval
        # It will find a value such that
        # t_n <= i < t_(n+1), where i is the index
        #-------------------------------------------------------------------------------------


        var_name = value_path.split('/').pop()
        res = acquire_data([file_path], [var_name], record_count).next()
        time_vector = res[var_name]['values']
        retval = 0
        for i in xrange(len(time_vector)):
            if time_vector[i] == timeval:
                retval = i
                break
            elif i==0 and time_vector[i] > timeval:
                retval = i
                break
            elif (i+1) < len(time_vector): # not last val
                if time_vector[i] < timeval and time_vector[i+1] > timeval:
                    retval = i
                    break
            else: # last val
                retval = i
                break
        FileSystem.unlink(file_path)
        return retval

    def _get_hdf_from_string(self, hdf_string):
        '''
        @param hdf_string binary string consisting of an HDF5 file.
        @return temporary file (full path) where the string was written to.
        @note client's responsible to unlink when finished.
        '''
        f = FileSystem.mktemp()
        f.write(hdf_string)
        retval = f.name
        f.close()
        return retval


    def subset(self,granule,coverages):
        '''
        @param granule
        @return dataset subset based on the fields
        '''
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.'
        field_ids = self.field_ids
        element_count_id = self.element_count_id


        values_path = list()
        domain_ids = list()
        coverage_ids = list()
        coverages = list(coverages)
        log.debug('Coverages include %s of type %s', coverages, type(coverages))
        #-----------------------------------------------------------------------------------------------------------
        # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises
        #  - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain)
        #  - If its a rangeset make sure that it's part of what the client asked for, if not discard it
        #-----------------------------------------------------------------------------------------------------------


        for field_id in field_ids:

            range_id = self.definition.identifiables[field_id].range_id

            #-------------------------------------------------------------------------------------
            # Coordinate Axis
            # - Keep track of this in our domains
            # - Add it to the paths we need to grab from the file(s)
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id], CoordinateAxis):
                log.debug('got a domain: %s' % range_id)
                domain_ids.append(field_id)
                if granule.identifiables.has_key(range_id):
                    value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path
                    values_path.append(value_path)
                else:
                    value_path = self.definition.identifiables[range_id].values_path
                    values_path.append(value_path)
                continue

            #-------------------------------------------------------------------------------------
            # Range Set
            # - If it's part of the coverages we want to keep
            #   - Add it to the list of ranges we're tracking
            #   - Add the value path to the paths we're tracking.
            #-------------------------------------------------------------------------------------


            if isinstance(self.definition.identifiables[range_id], RangeSet):
                # If its a rangeset, a specified coverage and the granule has it, add it to the list
                if  field_id in coverages:
                    if granule.identifiables.has_key(range_id):
                        log.debug('got a range: %s' % range_id)
                        coverage_ids.append(field_id)
                        if granule.identifiables.has_key(range_id):
                            value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path
                            values_path.append(value_path)
                        else:
                            value_path = self.definition.identifiables[range_id].values_path
                            values_path.append(value_path)
                        continue

                # ----
                # We need to track the range and bounds because,
                # you guessed it, we need to update the bounds
                # ----

                range_id = self.definition.identifiables[field_id].range_id
                bounds_id = self.definition.identifiables[range_id].bounds_id


                #---
                # Lastly, if the field is there and we don't want it, we need to strip it
                #---

                if not (field_id in coverages):
                    log.debug('%s doesn\'t belong in %s.', field_id, coverages)
                    log.debug('rebool: %s', bool(field_id in coverages))
                    if granule.identifiables.has_key(range_id):
                        log.debug('Removing %s from granule', range_id)
                        del granule.identifiables[range_id]
                    if granule.identifiables.has_key(bounds_id):
                        log.debug('Removing %s from granule', bounds_id)
                        del granule.identifiables[bounds_id]

        log.debug('Domains: %s', domain_ids)
        log.debug('Ranges: %s', coverage_ids)
        log.debug('Values_paths: %s', values_path)

        file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values)
        full_coverage = list(domain_ids + coverage_ids)

        log.debug('Full coverage: %s' % full_coverage)
        log.debug('Calling acquire_data with: %s, %s, %s', [file_path],values_path,granule.identifiables[element_count_id].value)

        codec = HDFEncoder()

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        data = acquire_data([file_path], var_names, record_count).next()
        for row,value in data.iteritems():
            vp = self._find_vp(pairs, row)
            codec.add_hdf_dataset(vp, value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule,hdf_string)

        FileSystem.unlink(file_path)

        return granule
Exemple #29
0
class ReplayProcess(BaseReplayProcess):
    process_type="standalone"
    def __init__(self, *args, **kwargs):
        super(ReplayProcess, self).__init__(*args,**kwargs)
        #@todo Init stuff
        # mutex for shared resources between threads
        self.lock = RLock()
        
    def on_start(self):
        '''
        Creates a publisher for each stream_id passed in as publish_streams
        Creates an attribute with the name matching the stream name which corresponds to the publisher
        ex: say we have publish_streams:{'output': my_output_stream_id }
          then the instance has an attribute output which corresponds to the publisher for the stream
          in my_output_stream_id
        '''
        self.stream_publisher_registrar = StreamPublisherRegistrar(process=self,node=self.container.node)


        # Get the query
        self.query = self.CFG.get_safe('process.query',{})

        # Get the delivery_format
        self.delivery_format = self.CFG.get_safe('process.delivery_format',{})
        self.datastore_name = self.CFG.get_safe('process.datastore_name','dm_datastore')

        self.view_name = self.CFG.get_safe('process.view_name','datasets/dataset_by_id')
        self.key_id = self.CFG.get_safe('process.key_id')
        # Get a stream_id for this process
        self.stream_id = self.CFG.get_safe('process.publish_streams.output',{})



        if not (self.stream_id and hasattr(self,'output')):
            raise RuntimeError('The replay agent requires an output stream publisher named output. Invalid configuration!')



    def _records(self, records, n):
        """
        Given a list of records, yield at most n at a time
        """
        while True:
            yval = []
            try:
                for i in xrange(n):
                    yval = yval + [records.pop(0)]
                yield yval
            except IndexError:
                if yval:
                    yield yval
                break

    def _publish_query(self, results):
        '''
        Callback to publish the specified results
        '''
        #-----------------------
        # Iteration
        #-----------------------
        #  - Go through the results, if the user had include_docs=True in the options field
        #    then the full document is in result.doc; however if the query did not include_docs,
        #    then only the doc_id is provided in the result.value.
        #
        #  - What this allows us to do is limit the amount of traffic in information for large queries.
        #    If we only are making a query in a sequence of queries (such as map and reduce) then we don't
        #    care about the full document, yet, we only care about the doc id and will retrieve the document later.
        #  - Example:
        #      Imagine the blogging example, we want the latest blog by author George and all the comments for that blog
        #      The series of queries would go, post_by_updated -> posts_by_author -> posts_join_comments and then
        #      in the last query we'll set include_docs to true and parse the docs.
        #-----------------------


        log.warn('results: %s', results)

        for result in results:
            log.warn('REPLAY Result: %s' % result)



            assert('doc' in result)

            replay_obj_msg = result['doc']

            if isinstance(replay_obj_msg, BlogBase):
                replay_obj_msg.is_replay = True

                self.lock.acquire()
                self.output.publish(replay_obj_msg)
                self.lock.release()

            elif isinstance(replay_obj_msg, StreamDefinitionContainer):

                replay_obj_msg.stream_resource_id = self.stream_id


            elif isinstance(replay_obj_msg, StreamGranuleContainer):

                # Override the resource_stream_id so ingestion doesn't reingest, also this is a NEW stream (replay)
                replay_obj_msg.stream_resource_id = self.stream_id

                datastream = None
                sha1 = None

                for key, identifiable in replay_obj_msg.identifiables.iteritems():
                    if isinstance(identifiable, DataStream):
                        datastream = identifiable
                    elif isinstance(identifiable, Encoding):
                        sha1 = identifiable.sha1

                if sha1: # if there is an encoding

                    # Get the file from disk
                    filename = FileSystem.get_url(FS.CACHE, sha1, ".hdf5")

                    log.warn('Replay reading from filename: %s' % filename)

                    hdf_string = ''
                    try:
                        with open(filename, mode='rb') as f:
                            hdf_string = f.read()
                            f.close()

                            # Check the Sha1
                            retreived_hdfstring_sha1 = hashlib.sha1(hdf_string).hexdigest().upper()

                            if sha1 != retreived_hdfstring_sha1:
                                raise  ReplayProcessException('The sha1 mismatch between the sha1 in datastream and the sha1 of hdf_string in the saved file in hdf storage')

                    except IOError:
                        log.warn('No HDF file found!')
                        #@todo deal with this situation? How?
                        hdf_string = 'HDF File %s not found!' % filename

                    # set the datastream.value field!
                    datastream.values = hdf_string

                else:
                    log.warn('No encoding in the StreamGranuleContainer!')

                self.lock.acquire()
                self.output.publish(replay_obj_msg)
                self.lock.release()


            else:
                 log.warn('Unknown type retrieved in DOC!')



        #@todo: log when there are not results
        if results is None:
            log.warn('No results found in replay query!')
        else:
            log.debug('Published replay!')


    def execute_replay(self):
        log.debug('(Replay Agent %s)', self.name)

        # Handle the query
        datastore_name = self.datastore_name
        key_id = self.key_id


        # Got the post ID, pull the post and the comments
        view_name = self.view_name
        opts = {
            'start_key':[key_id, 0],
            'end_key':[key_id,2],
            'include_docs': True
        }
        g = Greenlet(self._query,datastore_name=datastore_name, view_name=view_name, opts=opts,
            callback=lambda results: self._publish_query(results))
        g.start()




    def _query(self,datastore_name='dm_datastore', view_name='posts/posts_by_id', opts={}, callback=None):
        '''
        Performs the query action
        '''
        log.debug('Couch Query:\n\t%s\n\t%s\n\t%s', datastore_name, view_name, opts)
        #@todo: Fix this datastore management profile with correct data profile in near future
        db = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.EXAMPLES, self.CFG)


        ret = db.query_view(view_name=view_name,opts=opts)

        callback(ret)
Exemple #30
0
        raw_data = resp.read()
        try:
            data = json.loads(raw_data)
        except:
            raise BuyException("Could not parse json: {}".format(raw_data))

        if "erroMsg" in data:
            raise BuyException("Error: {}".format(raw_data))

        if data["isFraud"] == "true":
            raise BuyException("Should not be fraud: {}".format(raw_data))


if __name__ == "__main__":
    from gevent.coros import RLock
    stats_lock = RLock()
    errors = []
    success_times = []
    error_times = []
    total_requests = 0
    REPORT_EACH = 50
    hostname = "dcf-ces63.appspot.com"
    print "Testing", hostname

    def run(customer, nreqs):
        global total_requests
        for i in range(nreqs):
            t = time.time()
            last_error = None
            try:
                customer.buy_something()
Exemple #31
0
class aioFile(object):
    """a buffered File like object that uses pyaio and gevent"""
    def __init__(self, filename, mode='r', buffer=16<<10):
        modes = os.O_LARGEFILE | os.O_CREAT
        self._offset = 0
        self._buffer_size = buffer
        if buffer:
            self._buffer_lock = RLock()
        self._read = False
        self._write = False
        self._read_buf = None
        self._write_buf = None
        self._eof = False   # Optimization to limit calls
        self._append = False   # Append Mode writes ignore offset
        self._stay_alive = gevent.spawn(_keep_awake);
        if mode.startswith('r') or '+' in mode:
            self._read = True
            self._read_buf = bytearray()
            if '+' not in mode:
                modes |= os.O_RDONLY
        if mode.startswith('w') or mode.startswith('a') or '+' in mode:
            if mode.startswith('w'):
                modes |= os.O_TRUNC
            self._write = True
            self._write_buf = bytearray()
            self._flush = False
            if '+' not in mode:
                modes |= os.O_WRONLY
        if '+' in mode:
            modes |= os.O_RDWR
        if mode.startswith('a'):
            modes |= os.O_APPEND
            self._append = True
        self._fd = os.open(filename, modes)

    def _clear_read_buf(self):
        if self._read:
            self._eof = False
            del self._read_buf[0:]

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def close(self):
        self.flush()
        os.close(self._fd)
        self._stay_alive.kill()

    def stat(self):
        return os.fstat(self._fd)

    def seek(self, pos, how=os.SEEK_SET):
        """Change the file pos, will clear read cache and flush writes """ \
        """This will also clear the EOF flag for the file"""
        offset = self._offset
        if how != os.SEEK_CUR and how != os.SEEK_END and how != os.SEEK_SET:
            raise OSError(14,
                'Invalid seek point use os.SEEK_SET, os.SEEK_CUR, os.SEEK_END')
        if how == os.SEEK_CUR:
            offset += pos
        elif how == os.SEEK_END:
            #Ugh this could be harry if we have outstanding writes
            offset = self.stat().st_size + pos
        else:
            offset = pos
        if offset < 0:
            raise OSError(14, 'File Position invalid, less than 0')
        #Even if the pos didn't change fix the buffers and EOF
        self._clear_read_buf()
        if not self._append:   # DON'T FLUSH on seek with append
            self.flush()
        self._offset = offset
        return offset

    def flush(self):
        """Flush write buffer"""
        if self._write and self._buffer_size:
            self._flush = True
            while len(self._write_buf):
                self.write(None)
            self._flush = False

    def _read_file(self):
        fbuf = bytearray()
        while True:
            part = self.read(16 << 10)  # Read 16k
            if part is None:  # EOF
                break
            fbuf.extend(part)
        return fbuf

    def write(self, buf, offset=None):
        """write a buffer object to file"""
        if not self._write:
            raise IOError(9, 'Bad file descriptor')
        if not self._append and self._buffer_size and self._read_buf:
                # We should clear read cache
            self._clear_read_buf()
        if offset is None:
            offset = self._offset
        write_size = self._buffer_size
        if not self._buffer_size and buf:
            write_size = len(buf)
        if not self._append and offset != self._offset:
            self.seek(offset)  # Makes sure we write our buffer

        #If we buffer we use the global buffer if not we use a local buffer
        if self._buffer_size:
            lbuf = self._write_buf
            self._buffer_lock.acquire()
            if buf:
                                          # The a memoryview of the buffer
                    lbuf.extend(buf)      # pushed to pyaio so we need to lock
        else:
            lbuf = buf

        while lbuf and len(lbuf) >= self._buffer_size \
                or (self._flush and lbuf):
            result = AsyncResult()
            def _write_results(rcode, errno):
                result.set((rcode, errno))
            pyaio.aio_write(self._fd, memoryview(lbuf)[0:write_size],
                            offset, _write_results)
            rcode, errno = result.get()  #SLEEP

            if rcode < 0:   # Some kind of error
                raise IOError(errno, 'AIO Write Error %d' % errno)
            # Clean up buffer (of actually written bytes)
            if self._buffer_size:
                del lbuf[0:rcode]
            else:
                lbuf = None
            self._offset = offset = offset + rcode  # Move the file offset
        if self._buffer_size:
            self._buffer_lock.release()
        if buf:
            return len(buf)
        else:
            return 0

    def read(self, size=0, offset=None):
        """read a size of bytes from the file, or entire file if 0 """ \
        """for speed we assume EOF after first short read"""
        if not self._read:
            raise IOError(9, 'Bad file descriptor')
        if not self._append and self._buffer_size and self._write_buf:
            self.flush()
        if offset is None:
            offset = self._offset
        if offset != self._offset:
            self.seek(offset)  # To make sure we blow away our read cache
        if size == 0:  # Attempt to read entire file and return in a single return
            return self._read_file()
        else:
            rbuf = bytearray()  # Holding Place for multiple reads
            while len(rbuf) < size:  # People get what they ask for
                # If we don't want to buffer then just read what they want
                if len(self._read_buf) < size - len(rbuf) and not self._eof:
                    #Ok we are buffer short so lets do a read
                    result = AsyncResult()
                    def _read_results(buf, rcode, errno):
                        result.set((buf, rcode, errno))
                    read_size = size - len(rbuf)
                    if self._buffer_size:   # If we buffer read buffer instead
                        read_size = self._buffer_size
                    pyaio.aio_read(self._fd, offset, read_size, _read_results)
                    buf, rcode, errno = result.get()  #SLEEP
                    if rcode < 0:  # Some kind of error
                        raise IOError(errno, 'AIO Read Error %d' % errno)
                    #Rcode will be the bytes read so lets push the offset
                    self._offset = offset = offset + rcode
                    if self._buffer_size:
                        self._read_buf.extend(buf)
                    else:
                        rbuf = buf  # Pass through because we are not buffering
                    if rcode == 0 or rcode < read_size:  # Good Enough
                        self._eof = True
                #Do a buffer read
                toread = size - len(rbuf)
                if self._buffer_size:
                    rbuf.extend(memoryview(self._read_buf)[0:toread])
                    #Clean up read buffer
                    del self._read_buf[0:toread]
                if not self._read_buf and self._eof:  # Empty buffer and eof
                    break
            if self._eof and not rbuf:
                return None  #EOF NO DATA
            else:
                return rbuf
Exemple #32
0
 def __init__(self, next_sink):
     super(RefCountedSink, self).__init__()
     self._ref_count = 0
     self._open_lock = RLock()
     self._open_ar = None
     self.next_sink = next_sink
class VizTransformProcForMatplotlibGraphs(TransformDataProcess):
    """
    This class is used for instantiating worker processes that have subscriptions to data streams and convert
    incoming data from CDM format to Matplotlib graphs

    """
    def on_start(self):
        super(VizTransformProcForMatplotlibGraphs, self).on_start()
        #assert len(self.streams)==1
        self.initDataFlag = True
        self.graph_data = {
        }  # Stores a dictionary of variables : [List of values]

        # Need some clients
        self.rr_cli = ResourceRegistryServiceProcessClient(
            process=self, node=self.container.node)
        self.pubsub_cli = PubsubManagementServiceClient(
            node=self.container.node)

        # extract the various parameters passed to the transform process
        self.out_stream_id = self.CFG.get('process').get(
            'publish_streams').get('visualization_service_submit_stream_id')

        # Create a publisher on the output stream
        #stream_route = self.pubsub_cli.register_producer(stream_id=self.out_stream_id)
        out_stream_pub_registrar = StreamPublisherRegistrar(
            process=self.container, node=self.container.node)
        self.out_stream_pub = out_stream_pub_registrar.create_publisher(
            stream_id=self.out_stream_id)

        self.data_product_id = self.CFG.get('data_product_id')
        self.stream_def_id = self.CFG.get("stream_def_id")
        self.stream_def = self.rr_cli.read(self.stream_def_id)

        # Start the thread responsible for keeping track of time and generating graphs
        # Mutex for ensuring proper concurrent communications between threads
        self.lock = RLock()
        self.rendering_proc = Greenlet(self.rendering_thread)
        self.rendering_proc.start()

    def process(self, packet):
        log.debug('(%s): Received Viz Data Packet' % self.name)
        #log.debug('(%s):   - Processing: %s' % (self.name,packet))

        # parse the incoming data
        psd = PointSupplementStreamParser(
            stream_definition=self.stream_def.container, stream_granule=packet)

        # re-arrange incoming data into an easy to parse dictionary
        vardict = {}
        arrLen = None
        for varname in psd.list_field_names():
            vardict[varname] = psd.get_values(varname)
            arrLen = len(vardict[varname])

        if self.initDataFlag:
            # look at the incoming packet and store
            for varname in psd.list_field_names():
                self.lock.acquire()
                self.graph_data[varname] = []
                self.lock.release()

            self.initDataFlag = False

        # If code reached here, the graph data storage has been initialized. Just add values
        # to the list
        with self.lock:
            for varname in psd.list_field_names():
                self.graph_data[varname].extend(vardict[varname])

    def rendering_thread(self):
        from copy import deepcopy
        # Service Client

        # init Matplotlib
        fig = Figure()
        ax = fig.add_subplot(111)
        canvas = FigureCanvas(fig)
        imgInMem = StringIO.StringIO()
        while True:

            # Sleep for a pre-decided interval. Should be specifiable in a YAML file
            gevent.sleep(20)

            # If there's no data, wait
            # Lock is used here to make sure the entire vector exists start to finish, this assures that the data won
            working_set = None
            with self.lock:
                if len(self.graph_data) == 0:
                    continue
                else:
                    working_set = deepcopy(self.graph_data)

            # For the simple case of testing, lets plot all time variant variables one at a time
            xAxisVar = 'time'
            xAxisFloatData = working_set[xAxisVar]

            for varName, varData in working_set.iteritems():
                if varName == 'time' or varName == 'height' or varName == 'longitude' or varName == 'latitude':
                    continue

                yAxisVar = varName
                yAxisFloatData = working_set[varName]

                # Generate the plot

                ax.plot(xAxisFloatData, yAxisFloatData, 'ro')
                ax.set_xlabel(xAxisVar)
                ax.set_ylabel(yAxisVar)
                ax.set_title(yAxisVar + ' vs ' + xAxisVar)
                ax.set_autoscale_on(False)

                # generate filename for the output image
                fileName = yAxisVar + '_vs_' + xAxisVar + '.png'
                # Save the figure to the in memory file
                canvas.print_figure(imgInMem, format="png")
                imgInMem.seek(0)

                # submit resulting table back using the out stream publisher
                msg = {
                    "viz_product_type": "matplotlib_graphs",
                    "data_product_id": self.data_product_id,
                    "image_obj": imgInMem.getvalue(),
                    "image_name": fileName
                }
                self.out_stream_pub.publish(msg)

                #clear the canvas for the next image
                ax.clear()
class VizTransformProcForMatplotlibGraphs(TransformDataProcess):

    """
    This class is used for instantiating worker processes that have subscriptions to data streams and convert
    incoming data from CDM format to Matplotlib graphs

    """
    def on_start(self):
        super(VizTransformProcForMatplotlibGraphs,self).on_start()
        #assert len(self.streams)==1
        self.initDataFlag = True
        self.graph_data = {} # Stores a dictionary of variables : [List of values]

        # Need some clients
        self.rr_cli = ResourceRegistryServiceProcessClient(process = self, node = self.container.node)
        self.pubsub_cli = PubsubManagementServiceClient(node=self.container.node)

        # extract the various parameters passed to the transform process
        self.out_stream_id = self.CFG.get('process').get('publish_streams').get('visualization_service_submit_stream_id')

        # Create a publisher on the output stream
        #stream_route = self.pubsub_cli.register_producer(stream_id=self.out_stream_id)
        out_stream_pub_registrar = StreamPublisherRegistrar(process=self.container, node=self.container.node)
        self.out_stream_pub = out_stream_pub_registrar.create_publisher(stream_id=self.out_stream_id)

        self.data_product_id = self.CFG.get('data_product_id')
        self.stream_def_id = self.CFG.get("stream_def_id")
        self.stream_def = self.rr_cli.read(self.stream_def_id)

        # Start the thread responsible for keeping track of time and generating graphs
        # Mutex for ensuring proper concurrent communications between threads
        self.lock = RLock()
        self.rendering_proc = Greenlet(self.rendering_thread)
        self.rendering_proc.start()




    def process(self, packet):
        log.debug('(%s): Received Viz Data Packet' % self.name )
        #log.debug('(%s):   - Processing: %s' % (self.name,packet))

        # parse the incoming data
        psd = PointSupplementStreamParser(stream_definition=self.stream_def.container, stream_granule=packet)

        # re-arrange incoming data into an easy to parse dictionary
        vardict = {}
        arrLen = None
        for varname in psd.list_field_names():
            vardict[varname] = psd.get_values(varname)
            arrLen = len(vardict[varname])

        if self.initDataFlag:
            # look at the incoming packet and store
            for varname in psd.list_field_names():
                self.lock.acquire()
                self.graph_data[varname] = []
                self.lock.release()

            self.initDataFlag = False

        # If code reached here, the graph data storage has been initialized. Just add values
        # to the list
        with self.lock:
            for varname in psd.list_field_names():
                self.graph_data[varname].extend(vardict[varname])


    def rendering_thread(self):
        from copy import deepcopy
        # Service Client

        # init Matplotlib
        fig = Figure()
        ax = fig.add_subplot(111)
        canvas = FigureCanvas(fig)
        imgInMem = StringIO.StringIO()
        while True:

            # Sleep for a pre-decided interval. Should be specifiable in a YAML file
            gevent.sleep(20)

            # If there's no data, wait
            # Lock is used here to make sure the entire vector exists start to finish, this assures that the data won
            working_set=None
            with self.lock:
                if len(self.graph_data) == 0:
                    continue
                else:
                    working_set = deepcopy(self.graph_data)


            # For the simple case of testing, lets plot all time variant variables one at a time
            xAxisVar = 'time'
            xAxisFloatData = working_set[xAxisVar]

            for varName, varData in working_set.iteritems():
                if varName == 'time' or varName == 'height' or varName == 'longitude' or varName == 'latitude':
                    continue

                yAxisVar = varName
                yAxisFloatData = working_set[varName]

                # Generate the plot

                ax.plot(xAxisFloatData, yAxisFloatData, 'ro')
                ax.set_xlabel(xAxisVar)
                ax.set_ylabel(yAxisVar)
                ax.set_title(yAxisVar + ' vs ' + xAxisVar)
                ax.set_autoscale_on(False)

                # generate filename for the output image
                fileName = yAxisVar + '_vs_' + xAxisVar + '.png'
                # Save the figure to the in memory file
                canvas.print_figure(imgInMem, format="png")
                imgInMem.seek(0)

                # submit resulting table back using the out stream publisher
                msg = {"viz_product_type": "matplotlib_graphs",
                       "data_product_id": self.data_product_id,
                       "image_obj": imgInMem.getvalue(),
                       "image_name": fileName}
                self.out_stream_pub.publish(msg)

                #clear the canvas for the next image
                ax.clear()
Exemple #35
0
'''
basic data-parallel main function module
'''

from gevent.coros import RLock
from gevent.pool import Pool

import logging
from data import load, save_result, strip_ephemeral
from serial_process import process, required_actions_count, print_progress_info
logger = logging.getLogger(__name__)

PROCESSED = 0
TOTAL = 0

REPORT_LOCK = RLock()


def target(ostream, params, parallel_tests, sorted_mode):
    global PROCESSED, REPORT_LOCK, TOTAL
    for result in process(params,
                          pool_size=parallel_tests,
                          sorted_mode=sorted_mode):
        with REPORT_LOCK:
            PROCESSED += 1
            print_progress_info(PROCESSED, TOTAL)
            save_result(ostream, strip_ephemeral(result))


def main(conf, istream, ostream, test_whitelist, test_blacklist,
         stage_whitelist, stage_blacklist, tags_whitelist, tags_blacklist,
Exemple #36
0
 def __init__(self):
     self._log = logging.getLogger("Deliverator")
     self._active_requests = dict()
     self._lock = RLock()
Exemple #37
0
 def __init__(self):
     Scheduler.__init__(self)
     from gevent.coros import RLock
     self._lock = RLock()
Exemple #38
0
 def __init__(self, *args, **kwargs):
     super(GreenQueuePool, self).__init__(*args, **kwargs)
     if self._overflow_lock is not None:
         self._overflow_lock = RLock()
Exemple #39
0
 def __init__(self, *args, **kwargs):
     super(ReplayProcess, self).__init__(*args,**kwargs)
     #@todo Init stuff
     # mutex for shared resources between threads
     self.lock = RLock()
Exemple #40
0
 def __init__(self, node):
     transport = TSocket.TSocket(str(node.host), int(node.port))
     self.transport = TTransport.TBufferedTransport(transport)
     protocol = TBinaryProtocol.TBinaryProtocol(transport)
     self.client = AtlasNode.Client(protocol)
     self.lock = RLock()
    def __init__(self, *args, **kwargs):

        super(ReplayProcess,self).__init__(*args,**kwargs)
        self.lock = RLock()
Exemple #42
0
 def __init__(self, addr, timeout=2):
     self.addr = addr
     self.sock = None
     self.unpacker = None
     self.timeout = timeout
     self.lock = RLock()
Exemple #43
0
class aioFile(object):
    """a buffered File like object that uses pyaio and gevent"""
    def __init__(self, filename, mode='r', buffer=16 << 10):
        modes = os.O_LARGEFILE | os.O_CREAT
        self._offset = 0
        self._buffer_size = buffer
        if buffer:
            self._buffer_lock = RLock()
        self._read = False
        self._write = False
        self._read_buf = None
        self._write_buf = None
        self._eof = False  # Optimization to limit calls
        self._append = False  # Append Mode writes ignore offset
        self._stay_alive = gevent.spawn(_keep_awake)
        if mode.startswith('r') or '+' in mode:
            self._read = True
            self._read_buf = bytearray()
            if '+' not in mode:
                modes |= os.O_RDONLY
        if mode.startswith('w') or mode.startswith('a') or '+' in mode:
            if mode.startswith('w'):
                modes |= os.O_TRUNC
            self._write = True
            self._write_buf = bytearray()
            self._flush = False
            if '+' not in mode:
                modes |= os.O_WRONLY
        if '+' in mode:
            modes |= os.O_RDWR
        if mode.startswith('a'):
            modes |= os.O_APPEND
            self._append = True
        self._fd = os.open(filename, modes)

    def _clear_read_buf(self):
        if self._read:
            self._eof = False
            del self._read_buf[0:]

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def close(self):
        self.flush()
        os.close(self._fd)
        self._stay_alive.kill()

    def stat(self):
        return os.fstat(self._fd)

    def seek(self, pos, how=os.SEEK_SET):
        """Change the file pos, will clear read cache and flush writes """ \
        """This will also clear the EOF flag for the file"""
        offset = self._offset
        if how != os.SEEK_CUR and how != os.SEEK_END and how != os.SEEK_SET:
            raise OSError(
                14,
                'Invalid seek point use os.SEEK_SET, os.SEEK_CUR, os.SEEK_END')
        if how == os.SEEK_CUR:
            offset += pos
        elif how == os.SEEK_END:
            #Ugh this could be harry if we have outstanding writes
            offset = self.stat().st_size + pos
        else:
            offset = pos
        if offset < 0:
            raise OSError(14, 'File Position invalid, less than 0')
        #Even if the pos didn't change fix the buffers and EOF
        self._clear_read_buf()
        if not self._append:  # DON'T FLUSH on seek with append
            self.flush()
        self._offset = offset
        return offset

    def flush(self):
        """Flush write buffer"""
        if self._write and self._buffer_size:
            self._flush = True
            while len(self._write_buf):
                self.write(None)
            self._flush = False

    def _read_file(self):
        fbuf = bytearray()
        while True:
            part = self.read(16 << 10)  # Read 16k
            if part is None:  # EOF
                break
            fbuf.extend(part)
        return fbuf

    def write(self, buf, offset=None):
        """write a buffer object to file"""
        if not self._write:
            raise IOError(9, 'Bad file descriptor')
        if not self._append and self._buffer_size and self._read_buf:
            # We should clear read cache
            self._clear_read_buf()
        if offset is None:
            offset = self._offset
        write_size = self._buffer_size
        if not self._buffer_size and buf:
            write_size = len(buf)
        if not self._append and offset != self._offset:
            self.seek(offset)  # Makes sure we write our buffer

        #If we buffer we use the global buffer if not we use a local buffer
        if self._buffer_size:
            lbuf = self._write_buf
            self._buffer_lock.acquire()
            if buf:
                # The a memoryview of the buffer
                lbuf.extend(buf)  # pushed to pyaio so we need to lock
        else:
            lbuf = buf

        while lbuf and len(lbuf) >= self._buffer_size \
                or (self._flush and lbuf):
            result = AsyncResult()

            def _write_results(rcode, errno):
                result.set((rcode, errno))

            pyaio.aio_write(self._fd,
                            memoryview(lbuf)[0:write_size], offset,
                            _write_results)
            rcode, errno = result.get()  #SLEEP

            if rcode < 0:  # Some kind of error
                raise IOError(errno, 'AIO Write Error %d' % errno)
            # Clean up buffer (of actually written bytes)
            if self._buffer_size:
                del lbuf[0:rcode]
            else:
                lbuf = None
            self._offset = offset = offset + rcode  # Move the file offset
        if self._buffer_size:
            self._buffer_lock.release()
        if buf:
            return len(buf)
        else:
            return 0

    def read(self, size=0, offset=None):
        """read a size of bytes from the file, or entire file if 0 """ \
        """for speed we assume EOF after first short read"""
        if not self._read:
            raise IOError(9, 'Bad file descriptor')
        if not self._append and self._buffer_size and self._write_buf:
            self.flush()
        if offset is None:
            offset = self._offset
        if offset != self._offset:
            self.seek(offset)  # To make sure we blow away our read cache
        if size == 0:  # Attempt to read entire file and return in a single return
            return self._read_file()
        else:
            rbuf = bytearray()  # Holding Place for multiple reads
            while len(rbuf) < size:  # People get what they ask for
                # If we don't want to buffer then just read what they want
                if len(self._read_buf) < size - len(rbuf) and not self._eof:
                    #Ok we are buffer short so lets do a read
                    result = AsyncResult()

                    def _read_results(buf, rcode, errno):
                        result.set((buf, rcode, errno))

                    read_size = size - len(rbuf)
                    if self._buffer_size:  # If we buffer read buffer instead
                        read_size = self._buffer_size
                    pyaio.aio_read(self._fd, offset, read_size, _read_results)
                    buf, rcode, errno = result.get()  #SLEEP
                    if rcode < 0:  # Some kind of error
                        raise IOError(errno, 'AIO Read Error %d' % errno)
                    #Rcode will be the bytes read so lets push the offset
                    self._offset = offset = offset + rcode
                    if self._buffer_size:
                        self._read_buf.extend(buf)
                    else:
                        rbuf = buf  # Pass through because we are not buffering
                    if rcode == 0 or rcode < read_size:  # Good Enough
                        self._eof = True
                #Do a buffer read
                toread = size - len(rbuf)
                if self._buffer_size:
                    rbuf.extend(memoryview(self._read_buf)[0:toread])
                    #Clean up read buffer
                    del self._read_buf[0:toread]
                if not self._read_buf and self._eof:  # Empty buffer and eof
                    break
            if self._eof and not rbuf:
                return None  #EOF NO DATA
            else:
                return rbuf
Exemple #44
0
LINE_PER_FILE = 10000

# HTTPError 403: 代理时IP请求超过上限
# 10022   IP requests out of rate limit   IP请求频次超过上限
# 10023   User requests out of rate limit 用户请求频次超过上限
# 10024   User requests for (%s) out of rate limit    用户请求特殊接口 (%s) 频次超过上限
ERROR_NORMAL = 0
ERROR_API = -1
ERROR_RATE = -2

task_queue = gevent.queue.JoinableQueue(10000)
result_queue = gevent.queue.JoinableQueue(10000)
log_queue = gevent.queue.JoinableQueue(1000)

live_signal = 0
log_lock = RLock()
live_lock = RLock()
logger = get_logger(LOG_FILE)


def wait_time(proxy):
    try:
        rl = api.rate_limit(proxy=proxy)
    except Exception, e:
        rl = None

    if rl:
        if rl['remaining_ip_hits'] > 1 and rl['remaining_user_hits'] > 1:
            return 1
        return rl['reset_time_in_seconds'] + 1
    now = datetime.now()
Exemple #45
0
 def __init__(self, states, events, enter_event, exit_event):
     self._lock = RLock()
     super(ThreadSafeFSM, self).__init__(states, events, enter_event, exit_event)