Ejemplo n.º 1
0
    class GeventScheduler(Scheduler):
        """A scheduler that dispatches tasks via Gevent"""

        def __init__(self):
            Scheduler.__init__(self)
            from gevent.coros import RLock
            self._lock = RLock()

        def start(self):
            """Spawn a greenlet for the main event loop."""
            self.greenlet = gevent.spawn(self._run)

        def stop(self):
            """Stop the scheduler and wait for the thread to finish."""
            Scheduler.stop(self)
            try:
                self.greenlet.kill(block=False)
            except AttributeError:
                pass

        def _acquire_lock(self):
            """Lock the thread's task queue."""
            self._lock.acquire()

        def _release_lock(self):
            """Release the lock on the thread's task queue."""
            self._lock.release()
Ejemplo n.º 2
0
class Deliverator(object):
    """
    The deliverator holds the channels that will be used to deliver 
    the replies that come over a resilient connection
    """
    def __init__(self):
        self._log = logging.getLogger("Deliverator")
        self._active_requests = dict()
        self._lock = RLock()

    def add_request(self, message_id):
        """
        Add a message_id

        return a channel (gevent.queue.Queue)

        When the web_server's pull server gets a reply for this message id
        it will push the message into the queue. The caller can block on the
        queue, waiting for the reply.

        We can't use the zero size 'channel' queue because the web server moves 
        on after 8 of 10 retrieves and nobody is waiting on the last two.

        So we use a size of one, and it is the caller's responsibility to clean
        up unused channels.
        """
        channel = Queue(maxsize=1)

        self._lock.acquire()
        try:
            if message_id in self._active_requests:
                raise ValueError("Duplicate request '%s'" % (message_id, ))
            self._active_requests[message_id] = channel
        finally:
            self._lock.release()

        return channel

    def deliver_reply(self, message):
        """
        Deliver the reply nessage over the channel for its message-id

        And discard the channel
        """
        self._lock.acquire()
        try:
            channel = self._active_requests.pop(message.control["message-id"])
        except KeyError:
            channel = None
        finally:
            self._lock.release()

        if channel is None:
            self._log.error("undeliverable message %s" % (message.control, ))
        else:
            channel.put((
                message.control,
                message.body,
            ))
Ejemplo n.º 3
0
class Deliverator(object):
    """
    The deliverator holds the channels that will be used to deliver 
    the replies that come over a resilient connection
    """
    def __init__(self):
        self._log = logging.getLogger("Deliverator")
        self._active_requests = dict()
        self._lock = RLock()

    def add_request(self, message_id):
        """
        Add a message_id

        return a channel (gevent.queue.Queue)

        When the web_server's pull server gets a reply for this message id
        it will push the message into the queue. The caller can block on the
        queue, waiting for the reply.

        We can't use the zero size 'channel' queue because the web server moves 
        on after 8 of 10 retrieves and nobody is waiting on the last two.

        So we use a size of one, and it is the caller's responsibility to clean
        up unused channels.
        """
        channel = Queue(maxsize=1)

        self._lock.acquire()
        try:
            if message_id in self._active_requests:
                raise ValueError("Duplicate request '%s'" % (message_id, ))
            self._active_requests[message_id] = channel
        finally:
            self._lock.release()

        return channel

    def deliver_reply(self, message):
        """
        Deliver the reply nessage over the channel for its message-id

        And discard the channel
        """
        self._lock.acquire()
        try:
            channel = self._active_requests.pop(message.control["message-id"])
        except KeyError:
            channel = None
        finally:
            self._lock.release()
        
        if channel is None:
            self._log.error("undeliverable message %s" % (message.control, ))
        else:
            channel.put((message.control, message.body, ))
Ejemplo n.º 4
0
class ThreadSafeFSM(InstrumentFSM):
    def __init__(self, states, events, enter_event, exit_event):
        self._lock = RLock()
        super(ThreadSafeFSM, self).__init__(states, events, enter_event, exit_event)
    def on_event(self, event, *args, **kwargs):
        with self._lock:
            return super(ThreadSafeFSM, self).on_event(event, *args, **kwargs)
    def on_event_if_free(self, event, *args, **kwargs):
        if not self._lock.acquire(blocking=False):
            raise FSMLockedError
        try:
            retval = super(ThreadSafeFSM, self).on_event(event, *args, **kwargs)
        finally:
            self._lock.release()
        return retval
Ejemplo n.º 5
0
class ThreadSafeFSM(InstrumentFSM):
    def __init__(self, states, events, enter_event, exit_event):
        self._lock = RLock()
        super(ThreadSafeFSM, self).__init__(states, events, enter_event, exit_event)
    def on_event(self, event, *args, **kwargs):
        with self._lock:
            return super(ThreadSafeFSM, self).on_event(event, *args, **kwargs)
    def on_event_if_free(self, event, *args, **kwargs):
        if not self._lock.acquire(blocking=False):
            raise FSMLockedError
        try:
            retval = super(ThreadSafeFSM, self).on_event(event, *args, **kwargs)
        finally:
            self._lock.release()
        return retval
Ejemplo n.º 6
0
class aioFile(object):
    """a buffered File like object that uses pyaio and gevent"""
    def __init__(self, filename, mode='r', buffer=16<<10):
        modes = os.O_LARGEFILE | os.O_CREAT
        self._offset = 0
        self._buffer_size = buffer
        if buffer:
            self._buffer_lock = RLock()
        self._read = False
        self._write = False
        self._read_buf = None
        self._write_buf = None
        self._eof = False   # Optimization to limit calls
        self._append = False   # Append Mode writes ignore offset
        self._stay_alive = gevent.spawn(_keep_awake);
        if mode.startswith('r') or '+' in mode:
            self._read = True
            self._read_buf = bytearray()
            if '+' not in mode:
                modes |= os.O_RDONLY
        if mode.startswith('w') or mode.startswith('a') or '+' in mode:
            if mode.startswith('w'):
                modes |= os.O_TRUNC
            self._write = True
            self._write_buf = bytearray()
            self._flush = False
            if '+' not in mode:
                modes |= os.O_WRONLY
        if '+' in mode:
            modes |= os.O_RDWR
        if mode.startswith('a'):
            modes |= os.O_APPEND
            self._append = True
        self._fd = os.open(filename, modes)

    def _clear_read_buf(self):
        if self._read:
            self._eof = False
            del self._read_buf[0:]

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def close(self):
        self.flush()
        os.close(self._fd)
        self._stay_alive.kill()

    def stat(self):
        return os.fstat(self._fd)

    def seek(self, pos, how=os.SEEK_SET):
        """Change the file pos, will clear read cache and flush writes """ \
        """This will also clear the EOF flag for the file"""
        offset = self._offset
        if how != os.SEEK_CUR and how != os.SEEK_END and how != os.SEEK_SET:
            raise OSError(14,
                'Invalid seek point use os.SEEK_SET, os.SEEK_CUR, os.SEEK_END')
        if how == os.SEEK_CUR:
            offset += pos
        elif how == os.SEEK_END:
            #Ugh this could be harry if we have outstanding writes
            offset = self.stat().st_size + pos
        else:
            offset = pos
        if offset < 0:
            raise OSError(14, 'File Position invalid, less than 0')
        #Even if the pos didn't change fix the buffers and EOF
        self._clear_read_buf()
        if not self._append:   # DON'T FLUSH on seek with append
            self.flush()
        self._offset = offset
        return offset

    def flush(self):
        """Flush write buffer"""
        if self._write and self._buffer_size:
            self._flush = True
            while len(self._write_buf):
                self.write(None)
            self._flush = False

    def _read_file(self):
        fbuf = bytearray()
        while True:
            part = self.read(16 << 10)  # Read 16k
            if part is None:  # EOF
                break
            fbuf.extend(part)
        return fbuf

    def write(self, buf, offset=None):
        """write a buffer object to file"""
        if not self._write:
            raise IOError(9, 'Bad file descriptor')
        if not self._append and self._buffer_size and self._read_buf:
                # We should clear read cache
            self._clear_read_buf()
        if offset is None:
            offset = self._offset
        write_size = self._buffer_size
        if not self._buffer_size and buf:
            write_size = len(buf)
        if not self._append and offset != self._offset:
            self.seek(offset)  # Makes sure we write our buffer

        #If we buffer we use the global buffer if not we use a local buffer
        if self._buffer_size:
            lbuf = self._write_buf
            self._buffer_lock.acquire()
            if buf:
                                          # The a memoryview of the buffer
                    lbuf.extend(buf)      # pushed to pyaio so we need to lock
        else:
            lbuf = buf

        while lbuf and len(lbuf) >= self._buffer_size \
                or (self._flush and lbuf):
            result = AsyncResult()
            def _write_results(rcode, errno):
                result.set((rcode, errno))
            pyaio.aio_write(self._fd, memoryview(lbuf)[0:write_size],
                            offset, _write_results)
            rcode, errno = result.get()  #SLEEP

            if rcode < 0:   # Some kind of error
                raise IOError(errno, 'AIO Write Error %d' % errno)
            # Clean up buffer (of actually written bytes)
            if self._buffer_size:
                del lbuf[0:rcode]
            else:
                lbuf = None
            self._offset = offset = offset + rcode  # Move the file offset
        if self._buffer_size:
            self._buffer_lock.release()
        if buf:
            return len(buf)
        else:
            return 0

    def read(self, size=0, offset=None):
        """read a size of bytes from the file, or entire file if 0 """ \
        """for speed we assume EOF after first short read"""
        if not self._read:
            raise IOError(9, 'Bad file descriptor')
        if not self._append and self._buffer_size and self._write_buf:
            self.flush()
        if offset is None:
            offset = self._offset
        if offset != self._offset:
            self.seek(offset)  # To make sure we blow away our read cache
        if size == 0:  # Attempt to read entire file and return in a single return
            return self._read_file()
        else:
            rbuf = bytearray()  # Holding Place for multiple reads
            while len(rbuf) < size:  # People get what they ask for
                # If we don't want to buffer then just read what they want
                if len(self._read_buf) < size - len(rbuf) and not self._eof:
                    #Ok we are buffer short so lets do a read
                    result = AsyncResult()
                    def _read_results(buf, rcode, errno):
                        result.set((buf, rcode, errno))
                    read_size = size - len(rbuf)
                    if self._buffer_size:   # If we buffer read buffer instead
                        read_size = self._buffer_size
                    pyaio.aio_read(self._fd, offset, read_size, _read_results)
                    buf, rcode, errno = result.get()  #SLEEP
                    if rcode < 0:  # Some kind of error
                        raise IOError(errno, 'AIO Read Error %d' % errno)
                    #Rcode will be the bytes read so lets push the offset
                    self._offset = offset = offset + rcode
                    if self._buffer_size:
                        self._read_buf.extend(buf)
                    else:
                        rbuf = buf  # Pass through because we are not buffering
                    if rcode == 0 or rcode < read_size:  # Good Enough
                        self._eof = True
                #Do a buffer read
                toread = size - len(rbuf)
                if self._buffer_size:
                    rbuf.extend(memoryview(self._read_buf)[0:toread])
                    #Clean up read buffer
                    del self._read_buf[0:toread]
                if not self._read_buf and self._eof:  # Empty buffer and eof
                    break
            if self._eof and not rbuf:
                return None  #EOF NO DATA
            else:
                return rbuf
Ejemplo n.º 7
0
class ReplayProcess(BaseReplayProcess):

    process_type = 'standalone'

    def __init__(self, *args, **kwargs):

        super(ReplayProcess, self).__init__(*args, **kwargs)
        self.lock = RLock()

    def on_start(self):

        self.query = self.CFG.get_safe('process.query', {})

        self.delivery_format = self.CFG.get_safe('process.delivery_format', {})
        self.datastore_name = self.CFG.get_safe('process.datastore_name',
                                                'dm_datastore')

        definition_id = self.delivery_format.get('definition_id')
        rrsc = ResourceRegistryServiceProcessClient(process=self,
                                                    node=self.container.node)
        definition = rrsc.read(definition_id)
        self.definition = definition.container

        self.fields = self.delivery_format.get('fields', None)

        self.view_name = self.CFG.get_safe('process.view_name',
                                           'datasets/dataset_by_id')
        self.key_id = self.CFG.get_safe('process.key_id')
        self.stream_id = self.CFG.get_safe('process.publish_streams.output')

        if not self.stream_id:
            raise Inconsistent(
                'The replay process requires a stream id. Invalid configuration!'
            )

        self.data_stream_id = self.definition.data_stream_id
        self.encoding_id = self.definition.identifiables[
            self.data_stream_id].encoding_id
        self.element_type_id = self.definition.identifiables[
            self.data_stream_id].element_type_id
        self.element_count_id = self.definition.identifiables[
            self.data_stream_id].element_count_id
        self.data_record_id = self.definition.identifiables[
            self.element_type_id].data_record_id
        self.field_ids = self.definition.identifiables[
            self.data_record_id].field_ids
        self.domain_ids = self.definition.identifiables[
            self.data_record_id].domain_ids
        self.time_id = self.definition.identifiables[
            self.domain_ids[0]].temporal_coordinate_vector_id

    def execute_replay(self):
        '''
        @brief Spawns a greenlet to take care of the query and work
        '''
        if not hasattr(self, 'output'):
            raise Inconsistent(
                'The replay process requires an output stream publisher named output. Invalid configuration!'
            )

        datastore_name = self.datastore_name
        key_id = self.key_id

        view_name = self.view_name

        opts = {
            'start_key': [key_id, 0],
            'end_key': [key_id, 2],
            'include_docs': True
        }

        g = Greenlet(self._query,
                     datastore_name=datastore_name,
                     view_name=view_name,
                     opts=opts,
                     callback=lambda results: self._publish_query(results))
        g.start()

    def _query(self,
               datastore_name='dm_datastore',
               view_name='posts/posts_by_id',
               opts={},
               callback=None):
        '''
        @brief Makes the couch query and then callsback to publish
        @param datastore_name Name of the datastore
        @param view_name The name of the design view where the data is organized
        @param opts options to pass
        @param callback the content handler
        '''
        db = self.container.datastore_manager.get_datastore(
            datastore_name, DataStore.DS_PROFILE.SCIDATA, self.CFG)

        ret = db.query_view(view_name=view_name, opts=opts)

        callback(ret)

    def _publish_query(self, results):
        '''
        @brief Publishes the appropriate data based on the delivery format and data returned from query
        @param results The query results from the couch query
        '''

        if results is None:
            log.info('No Results')
            return

        publish_queue = self._parse_results(results)
        for item in publish_queue:
            log.debug('Item in queue: %s' % type(item))
        granule = self._merge(publish_queue)
        if not granule:
            return  # no dataset

        if self.delivery_format.has_key('fields'):
            res = self.subset(granule, self.delivery_format['fields'])
            granule = res

        if self.delivery_format.has_key('time'):
            granule = self.time_subset(granule, self.delivery_format['time'])

        total_records = granule.identifiables[self.element_count_id].value
        granule.identifiables[self.element_count_id].constraint.intervals = [
            [0, total_records - 1],
        ]

        if self.delivery_format.has_key('records'):
            assert isinstance(self.delivery_format['records'],
                              int), 'delivery format is incorrectly formatted.'

            for chunk in self._records(granule,
                                       self.delivery_format['records']):
                self.lock.acquire()
                self.output.publish(chunk)
                self.lock.release()
            return

        self.lock.acquire()
        self.output.publish(granule)
        self.lock.release()

    def _parse_results(self, results):
        '''
        @brief Switch-case logic for what packet types replay can handle and how to handle
        @param results List of results returned from couch view
        @return A queue of msgs parsed and formatted to be iterated through and published.
        '''
        log.debug('called _parse_results')
        publish_queue = []

        for result in results:
            assert ('doc' in result)

            packet = result['doc']

            if isinstance(packet, BlogBase):
                packet.is_replay = True
                self.lock.acquire()
                self.output.publish(packet)
                self.lock.release()
                continue

            if isinstance(packet, StreamDefinitionContainer):
                continue  # Ignore

            if isinstance(packet, StreamGranuleContainer):
                packet = self._parse_granule(packet)
                log.debug('Got packet')
                if packet:
                    log.debug('Appending packet')
                    publish_queue.append(packet)
                continue

            log.info('Unknown packet type in replay.')

        return publish_queue

    def _records(self, granule, n):
        '''
        @brief Yields n records from a granule per iteration
        @param granule consisting of dataset
        @param n number of records to yield
        '''
        bin_size = n
        record_count = granule.identifiables[self.element_count_id].value

        i = 0
        while (i + bin_size) < record_count:
            log.debug('Yielding %d to %d', i, i + bin_size)
            yield self._slice(granule, slice(i, i + bin_size))
            i += bin_size
        if i < record_count:
            yield self._slice(granule, slice(i, i + bin_size))
        return

    def _pair_up(self, granule):
        '''
        @brief Creates a list of tuples consisting of acquire_data friendly var_names and full values_paths
        @param granule consisting of full dataset.
        @return list of tuples
        '''
        fields = self._list_data(self.definition, granule)
        pairs = list()
        for i in fields.values():
            pairs.append((i.split('/').pop(), i))
        return pairs

    def _find_vp(self, pairs, var_name):
        '''
        @brief Determines the value path based on the acquire_data friendly var_name
        @param pairs List of tuples consisting of pair-wise var_name/value_path
        @param var_name Desired var_name
        @return Associated value_path
        '''
        for pair in pairs:
            if var_name == pair[0]:
                return pair[1]
        return

    def _slice(self, granule, slice_):
        '''
        @brief Creates a granule which is a slice of the granule parameter
        @param granule the superset
        @param slice_ The slice values for which to create the granule
        @return Crafted subset granule of the parameter granule.
        '''
        retval = copy.deepcopy(granule)
        fields = self._list_data(self.definition, granule)
        record_count = slice_.stop - slice_.start
        assert record_count > 0, 'slice is malformed'
        pairs = self._pair_up(granule)
        var_names = list([i[0]
                          for i in pairs])  # Get the var_names from the pairs
        log.debug('var_names: %s', var_names)
        file_path = self._get_hdf_from_string(
            granule.identifiables[self.data_stream_id].values)
        codec = HDFEncoder()
        vectors = acquire_data([file_path], var_names, record_count,
                               slice_).next()

        for row, value in vectors.iteritems():
            vp = self._find_vp(pairs, row)
            # Determine the range_id reverse dictionary lookup
            #@todo: improve this pattern
            for field, path in fields.iteritems():
                if vp == path:
                    range_id = field
                    break
            bounds_id = retval.identifiables[range_id].bounds_id
            # Recalculate the bounds for this fields and update the granule
            range = value['range']
            retval.identifiables[bounds_id].value_pair[0] = float(range[0])
            retval.identifiables[bounds_id].value_pair[1] = float(range[1])
            codec.add_hdf_dataset(vp, value['values'])
            record_count = len(value['values'])
            #----- DEBUGGING ---------
            log.debug('slice- row: %s', row)
            log.debug('slice- value_path: %s', vp)
            log.debug('slice- range_id: %s', range_id)
            log.debug('slice- bounds_id: %s', bounds_id)
            log.debug('slice- limits: %s', value['range'])
            #-------------------------

        retval.identifiables[self.element_count_id].value = record_count
        hdf_string = codec.encoder_close()
        self._patch_granule(retval, hdf_string)
        FileSystem.unlink(file_path)
        return retval

    def _parse_granule(self, granule):
        '''
        @brief Ensures the granule is valid and gets some metadata from the granule for building the dataset
        @param granule raw granule straight from couch
        @return metadata in the granule as well as the granule itself if valid.
        '''

        granule.stream_resource_id = self.stream_id

        element_count_id = self.element_count_id
        encoding_id = self.encoding_id

        record_count = granule.identifiables[element_count_id].value
        sha1 = granule.identifiables[encoding_id].sha1 or None

        # If there are no records then this is not a proper granule
        if not (record_count > 0):
            log.debug('Granule had no record count discarding.')
            return None

        # No encoding, no packet
        if not encoding_id in granule.identifiables:
            log.debug('Granule had no encoding discarding.')
            return None

        if not sha1:
            log.debug('Granule had no sha1')
            return None

        filepath = FileSystem.get_hierarchical_url(FS.CACHE, sha1, '.hdf5')

        if not os.path.exists(filepath):
            log.debug('File with sha1 does not exist')
            return None

        return {'granule': granule, 'records': record_count, 'sha1': sha1}

    @staticmethod
    def merge_granule(definition, granule1, granule2):
        '''
        @brief Merges two granules based on the definition
        @param definition Stream Definition
        @param granule1 First Granule
        @param granule2 Second Granule
        @return Returns granule1 which is then merged with granule2 and the file pair for indexing

        @description granule1 := granule1 U granule2
        '''
        import numpy as np

        assert isinstance(
            definition,
            StreamDefinitionContainer), 'object is not a definition.'
        assert isinstance(granule1,
                          StreamGranuleContainer), 'object is not a granule.'
        encoding_id = DefinitionTree.get(
            definition, '%s.encoding_id' % definition.data_stream_id)

        if not granule2:
            pair = (granule1.identifiables['time_bounds'].value_pair[0],
                    '%s.hdf5' % granule1.identifiables[encoding_id].sha1)
            return {'granule': granule1, 'files': [pair]}

        assert isinstance(granule2,
                          StreamGranuleContainer), 'object is not a granule.'

        assert granule1.identifiables.has_key(
            'time_bounds'
        ), 'object has no time bounds and therefore is invalid.'

        assert granule2.identifiables.has_key(
            'time_bounds'
        ), 'object has no time bounds and therefore is invalid.'

        #-------------------------------------------------------------------------------------
        # First step is figure out where each granule belongs on the timeline
        # We do this with a tuple consisting of the point in the timeline and the filename
        # These will get stable sorted later
        #-------------------------------------------------------------------------------------

        pair1 = (granule1.identifiables['time_bounds'].value_pair[0],
                 '%s.hdf5' % granule1.identifiables[encoding_id].sha1)

        pair2 = (granule2.identifiables['time_bounds'].value_pair[0],
                 '%s.hdf5' % granule2.identifiables[encoding_id].sha1)

        files = []

        if encoding_id in granule1.identifiables:
            if granule1.identifiables[encoding_id].sha1:
                files.append('%s.hdf5' %
                             granule1.identifiables[encoding_id].sha1)
        if encoding_id in granule2.identifiables:
            if granule2.identifiables[encoding_id].sha1:
                files.append('%s.hdf5' %
                             granule2.identifiables[encoding_id].sha1)

        element_count_id = DefinitionTree.get(
            definition, '%s.element_count_id' % definition.data_stream_id)
        record_count = 0
        if element_count_id in granule1.identifiables:
            record_count += granule1.identifiables[element_count_id].value
        if element_count_id in granule2.identifiables:
            record_count += granule2.identifiables[element_count_id].value

        if not element_count_id in granule1.identifiables:
            granule1.identifiables[element_count_id] = CountElement()
            granule1.identifiables[element_count_id].value = record_count
        else:
            granule1.identifiables[element_count_id].value = record_count

        fields1 = ReplayProcess._list_data(definition, granule1)
        fields2 = ReplayProcess._list_data(definition, granule2)
        #@todo albeit counterintuitive an intersection is the only thing I can support
        merged_paths = {}
        for k, v in fields1.iteritems():
            if fields2.has_key(k):
                merged_paths[k] = v

        for k, v in granule2.identifiables.iteritems():
            # Switch(value):

            # Case Bounds:
            if isinstance(v, QuantityRangeElement):
                # If its not in granule1 just throw it in there
                if k not in granule1.identifiables:
                    granule1.identifiables[k] = v
                else:
                    bounds1 = granule1.identifiables[k].value_pair
                    bounds2 = granule2.identifiables[k].value_pair
                    bounds = np.append(bounds1, bounds2)
                    granule1.identifiables[k].value_pair = [
                        np.nanmin(bounds),
                        np.nanmax(bounds)
                    ]

            if isinstance(v, RangeSet):  #Including coordinate axis
                if merged_paths.has_key(
                        k) and not granule1.identifiables.has_key(k):
                    granule1.identifiables[k] = v  # Copy it over

        # Now make sure granule1 doesnt have excess stuff
        del_list = []
        for k, v in granule1.identifiables.iteritems():
            if isinstance(v, RangeSet):
                if not merged_paths.has_key(k):
                    del_list.append(k)

        for item in del_list:
            del granule1.identifiables[item]

        return {'granule': granule1, 'files': [pair1, pair2]}

    @staticmethod
    def _list_data(definition, granule):
        '''
        @brief Lists all the fields in the granule based on the Stream Definition
        @param definition Stream Definition
        @param granule Stream Granule
        @return dict of field_id : values_path for each field_id that exists
        '''
        from interface.objects import StreamDefinitionContainer, StreamGranuleContainer, RangeSet, CoordinateAxis
        assert isinstance(
            definition,
            StreamDefinitionContainer), 'object is not a definition.'
        assert isinstance(
            granule, StreamGranuleContainer
        ), 'object is not a granule. its a %s' % type(granule)
        retval = {}
        for key, value in granule.identifiables.iteritems():
            if isinstance(value, RangeSet):
                values_path = value.values_path or definition.identifiables[
                    key].values_path
                retval[key] = values_path

            elif isinstance(value, CoordinateAxis):
                values_path = value.values_path or definition.identifiables[
                    key].values_path
                retval[key] = values_path

        return retval

    def _merge(self, msgs):
        '''
        @brief Merges all the granules and datasets into one large dataset (Union)
        @param msgs raw granules from couch
        @return complete dataset
        @description
             n
        D := U [ msgs_i ]
            i=0
        '''
        granule = None
        file_list = list()
        count = len(msgs)
        used_vals = list()

        #-------------------------------------------------------------------------------------
        # Merge each granule to another granule one by one.
        # After each merge operation keep track of what files belong where on the timeline
        #-------------------------------------------------------------------------------------

        for i in xrange(count):
            if i == 0:
                granule = msgs[0]['granule']
                psc = PointSupplementConstructor(
                    point_definition=self.definition)

                res = ReplayProcess.merge_granule(definition=self.definition,
                                                  granule1=granule,
                                                  granule2=None)
                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])

            else:
                res = ReplayProcess.merge_granule(definition=self.definition,
                                                  granule1=granule,
                                                  granule2=msgs[i]['granule'])

                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])
                if file_pair[1] not in file_list and file_pair[1][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[1]))
                    used_vals.append(file_pair[1][0])

        if not granule:
            return
        log.debug('file_list: %s', file_list)
        #-------------------------------------------------------------------------------------
        # Order the lists using a stable sort from python (by the first value in the tuples
        # Then peel off just the file names
        # Then get the appropriate URL for the file using FileSystem
        #-------------------------------------------------------------------------------------
        file_list.sort()
        file_list = list(i[1] for i in file_list)
        file_list = list([
            FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i)
            for i in file_list
        ])

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        codec = HDFEncoder()
        log.debug('acquire_data:')
        log.debug('\tfile_list: %s', file_list)
        log.debug('\tfields: %s', var_names)
        log.debug('\trecords: %s', record_count)

        data = acquire_data(file_list, var_names, record_count).next()

        for row, value in data.iteritems():
            value_path = self._find_vp(pairs, row)
            codec.add_hdf_dataset(value_path, nparray=value['values'])
            #-------------------------------------------------------------------------------------
            # Debugging
            #-------------------------------------------------------------------------------------
            log.debug('row: %s', row)
            log.debug('value path: %s', value_path)
            log.debug('value: %s', value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule, hdf_string)
        return granule

    def _patch_granule(self, granule, hdf_string):
        '''
        @brief Adds the hdf_string and sha1 to the granule
        @param granule Stream Granule
        @param hdf_string string consisting of raw bytes from an hdf5 file
        '''
        granule.identifiables[self.data_stream_id].values = hdf_string
        granule.identifiables[self.encoding_id].sha1 = hashlib.sha1(
            hdf_string).hexdigest().upper()

    def time_subset(self, granule, time_bounds):
        '''
        @brief Obtains a subset of the granule dataset based on the specified time_bounds
        @param granule Dataset
        @param time_bounds tuple consisting of a lower and upper bound
        @return A subset of the granule's dataset based on the time boundaries.
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        lower = time_bounds[0] - 1
        upper = time_bounds[1]
        granule = self._slice(granule, slice(lower, upper))
        return granule

    def _get_time_index(self, granule, timeval):
        '''
        @brief Obtains the index where a time's value is
        @param granule must be a complete dataset (hdf_string provided)
        @param timeval the vector value
        @return Index value for timeval or closest approx such that timeval is IN the subset
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        assert granule.identifiables[
            self.data_stream_id].values, 'hdf_string is not provided.'

        hdf_string = granule.identifiables[self.data_stream_id].values
        file_path = self._get_hdf_from_string(hdf_string)

        #-------------------------------------------------------------------------------------
        # Determine the field_id for the temporal coordinate vector (aka time)
        #-------------------------------------------------------------------------------------

        time_field = self.definition.identifiables[
            self.time_id].coordinate_ids[0]
        value_path = granule.identifiables[
            time_field].values_path or self.definition.identifiables[
                time_field].values_path
        record_count = granule.identifiables[self.element_count_id].value

        #-------------------------------------------------------------------------------------
        # Go through the time vector and get the indexes that correspond to the timeval
        # It will find a value such that
        # t_n <= i < t_(n+1), where i is the index
        #-------------------------------------------------------------------------------------

        var_name = value_path.split('/').pop()
        res = acquire_data([file_path], [var_name], record_count).next()
        time_vector = res[var_name]['values']
        retval = 0
        for i in xrange(len(time_vector)):
            if time_vector[i] == timeval:
                retval = i
                break
            elif i == 0 and time_vector[i] > timeval:
                retval = i
                break
            elif (i + 1) < len(time_vector):  # not last val
                if time_vector[i] < timeval and time_vector[i + 1] > timeval:
                    retval = i
                    break
            else:  # last val
                retval = i
                break
        FileSystem.unlink(file_path)
        return retval

    def _get_hdf_from_string(self, hdf_string):
        '''
        @param hdf_string binary string consisting of an HDF5 file.
        @return temporary file (full path) where the string was written to.
        @note client's responsible to unlink when finished.
        '''
        f = FileSystem.mktemp()
        f.write(hdf_string)
        retval = f.name
        f.close()
        return retval

    def subset(self, granule, coverages):
        '''
        @param granule
        @return dataset subset based on the fields
        '''
        assert isinstance(granule,
                          StreamGranuleContainer), 'object is not a granule.'
        field_ids = self.field_ids
        element_count_id = self.element_count_id

        values_path = list()
        domain_ids = list()
        coverage_ids = list()
        coverages = list(coverages)
        log.debug('Coverages include %s of type %s', coverages,
                  type(coverages))
        #-----------------------------------------------------------------------------------------------------------
        # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises
        #  - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain)
        #  - If its a rangeset make sure that it's part of what the client asked for, if not discard it
        #-----------------------------------------------------------------------------------------------------------

        for field_id in field_ids:

            range_id = self.definition.identifiables[field_id].range_id

            #-------------------------------------------------------------------------------------
            # Coordinate Axis
            # - Keep track of this in our domains
            # - Add it to the paths we need to grab from the file(s)
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id],
                          CoordinateAxis):
                log.debug('got a domain: %s' % range_id)
                domain_ids.append(field_id)
                if granule.identifiables.has_key(range_id):
                    value_path = granule.identifiables[
                        range_id].values_path or self.definition.identifiables[
                            range_id].values_path
                    values_path.append(value_path)
                else:
                    value_path = self.definition.identifiables[
                        range_id].values_path
                    values_path.append(value_path)
                continue

            #-------------------------------------------------------------------------------------
            # Range Set
            # - If it's part of the coverages we want to keep
            #   - Add it to the list of ranges we're tracking
            #   - Add the value path to the paths we're tracking.
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id], RangeSet):
                # If its a rangeset, a specified coverage and the granule has it, add it to the list
                if field_id in coverages:
                    if granule.identifiables.has_key(range_id):
                        log.debug('got a range: %s' % range_id)
                        coverage_ids.append(field_id)
                        if granule.identifiables.has_key(range_id):
                            value_path = granule.identifiables[
                                range_id].values_path or self.definition.identifiables[
                                    range_id].values_path
                            values_path.append(value_path)
                        else:
                            value_path = self.definition.identifiables[
                                range_id].values_path
                            values_path.append(value_path)
                        continue

                # ----
                # We need to track the range and bounds because,
                # you guessed it, we need to update the bounds
                # ----

                range_id = self.definition.identifiables[field_id].range_id
                bounds_id = self.definition.identifiables[range_id].bounds_id

                #---
                # Lastly, if the field is there and we don't want it, we need to strip it
                #---

                if not (field_id in coverages):
                    log.debug('%s doesn\'t belong in %s.', field_id, coverages)
                    log.debug('rebool: %s', bool(field_id in coverages))
                    if granule.identifiables.has_key(range_id):
                        log.debug('Removing %s from granule', range_id)
                        del granule.identifiables[range_id]
                    if granule.identifiables.has_key(bounds_id):
                        log.debug('Removing %s from granule', bounds_id)
                        del granule.identifiables[bounds_id]

        log.debug('Domains: %s', domain_ids)
        log.debug('Ranges: %s', coverage_ids)
        log.debug('Values_paths: %s', values_path)

        file_path = self._get_hdf_from_string(
            granule.identifiables[self.data_stream_id].values)
        full_coverage = list(domain_ids + coverage_ids)

        log.debug('Full coverage: %s' % full_coverage)
        log.debug('Calling acquire_data with: %s, %s, %s', [file_path],
                  values_path, granule.identifiables[element_count_id].value)

        codec = HDFEncoder()

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        data = acquire_data([file_path], var_names, record_count).next()
        for row, value in data.iteritems():
            vp = self._find_vp(pairs, row)
            codec.add_hdf_dataset(vp, value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule, hdf_string)

        FileSystem.unlink(file_path)

        return granule
Ejemplo n.º 8
0
class aioFile(object):
    """a buffered File like object that uses pyaio and gevent"""
    def __init__(self, filename, mode='r', buffer=16 << 10):
        modes = os.O_LARGEFILE | os.O_CREAT
        self._offset = 0
        self._buffer_size = buffer
        if buffer:
            self._buffer_lock = RLock()
        self._read = False
        self._write = False
        self._read_buf = None
        self._write_buf = None
        self._eof = False  # Optimization to limit calls
        self._append = False  # Append Mode writes ignore offset
        self._stay_alive = gevent.spawn(_keep_awake)
        if mode.startswith('r') or '+' in mode:
            self._read = True
            self._read_buf = bytearray()
            if '+' not in mode:
                modes |= os.O_RDONLY
        if mode.startswith('w') or mode.startswith('a') or '+' in mode:
            if mode.startswith('w'):
                modes |= os.O_TRUNC
            self._write = True
            self._write_buf = bytearray()
            self._flush = False
            if '+' not in mode:
                modes |= os.O_WRONLY
        if '+' in mode:
            modes |= os.O_RDWR
        if mode.startswith('a'):
            modes |= os.O_APPEND
            self._append = True
        self._fd = os.open(filename, modes)

    def _clear_read_buf(self):
        if self._read:
            self._eof = False
            del self._read_buf[0:]

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def close(self):
        self.flush()
        os.close(self._fd)
        self._stay_alive.kill()

    def stat(self):
        return os.fstat(self._fd)

    def seek(self, pos, how=os.SEEK_SET):
        """Change the file pos, will clear read cache and flush writes """ \
        """This will also clear the EOF flag for the file"""
        offset = self._offset
        if how != os.SEEK_CUR and how != os.SEEK_END and how != os.SEEK_SET:
            raise OSError(
                14,
                'Invalid seek point use os.SEEK_SET, os.SEEK_CUR, os.SEEK_END')
        if how == os.SEEK_CUR:
            offset += pos
        elif how == os.SEEK_END:
            #Ugh this could be harry if we have outstanding writes
            offset = self.stat().st_size + pos
        else:
            offset = pos
        if offset < 0:
            raise OSError(14, 'File Position invalid, less than 0')
        #Even if the pos didn't change fix the buffers and EOF
        self._clear_read_buf()
        if not self._append:  # DON'T FLUSH on seek with append
            self.flush()
        self._offset = offset
        return offset

    def flush(self):
        """Flush write buffer"""
        if self._write and self._buffer_size:
            self._flush = True
            while len(self._write_buf):
                self.write(None)
            self._flush = False

    def _read_file(self):
        fbuf = bytearray()
        while True:
            part = self.read(16 << 10)  # Read 16k
            if part is None:  # EOF
                break
            fbuf.extend(part)
        return fbuf

    def write(self, buf, offset=None):
        """write a buffer object to file"""
        if not self._write:
            raise IOError(9, 'Bad file descriptor')
        if not self._append and self._buffer_size and self._read_buf:
            # We should clear read cache
            self._clear_read_buf()
        if offset is None:
            offset = self._offset
        write_size = self._buffer_size
        if not self._buffer_size and buf:
            write_size = len(buf)
        if not self._append and offset != self._offset:
            self.seek(offset)  # Makes sure we write our buffer

        #If we buffer we use the global buffer if not we use a local buffer
        if self._buffer_size:
            lbuf = self._write_buf
            self._buffer_lock.acquire()
            if buf:
                # The a memoryview of the buffer
                lbuf.extend(buf)  # pushed to pyaio so we need to lock
        else:
            lbuf = buf

        while lbuf and len(lbuf) >= self._buffer_size \
                or (self._flush and lbuf):
            result = AsyncResult()

            def _write_results(rcode, errno):
                result.set((rcode, errno))

            pyaio.aio_write(self._fd,
                            memoryview(lbuf)[0:write_size], offset,
                            _write_results)
            rcode, errno = result.get()  #SLEEP

            if rcode < 0:  # Some kind of error
                raise IOError(errno, 'AIO Write Error %d' % errno)
            # Clean up buffer (of actually written bytes)
            if self._buffer_size:
                del lbuf[0:rcode]
            else:
                lbuf = None
            self._offset = offset = offset + rcode  # Move the file offset
        if self._buffer_size:
            self._buffer_lock.release()
        if buf:
            return len(buf)
        else:
            return 0

    def read(self, size=0, offset=None):
        """read a size of bytes from the file, or entire file if 0 """ \
        """for speed we assume EOF after first short read"""
        if not self._read:
            raise IOError(9, 'Bad file descriptor')
        if not self._append and self._buffer_size and self._write_buf:
            self.flush()
        if offset is None:
            offset = self._offset
        if offset != self._offset:
            self.seek(offset)  # To make sure we blow away our read cache
        if size == 0:  # Attempt to read entire file and return in a single return
            return self._read_file()
        else:
            rbuf = bytearray()  # Holding Place for multiple reads
            while len(rbuf) < size:  # People get what they ask for
                # If we don't want to buffer then just read what they want
                if len(self._read_buf) < size - len(rbuf) and not self._eof:
                    #Ok we are buffer short so lets do a read
                    result = AsyncResult()

                    def _read_results(buf, rcode, errno):
                        result.set((buf, rcode, errno))

                    read_size = size - len(rbuf)
                    if self._buffer_size:  # If we buffer read buffer instead
                        read_size = self._buffer_size
                    pyaio.aio_read(self._fd, offset, read_size, _read_results)
                    buf, rcode, errno = result.get()  #SLEEP
                    if rcode < 0:  # Some kind of error
                        raise IOError(errno, 'AIO Read Error %d' % errno)
                    #Rcode will be the bytes read so lets push the offset
                    self._offset = offset = offset + rcode
                    if self._buffer_size:
                        self._read_buf.extend(buf)
                    else:
                        rbuf = buf  # Pass through because we are not buffering
                    if rcode == 0 or rcode < read_size:  # Good Enough
                        self._eof = True
                #Do a buffer read
                toread = size - len(rbuf)
                if self._buffer_size:
                    rbuf.extend(memoryview(self._read_buf)[0:toread])
                    #Clean up read buffer
                    del self._read_buf[0:toread]
                if not self._read_buf and self._eof:  # Empty buffer and eof
                    break
            if self._eof and not rbuf:
                return None  #EOF NO DATA
            else:
                return rbuf
Ejemplo n.º 9
0
class VizTransformProcForMatplotlibGraphs(TransformDataProcess):
    """
    This class is used for instantiating worker processes that have subscriptions to data streams and convert
    incoming data from CDM format to Matplotlib graphs

    """
    def on_start(self):
        super(VizTransformProcForMatplotlibGraphs, self).on_start()
        #assert len(self.streams)==1
        self.initDataFlag = True
        self.graph_data = {
        }  # Stores a dictionary of variables : [List of values]

        # Need some clients
        self.rr_cli = ResourceRegistryServiceProcessClient(
            process=self, node=self.container.node)
        self.pubsub_cli = PubsubManagementServiceClient(
            node=self.container.node)

        # extract the various parameters passed to the transform process
        self.out_stream_id = self.CFG.get('process').get(
            'publish_streams').get('visualization_service_submit_stream_id')

        # Create a publisher on the output stream
        #stream_route = self.pubsub_cli.register_producer(stream_id=self.out_stream_id)
        out_stream_pub_registrar = StreamPublisherRegistrar(
            process=self.container, node=self.container.node)
        self.out_stream_pub = out_stream_pub_registrar.create_publisher(
            stream_id=self.out_stream_id)

        self.data_product_id = self.CFG.get('data_product_id')
        self.stream_def_id = self.CFG.get("stream_def_id")
        self.stream_def = self.rr_cli.read(self.stream_def_id)

        # Start the thread responsible for keeping track of time and generating graphs
        # Mutex for ensuring proper concurrent communications between threads
        self.lock = RLock()
        self.rendering_proc = Greenlet(self.rendering_thread)
        self.rendering_proc.start()

    def process(self, packet):
        log.debug('(%s): Received Viz Data Packet' % self.name)
        #log.debug('(%s):   - Processing: %s' % (self.name,packet))

        # parse the incoming data
        psd = PointSupplementStreamParser(
            stream_definition=self.stream_def.container, stream_granule=packet)

        # re-arrange incoming data into an easy to parse dictionary
        vardict = {}
        arrLen = None
        for varname in psd.list_field_names():
            vardict[varname] = psd.get_values(varname)
            arrLen = len(vardict[varname])

        if self.initDataFlag:
            # look at the incoming packet and store
            for varname in psd.list_field_names():
                self.lock.acquire()
                self.graph_data[varname] = []
                self.lock.release()

            self.initDataFlag = False

        # If code reached here, the graph data storage has been initialized. Just add values
        # to the list
        with self.lock:
            for varname in psd.list_field_names():
                self.graph_data[varname].extend(vardict[varname])

    def rendering_thread(self):
        from copy import deepcopy
        # Service Client

        # init Matplotlib
        fig = Figure()
        ax = fig.add_subplot(111)
        canvas = FigureCanvas(fig)
        imgInMem = StringIO.StringIO()
        while True:

            # Sleep for a pre-decided interval. Should be specifiable in a YAML file
            gevent.sleep(20)

            # If there's no data, wait
            # Lock is used here to make sure the entire vector exists start to finish, this assures that the data won
            working_set = None
            with self.lock:
                if len(self.graph_data) == 0:
                    continue
                else:
                    working_set = deepcopy(self.graph_data)

            # For the simple case of testing, lets plot all time variant variables one at a time
            xAxisVar = 'time'
            xAxisFloatData = working_set[xAxisVar]

            for varName, varData in working_set.iteritems():
                if varName == 'time' or varName == 'height' or varName == 'longitude' or varName == 'latitude':
                    continue

                yAxisVar = varName
                yAxisFloatData = working_set[varName]

                # Generate the plot

                ax.plot(xAxisFloatData, yAxisFloatData, 'ro')
                ax.set_xlabel(xAxisVar)
                ax.set_ylabel(yAxisVar)
                ax.set_title(yAxisVar + ' vs ' + xAxisVar)
                ax.set_autoscale_on(False)

                # generate filename for the output image
                fileName = yAxisVar + '_vs_' + xAxisVar + '.png'
                # Save the figure to the in memory file
                canvas.print_figure(imgInMem, format="png")
                imgInMem.seek(0)

                # submit resulting table back using the out stream publisher
                msg = {
                    "viz_product_type": "matplotlib_graphs",
                    "data_product_id": self.data_product_id,
                    "image_obj": imgInMem.getvalue(),
                    "image_name": fileName
                }
                self.out_stream_pub.publish(msg)

                #clear the canvas for the next image
                ax.clear()
Ejemplo n.º 10
0
class VizTransformProcForMatplotlibGraphs(TransformDataProcess):

    """
    This class is used for instantiating worker processes that have subscriptions to data streams and convert
    incoming data from CDM format to Matplotlib graphs

    """
    def on_start(self):
        super(VizTransformProcForMatplotlibGraphs,self).on_start()
        #assert len(self.streams)==1
        self.initDataFlag = True
        self.graph_data = {} # Stores a dictionary of variables : [List of values]

        # Need some clients
        self.rr_cli = ResourceRegistryServiceProcessClient(process = self, node = self.container.node)
        self.pubsub_cli = PubsubManagementServiceClient(node=self.container.node)

        # extract the various parameters passed to the transform process
        self.out_stream_id = self.CFG.get('process').get('publish_streams').get('visualization_service_submit_stream_id')

        # Create a publisher on the output stream
        #stream_route = self.pubsub_cli.register_producer(stream_id=self.out_stream_id)
        out_stream_pub_registrar = StreamPublisherRegistrar(process=self.container, node=self.container.node)
        self.out_stream_pub = out_stream_pub_registrar.create_publisher(stream_id=self.out_stream_id)

        self.data_product_id = self.CFG.get('data_product_id')
        self.stream_def_id = self.CFG.get("stream_def_id")
        self.stream_def = self.rr_cli.read(self.stream_def_id)

        # Start the thread responsible for keeping track of time and generating graphs
        # Mutex for ensuring proper concurrent communications between threads
        self.lock = RLock()
        self.rendering_proc = Greenlet(self.rendering_thread)
        self.rendering_proc.start()




    def process(self, packet):
        log.debug('(%s): Received Viz Data Packet' % self.name )
        #log.debug('(%s):   - Processing: %s' % (self.name,packet))

        # parse the incoming data
        psd = PointSupplementStreamParser(stream_definition=self.stream_def.container, stream_granule=packet)

        # re-arrange incoming data into an easy to parse dictionary
        vardict = {}
        arrLen = None
        for varname in psd.list_field_names():
            vardict[varname] = psd.get_values(varname)
            arrLen = len(vardict[varname])

        if self.initDataFlag:
            # look at the incoming packet and store
            for varname in psd.list_field_names():
                self.lock.acquire()
                self.graph_data[varname] = []
                self.lock.release()

            self.initDataFlag = False

        # If code reached here, the graph data storage has been initialized. Just add values
        # to the list
        with self.lock:
            for varname in psd.list_field_names():
                self.graph_data[varname].extend(vardict[varname])


    def rendering_thread(self):
        from copy import deepcopy
        # Service Client

        # init Matplotlib
        fig = Figure()
        ax = fig.add_subplot(111)
        canvas = FigureCanvas(fig)
        imgInMem = StringIO.StringIO()
        while True:

            # Sleep for a pre-decided interval. Should be specifiable in a YAML file
            gevent.sleep(20)

            # If there's no data, wait
            # Lock is used here to make sure the entire vector exists start to finish, this assures that the data won
            working_set=None
            with self.lock:
                if len(self.graph_data) == 0:
                    continue
                else:
                    working_set = deepcopy(self.graph_data)


            # For the simple case of testing, lets plot all time variant variables one at a time
            xAxisVar = 'time'
            xAxisFloatData = working_set[xAxisVar]

            for varName, varData in working_set.iteritems():
                if varName == 'time' or varName == 'height' or varName == 'longitude' or varName == 'latitude':
                    continue

                yAxisVar = varName
                yAxisFloatData = working_set[varName]

                # Generate the plot

                ax.plot(xAxisFloatData, yAxisFloatData, 'ro')
                ax.set_xlabel(xAxisVar)
                ax.set_ylabel(yAxisVar)
                ax.set_title(yAxisVar + ' vs ' + xAxisVar)
                ax.set_autoscale_on(False)

                # generate filename for the output image
                fileName = yAxisVar + '_vs_' + xAxisVar + '.png'
                # Save the figure to the in memory file
                canvas.print_figure(imgInMem, format="png")
                imgInMem.seek(0)

                # submit resulting table back using the out stream publisher
                msg = {"viz_product_type": "matplotlib_graphs",
                       "data_product_id": self.data_product_id,
                       "image_obj": imgInMem.getvalue(),
                       "image_name": fileName}
                self.out_stream_pub.publish(msg)

                #clear the canvas for the next image
                ax.clear()
Ejemplo n.º 11
0
class ReplayProcess(BaseReplayProcess):

    process_type = 'standalone'

    def __init__(self, *args, **kwargs):

        super(ReplayProcess,self).__init__(*args,**kwargs)
        self.lock = RLock()

    def on_start(self):

        self.query = self.CFG.get_safe('process.query',{})

        self.delivery_format = self.CFG.get_safe('process.delivery_format',{})
        self.datastore_name = self.CFG.get_safe('process.datastore_name','dm_datastore')

        definition_id = self.delivery_format.get('definition_id')
        rrsc = ResourceRegistryServiceProcessClient(process=self, node=self.container.node)
        definition = rrsc.read(definition_id)
        self.definition = definition.container

        self.fields = self.delivery_format.get('fields',None)

        self.view_name = self.CFG.get_safe('process.view_name','datasets/dataset_by_id')
        self.key_id = self.CFG.get_safe('process.key_id')
        self.stream_id = self.CFG.get_safe('process.publish_streams.output')

        if not self.stream_id:
            raise Inconsistent('The replay process requires a stream id. Invalid configuration!')

        self.data_stream_id = self.definition.data_stream_id
        self.encoding_id = self.definition.identifiables[self.data_stream_id].encoding_id
        self.element_type_id = self.definition.identifiables[self.data_stream_id].element_type_id
        self.element_count_id = self.definition.identifiables[self.data_stream_id].element_count_id
        self.data_record_id = self.definition.identifiables[self.element_type_id].data_record_id
        self.field_ids = self.definition.identifiables[self.data_record_id].field_ids
        self.domain_ids = self.definition.identifiables[self.data_record_id].domain_ids
        self.time_id = self.definition.identifiables[self.domain_ids[0]].temporal_coordinate_vector_id

    def execute_replay(self):
        '''
        @brief Spawns a greenlet to take care of the query and work
        '''
        if not hasattr(self, 'output'):
            raise Inconsistent('The replay process requires an output stream publisher named output. Invalid configuration!')

        datastore_name = self.datastore_name
        key_id = self.key_id

        view_name = self.view_name

        opts = {
            'start_key':[key_id,0],
            'end_key':[key_id,2],
            'include_docs':True
        }

        g = Greenlet(self._query,datastore_name=datastore_name, view_name=view_name, opts=opts,
            callback=lambda results: self._publish_query(results))
        g.start()

    def _query(self,datastore_name='dm_datastore', view_name='posts/posts_by_id', opts={}, callback=None):
        '''
        @brief Makes the couch query and then callsback to publish
        @param datastore_name Name of the datastore
        @param view_name The name of the design view where the data is organized
        @param opts options to pass
        @param callback the content handler
        '''
        db = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.SCIDATA, self.CFG)

        ret = db.query_view(view_name=view_name,opts=opts)

        callback(ret)

    def _publish_query(self, results):
        '''
        @brief Publishes the appropriate data based on the delivery format and data returned from query
        @param results The query results from the couch query
        '''

        if results is None:
            log.info('No Results')
            return

        publish_queue = self._parse_results(results)
        for item in publish_queue:
            log.debug('Item in queue: %s' % type(item))
        granule = self._merge(publish_queue)
        if not granule:
            return # no dataset

        if self.delivery_format.has_key('fields'):
            res = self.subset(granule,self.delivery_format['fields'])
            granule = res

        if self.delivery_format.has_key('time'):
            granule = self.time_subset(granule, self.delivery_format['time'])

        total_records = granule.identifiables[self.element_count_id].value
        granule.identifiables[self.element_count_id].constraint.intervals = [[0, total_records-1],]


        if self.delivery_format.has_key('records'):
            assert isinstance(self.delivery_format['records'], int), 'delivery format is incorrectly formatted.'

            for chunk in self._records(granule,self.delivery_format['records']):
                self.lock.acquire()
                self.output.publish(chunk)
                self.lock.release()
            return


        self.lock.acquire()
        self.output.publish(granule)
        self.lock.release()

    def _parse_results(self, results):
        '''
        @brief Switch-case logic for what packet types replay can handle and how to handle
        @param results List of results returned from couch view
        @return A queue of msgs parsed and formatted to be iterated through and published.
        '''
        log.debug('called _parse_results')
        publish_queue = []

        for result in results:
            assert('doc' in result)

            packet = result['doc']

            if isinstance(packet, BlogBase):
                packet.is_replay = True
                self.lock.acquire()
                self.output.publish(packet)
                self.lock.release()
                continue

            if isinstance(packet, StreamDefinitionContainer):
                continue # Ignore

            if isinstance(packet, StreamGranuleContainer):
                packet = self._parse_granule(packet)
                log.debug('Got packet')
                if packet:
                    log.debug('Appending packet')
                    publish_queue.append(packet)
                continue

            log.info('Unknown packet type in replay.')

        return publish_queue

    def _records(self, granule, n):
        '''
        @brief Yields n records from a granule per iteration
        @param granule consisting of dataset
        @param n number of records to yield
        '''
        bin_size = n
        record_count = granule.identifiables[self.element_count_id].value

        i=0
        while (i+bin_size) < record_count:
            log.debug('Yielding %d to %d', i, i+bin_size)
            yield self._slice(granule,slice(i,i+bin_size))
            i+=bin_size
        if i < record_count:
            yield self._slice(granule, slice(i,i+bin_size))
        return

    def _pair_up(self, granule):
        '''
        @brief Creates a list of tuples consisting of acquire_data friendly var_names and full values_paths
        @param granule consisting of full dataset.
        @return list of tuples
        '''
        fields = self._list_data(self.definition, granule)
        pairs = list()
        for i in fields.values():
            pairs.append((i.split('/').pop(),i))
        return pairs

    def _find_vp(self, pairs, var_name):
        '''
        @brief Determines the value path based on the acquire_data friendly var_name
        @param pairs List of tuples consisting of pair-wise var_name/value_path
        @param var_name Desired var_name
        @return Associated value_path
        '''
        for pair in pairs:
            if var_name == pair[0]:
                return pair[1]
        return

    def _slice(self,granule,slice_):
        '''
        @brief Creates a granule which is a slice of the granule parameter
        @param granule the superset
        @param slice_ The slice values for which to create the granule
        @return Crafted subset granule of the parameter granule.
        '''
        retval = copy.deepcopy(granule)
        fields = self._list_data(self.definition,granule)
        record_count = slice_.stop - slice_.start
        assert record_count > 0, 'slice is malformed'
        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs]) # Get the var_names from the pairs
        log.debug('var_names: %s',var_names)
        file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values)
        codec = HDFEncoder()
        vectors = acquire_data([file_path],var_names,record_count,slice_ ).next()

        for row, value in vectors.iteritems():
            vp = self._find_vp(pairs, row)
            # Determine the range_id reverse dictionary lookup
            #@todo: improve this pattern
            for field,path in fields.iteritems():
                if vp==path:
                    range_id = field
                    break
            bounds_id = retval.identifiables[range_id].bounds_id
            # Recalculate the bounds for this fields and update the granule
            range = value['range']
            retval.identifiables[bounds_id].value_pair[0] = float(range[0])
            retval.identifiables[bounds_id].value_pair[1] = float(range[1])
            codec.add_hdf_dataset(vp, value['values'])
            record_count = len(value['values'])
            #----- DEBUGGING ---------
            log.debug('slice- row: %s', row)
            log.debug('slice- value_path: %s', vp)
            log.debug('slice- range_id: %s', range_id)
            log.debug('slice- bounds_id: %s', bounds_id)
            log.debug('slice- limits: %s', value['range'])
            #-------------------------


        retval.identifiables[self.element_count_id].value = record_count
        hdf_string = codec.encoder_close()
        self._patch_granule(retval, hdf_string)
        FileSystem.unlink(file_path)
        return retval


    def _parse_granule(self, granule):
        '''
        @brief Ensures the granule is valid and gets some metadata from the granule for building the dataset
        @param granule raw granule straight from couch
        @return metadata in the granule as well as the granule itself if valid.
        '''

        granule.stream_resource_id = self.stream_id

        element_count_id = self.element_count_id
        encoding_id = self.encoding_id

        record_count = granule.identifiables[element_count_id].value
        sha1 = granule.identifiables[encoding_id].sha1 or None

        # If there are no records then this is not a proper granule
        if not (record_count > 0):
            log.debug('Granule had no record count discarding.')
            return None

        # No encoding, no packet
        if not encoding_id in granule.identifiables:
            log.debug('Granule had no encoding discarding.')
            return None

        if not sha1:
            log.debug('Granule had no sha1')
            return None


        filepath = FileSystem.get_hierarchical_url(FS.CACHE, sha1, '.hdf5')

        if not os.path.exists(filepath):
            log.debug('File with sha1 does not exist')
            return None

        return {
            'granule':granule,
            'records':record_count,
            'sha1':sha1
        }

    @staticmethod
    def merge_granule(definition, granule1, granule2):
        '''
        @brief Merges two granules based on the definition
        @param definition Stream Definition
        @param granule1 First Granule
        @param granule2 Second Granule
        @return Returns granule1 which is then merged with granule2 and the file pair for indexing

        @description granule1 := granule1 U granule2
        '''
        import numpy as np

        assert isinstance(definition,StreamDefinitionContainer), 'object is not a definition.'
        assert isinstance(granule1, StreamGranuleContainer), 'object is not a granule.'
        encoding_id = DefinitionTree.get(definition,'%s.encoding_id' % definition.data_stream_id)

        if not granule2:
            pair = (
                granule1.identifiables['time_bounds'].value_pair[0],
                '%s.hdf5' % granule1.identifiables[encoding_id].sha1
                )
            return {
                'granule':granule1,
                'files':[pair]
            }

        assert isinstance(granule2, StreamGranuleContainer), 'object is not a granule.'

        assert granule1.identifiables.has_key('time_bounds'), 'object has no time bounds and therefore is invalid.'

        assert granule2.identifiables.has_key('time_bounds'), 'object has no time bounds and therefore is invalid.'

        #-------------------------------------------------------------------------------------
        # First step is figure out where each granule belongs on the timeline
        # We do this with a tuple consisting of the point in the timeline and the filename
        # These will get stable sorted later
        #-------------------------------------------------------------------------------------

        pair1 = (
            granule1.identifiables['time_bounds'].value_pair[0],
            '%s.hdf5' % granule1.identifiables[encoding_id].sha1
            )

        pair2 = (
            granule2.identifiables['time_bounds'].value_pair[0],
            '%s.hdf5' % granule2.identifiables[encoding_id].sha1
            )

        files = []

        if encoding_id in granule1.identifiables:
            if granule1.identifiables[encoding_id].sha1:
                files.append('%s.hdf5' % granule1.identifiables[encoding_id].sha1)
        if encoding_id in granule2.identifiables:
            if granule2.identifiables[encoding_id].sha1:
                files.append('%s.hdf5' % granule2.identifiables[encoding_id].sha1)

        element_count_id = DefinitionTree.get(definition,'%s.element_count_id' % definition.data_stream_id)
        record_count = 0
        if element_count_id in granule1.identifiables:
            record_count += granule1.identifiables[element_count_id].value
        if element_count_id in granule2.identifiables:
            record_count += granule2.identifiables[element_count_id].value

        if not element_count_id in granule1.identifiables:
            granule1.identifiables[element_count_id] = CountElement()
            granule1.identifiables[element_count_id].value = record_count
        else:
            granule1.identifiables[element_count_id].value = record_count

        fields1 = ReplayProcess._list_data(definition, granule1)
        fields2 = ReplayProcess._list_data(definition, granule2)
        #@todo albeit counterintuitive an intersection is the only thing I can support
        merged_paths = {}
        for k,v in fields1.iteritems():
            if fields2.has_key(k):
                merged_paths[k] = v



        for k,v in granule2.identifiables.iteritems():
            # Switch(value):

            # Case Bounds:
            if isinstance(v, QuantityRangeElement):
                # If its not in granule1 just throw it in there
                if k not in granule1.identifiables:
                    granule1.identifiables[k] = v
                else:
                    bounds1 = granule1.identifiables[k].value_pair
                    bounds2 = granule2.identifiables[k].value_pair
                    bounds = np.append(bounds1,bounds2)
                    granule1.identifiables[k].value_pair = [np.nanmin(bounds), np.nanmax(bounds)]


            if isinstance(v, RangeSet): #Including coordinate axis
                if merged_paths.has_key(k) and not granule1.identifiables.has_key(k):
                    granule1.identifiables[k] = v # Copy it over

        # Now make sure granule1 doesnt have excess stuff
        del_list = []
        for k,v in granule1.identifiables.iteritems():
            if isinstance(v, RangeSet):
                if not merged_paths.has_key(k):
                    del_list.append(k)

        for item in del_list:
            del granule1.identifiables[item]



        return {
            'granule':granule1,
            'files':[pair1, pair2]
        }




    @staticmethod
    def _list_data(definition, granule):
        '''
        @brief Lists all the fields in the granule based on the Stream Definition
        @param definition Stream Definition
        @param granule Stream Granule
        @return dict of field_id : values_path for each field_id that exists
        '''
        from interface.objects import StreamDefinitionContainer, StreamGranuleContainer, RangeSet, CoordinateAxis
        assert isinstance(definition, StreamDefinitionContainer), 'object is not a definition.'
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule. its a %s' % type(granule)
        retval = {}
        for key, value in granule.identifiables.iteritems():
            if isinstance(value, RangeSet):
                values_path = value.values_path or definition.identifiables[key].values_path
                retval[key] = values_path

            elif isinstance(value, CoordinateAxis):
                values_path = value.values_path or definition.identifiables[key].values_path
                retval[key] = values_path

        return retval



    def _merge(self, msgs):
        '''
        @brief Merges all the granules and datasets into one large dataset (Union)
        @param msgs raw granules from couch
        @return complete dataset
        @description
             n
        D := U [ msgs_i ]
            i=0
        '''
        granule = None
        file_list = list()
        count = len(msgs)
        used_vals = list()

        #-------------------------------------------------------------------------------------
        # Merge each granule to another granule one by one.
        # After each merge operation keep track of what files belong where on the timeline
        #-------------------------------------------------------------------------------------


        for i in xrange(count):
            if i==0:
                granule = msgs[0]['granule']
                psc = PointSupplementConstructor(point_definition=self.definition)

                res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None)
                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][0] not in used_vals:
                    file_list.append( tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])


            else:
                res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule'])

                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][0] not in used_vals:
                    file_list.append( tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])
                if file_pair[1] not in file_list and file_pair[1][0] not in used_vals:
                    file_list.append(tuple(file_pair[1]))
                    used_vals.append(file_pair[1][0])

        if not granule:
            return
        log.debug('file_list: %s', file_list)
        #-------------------------------------------------------------------------------------
        # Order the lists using a stable sort from python (by the first value in the tuples
        # Then peel off just the file names
        # Then get the appropriate URL for the file using FileSystem
        #-------------------------------------------------------------------------------------
        file_list.sort()
        file_list = list(i[1] for i in file_list)
        file_list = list([FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list])

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        codec = HDFEncoder()
        log.debug('acquire_data:')
        log.debug('\tfile_list: %s', file_list)
        log.debug('\tfields: %s', var_names)
        log.debug('\trecords: %s', record_count)

        data = acquire_data(file_list, var_names, record_count).next()

        for row,value in data.iteritems():
            value_path = self._find_vp(pairs,row)
            codec.add_hdf_dataset(value_path,nparray=value['values'])
            #-------------------------------------------------------------------------------------
            # Debugging
            #-------------------------------------------------------------------------------------
            log.debug('row: %s', row)
            log.debug('value path: %s', value_path)
            log.debug('value: %s', value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule,hdf_string)
        return granule

    def _patch_granule(self, granule, hdf_string):
        '''
        @brief Adds the hdf_string and sha1 to the granule
        @param granule Stream Granule
        @param hdf_string string consisting of raw bytes from an hdf5 file
        '''
        granule.identifiables[self.data_stream_id].values = hdf_string
        granule.identifiables[self.encoding_id].sha1 = hashlib.sha1(hdf_string).hexdigest().upper()


    def time_subset(self, granule, time_bounds):
        '''
        @brief Obtains a subset of the granule dataset based on the specified time_bounds
        @param granule Dataset
        @param time_bounds tuple consisting of a lower and upper bound
        @return A subset of the granule's dataset based on the time boundaries.
        '''
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.'
        lower = time_bounds[0]-1
        upper = time_bounds[1]
        granule = self._slice(granule, slice(lower,upper))
        return granule



    def _get_time_index(self, granule, timeval):
        '''
        @brief Obtains the index where a time's value is
        @param granule must be a complete dataset (hdf_string provided)
        @param timeval the vector value
        @return Index value for timeval or closest approx such that timeval is IN the subset
        '''
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.'
        assert granule.identifiables[self.data_stream_id].values, 'hdf_string is not provided.'

        hdf_string = granule.identifiables[self.data_stream_id].values
        file_path = self._get_hdf_from_string(hdf_string)

        #-------------------------------------------------------------------------------------
        # Determine the field_id for the temporal coordinate vector (aka time)
        #-------------------------------------------------------------------------------------

        time_field = self.definition.identifiables[self.time_id].coordinate_ids[0]
        value_path = granule.identifiables[time_field].values_path or self.definition.identifiables[time_field].values_path
        record_count = granule.identifiables[self.element_count_id].value

        #-------------------------------------------------------------------------------------
        # Go through the time vector and get the indexes that correspond to the timeval
        # It will find a value such that
        # t_n <= i < t_(n+1), where i is the index
        #-------------------------------------------------------------------------------------


        var_name = value_path.split('/').pop()
        res = acquire_data([file_path], [var_name], record_count).next()
        time_vector = res[var_name]['values']
        retval = 0
        for i in xrange(len(time_vector)):
            if time_vector[i] == timeval:
                retval = i
                break
            elif i==0 and time_vector[i] > timeval:
                retval = i
                break
            elif (i+1) < len(time_vector): # not last val
                if time_vector[i] < timeval and time_vector[i+1] > timeval:
                    retval = i
                    break
            else: # last val
                retval = i
                break
        FileSystem.unlink(file_path)
        return retval

    def _get_hdf_from_string(self, hdf_string):
        '''
        @param hdf_string binary string consisting of an HDF5 file.
        @return temporary file (full path) where the string was written to.
        @note client's responsible to unlink when finished.
        '''
        f = FileSystem.mktemp()
        f.write(hdf_string)
        retval = f.name
        f.close()
        return retval


    def subset(self,granule,coverages):
        '''
        @param granule
        @return dataset subset based on the fields
        '''
        assert isinstance(granule, StreamGranuleContainer), 'object is not a granule.'
        field_ids = self.field_ids
        element_count_id = self.element_count_id


        values_path = list()
        domain_ids = list()
        coverage_ids = list()
        coverages = list(coverages)
        log.debug('Coverages include %s of type %s', coverages, type(coverages))
        #-----------------------------------------------------------------------------------------------------------
        # Iterate through the fields IAW stream definition and check for rangesets and coordinate axises
        #  - If its a coordinate axis, it belongs regardless of what the client desires. (It's part of the domain)
        #  - If its a rangeset make sure that it's part of what the client asked for, if not discard it
        #-----------------------------------------------------------------------------------------------------------


        for field_id in field_ids:

            range_id = self.definition.identifiables[field_id].range_id

            #-------------------------------------------------------------------------------------
            # Coordinate Axis
            # - Keep track of this in our domains
            # - Add it to the paths we need to grab from the file(s)
            #-------------------------------------------------------------------------------------

            if isinstance(self.definition.identifiables[range_id], CoordinateAxis):
                log.debug('got a domain: %s' % range_id)
                domain_ids.append(field_id)
                if granule.identifiables.has_key(range_id):
                    value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path
                    values_path.append(value_path)
                else:
                    value_path = self.definition.identifiables[range_id].values_path
                    values_path.append(value_path)
                continue

            #-------------------------------------------------------------------------------------
            # Range Set
            # - If it's part of the coverages we want to keep
            #   - Add it to the list of ranges we're tracking
            #   - Add the value path to the paths we're tracking.
            #-------------------------------------------------------------------------------------


            if isinstance(self.definition.identifiables[range_id], RangeSet):
                # If its a rangeset, a specified coverage and the granule has it, add it to the list
                if  field_id in coverages:
                    if granule.identifiables.has_key(range_id):
                        log.debug('got a range: %s' % range_id)
                        coverage_ids.append(field_id)
                        if granule.identifiables.has_key(range_id):
                            value_path = granule.identifiables[range_id].values_path or self.definition.identifiables[range_id].values_path
                            values_path.append(value_path)
                        else:
                            value_path = self.definition.identifiables[range_id].values_path
                            values_path.append(value_path)
                        continue

                # ----
                # We need to track the range and bounds because,
                # you guessed it, we need to update the bounds
                # ----

                range_id = self.definition.identifiables[field_id].range_id
                bounds_id = self.definition.identifiables[range_id].bounds_id


                #---
                # Lastly, if the field is there and we don't want it, we need to strip it
                #---

                if not (field_id in coverages):
                    log.debug('%s doesn\'t belong in %s.', field_id, coverages)
                    log.debug('rebool: %s', bool(field_id in coverages))
                    if granule.identifiables.has_key(range_id):
                        log.debug('Removing %s from granule', range_id)
                        del granule.identifiables[range_id]
                    if granule.identifiables.has_key(bounds_id):
                        log.debug('Removing %s from granule', bounds_id)
                        del granule.identifiables[bounds_id]

        log.debug('Domains: %s', domain_ids)
        log.debug('Ranges: %s', coverage_ids)
        log.debug('Values_paths: %s', values_path)

        file_path = self._get_hdf_from_string(granule.identifiables[self.data_stream_id].values)
        full_coverage = list(domain_ids + coverage_ids)

        log.debug('Full coverage: %s' % full_coverage)
        log.debug('Calling acquire_data with: %s, %s, %s', [file_path],values_path,granule.identifiables[element_count_id].value)

        codec = HDFEncoder()

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        data = acquire_data([file_path], var_names, record_count).next()
        for row,value in data.iteritems():
            vp = self._find_vp(pairs, row)
            codec.add_hdf_dataset(vp, value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule,hdf_string)

        FileSystem.unlink(file_path)

        return granule
Ejemplo n.º 12
0
class ReplayProcess(BaseReplayProcess):
    process_type="standalone"
    def __init__(self, *args, **kwargs):
        super(ReplayProcess, self).__init__(*args,**kwargs)
        #@todo Init stuff
        # mutex for shared resources between threads
        self.lock = RLock()
        
    def on_start(self):
        '''
        Creates a publisher for each stream_id passed in as publish_streams
        Creates an attribute with the name matching the stream name which corresponds to the publisher
        ex: say we have publish_streams:{'output': my_output_stream_id }
          then the instance has an attribute output which corresponds to the publisher for the stream
          in my_output_stream_id
        '''
        self.stream_publisher_registrar = StreamPublisherRegistrar(process=self,node=self.container.node)


        # Get the query
        self.query = self.CFG.get_safe('process.query',{})

        # Get the delivery_format
        self.delivery_format = self.CFG.get_safe('process.delivery_format',{})
        self.datastore_name = self.CFG.get_safe('process.datastore_name','dm_datastore')

        self.view_name = self.CFG.get_safe('process.view_name','datasets/dataset_by_id')
        self.key_id = self.CFG.get_safe('process.key_id')
        # Get a stream_id for this process
        self.stream_id = self.CFG.get_safe('process.publish_streams.output',{})



        if not (self.stream_id and hasattr(self,'output')):
            raise RuntimeError('The replay agent requires an output stream publisher named output. Invalid configuration!')



    def _records(self, records, n):
        """
        Given a list of records, yield at most n at a time
        """
        while True:
            yval = []
            try:
                for i in xrange(n):
                    yval = yval + [records.pop(0)]
                yield yval
            except IndexError:
                if yval:
                    yield yval
                break

    def _publish_query(self, results):
        '''
        Callback to publish the specified results
        '''
        #-----------------------
        # Iteration
        #-----------------------
        #  - Go through the results, if the user had include_docs=True in the options field
        #    then the full document is in result.doc; however if the query did not include_docs,
        #    then only the doc_id is provided in the result.value.
        #
        #  - What this allows us to do is limit the amount of traffic in information for large queries.
        #    If we only are making a query in a sequence of queries (such as map and reduce) then we don't
        #    care about the full document, yet, we only care about the doc id and will retrieve the document later.
        #  - Example:
        #      Imagine the blogging example, we want the latest blog by author George and all the comments for that blog
        #      The series of queries would go, post_by_updated -> posts_by_author -> posts_join_comments and then
        #      in the last query we'll set include_docs to true and parse the docs.
        #-----------------------


        log.warn('results: %s', results)

        for result in results:
            log.warn('REPLAY Result: %s' % result)



            assert('doc' in result)

            replay_obj_msg = result['doc']

            if isinstance(replay_obj_msg, BlogBase):
                replay_obj_msg.is_replay = True

                self.lock.acquire()
                self.output.publish(replay_obj_msg)
                self.lock.release()

            elif isinstance(replay_obj_msg, StreamDefinitionContainer):

                replay_obj_msg.stream_resource_id = self.stream_id


            elif isinstance(replay_obj_msg, StreamGranuleContainer):

                # Override the resource_stream_id so ingestion doesn't reingest, also this is a NEW stream (replay)
                replay_obj_msg.stream_resource_id = self.stream_id

                datastream = None
                sha1 = None

                for key, identifiable in replay_obj_msg.identifiables.iteritems():
                    if isinstance(identifiable, DataStream):
                        datastream = identifiable
                    elif isinstance(identifiable, Encoding):
                        sha1 = identifiable.sha1

                if sha1: # if there is an encoding

                    # Get the file from disk
                    filename = FileSystem.get_url(FS.CACHE, sha1, ".hdf5")

                    log.warn('Replay reading from filename: %s' % filename)

                    hdf_string = ''
                    try:
                        with open(filename, mode='rb') as f:
                            hdf_string = f.read()
                            f.close()

                            # Check the Sha1
                            retreived_hdfstring_sha1 = hashlib.sha1(hdf_string).hexdigest().upper()

                            if sha1 != retreived_hdfstring_sha1:
                                raise  ReplayProcessException('The sha1 mismatch between the sha1 in datastream and the sha1 of hdf_string in the saved file in hdf storage')

                    except IOError:
                        log.warn('No HDF file found!')
                        #@todo deal with this situation? How?
                        hdf_string = 'HDF File %s not found!' % filename

                    # set the datastream.value field!
                    datastream.values = hdf_string

                else:
                    log.warn('No encoding in the StreamGranuleContainer!')

                self.lock.acquire()
                self.output.publish(replay_obj_msg)
                self.lock.release()


            else:
                 log.warn('Unknown type retrieved in DOC!')



        #@todo: log when there are not results
        if results is None:
            log.warn('No results found in replay query!')
        else:
            log.debug('Published replay!')


    def execute_replay(self):
        log.debug('(Replay Agent %s)', self.name)

        # Handle the query
        datastore_name = self.datastore_name
        key_id = self.key_id


        # Got the post ID, pull the post and the comments
        view_name = self.view_name
        opts = {
            'start_key':[key_id, 0],
            'end_key':[key_id,2],
            'include_docs': True
        }
        g = Greenlet(self._query,datastore_name=datastore_name, view_name=view_name, opts=opts,
            callback=lambda results: self._publish_query(results))
        g.start()




    def _query(self,datastore_name='dm_datastore', view_name='posts/posts_by_id', opts={}, callback=None):
        '''
        Performs the query action
        '''
        log.debug('Couch Query:\n\t%s\n\t%s\n\t%s', datastore_name, view_name, opts)
        #@todo: Fix this datastore management profile with correct data profile in near future
        db = self.container.datastore_manager.get_datastore(datastore_name, DataStore.DS_PROFILE.EXAMPLES, self.CFG)


        ret = db.query_view(view_name=view_name,opts=opts)

        callback(ret)
Ejemplo n.º 13
0
        else:
            error = ERROR_API
    except urllib2.HTTPError, e:
        error_msg = e
        if e.getcode() == 403:
            error = ERROR_RATE
        else:
            error = ERROR_NORMAL
    except Exception, e:
        error_msg = e
        error = ERROR_NORMAL

    if error is None:
        return res
    else:
        log_lock.acquire()
        logger.critical('@%s@: %s' % (uids, error_msg))
        log_lock.release()
        return error


def _worker_finish():
    live_lock.acquire()
    global live_signal
    live_signal -= 1
    print 'a worker quit!'
    print 'worker remained: %d' % live_signal
    live_lock.release()


def worker(proxy_index):