def read_nowait(self, flatten=False): """ Same as read but this is not a coroutine. This should only be used for unit testing. Args: flatten: Returns: numpy.ndarray >>> data = pipe.read_nowait() [1, 2, 3] """ if self._failed: raise PipeError('pipe failed') # if reread is set just return the old data if self._reread: self._reread = False if len(self.read_buffer) == 0: raise PipeError("No data left to reread") return self._format_data(self.read_buffer, flatten) # if the queue is empty and we have old data, just return the old data if self.queue.empty() and len(self.read_buffer) > 0: return self._format_data(self.read_buffer, flatten) # if the buffer is empty and the queue is empty and the pipe is closed if self.queue.empty() and len(self.read_buffer) == 0 and self.closed: raise EmptyPipe() # do not wait for new data, return an empty array if nothing else is available return self._read(flatten)
async def close_interval(self): """ Signal a break in the data stream. This should be used to indicate missing data. Data returned from :meth:`read` will be chunked by interval boundaries. """ if self.direction == Pipe.DIRECTION.INPUT: raise PipeError("cannot write to an input pipe") raise PipeError("abstract method must be implemented by child")
async def flush_cache(self): """ Force a pipe flush even if the cache is not full. Raises an error if caching is not enabled. """ if self.direction == Pipe.DIRECTION.INPUT: raise PipeError("cannot control cache on input pipes") raise PipeError("abstract method must be implemented by child")
async def close_interval(self): if self._failed: raise PipeError('pipe failed') if self.closed: raise PipeError("Cannot write to a closed pipe") if self.debug: print("[%s:write] closing interval" % self.name) if self._caching: await self.flush_cache() await self.queue.put(None)
def reread_last(self): """ The next read will return only unconsumed data from the previous read and no new data from the source. The end_of_interval flag is maintained. """ if self.direction == Pipe.DIRECTION.OUTPUT: raise PipeError("cannot read from an output pipe") raise PipeError("Not Implemented")
def _validate_data(data): if type(data) is not np.ndarray: raise PipeError("invalid data type must be a structured array or 2D matrix") # check for valid data type try: if (len(data) == 0) or len(data[0]) == 0: log.info("pipe write called with no data") return False except TypeError: raise PipeError("invalid data type must be a structured array or 2D matrix") return True
def consume(self, num_rows): if num_rows == 0: return # nothing to do if num_rows < 0: raise PipeError("consume called with negative offset: %d" % num_rows) if num_rows > self.last_index: raise PipeError("cannot consume %d rows: only %d available" % (num_rows, self.last_index)) self.buffer = np.roll(self.buffer, -1 * num_rows) self.last_index -= num_rows
def close_nowait(self): """ Same as close but this is not a coroutine. This should only be used for unit testing """ if len(self.subscribers) > 0: raise PipeError("cannot close_nowait subscribers, use async") if self.close_cb is not None: raise PipeError("close_cb cannot be executed, use async") self.closed = True
def consume(self, num_rows): if num_rows == 0: return if num_rows < 0: raise PipeError("consume called with negative offset: %d" % num_rows) if num_rows > len(self.read_buffer): raise PipeError("cannot consume %d rows: only %d available" % (num_rows, len(self.read_buffer))) if self.debug: print("[%s:read] consumed %d rows" % (self.name, num_rows)) self.read_buffer = self.read_buffer[num_rows:]
async def read_all(self, flatten=False, maxrows=1e5, error_on_overflow=False) -> np.ndarray: """ Read stream data. By default this method returns a structured array with ``timestamp`` and ``data`` fields. The pipe is automatically closed. This method is a coroutine. Args: flatten: return an unstructured array (flat 2D matrix) with timestamps in the first column maxrows: the maximum number of rows to read from the pipe error_on_overflow: raise a PipeError exception if pipe is not empty after reading maxrows Returns: numpy.ndarray >>> data = await pipe.read_all(flatten=True) [1, 2, 3] """ if self.direction == Pipe.DIRECTION.OUTPUT: raise PipeError("cannot read from an output pipe") data = None while True: try: new_data = await self.read(flatten) self.consume(len(new_data)) except PipeError: break if data is None: data = new_data if len(data) > maxrows: await self.close() if error_on_overflow: raise PipeError("More than [%d] rows, increase maxrows or disable error_on_overflow" % maxrows) return data[:maxrows] else: if len(data) + len(new_data) > maxrows: await self.close() if error_on_overflow: raise PipeError("More than [%d] rows, increase maxrows or disable error_on_overflow" % maxrows) remaining_rows = maxrows - len(data) if flatten: data = np.vstack((data, new_data[:remaining_rows])) else: data = np.hstack((data, new_data[:remaining_rows])) break if flatten: data = np.vstack((data, new_data)) else: data = np.hstack((data, new_data)) if data is None: raise PipeError("No data in pipe") return data
def consume(self, num_rows): """ Flush data from the read buffer. The next call to :meth:`read` will return any unflushed data followed by new incoming data. Args: num_rows: number of rows to flush from the read buffer """ if self.direction == Pipe.DIRECTION.OUTPUT: raise PipeError("cannot consume from an output pipe") raise PipeError("abstract method must be implemented by child")
def enable_cache(self, lines: int): """ Turn on caching for pipe writes. Data is only transmitted once the cache is full. This improves system performance especially if :meth:`write` is called rapidly with short arrays. Once enabled, caching cannot be disabled. Args: lines: cache size """ if self.direction == Pipe.DIRECTION.INPUT: raise PipeError("cannot control cache on input pipes") raise PipeError("abstract method must be implemented by child")
async def write(self, data): """ Write timestamped data to the pipe. Timestamps must be monotonically increasing and should not overlap with existing stream data in the database. This method is a coroutine. Args: data (numpy.ndarray): May be a structured array with ``timestamp`` and ``data`` fields or an unstructured array with timestamps in the first column. >>> await pipe.write([[1000, 2, 3],[1001, 3, 4]]) """ if self.direction == Pipe.DIRECTION.INPUT: raise PipeError("cannot write to an input pipe") raise PipeError("abstract method must be implemented by child")
async def read(self, flatten=False): if flatten: raise Exception("Not Implemented") if self._reread: self._reread = False if self.last_block is None or len(self.last_block) == 0: raise PipeError("No data left to reread") return self.last_block if len(self.data_blocks) == 0 and self.last_block is None: raise EmptyPipeError() if len(self.data_blocks) != 0: block = self.data_blocks.popleft() if len(self.data_blocks) == 0: self.interval_break = True self._last_read = True elif self.data_blocks[0] is None: self.data_blocks.popleft() self.interval_break = True else: self.interval_break = False if self.last_block is not None: self.last_block = np.hstack((self.last_block, block)) else: self.last_block = block return self.last_block
async def flush_cache(self): if self.closed: raise PipeError("Cannot write to a closed pipe") if self._cache_index > 0: await self._write(self._cache[:self._cache_index]) self._cache_index = 0 self._cache = np.empty(len(self._cache), self.dtype)
async def close_interval(self): if self.closed: raise PipeError("Pipe is closed") if self.writer is None: return # nothing has been written yet so nothing to close if self._caching: await self.flush_cache() self.writer.write(interval_token(self.layout).tobytes()) await self.writer.drain()
async def read(self, flatten=False) -> np.ndarray: if self._failed: await self.close() raise PipeError('pipe failed') # if reread is set just return the old data if self._reread: self._reread = False if len(self.read_buffer) == 0: raise PipeError("No data left to reread") return self._format_data(self.read_buffer, flatten) self._interval_break = False # if the queue is empty and we have old data, just return the old data # THIS IS REMOVED, OTHERWISE THE WRITER CAN BE STARVED AND NEVER CLOSE THE PIPE # if self.queue.empty() and len(self.read_buffer) > 0: # await asyncio.sleep(self.TIMEOUT_INTERVAL) # return self._format_data(self.read_buffer, flatten) # otherwise wait for at least one block while self.queue.empty(): # if self._last_read: # raise EmptyPipe() # trying to re-read old data # if the buffer is empty and the queue is empty and the pipe is closed if self.queue.empty() and self.closed: self._last_read = True # from now on the is_empty flag will be set # but an error will only be generated if all the remaining data is consumed break # return unconsumed data await asyncio.sleep(self.TIMEOUT_INTERVAL) data_block = self._read(flatten) # NOTE: There is a chance read will return an empty array-> if the producer simply closes the existing # interval but all of the data is already consumed. This happens typically when a module fails and has to # be restarted, then the inserter pipe will have no data (probably already has been read), but the terminating # worker adds in an interval closing block [None] to the pipe. But if the producer also closes the pipe there # is no reason to pass back empty data so raise an EmptyPipe exception instead if len(data_block) == 0 and self.closed: raise EmptyPipe() return data_block
async def write(self, data: np.ndarray): if self._failed: await self.close() raise PipeError('pipe failed') if self.closed: raise PipeError("Cannot write to a closed pipe") if not self._validate_data(data): return # convert into a structured array sarray = self._apply_dtype(data) if self._caching: for row in sarray: self._cache[self._cache_index] = row self._cache_index += 1 if self._cache_index >= len(self._cache): await self.flush_cache() else: await self._write(sarray)
def close_interval_nowait(self): if self.closed: raise PipeError("Pipe is closed") if self.writer is None: return # nothing has been written yet so nothing to close if self._cache_index > 0: log.warning("dumping %d rows of cached data due on %s" % (self._cache_index, self.name)) self._cache = np.empty(len(self._cache), self.dtype) self._cache_index = 0 self.writer.write(interval_token(self.layout).tobytes())
def close_interval_nowait(self): """ Same as close_interval but this is not a coroutine. This should only be used for unit testing """ if self._failed: raise PipeError('pipe failed') if self.debug: print("[%s:write] closing interval" % self.name) self.queue.put_nowait(None)
def subscribe(self, pipe): if self.direction == Pipe.DIRECTION.INPUT: raise PipeError("cannot subscribe to an input pipe") self.subscribers.append(pipe) def unsubscribe(): i = self.subscribers.index(pipe) del self.subscribers[i] return unsubscribe
def _apply_dtype(self, data: np.ndarray) -> np.ndarray: """convert [data] to the pipe's [dtype]""" if data.ndim == 1: # already a structured array just verify its data type if data.dtype != self.dtype: raise PipeError("wrong dtype for 1D (structured) array" + "[%s] != req type [%s]" % (data.dtype, self.dtype)) return data elif data.ndim == 2: # Convert to structured array sarray = np.zeros(data.shape[0], dtype=self.dtype) try: sarray['timestamp'] = data[:, 0] # Need the squeeze in case sarray['data'] is 1 dimensional sarray['data'] = data[:, 1:] return sarray except (IndexError, ValueError): raise PipeError("wrong number of fields for this data type") else: raise PipeError("wrong number of dimensions in array")
async def read(self, flatten=False) -> np.ndarray: """ Read stream data. By default this method returns a structured array with ``timestamp`` and ``data`` fields. This method is a coroutine. Args: flatten: return an unstructured array (flat 2D matrix) with timestamps in the first column Returns: numpy.ndarray >>> data = await pipe.read() [1, 2, 3] >>> data = await pipe.read(flatten=True) # the same data is returned again [1,2,3] >>> pipe.consume(len(data)) # next call to read will return only new data """ if self.direction == Pipe.DIRECTION.OUTPUT: raise PipeError("cannot read from an output pipe") raise PipeError("abstract method must be implemented by child")
def write_nowait(self, data): if self._failed: raise PipeError('pipe failed') if self.closed: raise PipeError("Cannot write to a closed pipe") if not self._validate_data(data): return # convert into a structured array sarray = self._apply_dtype(data) # send data to subscribers for pipe in self.subscribers: if type(pipe) is LocalPipe: p: LocalPipe = pipe # to appease type checker p.write_nowait(sarray) else: raise PipeError("cannot write_nowait to subscriber [%s]" % pipe.name) self.queue.put_nowait(sarray) self.queued_rows += len(sarray) if self.debug: print("[%s:write] queueing block with [%d] rows" % (self.name, len(sarray)))
async def write(self, data): if self.closed: raise PipeError("Cannot write to a closed pipe") if not self._validate_data(data): return # make sure dtype is structured sdata = self._apply_dtype(data) if self._caching: for row in sdata: self._cache[self._cache_index] = row self._cache_index += 1 if self._cache_index >= len(self._cache): await self.flush_cache() else: await self._write(sdata)
def reread_last(self): if len(self.read_buffer) == 0: raise PipeError("No data left to reread") self._reread = True
def write_nowait(self, data): if self._closed: raise PipeError("Cannot write to a closed pipe") self.data_blocks.append(data)
def change_layout(self, layout: str): raise PipeError("layout cannot be changed")
async def read(self, flatten=False) -> np.ndarray: if self.reader is None: self.reader, self._reader_close = await self.reader_factory() if self.closed: # this happens if close is called before the first read if self._reader_close is not None: self._reader_close() raise PipeError("Cannot read from a closed pipe") rowbytes = self.dtype.itemsize max_rows = self.BUFFER_SIZE - ( self.last_index + len(self.unprocessed_np_buffer) % rowbytes) if max_rows == 0: # buffer is full, this must be consumed before a new read return self._format_data(self.buffer[:self.last_index], flatten) # if reread is set just return the old data if self._reread: self._reread = False if self.last_index == 0: raise PipeError("No data left to reread") return self._format_data(self.buffer[:self.last_index], flatten) # make sure we get at least one full row of data from read (depends on datatype) raw = b'' while True: new_data = b'' if self.reader.at_eof(): # do not raise an exception, but is_empty() will return True #if self._last_read: # raise EmptyPipe() # this data has already been read once if (len(self.unprocessed_np_buffer) == 0 and self.last_index == 0): raise EmptyPipe() if len(self.unprocessed_np_buffer) == 0: # no new data is coming in, read() will just return # previously viewed data self._last_read = True break try: new_data = await asyncio.wait_for( self.reader.read(max_rows * rowbytes), self.TIMEOUT_INTERVAL) except asyncio.TimeoutError: pass raw += new_data if len(raw) < self.dtype.itemsize: await asyncio.sleep(0.1) else: break # extra_bytes: number of leftover bytes after % rowbytes # byte_buffer: the extra_bytes from the last read # unprocessed_np_buffer: data leftover from an interval break in the previous read extra_bytes = (len(raw) + len(self.byte_buffer)) % rowbytes if extra_bytes > 0: np_buffer = self.byte_buffer + raw[:-extra_bytes] self.byte_buffer = raw[-extra_bytes:] elif len(self.byte_buffer) > 0: np_buffer = self.byte_buffer + raw self.byte_buffer = b'' else: # common case where byte_buffer is empty and no extra bytes np_buffer = raw self.byte_buffer = b'' # append unprocessed np_buffer from previous read if len(self.unprocessed_np_buffer) > 0: self.unprocessed_np_buffer = self.unprocessed_np_buffer + np_buffer # check if we can process all the data, if not # store the extra in unprocessed_np_buffer max_bytes = max_rows * rowbytes if len(self.unprocessed_np_buffer) <= max_bytes: np_buffer = self.unprocessed_np_buffer self.unprocessed_np_buffer = b'' else: np_buffer = self.unprocessed_np_buffer[:max_bytes] self.unprocessed_np_buffer = self.unprocessed_np_buffer[ max_bytes:] # check for an interval self.interval_break = False loc = find_interval_token(np_buffer, self.layout) if loc is not None: self.unprocessed_np_buffer = np_buffer[ loc[1]:] + self.unprocessed_np_buffer np_buffer = np_buffer[:loc[0]] self.interval_break = True data = np.frombuffer(np_buffer, dtype=self.dtype) # append data onto buffer self.buffer[self.last_index:self.last_index + len(data)] = data self.last_index += len(data) return self._format_data(self.buffer[:self.last_index], flatten)
def reread_last(self): if self.last_index == 0: raise PipeError("No data left to reread") self._reread = True