async def _extract_data(conn: asyncpg.Connection, stream: DataStream, callback, decimation_level: int = 1, start: int = None, end: int = None, block_size=50000): if decimation_level > 1: layout = stream.decimated_layout else: layout = stream.layout table_name = "data.stream%d" % stream.id if decimation_level > 1: table_name += "_%d" % decimation_level # extract by interval query = "SELECT time FROM data.stream%d_intervals " % stream.id query += psql_helpers.query_time_bounds(start, end) try: boundary_records = await conn.fetch(query) except asyncpg.UndefinedTableError: # no data tables data = np.array([], dtype=pipes.compute_dtype(layout)) await callback(data, layout, decimation_level) return boundary_records += [{'time': end}] for i in range(len(boundary_records)): record = boundary_records[i] end = record['time'] # extract the interval data done = False while not done: query = "SELECT * FROM %s " % table_name query += psql_helpers.query_time_bounds(start, end) query += " ORDER BY time ASC LIMIT %d" % block_size psql_bytes = BytesIO() try: await conn.copy_from_query(query, format='binary', output=psql_bytes) except asyncpg.UndefinedTableError: # interval table exists but not the data table data = np.array([], dtype=pipes.compute_dtype(layout)) await callback(data, layout, decimation_level) return psql_bytes.seek(0) dtype = pipes.compute_dtype(layout) np_data = psql_helpers.bytes_to_data(psql_bytes, dtype) await callback(np_data, layout, decimation_level) if len(np_data) < block_size: break start = np_data['timestamp'][-1] + 1 # do not put an interval token at the end of the data if i < len(boundary_records) - 1: await callback(pipes.interval_token(layout), layout, decimation_level) start = end
async def extract(self, stream: DataStream, start: Optional[int], end: Optional[int], callback: Callable[[np.ndarray, str, int], Coroutine], max_rows: int = None, decimation_level=None): # figure out appropriate decimation level if decimation_level is None: if max_rows is None: decimation_level = 1 else: # find out how much data this represents count = await self._count_by_path(compute_path(stream), start, end) if count > 0: desired_decimation = np.ceil(count / max_rows) decimation_level = 4**np.ceil( np.log(desired_decimation) / np.log(self.decimation_factor)) else: # create an empty array with the right data type data = np.array([], dtype=pipes.compute_dtype(stream.layout)) await callback(data, stream.layout, 1) return # make sure the target decimation level exists and has data try: path = compute_path(stream, decimation_level) if (await self._count_by_path(path, start, end)) == 0: # no data in the decimated path raise errors.InsufficientDecimationError( "required level is empty") except errors.DataError as e: if ERRORS.NO_SUCH_STREAM.value in str(e): # no decimated data or required level does not exist raise errors.InsufficientDecimationError( "required level %d does not exist" % decimation_level) # some other error, propogate it up raise e # pragma: no cover elif max_rows is not None: # two constraints, make sure we aren't going to return too much data count = await self._count_by_path( compute_path(stream, decimation_level), start, end) if count > max_rows: raise errors.InsufficientDecimationError( "actual_rows(%d) > max_rows(%d)" % (count, max_rows)) # retrieve data from stream path = compute_path(stream, decimation_level) if decimation_level > 1: layout = stream.decimated_layout else: layout = stream.layout try: await self._extract_by_path(path, start, end, layout, callback) except aiohttp.ClientError as e: raise errors.DataError(str(e))
async def callback(data, layout, decimation_factor): self.assertEqual(decimation_factor, 1) self.assertEqual(self.stream1.layout, layout) self.assertEqual(pipes.compute_dtype(self.stream1.layout), data.dtype) if len(data) == 0: self.assertEqual(0, len(data['timestamp'])) else: for row in data: self.assertEqual(row, pipes.interval_token(self.stream1.layout))
async def extract(self, stream: 'DataStream', start: Optional[int], end: Optional[int], callback: Callable[[np.ndarray, str, int], Coroutine], max_rows: int = None, decimation_level=None): conn = await self.pool.acquire() # limit time bounds to range of base stream (start, end) = await psql_helpers.limit_time_bounds(conn, stream, start, end) # figure out appropriate decimation level if decimation_level is None: if max_rows is None: decimation_level = 1 else: # find out how much data this represents count = await psql_helpers.get_row_count( conn, stream, start, end) if count > 0: desired_decimation = np.ceil(count / max_rows) decimation_level = int(4**np.ceil( np.log(desired_decimation) / np.log(self.decimation_factor))) # print("count=%d, max_rows=%d,desired_decim=%d,decim_level=%d" % ( # count, max_rows, desired_decimation, decimation_level)) else: # create an empty array with the right data type data = np.array([], dtype=pipes.compute_dtype(stream.layout)) await callback(data, stream.layout, 1) await self.pool.release(conn) return try: await _extract_data(conn, stream, callback, decimation_level, start, end, block_size=self.extract_block_size) except Exception as e: raise e finally: await self.pool.release(conn)
async def _copy_interval(istart, iend, bar): #print("[%s] -> [%s]" % (timestamp_to_human(istart), timestamp_to_human(iend))) if nilmdb_source: src_params = { 'path': source, 'binary': 1, 'start': istart, 'end': iend } src_url = "{server}/stream/extract".format(server=source_node) src_headers = {} src_ssl = None else: src_params = {'id': src_stream.id, 'start': istart, 'end': iend} src_url = "{server}/data".format(server=source_node.session.url) src_headers = {"X-API-KEY": source_node.session.key} src_ssl = source_node.session.ssl_context async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout( total=None)) as session: async with session.get(src_url, params=src_params, headers=src_headers, ssl=src_ssl) as src_response: if src_response.status != 200: msg = await src_response.text() if msg == 'this stream has no data': # This is not an error because a previous copy may have been interrupted # This will cause the destination to have an interval gap where the source has no data # Example: source: |** *******| # dest: |** | |*******| # ^--- looks like missing data but there's nothing in the source return # ignore empty intervals raise click.ClickException( "Error reading from source: %s" % msg) pipe = pipes.InputPipe(stream=dest_stream, reader=src_response.content) async def _data_sender(): last_ts = istart try: while True: data = await pipe.read() pipe.consume(len(data)) if len(data) > 0: cur_ts = data[-1]['timestamp'] yield data.tobytes() # total time extents of this chunk bar.update(cur_ts - last_ts) last_ts = cur_ts # if pipe.end_of_interval: # yield pipes.interval_token(dest_stream.layout). \ # tostring() except pipes.EmptyPipe: pass bar.update(iend - last_ts) if nilmdb_dest: dst_params = { "start": istart, "end": iend, "path": destination, "binary": 1 } dst_url = "{server}/stream/insert".format(server=dest_node) await _send_nilmdb_data( dst_url, dst_params, _data_sender(), pipes.compute_dtype(dest_stream.layout), session) else: dst_url = "{server}/data".format( server=dest_node.session.url) dst_params = {"id": dest_stream.id} dst_headers = {"X-API-KEY": dest_node.session.key} dst_ssl = dest_node.session.ssl_context async with session.post(dst_url, params=dst_params, data=_data_sender(), headers=dst_headers, ssl=dst_ssl, chunked=True) as dest_response: if dest_response.status != 200: msg = await dest_response.text() raise errors.ApiError( "Error writing to destination: %s" % msg)
async def _run(): nonlocal stream_path # Open the file and make sure it is the right type try: hdf_root = h5py.File(file, 'r') hdf_timestamp = hdf_root['timestamp'] hdf_data = hdf_root['data'] start = hdf_timestamp[0, 0] end = hdf_timestamp[-1, 0] # make sure the length of both datasets are the same if len(hdf_data) != len(hdf_timestamp): raise click.ClickException( "Length of [data] and [timestamp] datasets must match") # if a stream is not specified see if one is in the data file if stream_path is None: try: stream_path = hdf_root.attrs['path'] except KeyError: raise click.ClickException( "Specify a target stream with --stream") except OSError: raise click.ClickException("Data file [%s] must be hdf5 format" % file) except KeyError: raise click.ClickException( "Data file must contain [data] and [timestamp] datasets") # get the stream object from the API try: stream_obj = await config.node.data_stream_get(stream_path) print("Destination stream: %s" % stream_path) stream_info = await config.node.data_stream_info(stream_path) # make sure the datatypes match dtype = compute_dtype(stream_obj.layout) if dtype[1].base != hdf_data.dtype: raise click.ClickException( "Incompatible datatypes, stream is [%s] and data file is [%s]" % ((dtype[1].base, hdf_data.dtype))) # make sure the number of elements match if len(stream_obj.elements) != hdf_data.shape[1]: raise click.ClickException( "DataStream has [%d] elements but data file has [%d] elements" % (len(stream_obj.elements), hdf_data.shape[1])) # check if there is existing data in this time period if stream_info.rows > 0 and (start < stream_info.end and end >= stream_info.start): # confirm overwrite if not click.confirm( "This will remove existing data between %s- %s" % (timestamp_to_human(start), timestamp_to_human(end))): click.echo("Cancelled") return await config.node.data_delete(stream_obj, start, end + 1) except errors.ApiError as e: if '404' not in str(e): raise click.ClickException(str(e)) # this stream doesn't exist, create it from the hdf attributes stream_obj = await _create_stream(stream_path, hdf_root, config.node) pipe = await config.node.data_write(stream_obj) # progress bar for writing to a file bar_ctx = click.progressbar(length=len(hdf_data), label='ingesting data') bar = bar_ctx.__enter__() for idx in range(0, len(hdf_data), BLOCK_SIZE): ts = hdf_timestamp[idx:idx + BLOCK_SIZE] data = hdf_data[idx:idx + BLOCK_SIZE] sdata = np.empty(len(ts), dtype=compute_dtype(stream_obj.layout)) sdata['timestamp'][:, None] = ts sdata['data'] = data await pipe.write(sdata) bar.update(len(data)) await pipe.close() bar_ctx.__exit__(None, None, None)