Example #1
0
async def _extract_data(conn: asyncpg.Connection,
                        stream: DataStream,
                        callback,
                        decimation_level: int = 1,
                        start: int = None,
                        end: int = None,
                        block_size=50000):
    if decimation_level > 1:
        layout = stream.decimated_layout
    else:
        layout = stream.layout

    table_name = "data.stream%d" % stream.id
    if decimation_level > 1:
        table_name += "_%d" % decimation_level
    # extract by interval
    query = "SELECT time FROM data.stream%d_intervals " % stream.id
    query += psql_helpers.query_time_bounds(start, end)
    try:
        boundary_records = await conn.fetch(query)
    except asyncpg.UndefinedTableError:
        # no data tables
        data = np.array([], dtype=pipes.compute_dtype(layout))
        await callback(data, layout, decimation_level)
        return

    boundary_records += [{'time': end}]
    for i in range(len(boundary_records)):
        record = boundary_records[i]
        end = record['time']
        # extract the interval data
        done = False
        while not done:
            query = "SELECT * FROM %s " % table_name
            query += psql_helpers.query_time_bounds(start, end)
            query += " ORDER BY time ASC LIMIT %d" % block_size
            psql_bytes = BytesIO()
            try:
                await conn.copy_from_query(query,
                                           format='binary',
                                           output=psql_bytes)
            except asyncpg.UndefinedTableError:
                # interval table exists but not the data table
                data = np.array([], dtype=pipes.compute_dtype(layout))
                await callback(data, layout, decimation_level)
                return
            psql_bytes.seek(0)
            dtype = pipes.compute_dtype(layout)
            np_data = psql_helpers.bytes_to_data(psql_bytes, dtype)
            await callback(np_data, layout, decimation_level)

            if len(np_data) < block_size:
                break
            start = np_data['timestamp'][-1] + 1
        # do not put an interval token at the end of the data
        if i < len(boundary_records) - 1:
            await callback(pipes.interval_token(layout), layout,
                           decimation_level)
        start = end
Example #2
0
    async def extract(self,
                      stream: DataStream,
                      start: Optional[int],
                      end: Optional[int],
                      callback: Callable[[np.ndarray, str, int], Coroutine],
                      max_rows: int = None,
                      decimation_level=None):
        # figure out appropriate decimation level
        if decimation_level is None:
            if max_rows is None:
                decimation_level = 1
            else:
                # find out how much data this represents
                count = await self._count_by_path(compute_path(stream), start,
                                                  end)
                if count > 0:
                    desired_decimation = np.ceil(count / max_rows)
                    decimation_level = 4**np.ceil(
                        np.log(desired_decimation) /
                        np.log(self.decimation_factor))
                else:
                    # create an empty array with the right data type
                    data = np.array([],
                                    dtype=pipes.compute_dtype(stream.layout))
                    await callback(data, stream.layout, 1)
                    return
                # make sure the target decimation level exists and has data
                try:
                    path = compute_path(stream, decimation_level)
                    if (await self._count_by_path(path, start, end)) == 0:
                        # no data in the decimated path
                        raise errors.InsufficientDecimationError(
                            "required level is empty")
                except errors.DataError as e:
                    if ERRORS.NO_SUCH_STREAM.value in str(e):
                        # no decimated data or required level does not exist
                        raise errors.InsufficientDecimationError(
                            "required level %d does not exist" %
                            decimation_level)
                    # some other error, propogate it up
                    raise e  # pragma: no cover
        elif max_rows is not None:
            # two constraints, make sure we aren't going to return too much data
            count = await self._count_by_path(
                compute_path(stream, decimation_level), start, end)
            if count > max_rows:
                raise errors.InsufficientDecimationError(
                    "actual_rows(%d) > max_rows(%d)" % (count, max_rows))

        # retrieve data from stream
        path = compute_path(stream, decimation_level)
        if decimation_level > 1:
            layout = stream.decimated_layout
        else:
            layout = stream.layout
        try:
            await self._extract_by_path(path, start, end, layout, callback)
        except aiohttp.ClientError as e:
            raise errors.DataError(str(e))
Example #3
0
 async def callback(data, layout, decimation_factor):
     self.assertEqual(decimation_factor, 1)
     self.assertEqual(self.stream1.layout, layout)
     self.assertEqual(pipes.compute_dtype(self.stream1.layout), data.dtype)
     if len(data) == 0:
         self.assertEqual(0, len(data['timestamp']))
     else:
         for row in data:
             self.assertEqual(row, pipes.interval_token(self.stream1.layout))
Example #4
0
    async def extract(self,
                      stream: 'DataStream',
                      start: Optional[int],
                      end: Optional[int],
                      callback: Callable[[np.ndarray, str, int], Coroutine],
                      max_rows: int = None,
                      decimation_level=None):
        conn = await self.pool.acquire()
        # limit time bounds to range of base stream
        (start,
         end) = await psql_helpers.limit_time_bounds(conn, stream, start, end)
        # figure out appropriate decimation level
        if decimation_level is None:
            if max_rows is None:
                decimation_level = 1
            else:
                # find out how much data this represents
                count = await psql_helpers.get_row_count(
                    conn, stream, start, end)
                if count > 0:
                    desired_decimation = np.ceil(count / max_rows)

                    decimation_level = int(4**np.ceil(
                        np.log(desired_decimation) /
                        np.log(self.decimation_factor)))
                    # print("count=%d, max_rows=%d,desired_decim=%d,decim_level=%d" % (
                    #    count, max_rows, desired_decimation, decimation_level))
                else:
                    # create an empty array with the right data type
                    data = np.array([],
                                    dtype=pipes.compute_dtype(stream.layout))
                    await callback(data, stream.layout, 1)
                    await self.pool.release(conn)
                    return
        try:
            await _extract_data(conn,
                                stream,
                                callback,
                                decimation_level,
                                start,
                                end,
                                block_size=self.extract_block_size)
        except Exception as e:
            raise e
        finally:
            await self.pool.release(conn)
Example #5
0
    async def _copy_interval(istart, iend, bar):
        #print("[%s] -> [%s]" % (timestamp_to_human(istart), timestamp_to_human(iend)))
        if nilmdb_source:
            src_params = {
                'path': source,
                'binary': 1,
                'start': istart,
                'end': iend
            }
            src_url = "{server}/stream/extract".format(server=source_node)
            src_headers = {}
            src_ssl = None
        else:
            src_params = {'id': src_stream.id, 'start': istart, 'end': iend}
            src_url = "{server}/data".format(server=source_node.session.url)
            src_headers = {"X-API-KEY": source_node.session.key}
            src_ssl = source_node.session.ssl_context
        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
                total=None)) as session:
            async with session.get(src_url,
                                   params=src_params,
                                   headers=src_headers,
                                   ssl=src_ssl) as src_response:
                if src_response.status != 200:
                    msg = await src_response.text()
                    if msg == 'this stream has no data':
                        # This is not an error because a previous copy may have been interrupted
                        # This will cause the destination to have an interval gap where the source has no data
                        # Example:   source:  |**     *******|
                        #            dest:    |** |  |*******|
                        #                          ^--- looks like missing data but there's nothing in the source
                        return  # ignore empty intervals
                    raise click.ClickException(
                        "Error reading from source: %s" % msg)

                pipe = pipes.InputPipe(stream=dest_stream,
                                       reader=src_response.content)

                async def _data_sender():

                    last_ts = istart
                    try:
                        while True:
                            data = await pipe.read()
                            pipe.consume(len(data))
                            if len(data) > 0:
                                cur_ts = data[-1]['timestamp']
                                yield data.tobytes()
                                # total time extents of this chunk
                                bar.update(cur_ts - last_ts)
                                last_ts = cur_ts
                            # if pipe.end_of_interval:
                            #    yield pipes.interval_token(dest_stream.layout). \
                            #        tostring()
                    except pipes.EmptyPipe:
                        pass
                    bar.update(iend - last_ts)

                if nilmdb_dest:
                    dst_params = {
                        "start": istart,
                        "end": iend,
                        "path": destination,
                        "binary": 1
                    }
                    dst_url = "{server}/stream/insert".format(server=dest_node)
                    await _send_nilmdb_data(
                        dst_url, dst_params, _data_sender(),
                        pipes.compute_dtype(dest_stream.layout), session)
                else:
                    dst_url = "{server}/data".format(
                        server=dest_node.session.url)
                    dst_params = {"id": dest_stream.id}
                    dst_headers = {"X-API-KEY": dest_node.session.key}
                    dst_ssl = dest_node.session.ssl_context
                    async with session.post(dst_url,
                                            params=dst_params,
                                            data=_data_sender(),
                                            headers=dst_headers,
                                            ssl=dst_ssl,
                                            chunked=True) as dest_response:
                        if dest_response.status != 200:
                            msg = await dest_response.text()
                            raise errors.ApiError(
                                "Error writing to destination: %s" % msg)
Example #6
0
    async def _run():
        nonlocal stream_path
        # Open the file and make sure it is the right type
        try:
            hdf_root = h5py.File(file, 'r')
            hdf_timestamp = hdf_root['timestamp']
            hdf_data = hdf_root['data']
            start = hdf_timestamp[0, 0]
            end = hdf_timestamp[-1, 0]
            # make sure the length of both datasets are  the same
            if len(hdf_data) != len(hdf_timestamp):
                raise click.ClickException(
                    "Length of [data] and [timestamp] datasets must match")
            # if a stream is not specified see if one is in the data file
            if stream_path is None:
                try:
                    stream_path = hdf_root.attrs['path']
                except KeyError:
                    raise click.ClickException(
                        "Specify a target stream with --stream")
        except OSError:
            raise click.ClickException("Data file [%s] must be hdf5 format" %
                                       file)
        except KeyError:
            raise click.ClickException(
                "Data file must contain [data] and [timestamp] datasets")

        # get the stream object from the API
        try:
            stream_obj = await config.node.data_stream_get(stream_path)
            print("Destination stream: %s" % stream_path)

            stream_info = await config.node.data_stream_info(stream_path)
            # make sure the datatypes match
            dtype = compute_dtype(stream_obj.layout)
            if dtype[1].base != hdf_data.dtype:
                raise click.ClickException(
                    "Incompatible datatypes, stream is [%s] and data file is [%s]"
                    % ((dtype[1].base, hdf_data.dtype)))
            # make sure the number of elements match
            if len(stream_obj.elements) != hdf_data.shape[1]:
                raise click.ClickException(
                    "DataStream has [%d] elements but data file has [%d] elements"
                    % (len(stream_obj.elements), hdf_data.shape[1]))
            # check if there is existing data in this time period
            if stream_info.rows > 0 and (start < stream_info.end
                                         and end >= stream_info.start):
                # confirm overwrite
                if not click.confirm(
                        "This will remove existing data between %s- %s" %
                    (timestamp_to_human(start), timestamp_to_human(end))):
                    click.echo("Cancelled")
                    return
                await config.node.data_delete(stream_obj, start, end + 1)
        except errors.ApiError as e:
            if '404' not in str(e):
                raise click.ClickException(str(e))
            # this stream doesn't exist, create it from the hdf attributes
            stream_obj = await _create_stream(stream_path, hdf_root,
                                              config.node)

        pipe = await config.node.data_write(stream_obj)

        # progress bar for writing to a file
        bar_ctx = click.progressbar(length=len(hdf_data),
                                    label='ingesting data')
        bar = bar_ctx.__enter__()
        for idx in range(0, len(hdf_data), BLOCK_SIZE):
            ts = hdf_timestamp[idx:idx + BLOCK_SIZE]
            data = hdf_data[idx:idx + BLOCK_SIZE]
            sdata = np.empty(len(ts), dtype=compute_dtype(stream_obj.layout))
            sdata['timestamp'][:, None] = ts
            sdata['data'] = data
            await pipe.write(sdata)
            bar.update(len(data))
        await pipe.close()
        bar_ctx.__exit__(None, None, None)