Ejemplo n.º 1
0
    def test_hdf5_persist_decimate(self):
        # Test HDF5 writing, time indexing, array extension etc
        ds_schema_str = """
        type: scion_data_schema_1
        description: Schema for test datasets
        attributes:
          basic_shape: 1d_timeseries
          time_variable: time
          persistence:
            format: hdf5
            layout: vars_individual
            row_increment: 1000
            time_index_step: 1000
        variables:
          - name: time
            base_type: ntp_time
            storage_dtype: i8
            unit: ""
            description: NTPv4 timestamp
          - name: var1
            base_type: float
            storage_dtype: f8
            unit: ""
            description: Sample value
          - name: random1
            base_type: float
            storage_dtype: f8
            unit: ""
            description: Random values
        """
        ds_schema = yaml.load(ds_schema_str)
        ds_id = create_simple_unique_id()
        ds_filename = self.container.file_system.get(
            "%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id))

        self.hdf5_persist = DatasetHDF5Persistence.get_persistence(
            ds_id, ds_schema, "hdf5")
        self.hdf5_persist.require_dataset()

        self.assertTrue(os.path.exists(ds_filename))
        self.addCleanup(os.remove, ds_filename)

        # Add 100000 values in packets of 100
        for i in xrange(100):
            packet = self._get_data_packet(i * 100, 100)
            self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res), 3)
        self.assertEqual(len(data_res["time"]), 10000)

        data_res = self.hdf5_persist.get_data(
            dict(max_rows=999, decimate=True, decimate_method="minmax"))
        self.assertEqual(len(data_res), 3)
        self.assertLessEqual(len(data_res["time"]), 1000)
Ejemplo n.º 2
0
    def test_hdf5_persist_decimate(self):
        # Test HDF5 writing, time indexing, array extension etc
        ds_schema_str = """
        type: scion_data_schema_1
        description: Schema for test datasets
        attributes:
          basic_shape: 1d_timeseries
          time_variable: time
          persistence:
            format: hdf5
            layout: vars_individual
            row_increment: 1000
            time_index_step: 1000
        variables:
          - name: time
            base_type: ntp_time
            storage_dtype: i8
            unit: ""
            description: NTPv4 timestamp
          - name: var1
            base_type: float
            storage_dtype: f8
            unit: ""
            description: Sample value
          - name: random1
            base_type: float
            storage_dtype: f8
            unit: ""
            description: Random values
        """
        ds_schema = yaml.load(ds_schema_str)
        ds_id = create_simple_unique_id()
        ds_filename = self.container.file_system.get("%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id))

        self.hdf5_persist = DatasetHDF5Persistence.get_persistence(ds_id, ds_schema, "hdf5")
        self.hdf5_persist.require_dataset()

        self.assertTrue(os.path.exists(ds_filename))
        self.addCleanup(os.remove, ds_filename)

        # Add 100000 values in packets of 100
        for i in xrange(100):
            packet = self._get_data_packet(i * 100, 100)
            self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res), 3)
        self.assertEqual(len(data_res["time"]), 10000)

        data_res = self.hdf5_persist.get_data(dict(max_rows=999, decimate=True, decimate_method="minmax"))
        self.assertEqual(len(data_res), 3)
        self.assertLessEqual(len(data_res["time"]), 1000)
Ejemplo n.º 3
0
    def get_asset_data(self, asset_id='', data_format='', data_filter=None):
        asset_obj = self._validate_resource_id("asset_id", asset_id,
                                               RT.Instrument)
        dataset_objs, _ = self.rr.find_objects(asset_id,
                                               PRED.hasDataset,
                                               RT.Dataset,
                                               id_only=False)
        if not dataset_objs:
            raise BadRequest("Could not find dataset")
        dataset_obj = dataset_objs[0]

        from ion.data.persist.hdf5_dataset import DatasetHDF5Persistence
        persistence = DatasetHDF5Persistence(dataset_obj._id,
                                             dataset_obj.schema_definition,
                                             "hdf5")
        data_filter1 = dict(transpose_time=True,
                            time_format="unix_millis",
                            max_rows=1000)
        data_filter1.update(data_filter or {})

        data_info = dict(dataset_id=dataset_obj._id,
                         ts_generated=get_ion_ts(),
                         data={},
                         info={},
                         num_rows=0)

        if data_filter1.get("get_info", None) is True:
            data_info["variables"] = [
                var_info["name"]
                for var_info in dataset_obj.schema_definition["variables"]
            ]
            data_info["schema"] = dataset_obj.schema_definition
            res_info = persistence.get_data_info(data_filter1)
            data_info["info"].update(res_info)

        if data_filter1.get("include_data", True):
            raw_data = persistence.get_data(data_filter=data_filter1)
            data_info["data"] = raw_data
            data_info["num_rows"] = len(
                raw_data.values()[0]) if raw_data else 0

        return data_info
Ejemplo n.º 4
0
    def download_asset_data(self,
                            asset_id='',
                            data_format='',
                            data_filter=None):
        asset_obj = self._validate_resource_id("asset_id", asset_id,
                                               RT.Instrument)
        dataset_objs, _ = self.rr.find_objects(asset_id,
                                               PRED.hasDataset,
                                               RT.Dataset,
                                               id_only=False)
        if not dataset_objs:
            raise BadRequest("Could not find dataset")
        dataset_obj = dataset_objs[0]

        if data_format and data_format != "hdf5":
            raise BadRequest("Unsupported download data format")

        from ion.data.persist.hdf5_dataset import DatasetHDF5Persistence
        persistence = DatasetHDF5Persistence(dataset_obj._id,
                                             dataset_obj.schema_definition,
                                             "hdf5")
        data_filter1 = dict(transpose_time=True,
                            time_format="unix_millis",
                            max_rows=100000,
                            start_time=get_ion_ts_millis() - 86400000)
        data_filter1.update(data_filter or {})
        temp_filename = persistence.get_data_copy(data_filter=data_filter1)

        resp_hdrs = {
            "Content-Disposition":
            'attachment; filename="ds_%s.hdf5"' % asset_obj._id
        }

        mr = MediaResponse(media_mimetype="application/octet-stream",
                           body=temp_filename,
                           internal_encoding="filename",
                           response_headers=resp_hdrs)

        return mr
Ejemplo n.º 5
0
    def test_hdf5_persist(self):
        # Test HDF5 writing, time indexing, array extension etc
        ds_schema_str = """
        type: scion_data_schema_1
        description: Schema for test datasets
        attributes:
          basic_shape: 1d_timeseries
          time_variable: time
          persistence:
            format: hdf5
            layout: vars_individual
            row_increment: 1000
            time_index_step: 1000
        variables:
          - name: time
            base_type: ntp_time
            storage_dtype: i8
            unit: ""
            description: NTPv4 timestamp
          - name: var1
            base_type: float
            storage_dtype: f8
            unit: ""
            description: Sample value
          - name: random1
            base_type: float
            storage_dtype: f8
            unit: ""
            description: Random values
        """
        ds_schema = yaml.load(ds_schema_str)
        ds_id = create_simple_unique_id()
        ds_filename = self.container.file_system.get(
            "%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id))

        self.hdf5_persist = DatasetHDF5Persistence.get_persistence(
            ds_id, ds_schema, "hdf5")
        self.hdf5_persist.require_dataset()

        self.assertTrue(os.path.exists(ds_filename))
        self.addCleanup(os.remove, ds_filename)

        # Add 100 values in packets of 10
        for i in xrange(10):
            packet = self._get_data_packet(i * 10, 10)
            self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res), 3)
        self.assertEqual(len(data_res["time"]), 100)
        self.assertEqual(len(data_res["var1"]), 100)
        self.assertEqual(len(data_res["random1"]), 100)
        self.assertEqual(data_res["var1"][1], 1.0)

        with HDFLockingFile(ds_filename, "r") as hdff:
            ds_time = hdff["vars/time"]
            cur_idx = ds_time.attrs["cur_row"]
            self.assertEqual(cur_idx, 100)
            self.assertEqual(len(ds_time), 1000)

            ds_tidx = hdff[DS_TIMEIDX_PATH]
            cur_tidx = ds_tidx.attrs["cur_row"]
            self.assertEqual(cur_tidx, 1)
            self.assertEqual(len(ds_tidx), 1000)

        # Add 1000 values in packets of 10
        for i in xrange(100):
            packet = self._get_data_packet(100 + i * 10, 10)
            self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 1100)

        with HDFLockingFile(ds_filename, "r") as hdff:
            ds_time = hdff["vars/time"]
            cur_idx = ds_time.attrs["cur_row"]
            self.assertEqual(cur_idx, 1100)
            self.assertEqual(len(ds_time), 2000)

            ds_tidx = hdff[DS_TIMEIDX_PATH]
            cur_tidx = ds_tidx.attrs["cur_row"]
            self.assertEqual(cur_tidx, 2)
            self.assertEqual(len(ds_tidx), 1000)

            self.assertEqual(ds_time[0], ds_tidx[0][0])
            self.assertEqual(ds_time[1000], ds_tidx[1][0])

        info_res = self.hdf5_persist.get_data_info()

        self.assertEqual(info_res["ds_rows"], 1100)
        self.assertEqual(info_res["ts_first"], 1000000000.0)
        self.assertEqual(info_res["ts_last"], 1000010990.0)
Ejemplo n.º 6
0
    def test_hdf5_persist_prune(self):
        # Test auto-pruning
        ds_schema_str = """
type: scion_data_schema_1
description: Schema for test datasets
attributes:
  basic_shape: 1d_timeseries
  time_variable: time
  persistence:
    format: hdf5
    layout: vars_individual
    row_increment: 1000
    time_index_step: 1000
  pruning:
    trigger_mode: on_ingest
    prune_mode: max_age_rel
    prune_action: rewrite
    trigger_age: 1000.0
    retain_age: 500.0
variables:
  - name: time
    base_type: ntp_time
    storage_dtype: i8
    unit: ""
    description: NTPv4 timestamp
  - name: var1
    base_type: float
    storage_dtype: f8
    unit: ""
    description: Sample value
  - name: random1
    base_type: float
    storage_dtype: f8
    unit: ""
    description: Random values
"""
        ds_schema = yaml.load(ds_schema_str)
        ds_id = create_simple_unique_id()
        ds_filename = self.container.file_system.get(
            "%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id))

        self.hdf5_persist = DatasetHDF5Persistence.get_persistence(
            ds_id, ds_schema, "hdf5")
        self.hdf5_persist.require_dataset()

        self.assertTrue(os.path.exists(ds_filename))
        self.addCleanup(os.remove, ds_filename)

        # Add 100 values in packets of 10 (right up to the prune trigger)
        for i in xrange(10):
            packet = self._get_data_packet(i * 10, 10)
            self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 100)
        self.assertEqual(len(data_res["var1"]), 100)
        self.assertEqual(len(data_res["random1"]), 100)
        self.assertEqual(data_res["var1"][1], 1.0)

        log.info("*** STEP 2: First prune")

        # Add 2 values (stepping across the prune trigger - inclusive boundary)
        packet = self._get_data_packet(100, 2)
        self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 51)
        self.assertEqual(len(data_res["var1"]), 51)
        self.assertEqual(len(data_res["random1"]), 51)
        self.assertEqual(data_res["var1"][0], 51.0)
        self.assertEqual(data_res["var1"][50], 101.0)

        log.info("*** STEP 3: Additional data")

        # Add 100 values in packets of 10 (right up to the prune trigger)
        packet = self._get_data_packet(102, 8)
        self.hdf5_persist.extend_dataset(packet)
        for i in xrange(4):
            packet = self._get_data_packet(110 + i * 10, 10)
            self.hdf5_persist.extend_dataset(packet)

        packet = self._get_data_packet(150, 2)
        self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 101)
        self.assertEqual(data_res["var1"][0], 51.0)
        self.assertEqual(data_res["var1"][100], 151.0)

        log.info("*** STEP 4: Second prune")

        packet = self._get_data_packet(152, 1)
        self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 51)
        self.assertEqual(data_res["var1"][0], 102.0)
        self.assertEqual(data_res["var1"][50], 152.0)

        log.info("*** STEP 5: Third prune")

        packet = self._get_data_packet(153, 100)
        self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 51)
        self.assertEqual(data_res["var1"][0], 202.0)
        self.assertEqual(data_res["var1"][50], 252.0)
Ejemplo n.º 7
0
    def test_hdf5_persist(self):
        # Test HDF5 writing, time indexing, array extension etc
        ds_schema_str = """
        type: scion_data_schema_1
        description: Schema for test datasets
        attributes:
          basic_shape: 1d_timeseries
          time_variable: time
          persistence:
            format: hdf5
            layout: vars_individual
            row_increment: 1000
            time_index_step: 1000
        variables:
          - name: time
            base_type: ntp_time
            storage_dtype: i8
            unit: ""
            description: NTPv4 timestamp
          - name: var1
            base_type: float
            storage_dtype: f8
            unit: ""
            description: Sample value
          - name: random1
            base_type: float
            storage_dtype: f8
            unit: ""
            description: Random values
        """
        ds_schema = yaml.load(ds_schema_str)
        ds_id = create_simple_unique_id()
        ds_filename = self.container.file_system.get("%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id))

        self.hdf5_persist = DatasetHDF5Persistence.get_persistence(ds_id, ds_schema, "hdf5")
        self.hdf5_persist.require_dataset()

        self.assertTrue(os.path.exists(ds_filename))
        self.addCleanup(os.remove, ds_filename)

        # Add 100 values in packets of 10
        for i in xrange(10):
            packet = self._get_data_packet(i*10, 10)
            self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res), 3)
        self.assertEqual(len(data_res["time"]), 100)
        self.assertEqual(len(data_res["var1"]), 100)
        self.assertEqual(len(data_res["random1"]), 100)
        self.assertEqual(data_res["var1"][1], 1.0)

        with HDFLockingFile(ds_filename, "r") as hdff:
            ds_time = hdff["vars/time"]
            cur_idx = ds_time.attrs["cur_row"]
            self.assertEqual(cur_idx, 100)
            self.assertEqual(len(ds_time), 1000)

            ds_tidx = hdff[DS_TIMEIDX_PATH]
            cur_tidx = ds_tidx.attrs["cur_row"]
            self.assertEqual(cur_tidx, 1)
            self.assertEqual(len(ds_tidx), 1000)

        # Add 1000 values in packets of 10
        for i in xrange(100):
            packet = self._get_data_packet(100 + i*10, 10)
            self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 1100)

        with HDFLockingFile(ds_filename, "r") as hdff:
            ds_time = hdff["vars/time"]
            cur_idx = ds_time.attrs["cur_row"]
            self.assertEqual(cur_idx, 1100)
            self.assertEqual(len(ds_time), 2000)

            ds_tidx = hdff[DS_TIMEIDX_PATH]
            cur_tidx = ds_tidx.attrs["cur_row"]
            self.assertEqual(cur_tidx, 2)
            self.assertEqual(len(ds_tidx), 1000)


            self.assertEqual(ds_time[0], ds_tidx[0][0])
            self.assertEqual(ds_time[1000], ds_tidx[1][0])

        info_res = self.hdf5_persist.get_data_info()

        self.assertEqual(info_res["ds_rows"], 1100)
        self.assertEqual(info_res["ts_first"], 1000000000.0)
        self.assertEqual(info_res["ts_last"], 1000010990.0)
Ejemplo n.º 8
0
    def test_hdf5_persist_prune(self):
        # Test auto-pruning
        ds_schema_str = """
type: scion_data_schema_1
description: Schema for test datasets
attributes:
  basic_shape: 1d_timeseries
  time_variable: time
  persistence:
    format: hdf5
    layout: vars_individual
    row_increment: 1000
    time_index_step: 1000
  pruning:
    trigger_mode: on_ingest
    prune_mode: max_age_rel
    prune_action: rewrite
    trigger_age: 1000.0
    retain_age: 500.0
variables:
  - name: time
    base_type: ntp_time
    storage_dtype: i8
    unit: ""
    description: NTPv4 timestamp
  - name: var1
    base_type: float
    storage_dtype: f8
    unit: ""
    description: Sample value
  - name: random1
    base_type: float
    storage_dtype: f8
    unit: ""
    description: Random values
"""
        ds_schema = yaml.load(ds_schema_str)
        ds_id = create_simple_unique_id()
        ds_filename = self.container.file_system.get("%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id))

        self.hdf5_persist = DatasetHDF5Persistence.get_persistence(ds_id, ds_schema, "hdf5")
        self.hdf5_persist.require_dataset()

        self.assertTrue(os.path.exists(ds_filename))
        self.addCleanup(os.remove, ds_filename)

        # Add 100 values in packets of 10 (right up to the prune trigger)
        for i in xrange(10):
            packet = self._get_data_packet(i * 10, 10)
            self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 100)
        self.assertEqual(len(data_res["var1"]), 100)
        self.assertEqual(len(data_res["random1"]), 100)
        self.assertEqual(data_res["var1"][1], 1.0)

        log.info("*** STEP 2: First prune")

        # Add 2 values (stepping across the prune trigger - inclusive boundary)
        packet = self._get_data_packet(100, 2)
        self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 51)
        self.assertEqual(len(data_res["var1"]), 51)
        self.assertEqual(len(data_res["random1"]), 51)
        self.assertEqual(data_res["var1"][0], 51.0)
        self.assertEqual(data_res["var1"][50], 101.0)

        log.info("*** STEP 3: Additional data")

        # Add 100 values in packets of 10 (right up to the prune trigger)
        packet = self._get_data_packet(102, 8)
        self.hdf5_persist.extend_dataset(packet)
        for i in xrange(4):
            packet = self._get_data_packet(110 + i * 10, 10)
            self.hdf5_persist.extend_dataset(packet)

        packet = self._get_data_packet(150, 2)
        self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 101)
        self.assertEqual(data_res["var1"][0], 51.0)
        self.assertEqual(data_res["var1"][100], 151.0)

        log.info("*** STEP 4: Second prune")

        packet = self._get_data_packet(152, 1)
        self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 51)
        self.assertEqual(data_res["var1"][0], 102.0)
        self.assertEqual(data_res["var1"][50], 152.0)

        log.info("*** STEP 5: Third prune")

        packet = self._get_data_packet(153, 100)
        self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 51)
        self.assertEqual(data_res["var1"][0], 202.0)
        self.assertEqual(data_res["var1"][50], 252.0)