Beispiel #1
0
 def test_time_string_fidelity(self):
     it1 = NTP4Time()
     ntp_str = it1.to_string()
     it2 = NTP4Time.from_string(ntp_str)
     
     self.assertEquals(it1.seconds,it2.seconds)
     self.assertTrue(np.abs(it1.useconds - it2.useconds) <= 1)
Beispiel #2
0
    def _prune_dataset(self, data_file):
        if not self.prune_mode:
            return
        if self.prune_mode == "max_age_rel":
            # Prunes if first timestamp older than trigger compared to most recent timestamp
            trigger_age = float(self.pruning_attrs.get("trigger_age", 0))
            retain_age = float(self.pruning_attrs.get("retain_age", 0))
            if trigger_age <= 0.0 or retain_age <= 0.0 or trigger_age < retain_age:
                raise BadRequest("Bad pruning trigger_age or retain_age")
            var_ds = data_file["vars/%s" % self.time_var]
            cur_idx = var_ds.attrs["cur_row"]
            if not len(var_ds) or not cur_idx:
                return
            min_ts = NTP4Time.from_ntp64(var_ds[0].tostring()).to_unix()
            max_ts = NTP4Time.from_ntp64(var_ds[cur_idx -
                                                1].tostring()).to_unix()
            if min_ts + trigger_age >= max_ts:
                return

            # Find the first index that is lower or equal to retain_age and delete gap
            start_time = (max_ts - retain_age) * 1000
            log.info("PRUNING dataset now: mode=%s, start_time=%s",
                     self.prune_mode, int(start_time))
            copy_filename = self._get_data_copy(
                data_file, data_filter=dict(start_time=start_time))
        elif self.prune_mode == "max_age_abs":
            # Prunes if first timestamp older than trigger compared to current timestamp
            raise NotImplementedError()
        elif self.prune_mode == "max_rows":
            raise NotImplementedError()
        elif self.prune_mode == "max_size":
            raise NotImplementedError()
        else:
            raise BadRequest("Invalid prune_mode: %s" % self.prune_mode)

        if not copy_filename:
            return

        # Do the replace of data file with the copy.
        # Make sure to heed race conditions so that waiting processes won't lock the file first
        ds_filename = self._get_ds_filename()
        ds_filename_bak = ds_filename + ".bak"
        if os.path.exists(ds_filename_bak):
            os.remove(ds_filename_bak)

        data_file.close(
        )  # Note: Inter-process race condition possible because close removes the lock
        shutil.move(ds_filename, ds_filename_bak)
        shutil.move(
            copy_filename,
            ds_filename)  # Note: There may be a cross-device-link error here
        # os.remove(ds_filename_bak)
        log.info("Pruning successful. Replaced dataset with pruned file.")
        return True
Beispiel #3
0
 def gte_time(data_val, cmp_val, allow_equal=True):
     # Support NTP4 timestamp and Unit millis (i8)
     if time_type == "ntp_time":
         if allow_equal:
             return NTP4Time.from_ntp64(data_val.tostring()).to_unix() >= cmp_val
         else:
             return NTP4Time.from_ntp64(data_val.tostring()).to_unix() > cmp_val
     else:
         if allow_equal:
             return data_val >= cmp_val
         else:
             return data_val > cmp_val
Beispiel #4
0
 def gte_time(data_val, cmp_val, allow_equal=True):
     # Support NTP4 timestamp and Unit millis (i8)
     if time_type == "ntp_time":
         if allow_equal:
             return NTP4Time.from_ntp64(
                 data_val.tostring()).to_unix() >= cmp_val
         else:
             return NTP4Time.from_ntp64(
                 data_val.tostring()).to_unix() > cmp_val
     else:
         if allow_equal:
             return data_val >= cmp_val
         else:
             return data_val > cmp_val
Beispiel #5
0
    def get_data_info(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_info = {}
            max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            should_decimate = data_filter.get("decimate", False) is True

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]

            res_info["ds_rows"] = cur_idx
            res_info["ds_size"] = len(ds_time)
            res_info["file_size"] = os.path.getsize(ds_filename)
            res_info["file_name"] = ds_filename
            res_info["vars"] = list(data_file["vars"])

            start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
            res_info["need_expand"] = self.expand_info.get("need_expand", False)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info["num_steps"]  # Compensate expansion
            res_info["should_decimate"] = should_decimate
            res_info["need_decimate"] = bool(should_decimate and end_row - start_row > max_rows)

            res_info["ts_first"] = NTP4Time.from_ntp64(ds_time.value[0].tostring()).to_unix()
            res_info["ts_last"] = NTP4Time.from_ntp64(ds_time.value[cur_idx - 1].tostring()).to_unix()
            res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] * 1000, local_time=False)
            res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] * 1000, local_time=False)

            res_info["ds_samples"] = cur_idx * self.expand_info["num_steps"] if res_info["need_expand"] else cur_idx

            res_info["filter_start_row"] = start_row
            res_info["filter_end_row"] = end_row
            res_info["filter_max_rows"] = max_rows
            res_info["filter_ts_first"] = NTP4Time.from_ntp64(ds_time.value[start_row].tostring()).to_unix()
            res_info["filter_ts_last"] = NTP4Time.from_ntp64(ds_time.value[end_row - 1].tostring()).to_unix()
            res_info["filter_ts_first_str"] = get_datetime_str(res_info["filter_ts_first"] * 1000, local_time=False)
            res_info["filter_ts_last_str"] = get_datetime_str(res_info["filter_ts_last"] * 1000, local_time=False)

            return res_info

        finally:
            data_file.close()
Beispiel #6
0
    def _prune_dataset(self, data_file):
        if not self.prune_mode:
            return
        if self.prune_mode == "max_age_rel":
            # Prunes if first timestamp older than trigger compared to most recent timestamp
            trigger_age = float(self.pruning_attrs.get("trigger_age", 0))
            retain_age = float(self.pruning_attrs.get("retain_age", 0))
            if trigger_age <= 0.0 or retain_age <= 0.0 or trigger_age < retain_age:
                raise BadRequest("Bad pruning trigger_age or retain_age")
            var_ds = data_file["vars/%s" % self.time_var]
            cur_idx = var_ds.attrs["cur_row"]
            if not len(var_ds) or not cur_idx:
                return
            min_ts = NTP4Time.from_ntp64(var_ds[0].tostring()).to_unix()
            max_ts = NTP4Time.from_ntp64(var_ds[cur_idx - 1].tostring()).to_unix()
            if min_ts + trigger_age >= max_ts:
                return

            # Find the first index that is lower or equal to retain_age and delete gap
            start_time = (max_ts - retain_age) * 1000
            log.info("PRUNING dataset now: mode=%s, start_time=%s", self.prune_mode, int(start_time))
            copy_filename = self._get_data_copy(data_file, data_filter=dict(start_time=start_time))
        elif self.prune_mode == "max_age_abs":
            # Prunes if first timestamp older than trigger compared to current timestamp
            raise NotImplementedError()
        elif self.prune_mode == "max_rows":
            raise NotImplementedError()
        elif self.prune_mode == "max_size":
            raise NotImplementedError()
        else:
            raise BadRequest("Invalid prune_mode: %s" % self.prune_mode)

        if not copy_filename:
            return

        # Do the replace of data file with the copy.
        # Make sure to heed race conditions so that waiting processes won't lock the file first
        ds_filename = self._get_ds_filename()
        ds_filename_bak = ds_filename + ".bak"
        if os.path.exists(ds_filename_bak):
            os.remove(ds_filename_bak)

        data_file.close()  # Note: Inter-process race condition possible because close removes the lock
        shutil.move(ds_filename, ds_filename_bak)
        shutil.move(copy_filename, ds_filename)  # Note: There may be a cross-device-link error here
        # os.remove(ds_filename_bak)
        log.info("Pruning successful. Replaced dataset with pruned file.")
        return True
Beispiel #7
0
    def acquire_samples(self, max_samples=0):

        sample = [NTP4Time.utcnow().to_ntp64(), psutil.cpu_percent()]

        sample_desc = dict(cols=["time", "cpu_percent"], data=[sample])

        return sample_desc
Beispiel #8
0
 def build_packet_from_samples(cls, samples, **kwargs):
     num_samples = len(samples["data"])
     dtype_parts = []
     for coldef in samples["cols"]:
         if coldef == "time":
             dtype_parts.append((coldef, "i8"))
         elif "coltypes" in samples and coldef in samples["coltypes"]:
             dtype_parts.append((coldef, samples["coltypes"][coldef]))
         else:
             dtype_parts.append((coldef, "f8"))
     dt = np.dtype(dtype_parts)
     data_array = np.zeros(num_samples, dtype=dt)
     for row_num, data_row in enumerate(samples["data"]):
         row_tuple = tuple(
             NTP4Time.np_from_string(dv) if isinstance(dv, basestring
                                                       ) else dv
             for dv in data_row)
         data_array[row_num] = np.array(row_tuple, dtype=dt)
     data = samples.copy()
     data["data"] = data_array
     new_packet = DataPacket(ts_created=get_ion_ts(), data=data)
     for attr in new_packet.__dict__.keys():
         if attr in ('data', 'ts_created'):
             continue
         if attr in kwargs:
             setattr(new_packet, attr, kwargs[attr])
     return new_packet
Beispiel #9
0
    def acquire_samples(self, max_samples=0):

        sample = [NTP4Time.utcnow().to_ntp64(), psutil.cpu_percent()]

        sample_desc = dict(cols=["time", "cpu_percent"],
                           data=[sample])

        return sample_desc
Beispiel #10
0
    def test_ntp_compatability(self):
        unix_day = NTP4Time(datetime.datetime(1970, 1, 1))
        self.assertEquals(unix_day.era, 0)
        self.assertEquals(unix_day.seconds, 2208988800)

        utc_day = NTP4Time(datetime.datetime(1972, 1, 1))
        self.assertEquals(utc_day.era, 0)
        self.assertEquals(utc_day.seconds, 2272060800)

        millen_day = NTP4Time(datetime.datetime(2000, 1, 1))
        self.assertEquals(millen_day.era, 0)
        self.assertEquals(millen_day.seconds, 3155673600)

        ntp_era1 = NTP4Time(datetime.datetime(2036, 2, 8))
        self.assertEquals(ntp_era1.era, 1)
        self.assertEquals(ntp_era1.seconds, 63104)
        self.assertEquals(ntp_era1.to_unix(), 2086041600.)
Beispiel #11
0
 def _extract_row(self, pkt, cols):
     row = []
     for c in cols:
         for ch in pkt['channels']:
             if ch['chan'] == c:
                 row.append(tuple(ch['data']))
                 break
     orbtime = pkt['channels'][0]['time']
     row.append(NTP4Time(orbtime).to_ntp64())
     return row
Beispiel #12
0
    def acquire_samples(self, max_samples=0):
        ts = time.time()
        sample = [NTP4Time.utcnow().to_ntp64(),
                  20 * math.sin(10 * ts) + 5,
                  10 * math.sin(15 * ts) + 10,
                  random.random()*100]

        sample_desc = dict(cols=["time", "wave1", "wave2", "random1"],
                           data=[sample])

        return sample_desc
Beispiel #13
0
    def acquire_samples(self, max_samples=0):
        log.debug('CDIP_DataAgentPlugin.acquire_samples')

        # Read server, extract last sample.
        data = requests.get(self.streaming_args.url)
        m = None
        for m in re.finditer(pattern, data.text, flags=re.MULTILINE):
            pass
        if not m:
            log.warning('CDIP_DataAgentPlugin.acquire_samples: No data found.')
            return None

        year = int(m.group(1))
        month = int(m.group(2))
        day = int(m.group(3))
        hour = int(m.group(4))
        minute = int(m.group(5))
        Hs = float(m.group(6))
        Tp = float(m.group(7))
        Dp = int(m.group(8))
        Ta = float(m.group(9))
        Temp = float(m.group(10))

        # Create sample.
        # [ntp64_ts, Hs, Tp, Dp, Ta, Temp]
        # ['\xdb\x07\x00,\x00\x00\x00\x00', 2.66, 9.09, 328, 6.67, 12.2]
        dt = datetime.datetime(year, month, day, hour, minute)
        ts = NTP4Time(dt).to_ntp64()
        sample = [ts, Hs, Tp, Dp, Ta, Temp]

        # Compare to last reading.
        if self.last_sample == sample:
            log.debug('CDIP_DataAgentPlugin.acquire_samples: No new data.')
            return None

        # Update, pack and return.
        log.debug('CDIP_DataAgentPlugin.acquire_samples: Got new data.')
        log.debug('CDIP data: %s' % str(sample))
        self.last_sample = sample
        sample_desc = dict(cols=["time", "Hs", "Tp", "Dp", "Ta", "Temp"],
                           data=[sample])
        return sample_desc
    def acquire_samples(self, max_samples=0):
        if len(self.samples) <= self.sample_index:
            log.warn("Out of samples at index %s", self.sample_index)
            self.sample_index += 1
            return None

        data_row = self.samples[self.sample_index]
        self.sample_index += 1

        sample = [
            NTP4Time(data_row["time"]).to_ntp64(),
            tuple(data_row["sample_vector"])
        ]

        sample_desc = dict(cols=["time", "sample_vector"],
                           coltypes=dict(sample_vector="10i2"),
                           data=[sample])

        print sample_desc

        return sample_desc
Beispiel #15
0
 def build_packet_from_samples(cls, samples, **kwargs):
     num_samples = len(samples["data"])
     dtype_parts = []
     for coldef in samples["cols"]:
         if coldef == "time":
             dtype_parts.append((coldef, "i8"))
         else:
             dtype_parts.append((coldef, "f8"))
     dt = np.dtype(dtype_parts)
     data_array = np.zeros(num_samples, dtype=dt)
     for row_num, data_row in enumerate(samples["data"]):
         row_tuple = tuple(NTP4Time.np_from_string(dv) if isinstance(dv, basestring) else dv for dv in data_row)
         data_array[row_num] = np.array(row_tuple, dtype=dt)
     data = samples.copy()
     data["data"] = data_array
     new_packet = DataPacket(ts_created=get_ion_ts(), data=data)
     for attr in new_packet.__dict__.keys():
         if attr in ('data', 'ts_created'):
             continue
         if attr in kwargs:
             setattr(new_packet, attr, kwargs[attr])
     return new_packet
    def _get_data_packet(self, index, num_rows=1):
        """ Return a data packet with number of samples.
        The index indicates the offset from the starting timestamp, 10 sec per sample."""
        base_ts = 1000000000
        index_ts = base_ts + 10 * index

        # Core samples as provided by agent.acquire_samples
        sample_list = []
        for i in xrange(num_rows):
            ts = index_ts + i * 10
            sample = [
                NTP4Time(ts).to_ntp64(),
                float(index + i),
                random.random() * 100
            ]

            sample_list.append(sample)

        sample_desc = dict(cols=["time", "var1", "random1"], data=sample_list)

        packet = DataPacketBuilder.build_packet_from_samples(
            sample_desc, resource_id="ds_id", stream_name="basic_streams")

        return packet
Beispiel #17
0
    def extend_dataset(self, packet):
        """
        Adds values from a data packet to the dataset and updates indexes and metadata
        """
        ingest_ts = NTP4Time.utcnow()
        num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, []
        ds_filename = self._get_ds_filename()
        data_file = HDFLockingFile(ds_filename,
                                   "r+",
                                   retry_count=10,
                                   retry_wait=0.5)
        file_closed = False
        try:
            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                # Get index values from time var
                if self.time_var not in packet.data["cols"]:
                    raise BadRequest("Packet has no time")
                var_ds = data_file["vars/%s" % self.time_var]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                var_ds.attrs["cur_row"] += num_rows

                # Fill variables with values from packet or NaN
                for var_name in self.var_defs_map.keys():
                    var_ds = data_file["vars/%s" % var_name]
                    if cur_idx + num_rows > cur_size:
                        self._resize_dataset(var_ds, num_rows)
                    if var_name in packet.data["cols"]:
                        data_slice = packet.data["data"][:][var_name]
                        var_ds[cur_idx:cur_idx + num_rows] = data_slice
                    else:
                        # Leave the initial fill value (zeros)
                        #var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows
                        pass

                extra_vars = set(packet.data["cols"]) - set(
                    self.var_defs_map.keys())
                if extra_vars:
                    log.warn("Data packet had extra vars not in dataset: %s",
                             extra_vars)

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                var_ds = data_file["vars/%s" % DS_VARIABLES]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                if cur_idx + num_rows > cur_size:
                    self._resize_dataset(var_ds, num_rows)
                ds_var_names = [var_info["name"] for var_info in self.var_defs]
                pvi = {
                    col_name: col_idx
                    for col_idx, col_name in enumerate(packet.data["cols"])
                    if col_name in ds_var_names
                }
                for row_idx in xrange(num_rows):
                    row_data = packet.data["data"][row_idx]
                    row_vals = tuple(row_data[vn] if vn in pvi else None
                                     for vn in ds_var_names)
                    var_ds[cur_idx + row_idx] = row_vals
                var_ds.attrs["cur_row"] += num_rows

            # Update time_ingest (ts, begin row, count)
            ds_tingest = data_file[DS_TIMEINGEST_PATH]
            if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest):
                self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT)
            ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(),
                                                       cur_idx, num_rows)
            ds_tingest.attrs["cur_row"] += 1

            # Update time index
            self._update_time_index(data_file, num_rows, cur_idx=cur_idx)

            # Check if pruning is necessary
            if self.prune_trigger_mode == "on_ingest" and self.prune_mode:
                file_closed = self._prune_dataset(data_file)

            #HDF5Tools.dump_hdf5(data_file, with_data=True)
        except Exception:
            log.exception("Error extending dataset %s HDF5 file" %
                          self.dataset_id)
            raise
        finally:
            if not file_closed:
                data_file.close()
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            should_decimate = data_filter.get("decimate", False) is True
            time_slice = None

            start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
            log.info("Row date interval: %s : %s", start_row, end_row)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info["num_steps"]  # Compensate expansion
            if end_row-start_row > max_rows:
                log.info("Truncating %s rows to %s max rows (from the end)", end_row-start_row, max_rows)

            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                ds_time = data_file["vars/%s" % self.time_var]
                cur_idx = ds_time.attrs["cur_row"]
                for var_name in read_vars:
                    ds_path = "vars/%s" % var_name
                    if ds_path not in data_file:
                        log.warn("Variable '%s' not in dataset - ignored", var_name)
                        continue
                    ds_var = data_file[ds_path]
                    data_array = ds_var[max(start_row, end_row-max_rows, 0):end_row]
                    if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time":
                        if time_format == "unix_millis":
                            data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array]
                        else:
                            data_array = data_array.tolist()
                    else:
                        data_array = data_array.tolist()
                    if var_name == self.time_var:
                        time_slice = data_array

                    res_data[var_name] = data_array

                # At this point we have dict with variable to data array mapping with target (unix) timestamps
                self._expand_packed_rows(res_data, data_filter)

                if data_filter.get("transpose_time", False) is True:
                    time_series = res_data.pop(self.time_var)
                    for var_name, var_series in res_data.iteritems():
                        res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)]

                # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                raise NotImplementedError()

            return res_data

        finally:
            data_file.close()
Beispiel #19
0
    def extend_dataset(self, packet):
        """
        Adds values from a data packet to the dataset and updates indexes and metadata
        """
        ingest_ts = NTP4Time.utcnow()
        num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, []
        ds_filename = self._get_ds_filename()
        data_file = HDFLockingFile(ds_filename, "r+", retry_count=10, retry_wait=0.5)
        file_closed = False
        try:
            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                # Get index values from time var
                if self.time_var not in packet.data["cols"]:
                    raise BadRequest("Packet has no time")
                var_ds = data_file["vars/%s" % self.time_var]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                var_ds.attrs["cur_row"] += num_rows

                # Fill variables with values from packet or NaN
                for var_name in self.var_defs_map.keys():
                    var_ds = data_file["vars/%s" % var_name]
                    if cur_idx + num_rows > cur_size:
                        self._resize_dataset(var_ds, num_rows)
                    if var_name in packet.data["cols"]:
                        data_slice = packet.data["data"][:][var_name]
                        var_ds[cur_idx : cur_idx + num_rows] = data_slice
                    else:
                        # Leave the initial fill value (zeros)
                        # var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows
                        pass

                extra_vars = set(packet.data["cols"]) - set(self.var_defs_map.keys())
                if extra_vars:
                    log.warn("Data packet had extra vars not in dataset: %s", extra_vars)

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                var_ds = data_file["vars/%s" % DS_VARIABLES]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                if cur_idx + num_rows > cur_size:
                    self._resize_dataset(var_ds, num_rows)
                ds_var_names = [var_info["name"] for var_info in self.var_defs]
                pvi = {
                    col_name: col_idx
                    for col_idx, col_name in enumerate(packet.data["cols"])
                    if col_name in ds_var_names
                }
                for row_idx in xrange(num_rows):
                    row_data = packet.data["data"][row_idx]
                    row_vals = tuple(row_data[vn] if vn in pvi else None for vn in ds_var_names)
                    var_ds[cur_idx + row_idx] = row_vals
                var_ds.attrs["cur_row"] += num_rows

            # Update time_ingest (ts, begin row, count)
            ds_tingest = data_file[DS_TIMEINGEST_PATH]
            if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest):
                self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT)
            ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(), cur_idx, num_rows)
            ds_tingest.attrs["cur_row"] += 1

            # Update time index
            self._update_time_index(data_file, num_rows, cur_idx=cur_idx)

            # Check if pruning is necessary
            if self.prune_trigger_mode == "on_ingest" and self.prune_mode:
                file_closed = self._prune_dataset(data_file)

            # HDF5Tools.dump_hdf5(data_file, with_data=True)
        except Exception:
            log.exception("Error extending dataset %s HDF5 file" % self.dataset_id)
            raise
        finally:
            if not file_closed:
                data_file.close()
Beispiel #20
0
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            should_decimate = data_filter.get("decimate", False) is True
            time_slice = None

            start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
            log.info("Get data for row interval %s to %s", start_row, end_row)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info["num_steps"]  # Compensate expansion
            if end_row - start_row > max_rows:
                if should_decimate:
                    log.info("Decimating %s rows to satisfy %s max rows", end_row - start_row, max_rows_org)
                else:
                    log.info(
                        "Truncating %s rows to %s max rows, %s unexpanded", end_row - start_row, max_rows_org, max_rows
                    )

            if self.ds_layout != DS_LAYOUT_INDIVIDUAL:
                raise NotImplementedError()

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]
            for var_name in read_vars:
                ds_path = "vars/%s" % var_name
                if ds_path not in data_file:
                    log.warn("Variable '%s' not in dataset - ignored", var_name)
                    continue
                ds_var = data_file[ds_path]
                start_row_act = start_row if should_decimate else max(start_row, end_row - max_rows, 0)
                data_array = ds_var[start_row_act:end_row]
                if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time":
                    if time_format == "unix_millis":
                        data_array = [int(1000 * NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array]
                    else:
                        data_array = data_array.tolist()
                else:
                    data_array = data_array.tolist()
                if var_name == self.time_var:
                    time_slice = data_array

                res_data[var_name] = data_array

            # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps
            self._expand_packed_rows(res_data, data_filter)

            self._decimate_rows(res_data, data_filter)

            if data_filter.get("transpose_time", False) is True:
                time_series = res_data.pop(self.time_var)
                for var_name, var_series in res_data.iteritems():
                    res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)]

            return res_data

        finally:
            data_file.close()
Beispiel #21
0
    def test_unix_time_fidelity(self):
        ts = time.time()
        it1 = NTP4Time(ts)

        ts_2 = it1.to_unix()
        self.assertTrue(np.abs(ts - ts_2) <= 1e-3)
Beispiel #22
0
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename,
                                   "r",
                                   retry_count=10,
                                   retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [
                var_info["name"] for var_info in self.var_defs
            ]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows_org = max_rows = data_filter.get("max_rows",
                                                      DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include",
                                                 True) is True
            should_decimate = data_filter.get("decimate", False) is True
            time_slice = None

            start_row, end_row = self._get_row_interval(
                data_file, start_time, end_time, start_time_include)
            log.info("Get data for row interval %s to %s", start_row, end_row)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info[
                    "num_steps"]  # Compensate expansion
            if end_row - start_row > max_rows:
                if should_decimate:
                    log.info("Decimating %s rows to satisfy %s max rows",
                             end_row - start_row, max_rows_org)
                else:
                    log.info(
                        "Truncating %s rows to %s max rows, %s unexpanded",
                        end_row - start_row, max_rows_org, max_rows)

            if self.ds_layout != DS_LAYOUT_INDIVIDUAL:
                raise NotImplementedError()

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]
            for var_name in read_vars:
                ds_path = "vars/%s" % var_name
                if ds_path not in data_file:
                    log.warn("Variable '%s' not in dataset - ignored",
                             var_name)
                    continue
                ds_var = data_file[ds_path]
                start_row_act = start_row if should_decimate else max(
                    start_row, end_row - max_rows, 0)
                data_array = ds_var[start_row_act:end_row]
                if var_name == self.time_var and self.var_defs_map[
                        var_name].get("base_type", "") == "ntp_time":
                    if time_format == "unix_millis":
                        data_array = [
                            int(1000 *
                                NTP4Time.from_ntp64(dv.tostring()).to_unix())
                            for dv in data_array
                        ]
                    else:
                        data_array = data_array.tolist()
                else:
                    data_array = data_array.tolist()
                if var_name == self.time_var:
                    time_slice = data_array

                res_data[var_name] = data_array

            # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps
            self._expand_packed_rows(res_data, data_filter)

            self._decimate_rows(res_data, data_filter)

            if data_filter.get("transpose_time", False) is True:
                time_series = res_data.pop(self.time_var)
                for var_name, var_series in res_data.iteritems():
                    res_data[var_name] = [
                        (tv, dv) for (tv, dv) in zip(time_series, var_series)
                    ]

            return res_data

        finally:
            data_file.close()
Beispiel #23
0
    def get_data_info(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename,
                                   "r",
                                   retry_count=10,
                                   retry_wait=0.2)
        try:
            res_info = {}
            max_rows_org = max_rows = data_filter.get("max_rows",
                                                      DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include",
                                                 True) is True
            should_decimate = data_filter.get("decimate", False) is True

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]

            res_info["ds_rows"] = cur_idx
            res_info["ds_size"] = len(ds_time)
            res_info["file_size"] = os.path.getsize(ds_filename)
            res_info["file_name"] = ds_filename
            res_info["vars"] = list(data_file["vars"])

            start_row, end_row = self._get_row_interval(
                data_file, start_time, end_time, start_time_include)
            res_info["need_expand"] = self.expand_info.get(
                "need_expand", False)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info[
                    "num_steps"]  # Compensate expansion
            res_info["should_decimate"] = should_decimate
            res_info["need_decimate"] = bool(
                should_decimate and end_row - start_row > max_rows)

            res_info["ts_first"] = NTP4Time.from_ntp64(
                ds_time.value[0].tostring()).to_unix()
            res_info["ts_last"] = NTP4Time.from_ntp64(
                ds_time.value[cur_idx - 1].tostring()).to_unix()
            res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] *
                                                        1000,
                                                        local_time=False)
            res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] *
                                                       1000,
                                                       local_time=False)

            res_info["ds_samples"] = cur_idx * self.expand_info[
                "num_steps"] if res_info["need_expand"] else cur_idx

            res_info["filter_start_row"] = start_row
            res_info["filter_end_row"] = end_row
            res_info["filter_max_rows"] = max_rows
            res_info["filter_ts_first"] = NTP4Time.from_ntp64(
                ds_time.value[start_row].tostring()).to_unix()
            res_info["filter_ts_last"] = NTP4Time.from_ntp64(
                ds_time.value[end_row - 1].tostring()).to_unix()
            res_info["filter_ts_first_str"] = get_datetime_str(
                res_info["filter_ts_first"] * 1000, local_time=False)
            res_info["filter_ts_last_str"] = get_datetime_str(
                res_info["filter_ts_last"] * 1000, local_time=False)

            return res_info

        finally:
            data_file.close()
Beispiel #24
0
    def extend_dataset(self, packet):
        """
        Adds values from a data packet to the dataset and updates indexes and metadata
        """
        ingest_ts = NTP4Time.utcnow()
        num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, []
        ds_filename = self._get_ds_filename()
        data_file = HDFLockingFile(ds_filename, "r+", retry_count=10, retry_wait=0.5)
        try:
            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                # Get index values from time var
                if self.time_var not in packet.data["cols"]:
                    raise BadRequest("Packet has no time")
                var_ds = data_file["vars/%s" % self.time_var]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                var_ds.attrs["cur_row"] += num_rows

                # Fill variables with values from packet or NaN
                for var_name in self.var_defs_map.keys():
                    var_ds = data_file["vars/%s" % var_name]
                    if cur_idx + num_rows > cur_size:
                        self._resize_dataset(var_ds, num_rows)
                    if var_name in packet.data["cols"]:
                        data_slice = packet.data["data"][:][var_name]
                        var_ds[cur_idx:cur_idx+num_rows] = data_slice
                    else:
                        # Leave the initial fill value (zeros)
                        #var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows
                        pass

                extra_vars = set(packet.data["cols"]) - set(self.var_defs_map.keys())
                if extra_vars:
                    log.warn("Data packet had extra vars not in dataset: %s", extra_vars)

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                var_ds = data_file["vars/%s" % DS_VARIABLES]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                if cur_idx + num_rows > cur_size:
                    self._resize_dataset(var_ds, num_rows)
                ds_var_names = [var_info["name"] for var_info in self.var_defs]
                pvi = {col_name: col_idx for col_idx, col_name in enumerate(packet.data["cols"]) if col_name in ds_var_names}
                for row_idx in xrange(num_rows):
                    row_data = packet.data["data"][row_idx]
                    row_vals = tuple(row_data[vn] if vn in pvi else None for vn in ds_var_names)
                    var_ds[cur_idx+row_idx] = row_vals
                var_ds.attrs["cur_row"] += num_rows

            # Update time_ingest (ts, begin row, count)
            ds_tingest = data_file[DS_TIMEINGEST_PATH]
            if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest):
                self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT)
            ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(), cur_idx, num_rows)
            ds_tingest.attrs["cur_row"] += 1

            # Update time_idx (every nth row's time)
            new_idx_row = (cur_idx + num_rows + self.time_idx_step - 1) / self.time_idx_step
            old_idx_row = (cur_idx + self.time_idx_step - 1) / self.time_idx_step
            num_tidx_rows = new_idx_row - old_idx_row
            time_ds = data_file["vars/%s" % (self.time_var if self.ds_layout == DS_LAYOUT_INDIVIDUAL else DS_VARIABLES)]
            time_idx_rows = [time_ds[idx_row*self.time_idx_step] for idx_row in xrange(old_idx_row, new_idx_row)]
            if time_idx_rows:
                ds_tidx = data_file[DS_TIMEIDX_PATH]
                tidx_cur_row = ds_tidx.attrs["cur_row"]
                if tidx_cur_row + num_tidx_rows > len(ds_tidx):
                    self._resize_dataset(ds_tidx, num_tidx_rows, INTERNAL_ROW_INCREMENT)
                ds_tidx[tidx_cur_row:tidx_cur_row+num_tidx_rows] = time_idx_rows
                ds_tidx.attrs["cur_row"] += num_tidx_rows

            #HDF5Tools.dump_hdf5(data_file, with_data=True)
        finally:
            data_file.close()
Beispiel #25
0
 def test_time_ntp_fidelity(self):
     it1 = NTP4Time()
     ntp_ts = it1.to_ntp64()
     it2 = NTP4Time.from_ntp64(ntp_ts)
     self.assertEquals(it1.seconds,it2.seconds)
     self.assertTrue(np.abs(it1.useconds - it2.useconds) <= 1)
Beispiel #26
0
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows = data_filter.get("max_rows", 999999999)
            time_slice = None
            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                time_ds = data_file["vars/%s" % self.time_var]
                cur_idx = time_ds.attrs["cur_row"]
                for var_name in read_vars:
                    ds_path = "vars/%s" % var_name
                    if ds_path not in data_file:
                        log.warn("Variable '%s' not in dataset - ignored", var_name)
                        continue
                    var_ds = data_file[ds_path]
                    data_array = var_ds[max(cur_idx-max_rows, 0):cur_idx]
                    if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time":
                        if time_format == "unix_millis":
                            data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array]
                        else:
                            data_array = data_array.tolist()
                    else:
                        data_array = data_array.tolist()
                    if var_name == self.time_var:
                        time_slice = data_array

                    res_data[var_name] = data_array

                if data_filter.get("transpose_time", False) is True:
                    time_series = res_data.pop(self.time_var)
                    for var_name, var_series in res_data.iteritems():
                        res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)]

                # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                raise NotImplementedError()

            start_time = data_filter.get("start_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            if time_slice and res_data and start_time:
                start_time = int(start_time)
                time_idx = len(time_slice)
                for idx, tv in enumerate(time_slice):
                    if tv == start_time and start_time_include:
                        time_idx = idx
                        break
                    elif tv > start_time:
                        time_idx = idx
                        break
                for var_name, var_series in res_data.iteritems():
                    res_data[var_name] = var_series[time_idx:]

            return res_data

        finally:
            data_file.close()