コード例 #1
0
    def _prune_dataset(self, data_file):
        if not self.prune_mode:
            return
        if self.prune_mode == "max_age_rel":
            # Prunes if first timestamp older than trigger compared to most recent timestamp
            trigger_age = float(self.pruning_attrs.get("trigger_age", 0))
            retain_age = float(self.pruning_attrs.get("retain_age", 0))
            if trigger_age <= 0.0 or retain_age <= 0.0 or trigger_age < retain_age:
                raise BadRequest("Bad pruning trigger_age or retain_age")
            var_ds = data_file["vars/%s" % self.time_var]
            cur_idx = var_ds.attrs["cur_row"]
            if not len(var_ds) or not cur_idx:
                return
            min_ts = NTP4Time.from_ntp64(var_ds[0].tostring()).to_unix()
            max_ts = NTP4Time.from_ntp64(var_ds[cur_idx -
                                                1].tostring()).to_unix()
            if min_ts + trigger_age >= max_ts:
                return

            # Find the first index that is lower or equal to retain_age and delete gap
            start_time = (max_ts - retain_age) * 1000
            log.info("PRUNING dataset now: mode=%s, start_time=%s",
                     self.prune_mode, int(start_time))
            copy_filename = self._get_data_copy(
                data_file, data_filter=dict(start_time=start_time))
        elif self.prune_mode == "max_age_abs":
            # Prunes if first timestamp older than trigger compared to current timestamp
            raise NotImplementedError()
        elif self.prune_mode == "max_rows":
            raise NotImplementedError()
        elif self.prune_mode == "max_size":
            raise NotImplementedError()
        else:
            raise BadRequest("Invalid prune_mode: %s" % self.prune_mode)

        if not copy_filename:
            return

        # Do the replace of data file with the copy.
        # Make sure to heed race conditions so that waiting processes won't lock the file first
        ds_filename = self._get_ds_filename()
        ds_filename_bak = ds_filename + ".bak"
        if os.path.exists(ds_filename_bak):
            os.remove(ds_filename_bak)

        data_file.close(
        )  # Note: Inter-process race condition possible because close removes the lock
        shutil.move(ds_filename, ds_filename_bak)
        shutil.move(
            copy_filename,
            ds_filename)  # Note: There may be a cross-device-link error here
        # os.remove(ds_filename_bak)
        log.info("Pruning successful. Replaced dataset with pruned file.")
        return True
コード例 #2
0
ファイル: hdf5_dataset.py プロジェクト: scionrep/scioncc
 def gte_time(data_val, cmp_val, allow_equal=True):
     # Support NTP4 timestamp and Unit millis (i8)
     if time_type == "ntp_time":
         if allow_equal:
             return NTP4Time.from_ntp64(data_val.tostring()).to_unix() >= cmp_val
         else:
             return NTP4Time.from_ntp64(data_val.tostring()).to_unix() > cmp_val
     else:
         if allow_equal:
             return data_val >= cmp_val
         else:
             return data_val > cmp_val
コード例 #3
0
 def gte_time(data_val, cmp_val, allow_equal=True):
     # Support NTP4 timestamp and Unit millis (i8)
     if time_type == "ntp_time":
         if allow_equal:
             return NTP4Time.from_ntp64(
                 data_val.tostring()).to_unix() >= cmp_val
         else:
             return NTP4Time.from_ntp64(
                 data_val.tostring()).to_unix() > cmp_val
     else:
         if allow_equal:
             return data_val >= cmp_val
         else:
             return data_val > cmp_val
コード例 #4
0
ファイル: hdf5_dataset.py プロジェクト: scionrep/scioncc
    def get_data_info(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_info = {}
            max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            should_decimate = data_filter.get("decimate", False) is True

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]

            res_info["ds_rows"] = cur_idx
            res_info["ds_size"] = len(ds_time)
            res_info["file_size"] = os.path.getsize(ds_filename)
            res_info["file_name"] = ds_filename
            res_info["vars"] = list(data_file["vars"])

            start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
            res_info["need_expand"] = self.expand_info.get("need_expand", False)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info["num_steps"]  # Compensate expansion
            res_info["should_decimate"] = should_decimate
            res_info["need_decimate"] = bool(should_decimate and end_row - start_row > max_rows)

            res_info["ts_first"] = NTP4Time.from_ntp64(ds_time.value[0].tostring()).to_unix()
            res_info["ts_last"] = NTP4Time.from_ntp64(ds_time.value[cur_idx - 1].tostring()).to_unix()
            res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] * 1000, local_time=False)
            res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] * 1000, local_time=False)

            res_info["ds_samples"] = cur_idx * self.expand_info["num_steps"] if res_info["need_expand"] else cur_idx

            res_info["filter_start_row"] = start_row
            res_info["filter_end_row"] = end_row
            res_info["filter_max_rows"] = max_rows
            res_info["filter_ts_first"] = NTP4Time.from_ntp64(ds_time.value[start_row].tostring()).to_unix()
            res_info["filter_ts_last"] = NTP4Time.from_ntp64(ds_time.value[end_row - 1].tostring()).to_unix()
            res_info["filter_ts_first_str"] = get_datetime_str(res_info["filter_ts_first"] * 1000, local_time=False)
            res_info["filter_ts_last_str"] = get_datetime_str(res_info["filter_ts_last"] * 1000, local_time=False)

            return res_info

        finally:
            data_file.close()
コード例 #5
0
ファイル: hdf5_dataset.py プロジェクト: scionrep/scioncc
    def _prune_dataset(self, data_file):
        if not self.prune_mode:
            return
        if self.prune_mode == "max_age_rel":
            # Prunes if first timestamp older than trigger compared to most recent timestamp
            trigger_age = float(self.pruning_attrs.get("trigger_age", 0))
            retain_age = float(self.pruning_attrs.get("retain_age", 0))
            if trigger_age <= 0.0 or retain_age <= 0.0 or trigger_age < retain_age:
                raise BadRequest("Bad pruning trigger_age or retain_age")
            var_ds = data_file["vars/%s" % self.time_var]
            cur_idx = var_ds.attrs["cur_row"]
            if not len(var_ds) or not cur_idx:
                return
            min_ts = NTP4Time.from_ntp64(var_ds[0].tostring()).to_unix()
            max_ts = NTP4Time.from_ntp64(var_ds[cur_idx - 1].tostring()).to_unix()
            if min_ts + trigger_age >= max_ts:
                return

            # Find the first index that is lower or equal to retain_age and delete gap
            start_time = (max_ts - retain_age) * 1000
            log.info("PRUNING dataset now: mode=%s, start_time=%s", self.prune_mode, int(start_time))
            copy_filename = self._get_data_copy(data_file, data_filter=dict(start_time=start_time))
        elif self.prune_mode == "max_age_abs":
            # Prunes if first timestamp older than trigger compared to current timestamp
            raise NotImplementedError()
        elif self.prune_mode == "max_rows":
            raise NotImplementedError()
        elif self.prune_mode == "max_size":
            raise NotImplementedError()
        else:
            raise BadRequest("Invalid prune_mode: %s" % self.prune_mode)

        if not copy_filename:
            return

        # Do the replace of data file with the copy.
        # Make sure to heed race conditions so that waiting processes won't lock the file first
        ds_filename = self._get_ds_filename()
        ds_filename_bak = ds_filename + ".bak"
        if os.path.exists(ds_filename_bak):
            os.remove(ds_filename_bak)

        data_file.close()  # Note: Inter-process race condition possible because close removes the lock
        shutil.move(ds_filename, ds_filename_bak)
        shutil.move(copy_filename, ds_filename)  # Note: There may be a cross-device-link error here
        # os.remove(ds_filename_bak)
        log.info("Pruning successful. Replaced dataset with pruned file.")
        return True
コード例 #6
0
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename,
                                   "r",
                                   retry_count=10,
                                   retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [
                var_info["name"] for var_info in self.var_defs
            ]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows_org = max_rows = data_filter.get("max_rows",
                                                      DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include",
                                                 True) is True
            should_decimate = data_filter.get("decimate", False) is True
            time_slice = None

            start_row, end_row = self._get_row_interval(
                data_file, start_time, end_time, start_time_include)
            log.info("Get data for row interval %s to %s", start_row, end_row)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info[
                    "num_steps"]  # Compensate expansion
            if end_row - start_row > max_rows:
                if should_decimate:
                    log.info("Decimating %s rows to satisfy %s max rows",
                             end_row - start_row, max_rows_org)
                else:
                    log.info(
                        "Truncating %s rows to %s max rows, %s unexpanded",
                        end_row - start_row, max_rows_org, max_rows)

            if self.ds_layout != DS_LAYOUT_INDIVIDUAL:
                raise NotImplementedError()

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]
            for var_name in read_vars:
                ds_path = "vars/%s" % var_name
                if ds_path not in data_file:
                    log.warn("Variable '%s' not in dataset - ignored",
                             var_name)
                    continue
                ds_var = data_file[ds_path]
                start_row_act = start_row if should_decimate else max(
                    start_row, end_row - max_rows, 0)
                data_array = ds_var[start_row_act:end_row]
                if var_name == self.time_var and self.var_defs_map[
                        var_name].get("base_type", "") == "ntp_time":
                    if time_format == "unix_millis":
                        data_array = [
                            int(1000 *
                                NTP4Time.from_ntp64(dv.tostring()).to_unix())
                            for dv in data_array
                        ]
                    else:
                        data_array = data_array.tolist()
                else:
                    data_array = data_array.tolist()
                if var_name == self.time_var:
                    time_slice = data_array

                res_data[var_name] = data_array

            # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps
            self._expand_packed_rows(res_data, data_filter)

            self._decimate_rows(res_data, data_filter)

            if data_filter.get("transpose_time", False) is True:
                time_series = res_data.pop(self.time_var)
                for var_name, var_series in res_data.iteritems():
                    res_data[var_name] = [
                        (tv, dv) for (tv, dv) in zip(time_series, var_series)
                    ]

            return res_data

        finally:
            data_file.close()
コード例 #7
0
    def get_data_info(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename,
                                   "r",
                                   retry_count=10,
                                   retry_wait=0.2)
        try:
            res_info = {}
            max_rows_org = max_rows = data_filter.get("max_rows",
                                                      DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include",
                                                 True) is True
            should_decimate = data_filter.get("decimate", False) is True

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]

            res_info["ds_rows"] = cur_idx
            res_info["ds_size"] = len(ds_time)
            res_info["file_size"] = os.path.getsize(ds_filename)
            res_info["file_name"] = ds_filename
            res_info["vars"] = list(data_file["vars"])

            start_row, end_row = self._get_row_interval(
                data_file, start_time, end_time, start_time_include)
            res_info["need_expand"] = self.expand_info.get(
                "need_expand", False)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info[
                    "num_steps"]  # Compensate expansion
            res_info["should_decimate"] = should_decimate
            res_info["need_decimate"] = bool(
                should_decimate and end_row - start_row > max_rows)

            res_info["ts_first"] = NTP4Time.from_ntp64(
                ds_time.value[0].tostring()).to_unix()
            res_info["ts_last"] = NTP4Time.from_ntp64(
                ds_time.value[cur_idx - 1].tostring()).to_unix()
            res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] *
                                                        1000,
                                                        local_time=False)
            res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] *
                                                       1000,
                                                       local_time=False)

            res_info["ds_samples"] = cur_idx * self.expand_info[
                "num_steps"] if res_info["need_expand"] else cur_idx

            res_info["filter_start_row"] = start_row
            res_info["filter_end_row"] = end_row
            res_info["filter_max_rows"] = max_rows
            res_info["filter_ts_first"] = NTP4Time.from_ntp64(
                ds_time.value[start_row].tostring()).to_unix()
            res_info["filter_ts_last"] = NTP4Time.from_ntp64(
                ds_time.value[end_row - 1].tostring()).to_unix()
            res_info["filter_ts_first_str"] = get_datetime_str(
                res_info["filter_ts_first"] * 1000, local_time=False)
            res_info["filter_ts_last_str"] = get_datetime_str(
                res_info["filter_ts_last"] * 1000, local_time=False)

            return res_info

        finally:
            data_file.close()
コード例 #8
0
ファイル: hdf5_dataset.py プロジェクト: edwardhunter/scioncc
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows = data_filter.get("max_rows", 999999999)
            time_slice = None
            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                time_ds = data_file["vars/%s" % self.time_var]
                cur_idx = time_ds.attrs["cur_row"]
                for var_name in read_vars:
                    ds_path = "vars/%s" % var_name
                    if ds_path not in data_file:
                        log.warn("Variable '%s' not in dataset - ignored", var_name)
                        continue
                    var_ds = data_file[ds_path]
                    data_array = var_ds[max(cur_idx-max_rows, 0):cur_idx]
                    if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time":
                        if time_format == "unix_millis":
                            data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array]
                        else:
                            data_array = data_array.tolist()
                    else:
                        data_array = data_array.tolist()
                    if var_name == self.time_var:
                        time_slice = data_array

                    res_data[var_name] = data_array

                if data_filter.get("transpose_time", False) is True:
                    time_series = res_data.pop(self.time_var)
                    for var_name, var_series in res_data.iteritems():
                        res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)]

                # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                raise NotImplementedError()

            start_time = data_filter.get("start_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            if time_slice and res_data and start_time:
                start_time = int(start_time)
                time_idx = len(time_slice)
                for idx, tv in enumerate(time_slice):
                    if tv == start_time and start_time_include:
                        time_idx = idx
                        break
                    elif tv > start_time:
                        time_idx = idx
                        break
                for var_name, var_series in res_data.iteritems():
                    res_data[var_name] = var_series[time_idx:]

            return res_data

        finally:
            data_file.close()
コード例 #9
0
ファイル: hdf5_dataset.py プロジェクト: scionrep/scioncc
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            should_decimate = data_filter.get("decimate", False) is True
            time_slice = None

            start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
            log.info("Get data for row interval %s to %s", start_row, end_row)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info["num_steps"]  # Compensate expansion
            if end_row - start_row > max_rows:
                if should_decimate:
                    log.info("Decimating %s rows to satisfy %s max rows", end_row - start_row, max_rows_org)
                else:
                    log.info(
                        "Truncating %s rows to %s max rows, %s unexpanded", end_row - start_row, max_rows_org, max_rows
                    )

            if self.ds_layout != DS_LAYOUT_INDIVIDUAL:
                raise NotImplementedError()

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]
            for var_name in read_vars:
                ds_path = "vars/%s" % var_name
                if ds_path not in data_file:
                    log.warn("Variable '%s' not in dataset - ignored", var_name)
                    continue
                ds_var = data_file[ds_path]
                start_row_act = start_row if should_decimate else max(start_row, end_row - max_rows, 0)
                data_array = ds_var[start_row_act:end_row]
                if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time":
                    if time_format == "unix_millis":
                        data_array = [int(1000 * NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array]
                    else:
                        data_array = data_array.tolist()
                else:
                    data_array = data_array.tolist()
                if var_name == self.time_var:
                    time_slice = data_array

                res_data[var_name] = data_array

            # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps
            self._expand_packed_rows(res_data, data_filter)

            self._decimate_rows(res_data, data_filter)

            if data_filter.get("transpose_time", False) is True:
                time_series = res_data.pop(self.time_var)
                for var_name, var_series in res_data.iteritems():
                    res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)]

            return res_data

        finally:
            data_file.close()
コード例 #10
0
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            should_decimate = data_filter.get("decimate", False) is True
            time_slice = None

            start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
            log.info("Row date interval: %s : %s", start_row, end_row)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info["num_steps"]  # Compensate expansion
            if end_row-start_row > max_rows:
                log.info("Truncating %s rows to %s max rows (from the end)", end_row-start_row, max_rows)

            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                ds_time = data_file["vars/%s" % self.time_var]
                cur_idx = ds_time.attrs["cur_row"]
                for var_name in read_vars:
                    ds_path = "vars/%s" % var_name
                    if ds_path not in data_file:
                        log.warn("Variable '%s' not in dataset - ignored", var_name)
                        continue
                    ds_var = data_file[ds_path]
                    data_array = ds_var[max(start_row, end_row-max_rows, 0):end_row]
                    if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time":
                        if time_format == "unix_millis":
                            data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array]
                        else:
                            data_array = data_array.tolist()
                    else:
                        data_array = data_array.tolist()
                    if var_name == self.time_var:
                        time_slice = data_array

                    res_data[var_name] = data_array

                # At this point we have dict with variable to data array mapping with target (unix) timestamps
                self._expand_packed_rows(res_data, data_filter)

                if data_filter.get("transpose_time", False) is True:
                    time_series = res_data.pop(self.time_var)
                    for var_name, var_series in res_data.iteritems():
                        res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)]

                # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                raise NotImplementedError()

            return res_data

        finally:
            data_file.close()
コード例 #11
0
 def test_time_ntp_fidelity(self):
     it1 = NTP4Time()
     ntp_ts = it1.to_ntp64()
     it2 = NTP4Time.from_ntp64(ntp_ts)
     self.assertEquals(it1.seconds,it2.seconds)
     self.assertTrue(np.abs(it1.useconds - it2.useconds) <= 1)