def _prune_dataset(self, data_file): if not self.prune_mode: return if self.prune_mode == "max_age_rel": # Prunes if first timestamp older than trigger compared to most recent timestamp trigger_age = float(self.pruning_attrs.get("trigger_age", 0)) retain_age = float(self.pruning_attrs.get("retain_age", 0)) if trigger_age <= 0.0 or retain_age <= 0.0 or trigger_age < retain_age: raise BadRequest("Bad pruning trigger_age or retain_age") var_ds = data_file["vars/%s" % self.time_var] cur_idx = var_ds.attrs["cur_row"] if not len(var_ds) or not cur_idx: return min_ts = NTP4Time.from_ntp64(var_ds[0].tostring()).to_unix() max_ts = NTP4Time.from_ntp64(var_ds[cur_idx - 1].tostring()).to_unix() if min_ts + trigger_age >= max_ts: return # Find the first index that is lower or equal to retain_age and delete gap start_time = (max_ts - retain_age) * 1000 log.info("PRUNING dataset now: mode=%s, start_time=%s", self.prune_mode, int(start_time)) copy_filename = self._get_data_copy( data_file, data_filter=dict(start_time=start_time)) elif self.prune_mode == "max_age_abs": # Prunes if first timestamp older than trigger compared to current timestamp raise NotImplementedError() elif self.prune_mode == "max_rows": raise NotImplementedError() elif self.prune_mode == "max_size": raise NotImplementedError() else: raise BadRequest("Invalid prune_mode: %s" % self.prune_mode) if not copy_filename: return # Do the replace of data file with the copy. # Make sure to heed race conditions so that waiting processes won't lock the file first ds_filename = self._get_ds_filename() ds_filename_bak = ds_filename + ".bak" if os.path.exists(ds_filename_bak): os.remove(ds_filename_bak) data_file.close( ) # Note: Inter-process race condition possible because close removes the lock shutil.move(ds_filename, ds_filename_bak) shutil.move( copy_filename, ds_filename) # Note: There may be a cross-device-link error here # os.remove(ds_filename_bak) log.info("Pruning successful. Replaced dataset with pruned file.") return True
def gte_time(data_val, cmp_val, allow_equal=True): # Support NTP4 timestamp and Unit millis (i8) if time_type == "ntp_time": if allow_equal: return NTP4Time.from_ntp64(data_val.tostring()).to_unix() >= cmp_val else: return NTP4Time.from_ntp64(data_val.tostring()).to_unix() > cmp_val else: if allow_equal: return data_val >= cmp_val else: return data_val > cmp_val
def gte_time(data_val, cmp_val, allow_equal=True): # Support NTP4 timestamp and Unit millis (i8) if time_type == "ntp_time": if allow_equal: return NTP4Time.from_ntp64( data_val.tostring()).to_unix() >= cmp_val else: return NTP4Time.from_ntp64( data_val.tostring()).to_unix() > cmp_val else: if allow_equal: return data_val >= cmp_val else: return data_val > cmp_val
def get_data_info(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_info = {} max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] res_info["ds_rows"] = cur_idx res_info["ds_size"] = len(ds_time) res_info["file_size"] = os.path.getsize(ds_filename) res_info["file_name"] = ds_filename res_info["vars"] = list(data_file["vars"]) start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) res_info["need_expand"] = self.expand_info.get("need_expand", False) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info["num_steps"] # Compensate expansion res_info["should_decimate"] = should_decimate res_info["need_decimate"] = bool(should_decimate and end_row - start_row > max_rows) res_info["ts_first"] = NTP4Time.from_ntp64(ds_time.value[0].tostring()).to_unix() res_info["ts_last"] = NTP4Time.from_ntp64(ds_time.value[cur_idx - 1].tostring()).to_unix() res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] * 1000, local_time=False) res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] * 1000, local_time=False) res_info["ds_samples"] = cur_idx * self.expand_info["num_steps"] if res_info["need_expand"] else cur_idx res_info["filter_start_row"] = start_row res_info["filter_end_row"] = end_row res_info["filter_max_rows"] = max_rows res_info["filter_ts_first"] = NTP4Time.from_ntp64(ds_time.value[start_row].tostring()).to_unix() res_info["filter_ts_last"] = NTP4Time.from_ntp64(ds_time.value[end_row - 1].tostring()).to_unix() res_info["filter_ts_first_str"] = get_datetime_str(res_info["filter_ts_first"] * 1000, local_time=False) res_info["filter_ts_last_str"] = get_datetime_str(res_info["filter_ts_last"] * 1000, local_time=False) return res_info finally: data_file.close()
def _prune_dataset(self, data_file): if not self.prune_mode: return if self.prune_mode == "max_age_rel": # Prunes if first timestamp older than trigger compared to most recent timestamp trigger_age = float(self.pruning_attrs.get("trigger_age", 0)) retain_age = float(self.pruning_attrs.get("retain_age", 0)) if trigger_age <= 0.0 or retain_age <= 0.0 or trigger_age < retain_age: raise BadRequest("Bad pruning trigger_age or retain_age") var_ds = data_file["vars/%s" % self.time_var] cur_idx = var_ds.attrs["cur_row"] if not len(var_ds) or not cur_idx: return min_ts = NTP4Time.from_ntp64(var_ds[0].tostring()).to_unix() max_ts = NTP4Time.from_ntp64(var_ds[cur_idx - 1].tostring()).to_unix() if min_ts + trigger_age >= max_ts: return # Find the first index that is lower or equal to retain_age and delete gap start_time = (max_ts - retain_age) * 1000 log.info("PRUNING dataset now: mode=%s, start_time=%s", self.prune_mode, int(start_time)) copy_filename = self._get_data_copy(data_file, data_filter=dict(start_time=start_time)) elif self.prune_mode == "max_age_abs": # Prunes if first timestamp older than trigger compared to current timestamp raise NotImplementedError() elif self.prune_mode == "max_rows": raise NotImplementedError() elif self.prune_mode == "max_size": raise NotImplementedError() else: raise BadRequest("Invalid prune_mode: %s" % self.prune_mode) if not copy_filename: return # Do the replace of data file with the copy. # Make sure to heed race conditions so that waiting processes won't lock the file first ds_filename = self._get_ds_filename() ds_filename_bak = ds_filename + ".bak" if os.path.exists(ds_filename_bak): os.remove(ds_filename_bak) data_file.close() # Note: Inter-process race condition possible because close removes the lock shutil.move(ds_filename, ds_filename_bak) shutil.move(copy_filename, ds_filename) # Note: There may be a cross-device-link error here # os.remove(ds_filename_bak) log.info("Pruning successful. Replaced dataset with pruned file.") return True
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [ var_info["name"] for var_info in self.var_defs ] time_format = data_filter.get("time_format", "unix_millis") max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True time_slice = None start_row, end_row = self._get_row_interval( data_file, start_time, end_time, start_time_include) log.info("Get data for row interval %s to %s", start_row, end_row) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info[ "num_steps"] # Compensate expansion if end_row - start_row > max_rows: if should_decimate: log.info("Decimating %s rows to satisfy %s max rows", end_row - start_row, max_rows_org) else: log.info( "Truncating %s rows to %s max rows, %s unexpanded", end_row - start_row, max_rows_org, max_rows) if self.ds_layout != DS_LAYOUT_INDIVIDUAL: raise NotImplementedError() ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] start_row_act = start_row if should_decimate else max( start_row, end_row - max_rows, 0) data_array = ds_var[start_row_act:end_row] if var_name == self.time_var and self.var_defs_map[ var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [ int(1000 * NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array ] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps self._expand_packed_rows(res_data, data_filter) self._decimate_rows(res_data, data_filter) if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [ (tv, dv) for (tv, dv) in zip(time_series, var_series) ] return res_data finally: data_file.close()
def get_data_info(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_info = {} max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] res_info["ds_rows"] = cur_idx res_info["ds_size"] = len(ds_time) res_info["file_size"] = os.path.getsize(ds_filename) res_info["file_name"] = ds_filename res_info["vars"] = list(data_file["vars"]) start_row, end_row = self._get_row_interval( data_file, start_time, end_time, start_time_include) res_info["need_expand"] = self.expand_info.get( "need_expand", False) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info[ "num_steps"] # Compensate expansion res_info["should_decimate"] = should_decimate res_info["need_decimate"] = bool( should_decimate and end_row - start_row > max_rows) res_info["ts_first"] = NTP4Time.from_ntp64( ds_time.value[0].tostring()).to_unix() res_info["ts_last"] = NTP4Time.from_ntp64( ds_time.value[cur_idx - 1].tostring()).to_unix() res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] * 1000, local_time=False) res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] * 1000, local_time=False) res_info["ds_samples"] = cur_idx * self.expand_info[ "num_steps"] if res_info["need_expand"] else cur_idx res_info["filter_start_row"] = start_row res_info["filter_end_row"] = end_row res_info["filter_max_rows"] = max_rows res_info["filter_ts_first"] = NTP4Time.from_ntp64( ds_time.value[start_row].tostring()).to_unix() res_info["filter_ts_last"] = NTP4Time.from_ntp64( ds_time.value[end_row - 1].tostring()).to_unix() res_info["filter_ts_first_str"] = get_datetime_str( res_info["filter_ts_first"] * 1000, local_time=False) res_info["filter_ts_last_str"] = get_datetime_str( res_info["filter_ts_last"] * 1000, local_time=False) return res_info finally: data_file.close()
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] time_format = data_filter.get("time_format", "unix_millis") max_rows = data_filter.get("max_rows", 999999999) time_slice = None if self.ds_layout == DS_LAYOUT_INDIVIDUAL: time_ds = data_file["vars/%s" % self.time_var] cur_idx = time_ds.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue var_ds = data_file[ds_path] data_array = var_ds[max(cur_idx-max_rows, 0):cur_idx] if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)] # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array elif self.ds_layout == DS_LAYOUT_COMBINED: raise NotImplementedError() start_time = data_filter.get("start_time", None) start_time_include = data_filter.get("start_time_include", True) is True if time_slice and res_data and start_time: start_time = int(start_time) time_idx = len(time_slice) for idx, tv in enumerate(time_slice): if tv == start_time and start_time_include: time_idx = idx break elif tv > start_time: time_idx = idx break for var_name, var_series in res_data.iteritems(): res_data[var_name] = var_series[time_idx:] return res_data finally: data_file.close()
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] time_format = data_filter.get("time_format", "unix_millis") max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True time_slice = None start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) log.info("Get data for row interval %s to %s", start_row, end_row) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info["num_steps"] # Compensate expansion if end_row - start_row > max_rows: if should_decimate: log.info("Decimating %s rows to satisfy %s max rows", end_row - start_row, max_rows_org) else: log.info( "Truncating %s rows to %s max rows, %s unexpanded", end_row - start_row, max_rows_org, max_rows ) if self.ds_layout != DS_LAYOUT_INDIVIDUAL: raise NotImplementedError() ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] start_row_act = start_row if should_decimate else max(start_row, end_row - max_rows, 0) data_array = ds_var[start_row_act:end_row] if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [int(1000 * NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps self._expand_packed_rows(res_data, data_filter) self._decimate_rows(res_data, data_filter) if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)] return res_data finally: data_file.close()
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] time_format = data_filter.get("time_format", "unix_millis") max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True time_slice = None start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) log.info("Row date interval: %s : %s", start_row, end_row) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info["num_steps"] # Compensate expansion if end_row-start_row > max_rows: log.info("Truncating %s rows to %s max rows (from the end)", end_row-start_row, max_rows) if self.ds_layout == DS_LAYOUT_INDIVIDUAL: ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] data_array = ds_var[max(start_row, end_row-max_rows, 0):end_row] if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array # At this point we have dict with variable to data array mapping with target (unix) timestamps self._expand_packed_rows(res_data, data_filter) if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)] # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array elif self.ds_layout == DS_LAYOUT_COMBINED: raise NotImplementedError() return res_data finally: data_file.close()
def test_time_ntp_fidelity(self): it1 = NTP4Time() ntp_ts = it1.to_ntp64() it2 = NTP4Time.from_ntp64(ntp_ts) self.assertEquals(it1.seconds,it2.seconds) self.assertTrue(np.abs(it1.useconds - it2.useconds) <= 1)