def test_downsample_with_default_validators_raises_error_with_multiple_messages( ): data = np.random.standard_normal((4, 3)) # 3 columns data[:, 0] = [1, 2, 2, 3] # unsorted x values data[2, 1] = np.nan # missing y value with pytest.raises(ValueError) as exc: lttb.downsample(data, 3) assert exc.match("data does not have 2 columns; " "data contains NaN values; " "first column is not strictly increasing")
def test_downsampling(): csv = 'tests/timeseries.csv' data = np.genfromtxt(csv, delimiter=',', names=True) xs = data['X'] ys = data['Y'] data = np.array([xs, ys]).T out = lttb.downsample(data, 100) assert out.shape == (100, 2)
def load_data(graph, data_slice=None): global DATA, FILENAME app_logger.info('building graphs') app_logger.info('load_data DATA.shape %r %r' % (DATA.shape, DATA.length)) # history, filename = load_history() history = DATA filename = FILENAME newdata = [] if data_slice is None: data_slice = range(DATA.length) else: start = np.where(history[:, 0] > data_slice[0])[0][0] stop = np.where(history[:, 0] < data_slice[1])[0][-1] data_slice = range(start, stop) # print('data_slice:', data_slice, stop-start) # bool_ = np.logical_and(history[:,0] > data_slice[0], history[:,0] < # data_slice[1]) # data_slice = np.where(bool_)[0] x = [ datetime.datetime.fromtimestamp(d).strftime('%y-%m-%d %H:%M:%S') for d in history[data_slice, 0] ] use_lttb = len(x) > 1000 for idx, name in enumerate(graph): lower_labels = [label.lower() for label in labels] idx = lower_labels.index(name.lower( )) + LABEL_OFFSET # Could be 2 if there is human readable date # y = history[data_slice, idx] # print('y', y) # if len(history[data_slice, 0]) > 1000: if use_lttb: app_logger.debug('data_slice len', len(data_slice)) hdata = np.column_stack((history[data_slice, 0], history[data_slice, idx])) app_logger.debug('hdata.shape', hdata.shape) downsize = lttb.downsample(hdata, n_out=1000) assert downsize.shape == (1000, 2) x = [ datetime.datetime.fromtimestamp(d).strftime( '%y-%m-%d %H:%M:%S') for d in downsize[:, 0] ] y = list(downsize[:, 1]) else: y = list(history[data_slice, idx]) # print('load_data:', name, idx) newdata.append({ 'x': x, 'y': y, 'type': 'scatter', 'mode': 'markers+lines', 'name': '%s' % name }) app_logger.info('done building graphs') return newdata, filename
def process_series(self): n_rows = self.draw_set[0].shape[0] if n_rows <= N_MAX: return self.draw_set sampled_set = [] for ts in self.draw_set: out = lttb.downsample(np.array([ts.index, ts]).T, N_MAX) sampled_set.append( pd.Series(out[:, 1], index=out[:, 0], name=ts.name)) return sampled_set
def create_per_profile_df(root_dir: str, organisms: List[str]): dfs = [] confusion_matrices = _prof_confusion_matrices(root_dir, organisms, False) clans = pfam.Clans() for profile, cm in progress(confusion_matrices.items()): pr = cm.pr_curve if cm.P <= config.min_hmmer_hits: continue evalues = cm.sample_scores assert len(pr.recall) == len(evalues) x = pr.recall x, idx = np.unique(x, return_index=True) y = pr.precision[idx] evalues = evalues[idx] n_out = min(config.downsample, len(x)) matrix = lttb.downsample(np.stack((x, y), axis=1), n_out=n_out) idx = np.searchsorted(x, matrix[:, 0]) x = matrix[:, 0] y = matrix[:, 1] evalues = evalues[idx] df = pd.DataFrame() df[config.label.recall] = x df[config.label.precision] = y df["profile"] = profile df["clan"] = _clan_name(clans.get(profile)) df[config.label.auc] = pr.auc df[config.label.hmmer_hits] = cm.P df["e-value"] = evalues dfs.append(df) df = pd.concat(dfs) df.sort_values( [ "profile", "clan", config.label.recall, config.label.precision, ], inplace=True, ) df = df.reset_index(drop=True) return df
def process_zoom(self, xlim): a = get_nearest_index(xlim[0], self.timestamp) if self.timestamp else max( int(xlim[0]), 0) b = get_nearest_index(xlim[1], self.timestamp) if self.timestamp else min( int(xlim[1]) + 1, len(self.draw_set[0])) zoomed_set = [df.iloc[a:b] for df in self.draw_set] if b - a <= N_MAX: return zoomed_set sampled_set = [] for ts in zoomed_set: out = lttb.downsample(np.array([ts.index, ts]).T, N_MAX) sampled_set.append( pd.Series(out[:, 1], index=out[:, 0], name=ts.name)) return sampled_set
def create_per_organism_df(root_dir: str, organisms: List[str], downsample: int): dfs = [] for organism in progress(organisms): result, cm = _fetch_organism(root_dir, organism) pr = cm.pr_curve evalues = cm.sample_scores assert len(pr.recall) == len(evalues) x = pr.recall x, idx = np.unique(x, return_index=True) y = pr.precision[idx] evalues = evalues[idx] arr = np.where(x > 0.005)[0] if len(arr) == 0: first = 0 else: first = arr[0] x = x[first:] y = y[first:] evalues = evalues[first:] matrix = lttb.downsample(np.stack((x, y), axis=1), n_out=downsample) idx = np.searchsorted(x, matrix[:, 0]) x = matrix[:, 0] y = matrix[:, 1] evalues = evalues[idx] df = pd.DataFrame() df[config.label.recall] = x df[config.label.precision] = y df["organism"] = organism df[config.label.auc] = pr.auc df[config.label.hmmer_hits] = cm.P df["domain"] = result.accession.domain df["e-value"] = evalues dfs.append(df) return pd.concat(dfs).reset_index(drop=True)
def lttb_ops(x: pd.DataFrame): if x.shape[0] <= n_out: # just split into multiple dataframes ret = [ pd.DataFrame(index=x.index, data=x[col]) for col in x.columns ] return ret # convert DataFrame index (datetime) to int x = x.copy() x.index = x.index.astype("int64") # lttb calculation ret = [] for col in x.columns: data = np.array((x.index.values, x[col])).T output = lttb.downsample(data, n_out) df = pd.DataFrame(index=output[:, 0].astype('datetime64[ns]'), data={col: output[:, 1]}) ret.append(df) return ret
def test_invalid_n_out_raises_error(n_out): data = gen_valid_data(6) with pytest.raises(ValueError): lttb.downsample(data, n_out)
def test_downsampling_random_data_retains_variation(data, n_out): assume(n_out <= len(data)) out = lttb.downsample(data, n_out) var_in = np.var(data[:, 1]) var_out = np.var(out[:, 1]) assert var_out >= 0.95 * var_in
def test_downsampled_random_data_is_correct_shape(data, n_out): assume(n_out <= len(data)) out = lttb.downsample(data, n_out) assert out.shape == (n_out, 2)
def test_downsampling_test_data_retains_variation(n_out): data = load_test_data() out = lttb.downsample(data, n_out) assert np.var(out[:, 1]) >= 29.5 # var(data) == 30.9968
def test_downsampled_test_data_is_correct_shape(n_out): data = load_test_data() out = lttb.downsample(data, n_out) assert out.shape == (n_out, 2)