def _load_signal(signal, test_split): if isinstance(test_split, float): train, test = load_signal(signal, test_size=test_split) elif test_split: train = load_signal(signal + '-train') test = load_signal(signal + '-test') else: train = test = load_signal(signal) return train, test
def _add_dataset(explorer, args): if args.signal is None: args.signal = args.name if not args.start or not args.stop: path_or_name = args.location or args.name try: data = load_signal(path_or_name, None, args.timestamp_column, args.value_column) except HTTPError: print('Unknown signal: {}'.format(path_or_name)) sys.exit(1) else: timestamps = data['timestamp'] if not args.start: args.start = timestamps.min() if not args.stop: args.stop = timestamps.max() explorer.add_dataset( args.name, args.signal, args.satellite, args.start, args.stop, args.location, args.timestamp_column, args.value_column, args.user, )
def test_load_signal_test_size(isfile_mock, load_csv_mock): # setup isfile_mock.return_value = True data = pd.DataFrame({ 'timestamp': list(range(10)), 'value': list(range(10, 20)) }) load_csv_mock.return_value = data # run returned = load_signal('a/path/to/a.csv', test_size=0.33) # assert assert isinstance(returned, tuple) assert len(returned) == 2 train, test = returned expected_train = pd.DataFrame({ 'timestamp': list(range(7)), 'value': list(np.arange(10, 17).astype(float)) }) pd.testing.assert_frame_equal(train, expected_train) expected_test = pd.DataFrame({ 'timestamp': list(range(7, 10)), 'value': list(np.arange(17, 20).astype(float)) }) expected_test.index = range(7, 10) pd.testing.assert_frame_equal(test, expected_test)
def add_signal(self, name, dataset, data_location=None, start_time=None, stop_time=None, timestamp_column=None, value_column=None): """Add a new Signal object to the database. The signal needs to be given a name and be associated to a Dataset. Args: name (str): Name of the Signal. dataset (Dataset or ObjectID or str): Dataset object which the created Signal belongs to or the corresponding ObjectId. data_location (str): Path to the CSV containing the Signal data. If the signal is one of the signals provided by Orion, this can be omitted and the signal will be loaded based on the signal name. start_time (int): Optional. Minimum timestamp to use for this signal. If not provided this defaults to the minimum timestamp found in the signal data. stop_time (int): Optional. Maximum timestamp to use for this signal. If not provided this defaults to the maximum timestamp found in the signal data. timestamp_column (int): Optional. Index of the timestamp column. value_column (int): Optional. Index of the value column. Raises: NotUniqueError: If a Signal with the same name already exists for this Dataset. Returns: Signal """ data_location = data_location or name data = load_signal(data_location, None, timestamp_column, value_column) timestamps = data['timestamp'] if not start_time: start_time = timestamps.min() if not stop_time: stop_time = timestamps.max() dataset = self.get_dataset(dataset) return schema.Signal.insert(name=name, dataset=dataset, start_time=start_time, stop_time=stop_time, data_location=data_location, timestamp_column=timestamp_column, value_column=value_column, created_by=self.user)
def _evaluate_on_signal(pipeline, signal, metrics): data = load_signal(signal) anomalies = analyze(pipeline, data) truth = load_anomalies(signal) return { name: scorer(truth, anomalies, data) for name, scorer in metrics.items() }
def load(self): data = load_signal(self.data_location, None, self.timestamp_column, self.value_column) if self.start_time: data = data[data['timestamp'] >= self.start_time].copy() if self.stop_time: data = data[data['timestamp'] <= self.stop_time].copy() return data
def load_dataset(self, dataset): path_or_name = dataset.data_location or dataset.name LOGGER.info("Loading dataset %s", path_or_name) data = load_signal(path_or_name, None, dataset.timestamp_column, dataset.value_column) if dataset.start_time: data = data[data['timestamp'] >= dataset.start_time].copy() if dataset.stop_time: data = data[data['timestamp'] <= dataset.stop_time].copy() return data
def load_signal(self, signal): path_or_name = signal.data_location or signal.name LOGGER.info("Loading dataset %s", path_or_name) data = load_signal(path_or_name, None, signal.timestamp_column, signal.value_column) if signal.start_time: data = data[data['timestamp'] >= signal.start_time].copy() if signal.stop_time: data = data[data['timestamp'] <= signal.stop_time].copy() return data
def test_load_signal_filename(isfile_mock, load_csv_mock): # setup isfile_mock.return_value = True # run returned = load_signal('a/path/to/a.csv') # assert assert returned == load_csv_mock.return_value load_csv_mock.assert_called_once_with('a/path/to/a.csv', None, None)
def _evaluate_on_signal(pipeline, signal, metrics, holdout=True): if holdout: train = load_signal(signal + '-train') else: train = load_signal(signal) test = load_signal(signal + '-test') start = datetime.utcnow() anomalies = analyze(pipeline, train, test) elapsed = datetime.utcnow() - start truth = load_anomalies(signal) scores = { name: scorer(truth, anomalies, test) for name, scorer in metrics.items() } scores['elapsed'] = elapsed.total_seconds() return scores
def test_load_signal_nasa_signal_name(isfile_mock, load_csv_mock, lns_mock): # setup isfile_mock.return_value = False # run returned = load_signal('S-1') # assert assert returned == lns_mock.return_value load_csv_mock.assert_not_called() lns_mock.assert_called_once_with('S-1')
def test_load_signal_filename(isfile_mock, load_csv_mock): # setup data = pd.DataFrame({ 'timestamp': list(range(10)), 'value': list(np.arange(10, 20, dtype=float)) }) load_csv_mock.return_value = data isfile_mock.return_value = True # run returned = load_signal('a/path/to/a.csv') # assert pd.testing.assert_frame_equal(returned, load_csv_mock.return_value) load_csv_mock.assert_called_once_with('a/path/to/a.csv', None, None)
def test_load_signal_nasa_signal_name(isfile_mock, load_csv_mock, lns_mock): # setup data = pd.DataFrame({ 'timestamp': list(range(10)), 'value': list(np.arange(10, 20, dtype=float)) }) lns_mock.return_value = data isfile_mock.return_value = False # run returned = load_signal('S-1') # assert pd.testing.assert_frame_equal(returned, data) load_csv_mock.assert_not_called() lns_mock.assert_called_once_with('S-1')
def add_dataset(self, name, signal_set, satellite_id=None, start_time=None, stop_time=None, location=None, timestamp_column=None, value_column=None, user_id=None): location = location or name data = load_signal(location, None, timestamp_column, value_column) timestamps = data['timestamp'] if not start_time: start_time = timestamps.min() if not stop_time: stop_time = timestamps.max() return model.Dataset.find_or_insert( name=name, signal_set=signal_set, satellite_id=satellite_id, start_time=start_time, stop_time=stop_time, data_location=location, timestamp_column=timestamp_column, value_column=value_column, created_by=user_id )