def test_time_series_default_values(): n_channels = 3 t, m, e = sample_time_series(channels=n_channels) ts = TimeSeries(None, m[0], None) npt.assert_allclose( ts.time, np.linspace(0., time_series.DEFAULT_MAX_TIME, m.shape[1])) npt.assert_allclose(ts.error, np.repeat(time_series.DEFAULT_ERROR_VALUE, m.shape[1])) assert ts.n_channels == 1 ts = TimeSeries(None, m, None) npt.assert_allclose( ts.time[0], np.linspace(0., time_series.DEFAULT_MAX_TIME, m.shape[1])) npt.assert_allclose(ts.error[0], np.repeat(time_series.DEFAULT_ERROR_VALUE, m.shape[1])) assert ts.n_channels == n_channels t = [t[i][0:i + 2] for i in range(len(t))] m = [m[i][0:i + 2] for i in range(len(m))] e = [e[i][0:i + 2] for i in range(len(e))] ts = TimeSeries(None, m, None) for i in range(n_channels): npt.assert_allclose( ts.time[i], np.linspace(0., time_series.DEFAULT_MAX_TIME, len(m[i]))) npt.assert_allclose( ts.error[i], np.repeat(time_series.DEFAULT_ERROR_VALUE, len(m[i]))) assert ts.n_channels == n_channels
def test_transform_ts_files(): n_class1 = 4 n_class2 = 8 transform_type = "Train/Test Split" time_series = [TimeSeries(*sample_values(), target='class1') for i in range(n_class1)] time_series += [TimeSeries(*sample_values(), target='class2') for i in range(n_class2)] output = transformation.transform_ts_files(time_series, transform_type) npt.assert_equal(len(output), 2)
def test_train_test_split_ratios(): n_class1 = 4 n_class2 = 8 transform_type = "Train/Test Split" time_series = [TimeSeries(*sample_values(), target='class1') for i in range(n_class1)] time_series += [TimeSeries(*sample_values(), target='class2') for i in range(n_class2)] outputs = transformation.train_test_split( time_series, test_size=0.5, train_size=0.5) npt.assert_equal(len(outputs[1]), len(time_series) / 2) npt.assert_equal(len(outputs[0]), len(time_series) / 2)
def sample_ts_files(size, targets=[None]): temp_dir = tempfile.mkdtemp() paths = [] for target in islice(cycle(targets), size): t, m, e = sample_values() name = str(uuid.uuid4()) path = pjoin(temp_dir, '{}.nc'.format(name)) ts = TimeSeries(t, m, e, target=target, path=path, name=name) ts.to_netcdf(path) paths.append(path) yield paths shutil.rmtree(temp_dir)
def sample_ts_files(size, labels=[None]): temp_dir = tempfile.mkdtemp() paths = [] for label in islice(cycle(labels), size): t, m, e = sample_values() name = str(uuid.uuid4()) path = pjoin(temp_dir, '{}.npz'.format(name)) ts = TimeSeries(t, m, e, label=label, path=path, name=name) ts.save(path) paths.append(path) yield paths shutil.rmtree(temp_dir)
def test_time_series_init_2d(): n_channels = 3 t, m, e = sample_time_series(channels=n_channels) ts = TimeSeries(t, m, e) assert ts.time.shape == t.shape and np.allclose(ts.time, t) assert ts.measurement.shape == m.shape and np.allclose(ts.measurement, m) assert ts.error.shape == e.shape and np.allclose(ts.error, e) assert ts.n_channels == n_channels ts = TimeSeries(t[0], m, e[0]) assert ts.time.shape == m.shape and np.allclose(ts.time[0], t[0]) assert ts.measurement.shape == m.shape and np.allclose(ts.measurement, m) assert ts.error.shape == m.shape and np.allclose(ts.error[0], e[0]) assert ts.n_channels == n_channels
def sample_ts_files(size, targets=[None]): temp_dir = tempfile.mkdtemp() paths = [] for target in islice(cycle(targets), size): t, m, e = sample_values() name = str(uuid.uuid4()) path = pjoin(temp_dir, '{}.nc'.format(name)) ts = TimeSeries(t, m, e, target=target, path=path, name=name) ts.to_netcdf(path) paths.append(path) yield paths shutil.rmtree(temp_dir)
def sample_ts_files(size, labels=[None]): temp_dir = tempfile.mkdtemp() paths = [] for label in islice(cycle(labels), size): t, m, e = sample_values() name = str(uuid.uuid4()) path = pjoin(temp_dir, '{}.npz'.format(name)) ts = TimeSeries(t, m, e, label=label, path=path, name=name) ts.save(path) paths.append(path) yield paths shutil.rmtree(temp_dir)
def test_time_series_init_1d(): t, m, e = sample_time_series(channels=1) ts = TimeSeries(t, m, e) assert ts.time.shape == t.shape and np.allclose(ts.time, t) assert ts.measurement.shape == m.shape and np.allclose(ts.measurement, m) assert ts.error.shape == e.shape and np.allclose(ts.error, e) assert ts.n_channels == 1
def test_train_test_split(): # Mock out unevenly-labeled test data: 4 class1, 8 class2 n_class1 = 4 n_class2 = 8 transform_type = "Train/Test Split" time_series = [TimeSeries(*sample_values(), target='class1') for i in range(n_class1)] time_series += [TimeSeries(*sample_values(), target='class2') for i in range(n_class2)] np.random.seed(0) train, test = transformation.transform_ts_files(time_series, transform_type) npt.assert_equal( sum(ts.target == 'class1' for ts in train), 1 * n_class1 / 2) npt.assert_equal(sum(ts.target == 'class1' for ts in test), n_class1 / 2) npt.assert_equal( sum(ts.target == 'class2' for ts in train), 1 * n_class2 / 2) npt.assert_equal(sum(ts.target == 'class2' for ts in test), n_class2 / 2)
def test_time_series_init_ragged(): n_channels = 3 t, m, e = sample_time_series(channels=n_channels) t = [t[i][0:i + 2] for i in range(len(t))] m = [m[i][0:i + 2] for i in range(len(m))] e = [e[i][0:i + 2] for i in range(len(e))] ts = TimeSeries(t, m, e) assert all(np.allclose(ts.time[i], t[i]) for i in range(len(t))) assert all(np.allclose(ts.measurement[i], m[i]) for i in range(len(t))) assert all(np.allclose(ts.error[i], e[i]) for i in range(len(t))) assert ts.n_channels == n_channels
def to_cesium_dataset(X): """Transform a tslearn-compatible dataset into a cesium dataset. Parameters ---------- X: array, shape = (n_ts, sz, d), where n_ts=1 tslearn-formatted dataset to be cast to cesium format Returns ------- list of cesium TimeSeries cesium-formatted dataset (cf. `link <http://cesium-ml.org/docs/api/cesium.time_series.html#cesium.time_series.TimeSeries>`_) Examples -------- >>> tslearn_arr = numpy.random.randn(3, 16, 1) >>> cesium_ds = to_cesium_dataset(tslearn_arr) >>> len(cesium_ds) 3 >>> cesium_ds[0].measurement.shape (16,) >>> tslearn_arr = numpy.random.randn(3, 16, 2) >>> cesium_ds = to_cesium_dataset(tslearn_arr) >>> len(cesium_ds) 3 >>> cesium_ds[0].measurement.shape (2, 16) >>> tslearn_arr = [[1, 2, 3], [1, 2, 3, 4]] >>> cesium_ds = to_cesium_dataset(tslearn_arr) >>> len(cesium_ds) 2 >>> cesium_ds[0].measurement.shape (3,) Notes ----- Conversion from/to cesium format requires cesium to be installed. """ # noqa: E501 try: from cesium.time_series import TimeSeries except ImportError: raise ImportError("Conversion from/to cesium cannot be performed " "if cesium is not installed.") def transpose_or_flatten(ts): ts_ = ts[:ts_size(ts)] if ts.shape[1] == 1: return ts_.reshape((-1, )) else: return ts_.transpose() X_ = check_dataset(X) return [TimeSeries(m=transpose_or_flatten(Xi)) for Xi in X_]
def test_time_series_sort(): t, m, e = sample_time_series(channels=1) t[:2] = t[1::-1] ts = TimeSeries(t, m, e) npt.assert_allclose(ts.time, np.sort(t)) npt.assert_allclose(ts.measurement, m[np.argsort(t)]) npt.assert_allclose(ts.error, e[np.argsort(t)]) n_channels = 3 t, m, e = sample_time_series(channels=n_channels) t[:, :2] = t[:, 1::-1] ts = TimeSeries(t, m, e) for i in range(len(m)): npt.assert_allclose(ts.time[i], np.sort(t[i])) npt.assert_allclose(ts.measurement[i], m[i][np.argsort(t[i])]) npt.assert_allclose(ts.error[i], e[i][np.argsort(t[i])]) ts = TimeSeries(t[0], m, e[0]) for i in range(len(m)): npt.assert_allclose(ts.time[i], np.sort(t[0])) npt.assert_allclose(ts.measurement[i], m[i][np.argsort(t[0])]) npt.assert_allclose(ts.error[i], e[0][np.argsort(t[0])])
def test_time_series_netCDF(): n_channels = 3 t, m, e = sample_time_series(channels=n_channels) ts = TimeSeries(t[0], m[0], e[0]) ts.to_netcdf(TEST_TS_PATH) ts_nc = time_series.from_netcdf(TEST_TS_PATH) assert_ts_equal(ts, ts_nc) ts = TimeSeries(t[0], m, e[0]) ts.to_netcdf(TEST_TS_PATH) ts_nc = time_series.from_netcdf(TEST_TS_PATH) assert_ts_equal(ts, ts_nc) t = [t[i][0:i+2] for i in range(len(t))] m = [m[i][0:i+2] for i in range(len(m))] e = [e[i][0:i+2] for i in range(len(e))] ts = TimeSeries(t, m, e) ts.to_netcdf(TEST_TS_PATH) ts_nc = time_series.from_netcdf(TEST_TS_PATH) assert_ts_equal(ts, ts_nc)
def test_channels_iterator(): n_channels = 3 t, m, e = sample_time_series(channels=n_channels) ts = TimeSeries(t[0], m[0], e[0]) for t_i, m_i, e_i in ts.channels(): npt.assert_allclose(t_i, t[0]) npt.assert_allclose(m_i, m[0]) npt.assert_allclose(e_i, e[0]) ts = TimeSeries(t, m, e) for (t_i, m_i, e_i), i in zip(ts.channels(), range(n_channels)): npt.assert_allclose(t_i, t[i]) npt.assert_allclose(m_i, m[i]) npt.assert_allclose(e_i, e[i]) t = [t[i][0:i + 2] for i in range(len(t))] m = [m[i][0:i + 2] for i in range(len(m))] e = [e[i][0:i + 2] for i in range(len(e))] ts = TimeSeries(t, m, e) for (t_i, m_i, e_i), i in zip(ts.channels(), range(n_channels)): npt.assert_allclose(t_i, t[i]) npt.assert_allclose(m_i, m[i]) npt.assert_allclose(e_i, e[i])
def test_time_series_npz(tmpdir): n_channels = 3 t, m, e = sample_time_series(channels=n_channels) ts = TimeSeries(t[0], m[0], e[0]) ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz') ts.save(ts_path) ts_loaded = time_series.load(ts_path) assert_ts_equal(ts, ts_loaded) ts = TimeSeries(t[0], m, e[0]) ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz') ts.save(ts_path) ts_loaded = time_series.load(ts_path) assert_ts_equal(ts, ts_loaded) t = [t[i][0:i+2] for i in range(len(t))] m = [m[i][0:i+2] for i in range(len(m))] e = [e[i][0:i+2] for i in range(len(e))] ts = TimeSeries(t, m, e) ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz') ts.save(ts_path) ts_loaded = time_series.load(ts_path) assert_ts_equal(ts, ts_loaded)
def get_ts_obj(object_id, meta_data, ts_data, pbmap): pbnames = list(pbmap.values()) row = meta_data.query('object_id == @object_id') if 'target' in meta_data.columns: target = row['target'] else: target = None extract_ts = ts_data.query('object_id == @object_id') pbind = [(extract_ts['passband'] == pb) for pb in pbmap] t = [extract_ts['mjd'][mask].values for mask in pbind] m = [extract_ts['flux'][mask].values for mask in pbind] e = [extract_ts['flux_err'][mask].values for mask in pbind] ts_obj = TimeSeries(t=t, m=m, e=e, label=target, name=object_id, channel_names=pbnames) return (object_id, ts_obj)
def read_lightcurves(): # Load the light curve data from csv file to an Astropy table lcfilename = f'{datadir}/training_set.csv' lcdata = Table.read(lcfilename, format='csv') # Now we make a cesium Timeseries object for each light curve tsdict = OrderedDict() for i in trange(nobjects, desc="Building Timeseries"): row = metadata[i] thisid = row['object_id'] target = row['target'] meta = { 'z': row['hostgal_photoz'], 'zerr': row['hostgal_photoz_err'], 'mwebv': row['mwebv'] } ind = (lcdata['object_id'] == thisid) thislc = lcdata[ind] pbind = [(thislc['passband'] == pb) for pb in pbmap] t = [thislc['mjd'][mask].data for mask in pbind] m = [thislc['flux'][mask].data for mask in pbind] e = [thislc['flux_err'][mask].data for mask in pbind] tsdict[thisid] = TimeSeries(t=t, m=m, e=e, label=target, name=thisid, meta_features=meta, channel_names=pbnames) del lcdata return tsdict
def test_channels_iterator(): n_channels = 3 t, m, e = sample_time_series(channels=n_channels) ts = TimeSeries(t[0], m[0], e[0]) for t_i, m_i, e_i in ts.channels(): npt.assert_allclose(t_i, t[0]) npt.assert_allclose(m_i, m[0]) npt.assert_allclose(e_i, e[0]) ts = TimeSeries(t, m, e) for (t_i, m_i, e_i), i in zip(ts.channels(), range(n_channels)): npt.assert_allclose(t_i, t[i]) npt.assert_allclose(m_i, m[i]) npt.assert_allclose(e_i, e[i]) t = [t[i][0:i+2] for i in range(len(t))] m = [m[i][0:i+2] for i in range(len(m))] e = [e[i][0:i+2] for i in range(len(e))] ts = TimeSeries(t, m, e) for (t_i, m_i, e_i), i in zip(ts.channels(), range(n_channels)): npt.assert_allclose(t_i, t[i]) npt.assert_allclose(m_i, m[i]) npt.assert_allclose(e_i, e[i])
def featurize_data(pbmap, pbnames, lcdata, metadata, nobjects, featurefile): """ ***Feature extractor for PLaSTiCC*** Extracts features from data by some Cesium library functions. Builds a timeseries dictionary and for each time series extracts features. Features described in file: feature_sets. Created on Mon Apr 29 19:30:52 2019 @author: luisarribas """ print("") print("EXTRACTING FEATURES") print("===================") print("") print("Building Timeseries....wait") print("===========================") #**********************BUILD TIME SERIES********************************** tsdict = OrderedDict() for i in range(nobjects): row = metadata[i] thisid = row['object_id'] target = row['target'] meta = {'zBand':row['zBand'],\ 'z':row['hostgal_photoz'],\ 'zerr':row['hostgal_photoz_err'],\ 'mag':row['magnitude'],\ 'u-b':row['u-b'],\ 'b-v':row['b-v'] } ind = (lcdata['object_id'] == thisid) thislc = lcdata[ind] pbind = [(thislc['passband'] == pb) for pb in pbmap] t = [thislc['mjd'][mask].data for mask in pbind] m = [thislc['flux'][mask].data for mask in pbind] e = [thislc['flux_err'][mask].data for mask in pbind] tsdict[thisid] = TimeSeries(t=t, m=m, e=e,\ label=target, name=thisid, meta_features=meta,\ channel_names=pbnames ) print("") print("OK!") print(" ") #***********************FEATURE EXTRACTION WITH CESIUM******************** warnings.simplefilter('ignore') if os.path.exists(featurefile): print("") print("Loading features from file....wait") print("==================================") featuretable, _ = featurize.load_featureset(featurefile) print("") print("OK!") print(" ") else: features_list = [] print("") print("Computing features....wait") print("==========================") with schwimmbad.MultiPool() as pool: results = pool.imap(worker, list(tsdict.values())) for res in results: features_list.append(res) featuretable = featurize.assemble_featureset(features_list=features_list,\ time_series=tsdict.values()) featurize.impute_featureset(fset=featuretable, strategy='constant', value=0, max_value=18446744073709551000, inplace=True) featurize.save_featureset(fset=featuretable, path=featurefile) print("") print("OK!") print(" ") #*******Build Pandas dataframe output************************************* old_names = featuretable.columns.values new_names = ['{}_{}'.format(x, pbmap.get(y, 'meta')) for x, y in old_names] cols = [featuretable[col] for col in old_names] allfeats = Table(cols, names=new_names, masked=False) allfeats['target'] = metadata['target'] allfeats = allfeats.to_pandas() allfeats = np.nan_to_num(allfeats) new_names.append('target') allfeats = Table(allfeats, names=new_names, masked=False) allfeats = allfeats.to_pandas() print("") print("Extracted features = ", len(allfeats.columns)) print("==========================") print("") print("Nan Values detected = ", sum(len(allfeats) - allfeats.count())) print("==========================") return allfeats
#where the length equals how many total observations that object has in the time series data pbind = [np.asarray(thislc['passband'] == pb) for pb in pbmap] #Store the times of all the measurements for this object_id. this will be a list with length 6 (one for # each passband) where each of the 6 items is a list of length = how many observations that object #has for JUST THAT PASSBAND in the time series data t = [np.asarray(thislc.loc[:,'mjd'][mask]) for mask in pbind] #Store the flux measurements taken for this object_id. This is the same shape as t m = [np.asarray(thislc['flux'][mask]) for mask in pbind] #Store the flux measurement errors for this object_id. This is the same shape as t e = [np.asarray(thislc['flux_err'][mask]) for mask in pbind] #Create the TimeSeries() object using the variables defined above, and append it to the ordered dict. tsdict[thisid] = TimeSeries(t=t, m=m, e=e, name=thisid, meta_features=meta,channel_names=pbnames) #Occasionally print progress so we can assess speed if i % 1000000 == 0: print(str(i) + ' done out of ' + str(nobjects)) #Empty list for the actual features we want the TimeSeries() object to facilitate features_list = [] print("Generating features from objects and storing to a table...") with multiprocessing.Pool() as pool: #Apply the worker function to each object in the dict of TimeSeries() objects. see ./testWorker.py #this returns a single row for each object with hundreds of features determined by ./testWorker.py results = pool.imap(worker, list(tsdict.values())) i = 0
def test_time_series_npz(): n_channels = 3 t, m, e = sample_time_series(channels=n_channels) ts = TimeSeries(t[0], m[0], e[0]) ts.save(TEST_TS_PATH) ts_loaded = time_series.load(TEST_TS_PATH) assert_ts_equal(ts, ts_loaded) ts = TimeSeries(t[0], m, e[0]) ts.save(TEST_TS_PATH) ts_loaded = time_series.load(TEST_TS_PATH) assert_ts_equal(ts, ts_loaded) t = [t[i][0:i+2] for i in range(len(t))] m = [m[i][0:i+2] for i in range(len(m))] e = [e[i][0:i+2] for i in range(len(e))] ts = TimeSeries(t, m, e) ts.save(TEST_TS_PATH) ts_loaded = time_series.load(TEST_TS_PATH) assert_ts_equal(ts, ts_loaded)
def test_time_series_npz(tmpdir): n_channels = 3 t, m, e = sample_time_series(channels=n_channels) ts = TimeSeries(t[0], m[0], e[0]) ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz') ts.save(ts_path) ts_loaded = time_series.load(ts_path) assert_ts_equal(ts, ts_loaded) ts = TimeSeries(t[0], m, e[0]) ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz') ts.save(ts_path) ts_loaded = time_series.load(ts_path) assert_ts_equal(ts, ts_loaded) t = [t[i][0:i + 2] for i in range(len(t))] m = [m[i][0:i + 2] for i in range(len(m))] e = [e[i][0:i + 2] for i in range(len(e))] ts = TimeSeries(t, m, e) ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz') ts.save(ts_path) ts_loaded = time_series.load(ts_path) assert_ts_equal(ts, ts_loaded)