def test_nddataset_binary_operation_with_other_1D(): coord1 = Coord(np.linspace(0.0, 10.0, 10)) coord2 = Coord(np.linspace(1.0, 5.5, 5)) d1 = NDDataset(np.random.random((10, 5)), coordset=[coord1, coord2]) d2 = d1[0] # this should work independently of the value of the coordinates on dimension y d3 = d1 - d2 assert_array_equal(d3.data, d1.data - d2.data)
def __init__(self, data=None, coordset=None, coordunits=None, coordtitles=None, **kwargs): super().__init__(data, **kwargs) self._parent = None # eventually set the coordinates with optional units and title if isinstance(coordset, CoordSet): self.set_coordset(**coordset) else: if coordset is None: coordset = [None] * self.ndim if coordunits is None: coordunits = [None] * self.ndim if coordtitles is None: coordtitles = [None] * self.ndim _coordset = [] for c, u, t in zip(coordset, coordunits, coordtitles): if not isinstance(c, CoordSet): if isinstance(c, LinearCoord): coord = LinearCoord(c) else: coord = Coord(c) if u is not None: coord.units = u if t is not None: coord.title = t else: if u: # pragma: no cover warning_( "units have been set for a CoordSet, but this will be ignored " "(units are only defined at the coordinate level") if t: # pragma: no cover warning_( "title will be ignored as they are only defined at the coordinates level" ) coord = c _coordset.append(coord) if _coordset and set(_coordset) != { Coord() }: # if they are no coordinates do nothing self.set_coordset(*_coordset)
def stack(*datasets): """ Stack of |NDDataset| objects along a new dimension. Any number of |NDDataset| objects can be stacked. For this operation to be defined the following must be true : #. all inputs must be valid dataset objects, #. units of data and axis must be compatible (rescaling is applied automatically if necessary). Parameters ---------- *datasets : a series of |NDDataset| The dataset to be stacked to the current dataset. Returns -------- out A |NDDataset| created from the stack of the `datasets` datasets. See Also -------- concatenate : Concatenate |NDDataset| objects along a given dimension. Examples -------- >>> A = scp.read('irdata/nh4y-activation.spg', protocol='omnic') >>> B = scp.read('irdata/nh4y-activation.scp') >>> C = scp.stack(A, B) >>> print(C) NDDataset: [float64] a.u. (shape: (z:2, y:55, x:5549)) """ datasets = _get_copy(datasets) shapes = {ds.shape for ds in datasets} if len(shapes) != 1: raise DimensionsCompatibilityError( "all input arrays must have the same shape") # prepend a new dimension for i, dataset in enumerate(datasets): dataset._data = dataset.data[np.newaxis] dataset._mask = dataset.mask[np.newaxis] newcoord = Coord([i], labels=[dataset.name]) newcoord.name = (OrderedSet(DEFAULT_DIM_NAME) - dataset._dims).pop() dataset.add_coordset(newcoord) dataset.dims = [newcoord.name] + dataset.dims return concatenate(*datasets, dims=0)
def test_coord_unit_conversion_operators(operation, result_units): in_km = Coord(data=np.linspace(4000, 1000, 10), units='km', mask=None, title='something') scalar = 2. operator_km = in_km.__getattribute__(operation) combined = operator_km(scalar) debug_(f'{operation}, {combined}') assert_equal_units(combined.units, result_units)
def __getitem__(self, items, **kwargs): saveditems = items # coordinate selection to test first if isinstance(items, str): try: return self._coordset[items] except Exception: pass # slicing new, items = super().__getitem__(items, return_index=True) if new is None: return None if self._coordset is not None: names = self._coordset.names # all names of the current coordinates new_coords = [None] * len(names) for i, item in enumerate(items): # get the corresponding dimension name in the dims list name = self.dims[i] # get the corresponding index in the coordinate's names list idx = names.index(name) if self._coordset[idx].is_empty: new_coords[idx] = Coord(None, name=name) elif isinstance(item, slice): # add the slice on the corresponding coordinates on the dim to the new list of coordinates if not isinstance(self._coordset[idx], CoordSet): new_coords[idx] = self._coordset[idx][item] else: # we must slice all internal coordinates newc = [] for c in self._coordset[idx]: newc.append(c[item]) new_coords[idx] = CoordSet(*newc[::-1], name=name) # we reverse to be sure # the order will be kept for internal coordinates new_coords[idx]._default = self._coordset[ idx]._default # set the same default coord new_coords[idx]._is_same_dim = self._coordset[ idx]._is_same_dim elif isinstance(item, (np.ndarray, list)): new_coords[idx] = self._coordset[idx][item] new.set_coordset(*new_coords, keepnames=True) new.history = f"Slice extracted: ({saveditems})" return new
def test_coord_unit_conversion_operators_a(operation, result_units): print(operation, result_units) in_km = Coord(data=np.linspace(4000, 1000, 10), units='km', mask=None, title='something') scalar_in_m = 2. * ur.m operator_km = in_km.__getattribute__(operation) combined = operator_km(scalar_in_m) assert_equal_units(combined.units, result_units)
def test_nddataset_add_mismatch_coords(): coord1 = Coord(np.arange(5.0)) coord2 = Coord(np.arange(1.0, 5.5, 1.0)) d1 = NDDataset(np.ones((5, 5)), coordset=[coord1, coord2]) d2 = NDDataset(np.ones((5, 5)), coordset=[coord2, coord1]) with pytest.raises(CoordinateMismatchError) as exc: d1 -= d2 assert str( exc.value).startswith("\nCoord.data attributes are not almost equal") with pytest.raises(CoordinateMismatchError) as exc: d1 += d2 assert str(exc.value).startswith( "\nCoord.data attributes are not almost equal" ) # TODO= make more tests like this for various functions
def test_coord_add_units_with_different_scale(): d1 = Coord.arange(3.0, units="m") d2 = Coord.arange(3.0, units="cm") x = d1 + 1.0 * ur.cm assert x.data[1] == 1.01 x = d1 + d2 assert x.data[1] == 1.01 x = d2 + d1 assert x.data[1] == 101.0 d1 += d2 assert d1.data[1] == 1.01 d2 += d1 assert d2.data[1] == 102.0
def test_coord_not_implemented(name): coord0 = Coord(data=np.linspace(4000, 1000, 10), units='cm^-1', mask=None, title='wavelength') with pytest.raises(NotImplementedError): getattr(coord0, name)()
def test_IRIS(): X = NDDataset.read_omnic(os.path.join('irdata', 'CO@Mo_Al2O3.SPG')) p = [ 0.00300, 0.00400, 0.00900, 0.01400, 0.02100, 0.02600, 0.03600, 0.05100, 0.09300, 0.15000, 0.20300, 0.30000, 0.40400, 0.50300, 0.60200, 0.70200, 0.80100, 0.90500, 1.00400 ] X.coordset.update(y=Coord(p, title='pressure', units='torr')) # Using the `update` method is mandatory because it will preserve the name. # Indeed, setting using X.coordset[0] = Coord(...) fails unless name is specified: Coord(..., name='y') # set the optimization parameters, perform the analysis # and plot the results param = { 'epsRange': [-8, -1, 20], 'lambdaRange': [-7, -5, 3], 'kernel': 'langmuir' } X_ = X[:, 2250.:1950.] X_.plot() iris = IRIS(X_, param, verbose=True) f = iris.f X_hat = iris.reconstruct() iris.plotlcurve(scale='ln') f[0].plot(method='map', plottitle=True) X_hat[0].plot(plottitle=True) show()
def test_linearcoord(): coord1 = Coord([1, 2.5, 4, 5]) coord2 = Coord(np.array([1, 2.5, 4, 5])) assert coord2 == coord1 coord3 = Coord(range(10)) coord4 = Coord(np.arange(10)) assert coord4 == coord3 coord5 = coord4.copy() coord5 += 1 assert np.all(coord5.data == coord4.data + 1) assert coord5 is not None coord5.linear = True coord6 = Coord(linear=True, offset=2.0, increment=2.0, size=10) assert np.all(coord6.data == (coord4.data + 1.0) * 2.) LinearCoord(offset=2.0, increment=2.0, size=10) coord0 = LinearCoord.linspace(200., 300., 3, labels=['cold', 'normal', 'hot'], units="K", title='temperature') coord1 = LinearCoord.linspace(0., 60., 100, labels=None, units="minutes", title='time-on-stream') coord2 = LinearCoord.linspace(4000., 1000., 100, labels=None, units="cm^-1", title='wavenumber') assert coord0.size == 3 assert coord1.size == 100 assert coord2.size == 100 coordc = coord0.copy() assert coord0 == coordc coordc = coord1.copy() assert coord1 == coordc
def test_coord_unary_ufuncs_simple_data(name): coord0 = Coord(data=np.linspace(4000, 1000, 10), units='km', mask=None, title='something') f = getattr(np, name) r = f(coord0) assert isinstance(r, Coord)
def test_coord_slicing(): # slicing by index coord0 = Coord(data=np.linspace(4000, 1000, 10), mask=None, title="wavelength") assert coord0[0] == 4000.0 coord1 = Coord(data=np.linspace(4000, 1000, 10), units="cm^-1", mask=None, title="wavelength") c1 = coord1[0] assert isinstance(c1.values, Quantity) assert coord1[0].values == 4000.0 * (1.0 / ur.cm) # slicing with labels labs = list("abcdefghij") coord0 = Coord( data=np.linspace(4000, 1000, 10), labels=labs, units="cm^-1", mask=None, title="wavelength", ) assert coord0[0].values == 4000.0 * (1.0 / ur.cm) assert isinstance(coord0[0].values, Quantity) assert coord0[2] == coord0["c"] assert coord0["c":"d"] == coord0[2:4] # label included # slicing only-labels coordinates y = list("abcdefghij") a = Coord(labels=y, name="x") assert a.name == "x" assert isinstance(a.labels, np.ndarray) assert_array_equal(a.values, a.labels)
def sv(self): """|NDDataset|, Singular values""" size = self.s.size sv = self.s.copy() sv.name = 'sv' sv.title = 'singular values' sv.set_coordset( Coord(None, labels=['#%d' % (i + 1) for i in range(size)], title='Components')) return sv
def ev(self): """|NDDataset|, Explained variance""" size = self.s.size ev = self.s**2 / (size - 1) ev.name = 'ev' ev.title = 'explained variance' ev.set_coordset( Coord(None, labels=['#%d' % (i + 1) for i in range(size)], title='Components')) return ev
def test_coord_slicing(): # slicing by index coord0 = Coord(data=np.linspace(4000, 1000, 10), mask=None, title='wavelength') assert coord0[0] == 4000.0 coord1 = Coord(data=np.linspace(4000, 1000, 10), units='cm^-1', mask=None, title='wavelength') c1 = coord1[0] assert isinstance(c1.values, Quantity) assert coord1[0].values == 4000.0 * (1. / ur.cm) # slicing with labels labs = list('abcdefghij') coord0 = Coord(data=np.linspace(4000, 1000, 10), labels=labs, units='cm^-1', mask=None, title='wavelength') assert coord0[0].values == 4000.0 * (1. / ur.cm) assert isinstance(coord0[0].values, Quantity) assert coord0[2] == coord0['c'] assert coord0['c':'d'] == coord0[2:4] # label included # slicing only-labels coordinates y = list('abcdefghij') a = Coord(labels=y, name='x') assert a.name == 'x' assert isinstance(a.labels, np.ndarray) assert_array_equal(a.values, a.labels)
def ev(self): """ Explained variance (|NDDataset|). """ size = self.s.size ev = self.s**2 / (size - 1) ev.name = "ev" ev.title = "explained variance" ev.set_coordset( Coord(None, labels=[f"#{(i + 1)}" for i in range(size)], title="Components")) return ev
def sv(self): """ Singular values (|NDDataset|). """ size = self.s.size sv = self.s.copy() sv.name = "sv" sv.title = "singular values" sv.set_coordset( Coord(None, labels=[f"#{(i + 1)}" for i in range(size)], title="Components")) return sv
def update(self, **kwargs): """ Update a specific coordinates in the CoordSet. Parameters ---------- kwarg : Only keywords among the CoordSet.names are allowed - they denotes the name of a dimension. """ dims = kwargs.keys() for dim in list(dims)[:]: if dim in self.names: # we can replace the given coordinates idx = self.names.index(dim) self[idx] = Coord(kwargs.pop(dim), name=dim)
def _add_omnic_info(dataset, **kwargs): # get the time and name name = desc = dataset.name # modify the dataset metadata dataset.units = 'absorbance' dataset.title = 'absorbance' dataset.name = name dataset.description = ('Dataset from .csv file: {}\n'.format(desc)) dataset.history = str(datetime.now( timezone.utc)) + ':read from omnic exported csv file \n' dataset.origin = 'omnic' # Set the NDDataset date dataset._date = datetime.now(timezone.utc) dataset._modified = dataset.date # x axis dataset.x.units = 'cm^-1' # y axis ? if '_' in name: name, dat = name.split('_') # if needed convert weekday name to English dat = dat.replace('Lun', 'Mon') dat = dat[:3].replace('Mar', 'Tue') + dat[3:] dat = dat.replace('Mer', 'Wed') dat = dat.replace('Jeu', 'Thu') dat = dat.replace('Ven', 'Fri') dat = dat.replace('Sam', 'Sat') dat = dat.replace('Dim', 'Sun') # convert month name to English dat = dat.replace('Aout', 'Aug') # get the dates acqdate = datetime.strptime(dat, "%a %b %d %H-%M-%S %Y") # Transform back to timestamp for storage in the Coord object # use datetime.fromtimestamp(d, timezone.utc)) # to transform back to datetime obkct timestamp = acqdate.timestamp() dataset.y = Coord(np.array([timestamp]), name='y') dataset.set_coordtitles(y='acquisition timestamp (GMT)', x='wavenumbers') dataset.y.labels = np.array([[acqdate], [name]]) dataset.y.units = 's' return dataset
def _add_omnic_info(dataset, **kwargs): # get the time and name name = desc = dataset.name # modify the dataset metadata dataset.units = "absorbance" dataset.title = "absorbance" dataset.name = name dataset.description = "Dataset from .csv file: {}\n".format(desc) dataset.history = (str(datetime.now(timezone.utc)) + ":read from omnic exported csv file \n") dataset.origin = "omnic" # Set the NDDataset date dataset._date = datetime.now(timezone.utc) dataset._modified = dataset.date # x axis dataset.x.units = "cm^-1" # y axis ? if "_" in name: name, dat = name.split("_") # if needed convert weekday name to English dat = dat.replace("Lun", "Mon") dat = dat[:3].replace("Mar", "Tue") + dat[3:] dat = dat.replace("Mer", "Wed") dat = dat.replace("Jeu", "Thu") dat = dat.replace("Ven", "Fri") dat = dat.replace("Sam", "Sat") dat = dat.replace("Dim", "Sun") # convert month name to English dat = dat.replace("Aout", "Aug") # get the dates acqdate = datetime.strptime(dat, "%a %b %d %H-%M-%S %Y") # Transform back to timestamp for storage in the Coord object # use datetime.fromtimestamp(d, timezone.utc)) # to transform back to datetime obkct timestamp = acqdate.timestamp() dataset.y = Coord(np.array([timestamp]), name="y") dataset.set_coordtitles(y="acquisition timestamp (GMT)", x="wavenumbers") dataset.y.labels = np.array([[acqdate], [name]]) dataset.y.units = "s" return dataset
def _make_concentrations_matrix(*profiles): from spectrochempy.core.dataset.coord import Coord from spectrochempy.core.dataset.nddataset import NDDataset t = Coord(np.linspace(0, 10, 50), units='hour', title='time') c = [] for p in profiles: c.append(p(t.data)) ct = np.vstack(c) ct = ct - ct.min() ct = ct / np.sum(ct, axis=0) ct = NDDataset(data=ct, title='concentration', coordset=[range(len(ct)), t]) return ct
def _make_spectra_matrix(pos, width, ampl): from spectrochempy.core.dataset.coord import Coord from spectrochempy.core.dataset.nddataset import NDDataset from spectrochempy.core.fitting.models import gaussianmodel x = Coord(np.linspace(6000.0, 1000.0, 4000), units='cm^-1', title='wavenumbers') s = [] for args in zip(ampl, width, pos): s.append(gaussianmodel().f(x.data, *args)) st = np.vstack(s) st = NDDataset(data=st, units='absorbance', title='absorbance', coordset=[range(len(st)), x]) return st
def get_conc(self, n_pc=None): """ Computes abstract concentration profile (first in - first out). Parameters ---------- n_pc : int, optional, default:3 Number of pure species for which the concentration profile must be computed. Returns -------- concentrations Concentration profile. """ M, K = self.f_ev.shape if n_pc is None: n_pc = K n_pc = min(K, n_pc) f = self.f_ev b = self.b_ev xcoord = Coord(range(n_pc), title="PS#") c = NDDataset( np.zeros((M, n_pc)), coordset=CoordSet(y=self._X.y, x=xcoord), name=f"C_EFA[{self._X.name}]", title="relative concentration", description="Concentration profile from EFA", history=f"{datetime.now(timezone.utc)}: created by spectrochempy", ) if self._X.is_masked: masked_rows = np.all(self._X.mask, axis=-1) else: masked_rows = np.array([False] * M) for i in range(M): if masked_rows[i]: c[i] = MASKED continue c[i] = np.min((f.data[i, :n_pc], b.data[i, :n_pc][::-1]), axis=0) return c
def _valid_coordset(self, coords): # uses in coords_validate and setattr if coords is None: return for k, coord in enumerate(coords): if (coord is not None and not isinstance(coord, CoordSet) and coord.data is None): continue # For coord to be acceptable, we require at least a NDArray, a NDArray subclass or a CoordSet if not isinstance(coord, (LinearCoord, Coord, CoordSet)): if isinstance(coord, NDArray): coord = coords[k] = Coord(coord) else: raise TypeError( "Coordinates must be an instance or a subclass of Coord class or NDArray, or of " f" CoordSet class, but an instance of {type(coord)} has been passed" ) if self.dims and coord.name in self.dims: # check the validity of the given coordinates in terms of size (if it correspond to one of the dims) size = coord.size if self.implements("NDDataset"): idx = self._get_dims_index( coord.name)[0] # idx in self.dims if size != self._data.shape[idx]: raise ValueError( f"the size of a coordinates array must be None or be equal" f" to that of the respective `{coord.name}`" f" data dimension but coordinate size={size} != data shape[{idx}]=" f"{self._data.shape[idx]}") else: pass # bypass this checking for any other derived type (should be done in the subclass) coords._parent = self return coords
def __setattr__(self, key, value): if key in DEFAULT_DIM_NAME: # syntax such as ds.x, ds.y, etc... # Note the above test is important to avoid errors with traitlets # even if it looks redundant with the following if key in self.dims: if self._coordset is None: # we need to create a coordset first self.set_coordset( dict((self.dims[i], None) for i in range(self.ndim))) idx = self._coordset.names.index(key) _coordset = self._coordset listcoord = False if isinstance(value, list): listcoord = all( [isinstance(item, Coord) for item in value]) if listcoord: _coordset[idx] = list( CoordSet(value).to_dict().values())[0] _coordset[idx].name = key _coordset[idx]._is_same_dim = True elif isinstance(value, CoordSet): if len(value) > 1: value = CoordSet(value) _coordset[idx] = list(value.to_dict().values())[0] _coordset[idx].name = key _coordset[idx]._is_same_dim = True elif isinstance(value, (Coord, LinearCoord)): value.name = key _coordset[idx] = value else: _coordset[idx] = Coord(value, name=key) _coordset = self._valid_coordset(_coordset) self._coordset.set(_coordset) else: raise AttributeError(f"Coordinate `{key}` is not used.") else: super().__setattr__(key, value)
def _read_txt(*args, **kwargs): # read Labspec *txt files or series dataset, filename = args content = kwargs.get("content", False) if content: pass # fid = io.StringIO(content) # TODO: get the l list of string else: fid = open(filename, "r", encoding="utf-8") try: lines = fid.readlines() except UnicodeDecodeError: fid = open(filename, "r", encoding="latin-1") lines = fid.readlines() fid.close() if len(lines) == 0: return # Metadata meta = Meta() i = 0 while lines[i].startswith("#"): key, val = lines[i].split("=") key = key[1:] if key in meta.keys(): key = f"{key} {i}" meta[key] = val.strip() i += 1 # .txt extension is fairly common. We determine non labspc files based # on the absence of few keys. Two types of files (1D or 2D) are considered: labspec_keys_1D = ["Acq. time (s)", "Dark correction"] labspec_keys_2D = ["Exposition", "Grating"] if all(keywd in meta.keys() for keywd in labspec_keys_1D): pass elif all(keywd in meta.keys() for keywd in labspec_keys_2D): pass else: # this is not a labspec txt file" return # read spec rawdata = np.genfromtxt(lines[i:], delimiter="\t") # populate the dataset if rawdata.shape[1] == 2: data = rawdata[:, 1][np.newaxis] _x = Coord(rawdata[:, 0], title="Raman shift", units="1/cm") _y = Coord(None, title="Time", units="s") date_acq, _y = _transf_meta(_y, meta) else: data = rawdata[1:, 1:] _x = Coord(rawdata[0, 1:], title="Raman shift", units="1/cm") _y = Coord(rawdata[1:, 0], title="Time", units="s") date_acq, _y = _transf_meta(_y, meta) # try to transform to linear coord _x.linear = True # if success linear should still be True if _x.linear: _x = LinearCoord(_x) # set dataset metadata dataset.data = data dataset.set_coordset(y=_y, x=_x) dataset.title = "Counts" dataset.units = None dataset.name = filename.stem dataset.meta = meta # date_acq is Acquisition date at start (first moment of acquisition) dataset.description = "Spectrum acquisition : " + str(date_acq) # Set the NDDataset date dataset._date = datetime.datetime.now(datetime.timezone.utc) dataset._modified = dataset.date # Set origin, description and history dataset.history = f"{dataset.date}:imported from LabSpec6 text file {filename}" return dataset
def __init__(self, dataset, centered=True, standardized=False, scaled=False): """ Parameters ---------- dataset : |NDDataset| object The input dataset has shape (M, N). M is the number of observations (for examples a series of IR spectra) while N is the number of features (for example the wavenumbers measured in each IR spectrum). centered : bool, optional, default:True If True the data are centered around the mean values: :math:`X' = X - mean(X)`. standardized : bool, optional, default:False If True the data are scaled to unit standard deviation: :math:`X' = X / \\sigma`. scaled : bool, optional, default:False If True the data are scaled in the interval [0-1]: :math:`X' = (X - min(X)) / (max(X)-min(X))` """ self.prefs = dataset.preferences self._X = X = dataset Xsc = X.copy() # mean center the dataset # ----------------------- self._centered = centered if centered: self._center = center = np.mean(X, axis=0) Xsc = X - center Xsc.title = "centered %s" % X.title # Standardization # --------------- self._standardized = standardized if standardized: self._std = np.std(Xsc, axis=0) Xsc /= self._std Xsc.title = "standardized %s" % Xsc.title # Scaling # ------- self._scaled = scaled if scaled: self._min = np.min(Xsc, axis=0) self._ampl = np.ptp(Xsc, axis=0) Xsc -= self._min Xsc /= self._ampl Xsc.title = "scaled %s" % Xsc.title self._Xscaled = Xsc # perform SVD # ----------- svd = SVD(Xsc) sigma = svd.s.diag() U = svd.U VT = svd.VT # select n_pc loadings & compute scores # -------------------------------------------------------------------- # loadings LT = VT LT.title = 'loadings (L^T) of ' + X.name LT.history = 'Created by PCA' # scores S = dot(U, sigma) S.title = 'scores (S) of ' + X.name S.set_coordset(y=X.y, x=Coord(None, labels=['#%d' % (i + 1) for i in range(svd.s.size)], title='principal component')) S.description = 'scores (S) of ' + X.name S.history = 'Created by PCA' self._LT = LT self._S = S # other attributes # ---------------- self._sv = svd.sv self._sv.x.title = 'PC #' self._ev = svd.ev self._ev.x.title = 'PC #' self._ev_ratio = svd.ev_ratio self._ev_ratio.x.title = 'PC #' self._ev_cum = svd.ev_cum self._ev_cum.x.title = 'PC #' return
def align(dataset, *others, **kwargs): """ Align individual |NDDataset| along given dimensions using various methods. Parameters ----------- dataset : |NDDataset| Dataset on which we want to salign other objects. *others : |NDDataset| Objects to align. dim : str. Optional, default='x' Along which axis to perform the alignment. dims : list of str, optional, default=None Align along all dims defined in dims (if dim is also defined, then dims have higher priority). method : enum ['outer', 'inner', 'first', 'last', 'interpolate'], optional, default='outer' Which method to use for the alignment. If align is defined : * 'outer' means that a union of the different coordinates is achieved (missing values are masked) * 'inner' means that the intersection of the coordinates is used * 'first' means that the first dataset is used as reference * 'last' means that the last dataset is used as reference * 'interpolate' means that interpolation is performed relative to the first dataset. interpolate_method : enum ['linear','pchip']. Optional, default='linear' Method of interpolation to performs for the alignment. interpolate_sampling : 'auto', int or float. Optional, default='auto' * 'auto' : sampling is determined automatically from the existing data. * int : if an integer values is specified, then the sampling interval for the interpolated data will be splitted in this number of points. * float : If a float value is provided, it determines the interval between the interpolated data. coord : |Coord|, optional, default=None coordinates to use for alignment. Ignore those corresponding to the dimensions to align. copy : bool, optional, default=True If False then the returned objects will share memory with the original objects, whenever it is possible : in principle only if reindexing is not necessary. Returns -------- aligned_datasets : tuple of |NDDataset| Same objects as datasets with dimensions aligned. Raises ------ ValueError issued when the dimensions given in `dim` or `dims` argument are not compatibles (units, titles, etc...). """ # DEVELOPPER NOTE # There is probably better methods, but to simplify dealing with # LinearCoord, we transform them in Coord before treatment (going back # to linear if possible at the end of the process) # TODO: Perform an alignment along numeric labels # TODO: add example in docs # copy objects? copy = kwargs.pop('copy', True) # make a single list with dataset and the remaining object objects = [dataset] + list(others) # should we align on given external coordinates extern_coord = kwargs.pop('coord', None) if extern_coord and extern_coord.implements('LinearCoord'): extern_coord = Coord(extern_coord, linear=False, copy=True) # what's the method to use (by default='outer') method = kwargs.pop('method', 'outer') # trivial cases where alignment is not possible or unecessary if not objects: warning_('No object provided for alignment!') return None if len(objects) == 1 and objects[0].implements( 'NDDataset') and extern_coord is None: # no necessary alignment return objects # evaluate on which axis we align axis, dims = dataset.get_axis(only_first=False, **kwargs) # check compatibility of the dims and prepare the dimension for alignment for axis, dim in zip(axis, dims): # get all objets to align _objects = {} _nobj = 0 for idx, object in enumerate(objects): if not object.implements('NDDataset'): error_( f'Bad object(s) found: {object}. Note that only NDDataset ' f'objects are accepted ' f'for alignment') return None _objects[_nobj] = { 'obj': object.copy(), 'idx': idx, } _nobj += 1 _last = _nobj - 1 # get the reference object (by default the first, except if method if # set to 'last' ref_obj_index = 0 if method == 'last': ref_obj_index = _last ref_obj = _objects[ref_obj_index]['obj'] # as we will sort their coordinates at some point, we need to know # if the coordinates need to be reversed at # the end of the alignment process reversed = ref_obj.coordset[dim].reversed if reversed: ref_obj.sort(descend=False, dim=dim, inplace=True) # get the coordset corresponding to the reference object ref_obj_coordset = ref_obj.coordset # get the coordinate for the reference dimension ref_coord = ref_obj_coordset[dim] # as we will sort their coordinates at some point, we need to know # if the coordinates need to be reversed at # the end of the alignment process reversed = ref_coord.reversed # prepare a new Coord object to store the final new dimension new_coord = ref_coord.copy() ndec = get_n_decimals(new_coord.data.max(), 1.e-5) if new_coord.implements('LinearCoord'): new_coord = Coord(new_coord, linear=False, copy=True) # loop on all object for index, object in _objects.items(): obj = object['obj'] if obj is ref_obj: # not necessary to compare with itself! continue if reversed: obj.sort(descend=False, dim=dim, inplace=True) # get the current objet coordinates and check compatibility coord = obj.coordset[dim] if coord.implements('LinearCoord') or coord.linear: coord = Coord(coord, linear=False, copy=True) if not coord.is_units_compatible(ref_coord): # not compatible, stop everything raise UnitsCompatibilityError( 'NDataset to align must have compatible units!') # do units transform if necesssary so coords can be compared if coord.units != ref_coord.units: coord.ito(ref_coord) # adjust the new_cord depending on the method of alignement new_coord_data = set(np.around(new_coord.data, ndec)) coord_data = set(np.around(coord.data, ndec)) if method in ['outer', 'interpolate']: # in this case we do a union of the coords (masking the # missing values) # For method=`interpolate`, the interpolation will be # performed in a second step new_coord._data = sorted(coord_data | new_coord_data) elif method == 'inner': # take only intersection of the coordinates # and generate a warning if it result something null or new_coord._data = sorted(coord_data & new_coord_data) elif method in ['first', 'last']: # we take the reference coordinates already determined as # basis (masking the missing values) continue else: raise NotImplementedError(f'The method {method} is unknown!') # Now perform alignment of all objects on the new coordinates for index, object in _objects.items(): obj = object['obj'] # get the dim index for the given object dim_index = obj.dims.index(dim) # prepare slicing keys ; set slice(None) for the untouched # dimensions preceeding the dimension of interest prepend_keys = [slice(None)] * dim_index # New objects for obj must be created with the new coordinates # change the data shape new_obj_shape = list(obj.shape) new_obj_shape[dim_index] = len(new_coord) new_obj_data = np.full(new_obj_shape, np.NaN) # create new dataset for obj and ref_objects if copy: new_obj = obj.copy() else: new_obj = obj # update the data and mask coord = obj.coordset[dim] coord_data = set(np.around(coord.data, ndec)) dim_loc = new_coord._loc2index(sorted(coord_data)) loc = tuple(prepend_keys + [dim_loc]) new_obj._data = new_obj_data # mask all the data then unmask later the relevant data in # the next step if not new_obj.is_masked: new_obj.mask = MASKED new_obj.mask[loc] = False else: mask = new_obj.mask.copy() new_obj.mask = MASKED new_obj.mask[loc] = mask # set the data for the loc new_obj._data[loc] = obj.data # update the coordinates new_coordset = obj.coordset.copy() if coord.is_labeled: label_shape = list(coord.labels.shape) label_shape[0] = new_coord.size new_coord._labels = np.zeros(tuple(label_shape)).astype( coord.labels.dtype) new_coord._labels[:] = '--' new_coord._labels[dim_loc] = coord.labels setattr(new_coordset, dim, new_coord) new_obj._coordset = new_coordset # reversed? if reversed: # we must reverse the given coordinates new_obj.sort(descend=reversed, dim=dim, inplace=True) # update the _objects _objects[index]['obj'] = new_obj if method == 'interpolate': warning_( 'Interpolation not yet implemented - for now equivalent ' 'to `outer`') # the new transformed object must be in the same order as the passed # objects # and the missing values must be masked (for the moment they are defined to NaN for index, object in _objects.items(): obj = object['obj'] # obj[np.where(np.isnan(obj))] = MASKED # mask NaN values obj[np.where(np.isnan( obj))] = 99999999999999. # replace NaN values (to simplify # comparisons) idx = int(object['idx']) objects[idx] = obj # we also transform into linear coord if possible ? pass # TODO: # Now return return tuple(objects)
def concatenate(*datasets, **kwargs): """ Concatenation of |NDDataset| objects along a given axis. Any number of |NDDataset| objects can be concatenated (by default the last on the last dimension). For this operation to be defined the following must be true : #. all inputs must be valid |NDDataset| objects; #. units of data must be compatible #. concatenation is along the axis specified or the last one; #. along the non-concatenated dimensions, shapes must match. Parameters ---------- *datasets : positional |NDDataset| arguments The dataset(s) to be concatenated to the current dataset. The datasets must have the same shape, except in the dimension corresponding to axis (the last, by default). **kwargs Optional keyword parameters (see Other Parameters). Returns -------- out A |NDDataset| created from the contenations of the |NDDataset| input objects. Other Parameters ---------------- dims : str, optional, default='x' The dimension along which the operation is applied. axis : int, optional The axis along which the operation is applied. See Also --------- stack : Stack of |NDDataset| objects along a new dimension. Examples -------- >>> A = scp.read('irdata/nh4y-activation.spg', protocol='omnic') >>> B = scp.read('irdata/nh4y-activation.scp') >>> C = scp.concatenate(A[10:], B[3:5], A[:10], axis=0) >>> A[10:].shape, B[3:5].shape, A[:10].shape, C.shape ((45, 5549), (2, 5549), (10, 5549), (57, 5549)) or >>> D = A.concatenate(B, B, axis=0) >>> A.shape, B.shape, D.shape ((55, 5549), (55, 5549), (165, 5549)) >>> E = A.concatenate(B, axis=1) >>> A.shape, B.shape, E.shape ((55, 5549), (55, 5549), (55, 11098)) """ # check uise if "force_stack" in kwargs: warn("force_stack not used anymore, use stack() instead", DeprecationWarning) return stack(datasets) # get a copy of input datasets in order that input data are not modified datasets = _get_copy(datasets) # get axis from arguments axis, dim = datasets[0].get_axis(**kwargs) # check shapes, except for dim along which concatenation will be done shapes = {ds.shape[:axis] + ds.shape[axis + 1:] for ds in datasets} if len(shapes) != 1: raise DimensionsCompatibilityError( "all input arrays must have the same shape") # check units units = tuple(set(ds.units for ds in datasets)) if len(units) == 1: units = datasets[0].units else: # check compatibility for i, u1 in enumerate(units[:-1]): for u2 in units[i + 1:]: if u1.dimensionality != u2.dimensionality: raise UnitsCompatibilityError( f"Units of the data are {[str(u) for u in units]}. The datasets can't be concatenated" ) # should be compatible, so convert units = datasets[0].units for ds in datasets[1:]: if ds.units != units: ds.ito(units) # concatenate or stack the data array + mask # -------------------------------------------- sss = [] for i, dataset in enumerate(datasets): d = dataset.masked_data sss.append(d) sconcat = np.ma.concatenate(sss, axis=axis) data = np.asarray(sconcat) mask = sconcat.mask # now manage coordinates and labels coords = datasets[0].coordset if coords is not None: if not coords[dim].is_empty: labels = [] if coords[dim].is_labeled: for ds in datasets: labels.append(ds.coordset[dim].labels) if coords[dim].implements() in ["Coord", "LinearCoord"]: coords[dim] = Coord(coords[dim], linear=False) if labels != []: coords[dim]._labels = np.concatenate(labels) elif coords[dim].implements("CoordSet"): if labels != []: labels = np.array(labels) for i, coord in enumerate(coords[dim]): if labels[:i].size != 0: coord._labels = np.concatenate( [label for label in labels[:, i]]) coords[dim]._data = np.concatenate( tuple((ds.coordset[dim].data for ds in datasets))) out = dataset.copy() out._data = data if coords is not None: out._coordset[dim] = coords[dim] out._mask = mask out._units = units out.description = f"Concatenation of {len(datasets)} datasets:\n" out.description += "( {}".format(datasets[0].name) out.title = datasets[0].title authortuple = (datasets[0].author, ) for dataset in datasets[1:]: if out.title != dataset.title: warn( "Different data title => the title is that of the 1st dataset") if not (dataset.author in authortuple): authortuple = authortuple + (dataset.author, ) out.author = " & ".join([str(author) for author in authortuple]) out.description += ", {}".format(dataset.name) out.description += " )" out._date = out._modified = datetime.datetime.now(datetime.timezone.utc) out._history = [str(out.date) + ": Created by concatenate"] return out