def concatenate(*datasets, **kwargs): """ Concatenation of |NDDataset| objects along a given axis. Any number of |NDDataset| objects can be concatenated (by default the last on the last dimension). For this operation to be defined the following must be true : #. all inputs must be valid |NDDataset| objects; #. units of data must be compatible #. concatenation is along the axis specified or the last one; #. along the non-concatenated dimensions, shapes must match. Parameters ---------- *datasets : positional |NDDataset| arguments The dataset(s) to be concatenated to the current dataset. The datasets must have the same shape, except in the dimension corresponding to axis (the last, by default). **kwargs Optional keyword parameters (see Other Parameters). Returns -------- out A |NDDataset| created from the contenations of the |NDDataset| input objects. Other Parameters ---------------- dims : str, optional, default='x' The dimension along which the operation is applied. axis : int, optional The axis along which the operation is applied. See Also --------- stack : Stack of |NDDataset| objects along a new dimension. Examples -------- >>> A = scp.read('irdata/nh4y-activation.spg', protocol='omnic') >>> B = scp.read('irdata/nh4y-activation.scp') >>> C = scp.concatenate(A[10:], B[3:5], A[:10], axis=0) >>> A[10:].shape, B[3:5].shape, A[:10].shape, C.shape ((45, 5549), (2, 5549), (10, 5549), (57, 5549)) or >>> D = A.concatenate(B, B, axis=0) >>> A.shape, B.shape, D.shape ((55, 5549), (55, 5549), (165, 5549)) >>> E = A.concatenate(B, axis=1) >>> A.shape, B.shape, E.shape ((55, 5549), (55, 5549), (55, 11098)) """ # check uise if "force_stack" in kwargs: warn("force_stack not used anymore, use stack() instead", DeprecationWarning) return stack(datasets) # get a copy of input datasets in order that input data are not modified datasets = _get_copy(datasets) # get axis from arguments axis, dim = datasets[0].get_axis(**kwargs) # check shapes, except for dim along which concatenation will be done shapes = {ds.shape[:axis] + ds.shape[axis + 1:] for ds in datasets} if len(shapes) != 1: raise DimensionsCompatibilityError( "all input arrays must have the same shape") # check units units = tuple(set(ds.units for ds in datasets)) if len(units) == 1: units = datasets[0].units else: # check compatibility for i, u1 in enumerate(units[:-1]): for u2 in units[i + 1:]: if u1.dimensionality != u2.dimensionality: raise UnitsCompatibilityError( f"Units of the data are {[str(u) for u in units]}. The datasets can't be concatenated" ) # should be compatible, so convert units = datasets[0].units for ds in datasets[1:]: if ds.units != units: ds.ito(units) # concatenate or stack the data array + mask # -------------------------------------------- sss = [] for i, dataset in enumerate(datasets): d = dataset.masked_data sss.append(d) sconcat = np.ma.concatenate(sss, axis=axis) data = np.asarray(sconcat) mask = sconcat.mask # now manage coordinates and labels coords = datasets[0].coordset if coords is not None: if not coords[dim].is_empty: labels = [] if coords[dim].is_labeled: for ds in datasets: labels.append(ds.coordset[dim].labels) if coords[dim].implements() in ["Coord", "LinearCoord"]: coords[dim] = Coord(coords[dim], linear=False) if labels != []: coords[dim]._labels = np.concatenate(labels) elif coords[dim].implements("CoordSet"): if labels != []: labels = np.array(labels) for i, coord in enumerate(coords[dim]): if labels[:i].size != 0: coord._labels = np.concatenate( [label for label in labels[:, i]]) coords[dim]._data = np.concatenate( tuple((ds.coordset[dim].data for ds in datasets))) out = dataset.copy() out._data = data if coords is not None: out._coordset[dim] = coords[dim] out._mask = mask out._units = units out.description = f"Concatenation of {len(datasets)} datasets:\n" out.description += "( {}".format(datasets[0].name) out.title = datasets[0].title authortuple = (datasets[0].author, ) for dataset in datasets[1:]: if out.title != dataset.title: warn( "Different data title => the title is that of the 1st dataset") if not (dataset.author in authortuple): authortuple = authortuple + (dataset.author, ) out.author = " & ".join([str(author) for author in authortuple]) out.description += ", {}".format(dataset.name) out.description += " )" out._date = out._modified = datetime.datetime.now(datetime.timezone.utc) out._history = [str(out.date) + ": Created by concatenate"] return out
def align(dataset, *others, **kwargs): """ Align individual |NDDataset| along given dimensions using various methods. Parameters ----------- dataset : |NDDataset| Dataset on which we want to salign other objects. *others : |NDDataset| Objects to align. dim : str. Optional, default='x' Along which axis to perform the alignment. dims : list of str, optional, default=None Align along all dims defined in dims (if dim is also defined, then dims have higher priority). method : enum ['outer', 'inner', 'first', 'last', 'interpolate'], optional, default='outer' Which method to use for the alignment. If align is defined : * 'outer' means that a union of the different coordinates is achieved (missing values are masked) * 'inner' means that the intersection of the coordinates is used * 'first' means that the first dataset is used as reference * 'last' means that the last dataset is used as reference * 'interpolate' means that interpolation is performed relative to the first dataset. interpolate_method : enum ['linear','pchip']. Optional, default='linear' Method of interpolation to performs for the alignment. interpolate_sampling : 'auto', int or float. Optional, default='auto' * 'auto' : sampling is determined automatically from the existing data. * int : if an integer values is specified, then the sampling interval for the interpolated data will be splitted in this number of points. * float : If a float value is provided, it determines the interval between the interpolated data. coord : |Coord|, optional, default=None coordinates to use for alignment. Ignore those corresponding to the dimensions to align. copy : bool, optional, default=True If False then the returned objects will share memory with the original objects, whenever it is possible : in principle only if reindexing is not necessary. Returns -------- aligned_datasets : tuple of |NDDataset| Same objects as datasets with dimensions aligned. Raises ------ ValueError issued when the dimensions given in `dim` or `dims` argument are not compatibles (units, titles, etc...). """ # DEVELOPPER NOTE # There is probably better methods, but to simplify dealing with # LinearCoord, we transform them in Coord before treatment (going back # to linear if possible at the end of the process) # TODO: Perform an alignment along numeric labels # TODO: add example in docs # copy objects? copy = kwargs.pop('copy', True) # make a single list with dataset and the remaining object objects = [dataset] + list(others) # should we align on given external coordinates extern_coord = kwargs.pop('coord', None) if extern_coord and extern_coord.implements('LinearCoord'): extern_coord = Coord(extern_coord, linear=False, copy=True) # what's the method to use (by default='outer') method = kwargs.pop('method', 'outer') # trivial cases where alignment is not possible or unecessary if not objects: warning_('No object provided for alignment!') return None if len(objects) == 1 and objects[0].implements( 'NDDataset') and extern_coord is None: # no necessary alignment return objects # evaluate on which axis we align axis, dims = dataset.get_axis(only_first=False, **kwargs) # check compatibility of the dims and prepare the dimension for alignment for axis, dim in zip(axis, dims): # get all objets to align _objects = {} _nobj = 0 for idx, object in enumerate(objects): if not object.implements('NDDataset'): error_( f'Bad object(s) found: {object}. Note that only NDDataset ' f'objects are accepted ' f'for alignment') return None _objects[_nobj] = { 'obj': object.copy(), 'idx': idx, } _nobj += 1 _last = _nobj - 1 # get the reference object (by default the first, except if method if # set to 'last' ref_obj_index = 0 if method == 'last': ref_obj_index = _last ref_obj = _objects[ref_obj_index]['obj'] # as we will sort their coordinates at some point, we need to know # if the coordinates need to be reversed at # the end of the alignment process reversed = ref_obj.coordset[dim].reversed if reversed: ref_obj.sort(descend=False, dim=dim, inplace=True) # get the coordset corresponding to the reference object ref_obj_coordset = ref_obj.coordset # get the coordinate for the reference dimension ref_coord = ref_obj_coordset[dim] # as we will sort their coordinates at some point, we need to know # if the coordinates need to be reversed at # the end of the alignment process reversed = ref_coord.reversed # prepare a new Coord object to store the final new dimension new_coord = ref_coord.copy() ndec = get_n_decimals(new_coord.data.max(), 1.e-5) if new_coord.implements('LinearCoord'): new_coord = Coord(new_coord, linear=False, copy=True) # loop on all object for index, object in _objects.items(): obj = object['obj'] if obj is ref_obj: # not necessary to compare with itself! continue if reversed: obj.sort(descend=False, dim=dim, inplace=True) # get the current objet coordinates and check compatibility coord = obj.coordset[dim] if coord.implements('LinearCoord') or coord.linear: coord = Coord(coord, linear=False, copy=True) if not coord.is_units_compatible(ref_coord): # not compatible, stop everything raise UnitsCompatibilityError( 'NDataset to align must have compatible units!') # do units transform if necesssary so coords can be compared if coord.units != ref_coord.units: coord.ito(ref_coord) # adjust the new_cord depending on the method of alignement new_coord_data = set(np.around(new_coord.data, ndec)) coord_data = set(np.around(coord.data, ndec)) if method in ['outer', 'interpolate']: # in this case we do a union of the coords (masking the # missing values) # For method=`interpolate`, the interpolation will be # performed in a second step new_coord._data = sorted(coord_data | new_coord_data) elif method == 'inner': # take only intersection of the coordinates # and generate a warning if it result something null or new_coord._data = sorted(coord_data & new_coord_data) elif method in ['first', 'last']: # we take the reference coordinates already determined as # basis (masking the missing values) continue else: raise NotImplementedError(f'The method {method} is unknown!') # Now perform alignment of all objects on the new coordinates for index, object in _objects.items(): obj = object['obj'] # get the dim index for the given object dim_index = obj.dims.index(dim) # prepare slicing keys ; set slice(None) for the untouched # dimensions preceeding the dimension of interest prepend_keys = [slice(None)] * dim_index # New objects for obj must be created with the new coordinates # change the data shape new_obj_shape = list(obj.shape) new_obj_shape[dim_index] = len(new_coord) new_obj_data = np.full(new_obj_shape, np.NaN) # create new dataset for obj and ref_objects if copy: new_obj = obj.copy() else: new_obj = obj # update the data and mask coord = obj.coordset[dim] coord_data = set(np.around(coord.data, ndec)) dim_loc = new_coord._loc2index(sorted(coord_data)) loc = tuple(prepend_keys + [dim_loc]) new_obj._data = new_obj_data # mask all the data then unmask later the relevant data in # the next step if not new_obj.is_masked: new_obj.mask = MASKED new_obj.mask[loc] = False else: mask = new_obj.mask.copy() new_obj.mask = MASKED new_obj.mask[loc] = mask # set the data for the loc new_obj._data[loc] = obj.data # update the coordinates new_coordset = obj.coordset.copy() if coord.is_labeled: label_shape = list(coord.labels.shape) label_shape[0] = new_coord.size new_coord._labels = np.zeros(tuple(label_shape)).astype( coord.labels.dtype) new_coord._labels[:] = '--' new_coord._labels[dim_loc] = coord.labels setattr(new_coordset, dim, new_coord) new_obj._coordset = new_coordset # reversed? if reversed: # we must reverse the given coordinates new_obj.sort(descend=reversed, dim=dim, inplace=True) # update the _objects _objects[index]['obj'] = new_obj if method == 'interpolate': warning_( 'Interpolation not yet implemented - for now equivalent ' 'to `outer`') # the new transformed object must be in the same order as the passed # objects # and the missing values must be masked (for the moment they are defined to NaN for index, object in _objects.items(): obj = object['obj'] # obj[np.where(np.isnan(obj))] = MASKED # mask NaN values obj[np.where(np.isnan( obj))] = 99999999999999. # replace NaN values (to simplify # comparisons) idx = int(object['idx']) objects[idx] = obj # we also transform into linear coord if possible ? pass # TODO: # Now return return tuple(objects)
def concatenate(*datasets, **kwargs): """ Concatenation of |NDDataset| objects along a given axis. Any number of |NDDataset| objects can be concatenated (by default the last on the last dimension). For this operation to be defined the following must be true : #. all inputs must be valid |NDDataset| objects; #. units of data and axis must be compatible #. concatenation is along the axis specified or the last one; #. along the non-concatenated dimensions, shape and units coordinates must match. Parameters ---------- *datasets : positional |NDDataset| arguments The dataset(s) to be concatenated to the current dataset. The datasets must have the same shape, except in the dimension corresponding to axis (the last, by default). **kwargs : dict See other parameters. Returns -------- out A |NDDataset| created from the contenations of the |NDDataset| input objects. Other Parameters ---------------- dims : str, optional, default='x' The dimension along which the operation is applied. force_stack : bool, optional, default=False If True, the dataset are stacked instead of being concatenated. This means that a new dimension is prepended to each dataset before being stacked, except if one of the dimension is of size one. If this case the datasets are squeezed before stacking. The stacking is only possible is the shape of the various datasets are identical. This process is equivalent of using the method `stack`. See Also --------- stack : Stack of |NDDataset| objects along the first dimension. Examples -------- >>> import spectrochempy as scp >>> A = scp.read('irdata/nh4y-activation.spg', protocol='omnic') >>> B = scp.read('irdata/nh4y-activation.scp') >>> C = scp.concatenate(A[10:], B[3:5], A[:10], axis=0) >>> A[10:].shape, B[3:5].shape, A[:10].shape, C.shape ((45, 5549), (2, 5549), (10, 5549), (57, 5549)) or >>> D = A.concatenate(B, B, axis=0) >>> A.shape, B.shape, D.shape ((55, 5549), (55, 5549), (165, 5549)) >>> E = A.concatenate(B, axis=1) >>> A.shape, B.shape, E.shape ((55, 5549), (55, 5549), (55, 11098)) Stacking of datasets: for nDimensional datasets (with the same shape), a new dimension is added >>> F = A.concatenate(B, force_stack=True) >>> A.shape, B.shape, F.shape ((55, 5549), (55, 5549), (2, 55, 5549)) If one of the dimensions is of size one, then this dimension is removed before stacking >>> G = A[0].concatenate(B[0], force_stack=True) >>> A[0].shape, B[0].shape, G.shape ((1, 5549), (1, 5549), (2, 5549)) """ # ------------------------------------------------------------------------------------------------------------------ # checks dataset validity # ------------------------------------------------------------------------------------------------------------------ # We must have a list of datasets if isinstance(datasets, tuple): if isinstance(datasets[0], (list, tuple)): datasets = datasets[0] # make a copy of the objects (in order that input data are not modified) datasets = [ds.copy() for ds in datasets] # try to cast of dataset to NDDataset for i, item in enumerate(datasets): if not isinstance(item, NDDataset): try: datasets[i] = NDDataset(item) except Exception: raise TypeError( f"Only instance of NDDataset can be concatenated, not {type(item).__name__}, " f"but casting to this type failed. ") # get the shapes and ndims for comparison rshapes = [] rndims = [] for item in datasets: sh = list(item.shape) rshapes.append(sh) rndims.append(len(sh)) # The number of dimensions is expected to be the same for all datasets if len(list(set(rndims))) > 1: raise DimensionsCompatibilityError( "Only NDDataset with the same number of dimensions can be concatenated." ) rcompat = list(map(list, list(map(set, list(zip(*rshapes)))))) # a flag to force stacking of dataset instead of the default concatenation force_stack = kwargs.get('force_stack', False) if force_stack: # when stacking, we add a new first dimension except if one dimension is of size one: in this case we use this # dimension for stacking prepend = False if len(set(list(map(len, rcompat)))) == 1: # all dataset have the same shape # they can be stacked by prepending a new dimension prepend = True # else we will try to stack them on the first dimension if not prepend: warn( 'These datasets have not the same shape, so they cannot be stacked. By default they will be ' 'concatenated along the first dimension.', category=SpectroChemPyWarning) for i, dataset in enumerate(datasets): if not prepend or dataset.shape[0] == 1: continue dataset._data = dataset.data[np.newaxis] dataset._mask = dataset.mask[np.newaxis] newcoord = Coord([i], labels=[dataset.name]) newcoord.name = (OrderedSet(DEFAULT_DIM_NAME) - dataset._dims).pop() dataset.add_coordset(newcoord) dataset.dims = [newcoord.name] + dataset.dims # TODO: make a function to simplify this process of adding new dimensions with coords axis, dim = datasets[0].get_axis(dim=0) else: # get axis from arguments (or set it to the default) axis, dim = datasets[0].get_axis(**kwargs) # check if data shapes are compatible (all dimension must have the same size # except the one to be concatenated) for i, item in enumerate(zip(*rshapes)): if i != axis and len(set(item)) > 1: raise DimensionsCompatibilityError( "Datasets must have the same shape for all dimensions except the one along which the" " concatenation is performed") # Check unit compatibility # ------------------------------------------------------------------------------------------------------------------ units = datasets[0].units for dataset in datasets: if not dataset.is_units_compatible(datasets[0]): raise ValueError( 'units of the datasets to concatenate are not compatible') # if needed transform to the same unit dataset.ito(units) # TODO: make concatenation of heterogeneous data possible by using labels # Check coordinates compatibility # ------------------------------------------------------------------------------------------------------------------ # coordinates units of NDDatasets must be compatible in all dimensions # get the coordss coordss = [dataset.coordset for dataset in datasets] # def check_coordinates(coordss, force_stack): # # # We will call this only in case of problems because it takes a lot of time # # # how many different coordss # coordss = set(coordss) # if len(coordss) == 1 and force_stack: # # nothing to do (all datasets have the same coords and so are # # perfectly compatibles for stacking) # pass # # else: # for i, cs in enumerate(zip(*coordss)): # # axs = set(cs) # axref = axs.pop() # for ax in axs: # # we expect compatible units # if not ax.is_units_compatible(axref): # raise ValueError( # "units of the dataset's axis are not compatible" # ) # if i != axis and ax.size != axref.size: # # and same size for the non-concatenated axis # raise ValueError( # "size of the non-concatenated dimension must be " # "identical" # ) # concatenate or stack the data array + mask # ------------------------------------------------------------------------------------------------------------------ sss = [] for i, dataset in enumerate(datasets): d = dataset.masked_data sss.append(d) sconcat = np.ma.concatenate(sss, axis=axis) data = np.asarray(sconcat) mask = sconcat.mask # concatenate coords if they exists # ------------------------------------------------------------------------------------------------------------------ if len(coordss) == 1 and coordss.pop() is None: # no coords coords = None else: # we take the coords of the first dataset, and extend the coord along the concatenate axis coords = coordss[0].copy() try: coords[dim] = Coord(coords[dim], linear=False) # de-linearize the coordinates coords[dim]._data = np.concatenate( tuple((c[dim].data for c in coordss))) except ValueError: pass # concatenation of the labels (first check the presence of at least one labeled coordinates) is_labeled = False for i, c in enumerate(coordss): if c[dim].implements() in ['Coord', 'LinearCoord']: # this is a coord if c[dim].is_labeled: # at least one of the coord is labeled is_labeled = True break if c[dim].implements('CoordSet'): # this is a coordset for coord in c[dim]: if coord.is_labeled: # at least one of the coord is labeled is_labeled = True break if is_labeled: labels = [] # be sure that now all the coordinates have a label, or create one for i, c in enumerate(coordss): if c[dim].implements() in ['Coord', 'LinearCoord']: # this is a coord if c[dim].is_labeled: labels.append(c[dim].labels) else: labels.append(str(i)) if c[dim].implements('CoordSet'): # this is a coordset for coord in c[dim]: if coord.is_labeled: labels.append(c[dim].labels) else: labels.append(str(i)) if isinstance(coords[dim], Coord): coords[dim]._labels = np.concatenate(labels) if coords[dim].implements('CoordSet'): for i, coord in enumerate(coords[dim]): coord._labels = np.concatenate(labels[i::len(coords[dim])]) coords[dim]._labels = np.concatenate(labels) # out = NDDataset(data, coordset=coords, mask=mask, units=units) # This doesn't keep the order of the # coordinates out = dataset.copy() out._data = data out._coordset[dim] = coords[dim] out._mask = mask out._units = units thist = 'Stack' if axis == 0 else 'Concatenation' out.description = '{} of {} datasets:\n'.format(thist, len(datasets)) out.description += '( {}'.format(datasets[0].name) out.title = datasets[0].title authortuple = (datasets[0].author, ) for dataset in datasets[1:]: if out.title != dataset.title: warn( 'Different data title => the title is that of the 1st dataset') if not (dataset.author in authortuple): authortuple = authortuple + (dataset.author, ) out.author = out.author + ' & ' + dataset.author out.description += ', {}'.format(dataset.name) out.description += ' )' out._date = out._modified = datetime.datetime.now(datetime.timezone.utc) out._history = [str(out.date) + ': Created by %s' % thist] return out