def concatenate(*datasets, **kwargs):
    """
    Concatenation of |NDDataset| objects along a given axis.

    Any number of |NDDataset| objects can be concatenated (by default
    the last on the last dimension). For this operation
    to be defined the following must be true :

        #. all inputs must be valid |NDDataset| objects;
        #. units of data must be compatible
        #. concatenation is along the axis specified or the last one;
        #. along the non-concatenated dimensions, shapes must match.

    Parameters
    ----------
    *datasets : positional |NDDataset| arguments
        The dataset(s) to be concatenated to the current dataset. The datasets
        must have the same shape, except in the dimension corresponding to axis
        (the last, by default).
    **kwargs
        Optional keyword parameters (see Other Parameters).

    Returns
    --------
    out
        A |NDDataset| created from the contenations of the |NDDataset| input objects.

    Other Parameters
    ----------------
    dims : str, optional, default='x'
        The dimension along which the operation is applied.

    axis : int, optional
        The axis along which the operation is applied.

    See Also
    ---------
    stack : Stack of |NDDataset| objects along a new dimension.

    Examples
    --------
    >>> A = scp.read('irdata/nh4y-activation.spg', protocol='omnic')
    >>> B = scp.read('irdata/nh4y-activation.scp')
    >>> C = scp.concatenate(A[10:], B[3:5], A[:10], axis=0)
    >>> A[10:].shape, B[3:5].shape, A[:10].shape, C.shape
    ((45, 5549), (2, 5549), (10, 5549), (57, 5549))

    or

    >>> D = A.concatenate(B, B, axis=0)
    >>> A.shape, B.shape, D.shape
    ((55, 5549), (55, 5549), (165, 5549))

    >>> E = A.concatenate(B, axis=1)
    >>> A.shape, B.shape, E.shape
    ((55, 5549), (55, 5549), (55, 11098))
    """

    # check uise
    if "force_stack" in kwargs:
        warn("force_stack not used anymore, use stack() instead",
             DeprecationWarning)
        return stack(datasets)

    # get a copy of input datasets in order that input data are not modified
    datasets = _get_copy(datasets)

    # get axis from arguments
    axis, dim = datasets[0].get_axis(**kwargs)

    # check shapes, except for dim along which concatenation will be done
    shapes = {ds.shape[:axis] + ds.shape[axis + 1:] for ds in datasets}
    if len(shapes) != 1:
        raise DimensionsCompatibilityError(
            "all input arrays must have the same shape")

    # check units
    units = tuple(set(ds.units for ds in datasets))
    if len(units) == 1:
        units = datasets[0].units
    else:
        # check compatibility
        for i, u1 in enumerate(units[:-1]):
            for u2 in units[i + 1:]:
                if u1.dimensionality != u2.dimensionality:
                    raise UnitsCompatibilityError(
                        f"Units of the data are {[str(u) for u in units]}. The datasets can't be concatenated"
                    )
        # should be compatible, so convert
        units = datasets[0].units
        for ds in datasets[1:]:
            if ds.units != units:
                ds.ito(units)

    # concatenate or stack the data array + mask
    # --------------------------------------------

    sss = []
    for i, dataset in enumerate(datasets):
        d = dataset.masked_data
        sss.append(d)

    sconcat = np.ma.concatenate(sss, axis=axis)

    data = np.asarray(sconcat)
    mask = sconcat.mask

    # now manage coordinates and labels
    coords = datasets[0].coordset

    if coords is not None:

        if not coords[dim].is_empty:

            labels = []
            if coords[dim].is_labeled:
                for ds in datasets:
                    labels.append(ds.coordset[dim].labels)

            if coords[dim].implements() in ["Coord", "LinearCoord"]:
                coords[dim] = Coord(coords[dim], linear=False)
                if labels != []:
                    coords[dim]._labels = np.concatenate(labels)
            elif coords[dim].implements("CoordSet"):
                if labels != []:
                    labels = np.array(labels)
                    for i, coord in enumerate(coords[dim]):
                        if labels[:i].size != 0:
                            coord._labels = np.concatenate(
                                [label for label in labels[:, i]])

            coords[dim]._data = np.concatenate(
                tuple((ds.coordset[dim].data for ds in datasets)))

    out = dataset.copy()
    out._data = data
    if coords is not None:
        out._coordset[dim] = coords[dim]
    out._mask = mask
    out._units = units

    out.description = f"Concatenation of {len(datasets)}  datasets:\n"
    out.description += "( {}".format(datasets[0].name)
    out.title = datasets[0].title
    authortuple = (datasets[0].author, )

    for dataset in datasets[1:]:

        if out.title != dataset.title:
            warn(
                "Different data title => the title is that of the 1st dataset")

        if not (dataset.author in authortuple):
            authortuple = authortuple + (dataset.author, )

        out.author = " & ".join([str(author) for author in authortuple])

        out.description += ", {}".format(dataset.name)

    out.description += " )"
    out._date = out._modified = datetime.datetime.now(datetime.timezone.utc)
    out._history = [str(out.date) + ": Created by concatenate"]

    return out
Example #2
0
def align(dataset, *others, **kwargs):
    """
    Align individual |NDDataset| along given dimensions using various methods.

    Parameters
    -----------
    dataset : |NDDataset|
        Dataset on which we want to salign other objects.
    *others : |NDDataset|
        Objects to align.
    dim : str. Optional, default='x'
        Along which axis to perform the alignment.
    dims : list of str, optional, default=None
        Align along all dims defined in dims (if dim is also
        defined, then dims have higher priority).
    method : enum ['outer', 'inner', 'first', 'last', 'interpolate'], optional, default='outer'
        Which method to use for the alignment.

        If align is defined :

        * 'outer' means that a union of the different coordinates is
        achieved (missing values are masked)
        * 'inner' means that the intersection of the coordinates is used
        * 'first' means that the first dataset is used as reference
        * 'last' means that the last dataset is used as reference
        * 'interpolate' means that interpolation is performed relative to
        the first dataset.
    interpolate_method : enum ['linear','pchip']. Optional, default='linear'
        Method of interpolation to performs for the alignment.
    interpolate_sampling : 'auto', int or float. Optional, default='auto'

        * 'auto' : sampling is determined automatically from the existing data.
        * int :  if an integer values is specified, then the
          sampling interval for the interpolated data will be splitted in
          this number of points.
        * float : If a float value is provided, it determines the interval
        between the interpolated data.
    coord : |Coord|, optional, default=None
        coordinates to use for alignment. Ignore those corresponding to the
        dimensions to align.
    copy : bool, optional, default=True
        If False then the returned objects will share memory with the
        original objects, whenever it is possible :
        in principle only if reindexing is not necessary.

    Returns
    --------
    aligned_datasets : tuple of |NDDataset|
        Same objects as datasets with dimensions aligned.

    Raises
    ------
    ValueError
        issued when the dimensions given in `dim` or `dims` argument are not
        compatibles (units, titles, etc...).
    """
    # DEVELOPPER NOTE
    # There is probably better methods, but to simplify dealing with
    # LinearCoord, we transform them in Coord before treatment (going back
    # to linear if possible at the end of the process)

    # TODO: Perform an alignment along numeric labels
    # TODO: add example in docs

    # copy objects?
    copy = kwargs.pop('copy', True)

    # make a single list with dataset and the remaining object
    objects = [dataset] + list(others)

    # should we align on given external coordinates
    extern_coord = kwargs.pop('coord', None)
    if extern_coord and extern_coord.implements('LinearCoord'):
        extern_coord = Coord(extern_coord, linear=False, copy=True)

    # what's the method to use (by default='outer')
    method = kwargs.pop('method', 'outer')

    # trivial cases where alignment is not possible or unecessary
    if not objects:
        warning_('No object provided for alignment!')
        return None

    if len(objects) == 1 and objects[0].implements(
            'NDDataset') and extern_coord is None:
        # no necessary alignment
        return objects

    # evaluate on which axis we align
    axis, dims = dataset.get_axis(only_first=False, **kwargs)

    # check compatibility of the dims and prepare the dimension for alignment
    for axis, dim in zip(axis, dims):

        # get all objets to align
        _objects = {}
        _nobj = 0

        for idx, object in enumerate(objects):

            if not object.implements('NDDataset'):
                error_(
                    f'Bad object(s) found: {object}. Note that only NDDataset '
                    f'objects are accepted '
                    f'for alignment')
                return None

            _objects[_nobj] = {
                'obj': object.copy(),
                'idx': idx,
            }
            _nobj += 1

        _last = _nobj - 1

        # get the reference object (by default the first, except if method if
        # set to 'last'
        ref_obj_index = 0
        if method == 'last':
            ref_obj_index = _last

        ref_obj = _objects[ref_obj_index]['obj']

        # as we will sort their coordinates at some point, we need to know
        # if the coordinates need to be reversed at
        # the end of the alignment process
        reversed = ref_obj.coordset[dim].reversed
        if reversed:
            ref_obj.sort(descend=False, dim=dim, inplace=True)

        # get the coordset corresponding to the reference object
        ref_obj_coordset = ref_obj.coordset

        # get the coordinate for the reference dimension
        ref_coord = ref_obj_coordset[dim]

        # as we will sort their coordinates at some point, we need to know
        # if the coordinates need to be reversed at
        # the end of the alignment process
        reversed = ref_coord.reversed

        # prepare a new Coord object to store the final new dimension
        new_coord = ref_coord.copy()

        ndec = get_n_decimals(new_coord.data.max(), 1.e-5)

        if new_coord.implements('LinearCoord'):
            new_coord = Coord(new_coord, linear=False, copy=True)

        # loop on all object
        for index, object in _objects.items():

            obj = object['obj']

            if obj is ref_obj:
                # not necessary to compare with itself!
                continue

            if reversed:
                obj.sort(descend=False, dim=dim, inplace=True)

            # get the current objet coordinates and check compatibility
            coord = obj.coordset[dim]
            if coord.implements('LinearCoord') or coord.linear:
                coord = Coord(coord, linear=False, copy=True)

            if not coord.is_units_compatible(ref_coord):
                # not compatible, stop everything
                raise UnitsCompatibilityError(
                    'NDataset to align must have compatible units!')

            # do units transform if necesssary so coords can be compared
            if coord.units != ref_coord.units:
                coord.ito(ref_coord)

            # adjust the new_cord depending on the method of alignement

            new_coord_data = set(np.around(new_coord.data, ndec))
            coord_data = set(np.around(coord.data, ndec))

            if method in ['outer', 'interpolate']:
                # in this case we do a union of the coords (masking the
                # missing values)
                # For method=`interpolate`, the interpolation will be
                # performed in a second step
                new_coord._data = sorted(coord_data | new_coord_data)

            elif method == 'inner':
                # take only intersection of the coordinates
                # and generate a warning if it result something null or
                new_coord._data = sorted(coord_data & new_coord_data)

            elif method in ['first', 'last']:
                # we take the reference coordinates already determined as
                # basis (masking the missing values)
                continue

            else:
                raise NotImplementedError(f'The method {method} is unknown!')

        # Now perform alignment of all objects on the new coordinates
        for index, object in _objects.items():

            obj = object['obj']

            # get the dim index for the given object
            dim_index = obj.dims.index(dim)

            # prepare slicing keys ; set slice(None) for the untouched
            # dimensions preceeding the dimension of interest
            prepend_keys = [slice(None)] * dim_index

            # New objects for obj must be created with the new coordinates

            # change the data shape
            new_obj_shape = list(obj.shape)
            new_obj_shape[dim_index] = len(new_coord)
            new_obj_data = np.full(new_obj_shape, np.NaN)

            # create new dataset for obj and ref_objects
            if copy:
                new_obj = obj.copy()
            else:
                new_obj = obj

            # update the data and mask
            coord = obj.coordset[dim]
            coord_data = set(np.around(coord.data, ndec))

            dim_loc = new_coord._loc2index(sorted(coord_data))
            loc = tuple(prepend_keys + [dim_loc])

            new_obj._data = new_obj_data

            # mask all the data then unmask later the relevant data in
            # the next step

            if not new_obj.is_masked:
                new_obj.mask = MASKED
                new_obj.mask[loc] = False
            else:
                mask = new_obj.mask.copy()
                new_obj.mask = MASKED
                new_obj.mask[loc] = mask

            # set the data for the loc
            new_obj._data[loc] = obj.data

            # update the coordinates
            new_coordset = obj.coordset.copy()
            if coord.is_labeled:
                label_shape = list(coord.labels.shape)
                label_shape[0] = new_coord.size
                new_coord._labels = np.zeros(tuple(label_shape)).astype(
                    coord.labels.dtype)
                new_coord._labels[:] = '--'
                new_coord._labels[dim_loc] = coord.labels
            setattr(new_coordset, dim, new_coord)
            new_obj._coordset = new_coordset

            # reversed?
            if reversed:
                # we must reverse the given coordinates
                new_obj.sort(descend=reversed, dim=dim, inplace=True)

            # update the _objects
            _objects[index]['obj'] = new_obj

            if method == 'interpolate':
                warning_(
                    'Interpolation not yet implemented - for now equivalent '
                    'to `outer`')

        # the new transformed object must be in the same order as the passed
        # objects
        # and the missing values must be masked (for the moment they are defined to NaN

        for index, object in _objects.items():
            obj = object['obj']
            # obj[np.where(np.isnan(obj))] = MASKED  # mask NaN values
            obj[np.where(np.isnan(
                obj))] = 99999999999999.  # replace NaN values (to simplify
            # comparisons)
            idx = int(object['idx'])
            objects[idx] = obj

            # we also transform into linear coord if possible ?
            pass  # TODO:

    # Now return

    return tuple(objects)
Example #3
0
def concatenate(*datasets, **kwargs):
    """
    Concatenation of |NDDataset| objects along a given axis.

    Any number of |NDDataset| objects can be concatenated (by default
    the last on the last dimension). For this operation
    to be defined the following must be true :

        #. all inputs must be valid |NDDataset| objects;
        #. units of data and axis must be compatible
        #. concatenation is along the axis specified or the last one;
        #. along the non-concatenated dimensions, shape and units coordinates
           must match.

    Parameters
    ----------
    *datasets : positional |NDDataset| arguments
        The dataset(s) to be concatenated to the current dataset. The datasets
        must have the same shape, except in the dimension corresponding to axis
        (the last, by default).
    **kwargs : dict
        See other parameters.

    Returns
    --------
    out
        A |NDDataset| created from the contenations of the |NDDataset| input objects.

    Other Parameters
    ----------------
    dims : str, optional, default='x'
        The dimension along which the operation is applied.
    force_stack : bool, optional, default=False
        If True, the dataset are stacked instead of being concatenated. This means that a new dimension is prepended
        to each dataset before being stacked, except if one of the dimension is of size one. If this case the datasets
        are squeezed before stacking. The stacking is only possible is the shape of the various datasets are identical.
        This process is equivalent of using the method `stack`.

    See Also
    ---------
    stack : Stack of |NDDataset| objects along the first dimension.

    Examples
    --------
    >>> import spectrochempy as scp
    >>> A = scp.read('irdata/nh4y-activation.spg', protocol='omnic')
    >>> B = scp.read('irdata/nh4y-activation.scp')
    >>> C = scp.concatenate(A[10:], B[3:5], A[:10], axis=0)
    >>> A[10:].shape, B[3:5].shape, A[:10].shape, C.shape
    ((45, 5549), (2, 5549), (10, 5549), (57, 5549))

    or

    >>> D = A.concatenate(B, B, axis=0)
    >>> A.shape, B.shape, D.shape
    ((55, 5549), (55, 5549), (165, 5549))

    >>> E = A.concatenate(B, axis=1)
    >>> A.shape, B.shape, E.shape
    ((55, 5549), (55, 5549), (55, 11098))

    Stacking of datasets:
    for nDimensional datasets (with the same shape), a new dimension is added

    >>> F = A.concatenate(B, force_stack=True)
    >>> A.shape, B.shape, F.shape
    ((55, 5549), (55, 5549), (2, 55, 5549))

    If one of the dimensions is of size one, then this dimension is removed before stacking

    >>> G = A[0].concatenate(B[0], force_stack=True)
    >>> A[0].shape, B[0].shape, G.shape
    ((1, 5549), (1, 5549), (2, 5549))
    """

    # ------------------------------------------------------------------------------------------------------------------
    # checks dataset validity
    # ------------------------------------------------------------------------------------------------------------------

    # We must have a list of datasets
    if isinstance(datasets, tuple):
        if isinstance(datasets[0], (list, tuple)):
            datasets = datasets[0]

    # make a copy of the objects (in order that input data are not modified)
    datasets = [ds.copy() for ds in datasets]

    # try to cast of dataset to NDDataset
    for i, item in enumerate(datasets):
        if not isinstance(item, NDDataset):
            try:
                datasets[i] = NDDataset(item)
            except Exception:
                raise TypeError(
                    f"Only instance of NDDataset can be concatenated, not {type(item).__name__}, "
                    f"but casting to this type failed. ")

    # get the shapes and ndims for comparison
    rshapes = []
    rndims = []
    for item in datasets:
        sh = list(item.shape)
        rshapes.append(sh)
        rndims.append(len(sh))

    # The number of dimensions is expected to be the same for all datasets
    if len(list(set(rndims))) > 1:
        raise DimensionsCompatibilityError(
            "Only NDDataset with the same number of dimensions can be concatenated."
        )

    rcompat = list(map(list, list(map(set, list(zip(*rshapes))))))

    # a flag to force stacking of dataset instead of the default concatenation
    force_stack = kwargs.get('force_stack', False)
    if force_stack:
        # when stacking, we add a new first dimension except if one dimension is of size one: in this case we use this
        # dimension for stacking
        prepend = False
        if len(set(list(map(len, rcompat)))) == 1:
            # all dataset have the same shape
            # they can be stacked by prepending a new dimension
            prepend = True
            # else we will try to stack them on the first dimension

        if not prepend:
            warn(
                'These datasets have not the same shape, so they cannot be stacked. By default they will be '
                'concatenated along the first dimension.',
                category=SpectroChemPyWarning)

        for i, dataset in enumerate(datasets):
            if not prepend or dataset.shape[0] == 1:
                continue
            dataset._data = dataset.data[np.newaxis]
            dataset._mask = dataset.mask[np.newaxis]
            newcoord = Coord([i], labels=[dataset.name])
            newcoord.name = (OrderedSet(DEFAULT_DIM_NAME) -
                             dataset._dims).pop()
            dataset.add_coordset(newcoord)
            dataset.dims = [newcoord.name] + dataset.dims
            # TODO: make a function to simplify this process of adding new dimensions with coords
        axis, dim = datasets[0].get_axis(dim=0)

    else:
        # get axis from arguments (or set it to the default)
        axis, dim = datasets[0].get_axis(**kwargs)

    # check if data shapes are compatible (all dimension must have the same size
    # except the one to be concatenated)
    for i, item in enumerate(zip(*rshapes)):
        if i != axis and len(set(item)) > 1:
            raise DimensionsCompatibilityError(
                "Datasets must have the same shape for all dimensions except the one along which the"
                " concatenation is performed")

    # Check unit compatibility
    # ------------------------------------------------------------------------------------------------------------------

    units = datasets[0].units
    for dataset in datasets:
        if not dataset.is_units_compatible(datasets[0]):
            raise ValueError(
                'units of the datasets to concatenate are not compatible')
        # if needed transform to the same unit
        dataset.ito(units)
    # TODO: make concatenation of heterogeneous data possible by using labels

    # Check coordinates compatibility
    # ------------------------------------------------------------------------------------------------------------------

    # coordinates units of NDDatasets must be compatible in all dimensions
    # get the coordss
    coordss = [dataset.coordset for dataset in datasets]

    # def check_coordinates(coordss, force_stack):
    #
    #     # We will call this only in case of problems because it takes a lot of time
    #
    #     # how many different coordss
    #     coordss = set(coordss)
    #     if len(coordss) == 1 and force_stack:
    #         # nothing to do (all datasets have the same coords and so are
    #         # perfectly compatibles for stacking)
    #         pass
    #
    #     else:
    #         for i, cs in enumerate(zip(*coordss)):
    #
    #             axs = set(cs)
    #             axref = axs.pop()
    #             for ax in axs:
    #                 # we expect compatible units
    #                 if not ax.is_units_compatible(axref):
    #                     raise ValueError(
    #                         "units of the dataset's axis are not compatible"
    #                     )
    #                 if i != axis and ax.size != axref.size:
    #                     # and same size for the non-concatenated axis
    #                     raise ValueError(
    #                         "size of the non-concatenated dimension must be "
    #                         "identical"
    #                     )

    # concatenate or stack the data array + mask
    # ------------------------------------------------------------------------------------------------------------------

    sss = []
    for i, dataset in enumerate(datasets):
        d = dataset.masked_data
        sss.append(d)

    sconcat = np.ma.concatenate(sss, axis=axis)
    data = np.asarray(sconcat)
    mask = sconcat.mask

    # concatenate coords if they exists
    # ------------------------------------------------------------------------------------------------------------------

    if len(coordss) == 1 and coordss.pop() is None:
        # no coords
        coords = None
    else:
        # we take the coords of the first dataset, and extend the coord along the concatenate axis
        coords = coordss[0].copy()

        try:
            coords[dim] = Coord(coords[dim],
                                linear=False)  # de-linearize the coordinates
            coords[dim]._data = np.concatenate(
                tuple((c[dim].data for c in coordss)))
        except ValueError:
            pass

        # concatenation of the labels (first check the presence of at least one labeled coordinates)
        is_labeled = False
        for i, c in enumerate(coordss):
            if c[dim].implements() in ['Coord', 'LinearCoord']:
                # this is a coord
                if c[dim].is_labeled:
                    # at least one of the coord is labeled
                    is_labeled = True
                    break
            if c[dim].implements('CoordSet'):
                # this is a coordset
                for coord in c[dim]:
                    if coord.is_labeled:
                        # at least one of the coord is labeled
                        is_labeled = True
                        break

        if is_labeled:
            labels = []
            # be sure that now all the coordinates have a label, or create one
            for i, c in enumerate(coordss):
                if c[dim].implements() in ['Coord', 'LinearCoord']:
                    # this is a coord
                    if c[dim].is_labeled:
                        labels.append(c[dim].labels)
                    else:
                        labels.append(str(i))
                if c[dim].implements('CoordSet'):
                    # this is a coordset
                    for coord in c[dim]:
                        if coord.is_labeled:
                            labels.append(c[dim].labels)
                        else:
                            labels.append(str(i))

            if isinstance(coords[dim], Coord):
                coords[dim]._labels = np.concatenate(labels)
            if coords[dim].implements('CoordSet'):
                for i, coord in enumerate(coords[dim]):
                    coord._labels = np.concatenate(labels[i::len(coords[dim])])

            coords[dim]._labels = np.concatenate(labels)

    # out = NDDataset(data, coordset=coords, mask=mask, units=units)    # This doesn't keep the order of the
    # coordinates
    out = dataset.copy()
    out._data = data
    out._coordset[dim] = coords[dim]
    out._mask = mask
    out._units = units

    thist = 'Stack' if axis == 0 else 'Concatenation'

    out.description = '{} of {}  datasets:\n'.format(thist, len(datasets))
    out.description += '( {}'.format(datasets[0].name)
    out.title = datasets[0].title
    authortuple = (datasets[0].author, )

    for dataset in datasets[1:]:

        if out.title != dataset.title:
            warn(
                'Different data title => the title is that of the 1st dataset')

        if not (dataset.author in authortuple):
            authortuple = authortuple + (dataset.author, )
            out.author = out.author + ' & ' + dataset.author

        out.description += ', {}'.format(dataset.name)

    out.description += ' )'
    out._date = out._modified = datetime.datetime.now(datetime.timezone.utc)
    out._history = [str(out.date) + ': Created by %s' % thist]

    return out