Beispiel #1
0
def _check_signal_specification_v1(obj):
    # [implicit] - it must be a dataframe
    if not isinstance(obj, pd.DataFrame):
        raise SignalSpecificationError('Signals must be a dataframe',
                                       SignalSpecificationErrorCode.BAD_TYPE)
    dataframe = obj

    # [1] - dimensions (probably not necessary with dataframes)
    if len(dataframe.shape) != 2:  # pragma: no cover
        raise SignalSpecificationError('Dataframe must have two dimensions',
                                       SignalSpecificationErrorCode.BAD_SHAPE)

    # [2] - datetime index
    index = dataframe.index
    if not pd.core.dtypes.common.is_datetime64_any_dtype(index):
        raise SignalSpecificationError('Dataframe must have a datetime index',
                                       SignalSpecificationErrorCode.BAD_TYPE)

    # [3] - uniform time support
    # Use dsu.pandas_helpers.estimate_rate, since that code already raises
    # a DSUException on the same cases that we want to test
    if dataframe.shape[0] >= 2:
        from dsu.pandas_helpers import estimate_rate
        from dsu.exceptions import DSUException
        try:
            estimate_rate(dataframe)
        except DSUException as ex:
            raise SignalSpecificationError(f'Invalid dataframe index: {ex}',
                                           SignalSpecificationErrorCode.BAD_SAMPLING) from ex

    # [4] - known columns
    # Use mne channel names for standard 10/05 to avoid writing them down
    from mne.channels import make_standard_montage
    known_columns = (
            make_standard_montage('standard_1005').ch_names +  # EEG
            ['I', 'II', 'III', 'aVR', 'aVL', 'aVF'] +  # ECG
            ['PPG', 'GSR', 'PZT']
    )
    for col in dataframe.columns:
        if col == 'sample_number':
            continue
        if col not in known_columns:
            raise SignalSpecificationError(f'Unexpected "{col}" column',
                                           SignalSpecificationErrorCode.UNKNOWN_COLUMN_NAME)

    # [5] - column types
    for col in known_columns:
        if col not in dataframe.columns:
            continue
        if not np.issubdtype(dataframe[col].dtype, np.number):
            raise SignalSpecificationError(f'Column "{col}" must be of numeric dtype',
                                           SignalSpecificationErrorCode.INCORRECT_COLUMN_TYPE)

    if 'sample_number' in dataframe.columns:
        if not np.issubdtype(dataframe['sample_number'], np.integer):
            raise SignalSpecificationError('Column "sample_number" must be of integer dtype',
                                           SignalSpecificationErrorCode.INCORRECT_COLUMN_TYPE)
Beispiel #2
0
    def run(self, *, signals: pd.DataFrame) -> FileAdapter:

        if signals.empty:
            raise SoftPreconditionFailed('Input signals are empty')
        if 'PPG' not in signals.columns:
            raise SoftPreconditionFailed(
                'Input signals do not have a PPG column')

        output_file = self.default_outputs()
        fs = int(estimate_rate(signals))
        bands = (0.5, 11)
        self.logger.info(
            'Band-pass filtering signal between %.2f -- %.2f Hz '
            'with a FIR filter of order %d', *bands, fs)

        filtered = filtfilt_signal(
            signals,
            order=fs,
            frequencies=bands,
            filter_type='bandpass',
            filter_design='fir',
        )
        scaled = scale_signal(filtered, method='robust')

        self.logger.info('Cleaned PPG signal, input shape %s, output shape %s',
                         signals.shape, scaled.shape)

        with pd.HDFStore(output_file.file, 'w') as store:
            scaled.to_hdf(store, self.output_hdf5_key)

        return output_file
Beispiel #3
0
    def run(self, *, signals: pd.DataFrame) -> FileAdapter:
        if signals.empty:
            raise SoftPreconditionFailed('Input signals are empty')
        if self.column not in signals.columns:
            raise SoftPreconditionFailed(
                'Input signals do not have a PPG column')

        output_file = self.default_outputs()

        # Step 1: calculate SSF
        fs = estimate_rate(signals)
        window_samples = int(self.window_fraction * fs)
        ppg = signals[self.column]
        ppg_ssf = ssf(ppg, win=window_samples)
        df_ssf = pd.DataFrame({'PPG_SSF': ppg_ssf}, index=signals.index)
        self.logger.info(
            'Calculated SSF signal, input shape %s, output shape %s',
            signals.shape, ppg_ssf.shape)

        # Step 2: detect peak with adaptive threshold
        peaks, thresh = detect_ssf_peaks(df_ssf.PPG_SSF,
                                         threshold_percentage=0.50)

        # Step 3: convert to PP intervals and post-process them
        df_interval = peak_to_nn(peaks).rename(columns={'interval': 'NN'})

        # Step 4: interpolate NN
        df_interpolated = nn_interpolation(df_interval, fs=fs, column='NN')

        with pd.HDFStore(output_file.file, 'w') as store:
            df_ssf.to_hdf(store, self.ssf_output_hdf5_key)
            df_interval.to_hdf(store, self.ssf_nn_output_hdf5_key)
            df_interpolated.to_hdf(store, self.ssf_nni_output_hdf5_key)

        return output_file
Beispiel #4
0
    def run(self, *, signal: pd.DataFrame) -> FileAdapter:
        output_file = self.default_outputs()

        if self.column not in signal:
            raise SoftPreconditionFailed(f'Input dataframe does not have column "{self.column}"')
        x = signal[self.column]
        fs = int(estimate_rate(x))

        properties, _ = extract_all_peaks(x, window_size=fs)

        with pd.HDFStore(output_file.file, 'w') as store:
            properties.to_hdf(store, self.output_hdf5_key)

        return output_file
Beispiel #5
0
    def run(self, signals: pd.DataFrame) -> FileAdapter:
        """Extract and pre-process signals"""
        logger.info('Extracting Nexus signal %s -> %s on file %s',
                    self.source_column, self.target_column,
                    prefect.context.run_kwargs['signals'])

        raw = (
            signals[[self.source_column]]
            .rename(columns={self.source_column: self.target_column})
        )

        # Estimate the sampling frequency: weird signals that have a heavy jitter
        # will fail here early and raise a ValueError. See issue #44
        try:
            fs = estimate_rate(raw)
        except DSUException as ex:
            logger.warning('Failed to estimate rate: %s, raising a precondition fail', ex)
            raise SoftPreconditionFailed(str(ex)) from ex

        logger.debug('Uniform resampling from %.3f Hz to %d Hz', fs, self.sampling_rate)
        # Uniform sampling, with linear interpolation.
        # sample-and-hold is not a good strategy, see issue 48:
        # https://github.com/OpenMindInnovation/iguazu/issues/48
        raw_uniform = uniform_sampling(raw, self.sampling_rate,
                                       interpolation_kind='linear')

        # Create the annotations companion dataframe and mark any nan as a
        # "unknown" problem since it must come from the device / driver.
        # idx_sparse = raw_uniform.isna().any(axis='columns')
        #raw_annotations = raw_uniform.loc[idx_sparse].isna().replace({True: 'unknown', False: ''})
        # I have changed my mind: sparse complicates the code, and we are only saving so little space
        raw_annotations = raw_uniform.isna().replace({True: 'unknown', False: ''})

        n_samples = raw_uniform.shape[0]
        n_nans = (raw_annotations != '').sum()
        logger.debug('Finished standardization of Nexus signal %s -> %s. '
                     'Result has %d samples (%.1f seconds, %.1f minutes) '
                     '%d samples are NaN (%.1f %%).',
                     self.source_column, self.target_column,
                     n_samples,
                     n_samples / self.sampling_rate,
                     n_samples / self.sampling_rate / 60,
                     n_nans,
                     100 * n_nans / n_samples)
        if n_samples > 0:
            logger.debug('Extract of result:\n%s',
                         raw_uniform.to_string(max_rows=5))

        return self.save(raw_uniform, raw_annotations)
Beispiel #6
0
def respiration_clean(data, column='PZT'):
    '''
    # todo: does this function belong to dsu?
    Parameters
    ----------
    data
    column

    Returns
    -------

    '''
    sampling_rate = estimate_rate(data)
    data.loc[:, column] = nk.rsp_clean(data[column],
                                       sampling_rate,
                                       method='BioSPPy')
    return data
Beispiel #7
0
def bandpower(data, bands, epoch_size, epoch_overlap, fs=None, scaling='density', relative=False):
    # Note: add to doc that epoch_size and epoch_overlap is in seconds
    # also,
    # TODO: add on documentation of estimate_rate that the output is in Hz

    if isinstance(data, (pd.DataFrame, pd.Series)) and isinstance(data.index, (pd.TimedeltaIndex, pd.DatetimeIndex)):
        datetime_index = True
        fs = fs or int(estimate_rate(data))
        if isinstance(data, pd.DataFrame):
            columns = data.columns
            x = np.asarray(data.values)
        else:
            columns = [data.name]
            x = np.asarray(data.values)[:, np.newaxis]
    else:
        datetime_index = False
        x = np.asarray(data)
        if x.ndim not in (1, 2):
            raise ValueError('bandpower only supports 1- or 2-dimensional data')
        elif x.ndim == 1:
            x = x[:, np.newaxis]
        columns = [f'x{i+1}' for i in range(x.shape[1])]
        fs = fs or 1

    logger.debug('Calculating %s spectra with fs=%dHz on data shaped as %s',
                 'relative' if relative else 'absolute', fs, data.shape)

    x = x.T
    nsamples = x.shape[1]
    nperseg = int(epoch_size * fs)
    noverlap = int(epoch_overlap * fs)
    if nperseg > nsamples:
        raise ValueError('Epoch size is larger than data')

    # To whom it may concern: according to the _spectral_helper code, the
    # difference between scaling='density' and 'spectrum' is just how the
    # window adjusts the final value. One uses the sum of squared values, the
    # other the square of the sum. The 'density' also divides by the fs (which
    # is why the units change to V^2 / Hz).
    # In my opinion, this is not very important as long as we are consistent
    freqs, t, psd = _spectral_helper(x, x, fs=fs, window='hann',
                                     nperseg=nperseg, noverlap=noverlap,
                                     detrend='constant', return_onesided=True,
                                     scaling=scaling, mode='psd')  # TODO: psd or stft ???

    # Manage index
    if datetime_index:
        index = data.index[0] + pd.to_timedelta(t, unit='s')
    else:
        #
        index = pd.Index((t * (nperseg - noverlap)).astype(int),
                         name='sample')
    n = index.shape[0]

    powers = []
    rel_suffix = '_rel' if relative else '_abs'
    for name, (f_start, f_stop) in bands.items():
        f_start = f_start or 0
        f_stop = f_stop or fs / 2
        idx = (freqs >= f_start) & (freqs < f_stop)
        if idx.sum() == 0:
            logger.warning('Band %s is empty for fs=%d and nperseg=%d',
                           name, fs, nperseg)
            result = pd.DataFrame(data=[[np.nan] * len(columns)] * n,
                                  columns=columns,
                                  index=index)
        else:
            logger.debug('Calculating band power for %s with %d bins',
                         name, idx.sum())
            if idx.sum() <= 1:
                logger.warning('Band power for %s will be zero because there '
                               'are not enough frequency points to calculate an '
                               'integral', name)
            # TODO: we should manage the 1-bin case, but how ?
            # pxx axes: (column, freq, time)
            # bp = psd[:, idx, :].sum(axis=1)
            bp = simps(psd[:, idx, :], freqs[idx], axis=1)

            if relative:
                bp /= simps(psd, freqs, axis=1)

            # power_sum axes: (column, time)
            result = pd.DataFrame(data=bp.T,  # (time, column)
                                  columns=columns,
                                  index=index)

        powers.append(result.add_suffix(f'_{name}{rel_suffix}'))

    return pd.concat(powers, axis='columns')
Beispiel #8
0
def galvanic_cvx(signals,
                 annotations,
                 column=None,
                 warmup_duration=15,
                 threshold_scr=4.0,
                 cvxeda_params=None,
                 epoch_size=None,
                 epoch_overlap=None):
    """ Separate galvanic components using a convex deconvolution.

    This function separates the phasic (SCR) and tonic (SCL) galvanic components
    using the cvxEDA algorithm.

    For large signals, the underlying library (cvx) is particularly heavy on
    memory usage. Also, since it uses C code, it holds and does not relase the
    Python global interpreter lock (GIL). This makes everything more difficult,
    in particular for Iguazu. In order to manage this, one can do the algorithm
    by epochs using both the `epoch_size` and `epoch_overlap` parameters.

    Parameters
    ----------
    data: pd.DataFrame
        Dataframe containing the preprocessed GSR in channel given by column_name.
    column: str | None
        Name of column where the data of interest are located.
        If None, the first column is considered.
    warmup_duration: float
        Duration at beginning and end of the data to label as 'bad'==True.
    threshold_scr:
        Maximum acceptable amplitude of SCR component.
    cvxeda_params:
        Keywords arguments to apply cvxEDA algorithm.
    epoch_size: float
        Size in seconds of the epoch size. When set to ``None``, cvxEDA will
        be applied only once on the whole signal.
    epoch_overlap: float
        Size in seconds of the epoch overlap. When set to ``None``, cvxEDA will
        be applied only once on the whole signal.

    Returns
    -------
    data: pd.DataFrame
        Dataframe with columns: `..._SCR`, `..._SCL` and `bad`.

    Examples
    --------

      .. image:: ../source/_static/examples/galvanic_functions/io_deconvolution.png

    """
    cvxeda_params = cvxeda_params or {}

    # extract SCR and SCL component using deconvolution toolbox cvxEDA

    # if no column is specified, consider the first one
    column = column or signals.columns[-1]

    n = signals.shape[0]
    idx_epochs = np.arange(n)[np.newaxis, :]
    idx_warmup = slice(0, n)

    if epoch_size is not None and epoch_overlap is not None:
        fs = estimate_rate(signals)
        n_warmup = int(warmup_duration * fs)
        n_epoch = int(epoch_size * fs) + n_warmup
        n_overlap = int(epoch_overlap * fs)
        logger.debug(
            'Attempting to epoch signal of %d samples into epochs '
            'of %d samples and %d overlap', n, n_epoch, n_overlap)

        if n < n_epoch + n_overlap:
            # Data is not big enough for window
            logger.debug(
                'Cannot epoch into epochs of %d samples, signal is not '
                'large enough. Falling back to complete implementation.',
                n_epoch)
        else:
            idx_epochs = sliding_window(np.arange(n),
                                        size=n_epoch,
                                        stepsize=n_overlap)
            idx_warmup = slice(n_warmup, -n_warmup)
            logger.debug('cvxEDA epoched implementation with %d epochs',
                         len(idx_epochs))

    else:
        logger.debug('cvxEDA complete implementation')

    epochs = []
    for i, idx in enumerate(idx_epochs):
        logger.debug('Epoch %d / %d', i + 1, len(idx_epochs))
        chunk = signals.iloc[idx][[column]].dropna()
        if not chunk.empty:
            chunk = (apply_cvxEDA(
                chunk, **cvxeda_params).iloc[idx_warmup].rename_axis(
                    index='epoched_index').reset_index())
            epochs.append(chunk)

    if not epochs:
        return pd.DataFrame(), pd.DataFrame()  # or None?

    signals = (pd.concat(
        epochs,
        ignore_index=False).groupby('epoched_index').mean().rename_axis(
            index=signals.index.name))

    # add an annotation rejection boolean on amplitude criteria
    # todo: helpers with inputs: data, annotations, and index or bool condition, that sets values in data to NaN and annotate in annotations
    annotations.loc[signals[signals[column + '_SCR'] >= threshold_scr].index,
                    'GSR'] = 'CVX SCR outlier'

    warm_up_timedelta = warmup_duration * np.timedelta64(1, 's')
    annotations.loc[:signals.index[0] + warm_up_timedelta,
                    'GSR'] = 'CVX warm up'
    annotations.loc[signals.index[-1] - warm_up_timedelta:,
                    'GSR'] = 'CVX warm up'
    # replace column string name by 'gsr' for lisibility purpose
    signals.columns = signals.columns.str.replace(column, 'GSR')

    return signals, annotations
Beispiel #9
0
def respiration_sequence_features(data,
                                  events,
                                  column='PZT',
                                  known_sequences=None):
    # nk.rsp_peaks(pzt_signal['PZT'].values, sampling_rate=sampling_rate, method="BioSPPy")

    sampling_rate = estimate_rate(data)

    # Extract peak using neurokit BioSPPy method
    _index = data.index
    _, info = nk.rsp_peaks(data[column],
                           sampling_rate=sampling_rate,
                           method="BioSPPy")
    # Truncate so that first and last events are troughs
    RSP_Troughs = info['RSP_Troughs']
    RSP_Peaks = info['RSP_Peaks']
    # at this point, check that there are some peaks/trough
    if RSP_Troughs.size == 0 or RSP_Peaks.size == 0:
        raise NoRespirationPeaks(
            'No peaks/trough could be detected in PZT signal. ')

    RSP_Peaks = RSP_Peaks[(RSP_Peaks > RSP_Troughs[0])
                          & (RSP_Peaks <= RSP_Troughs[-1])]

    # Estimate Inspiration and Expiration durations, cycle (I+E) durations and amplitude
    I_duration = (RSP_Peaks - RSP_Troughs[:-1]) / sampling_rate
    E_duration = (RSP_Troughs[1:] - RSP_Peaks) / sampling_rate
    IE_duration = np.ediff1d(RSP_Troughs) / sampling_rate
    amplitude = data.iloc[RSP_Peaks].values - data.iloc[
        RSP_Troughs[:-1]].values
    IE_ratio = I_duration / E_duration

    # Back to Dataframe format to allow extracting feature between events
    I_duration_df = pd.DataFrame(index=_index[RSP_Peaks],
                                 columns=['I_duration'],
                                 data=I_duration)
    E_duration_df = pd.DataFrame(index=_index[RSP_Troughs[1:]],
                                 columns=['E_duration'],
                                 data=E_duration)
    IE_duration_df = pd.DataFrame(index=_index[RSP_Troughs[1:]],
                                  columns=['IE_duration'],
                                  data=IE_duration)
    amplitude_df = pd.DataFrame(index=_index[RSP_Peaks],
                                columns=['IE_amplitude'],
                                data=amplitude)
    IE_ratio_df = pd.DataFrame(index=_index[RSP_Peaks],
                               columns=['IE_ratio'],
                               data=IE_ratio)

    cycles_df = pd.concat([
        I_duration_df, E_duration_df, IE_duration_df, amplitude_df, IE_ratio_df
    ],
                          axis=1)

    known_sequences = known_sequences or VALID_SEQUENCE_KEYS

    features = []
    # for name, row in events.T.iterrows():  # transpose due to https://github.com/OpenMindInnovation/iguazu/issues/54
    for index, row in events.iterrows():
        logger.debug('Processing sequence %s at %s', row.id, index)
        if row.id not in known_sequences:
            continue

        begin = row.begin
        end = row.end
        cycles_sequence = cycles_df.loc[begin:end].copy()

        # extract features on sequence
        sequence_features = dataclass_to_dataframe(
            respiration_features(cycles_sequence)).rename_axis(
                index='id').reset_index()

        sequence_features.insert(0, 'reference', row.id)
        features.append(sequence_features)

    if len(features) > 0:
        features = pd.concat(features,
                             axis='index',
                             ignore_index=True,
                             sort=False)
        logger.info('Generated a feature dataframe of shape %s',
                    features.shape)
    else:
        logger.info('No features were generated')
        features = pd.DataFrame(columns=['id', 'reference', 'value'])

    return features
Beispiel #10
0
def detect_ssf_peaks(signal,
                     *,
                     fs=None,
                     max_bpm=180,
                     baseline_length=3,
                     threshold_percentage=0.70,
                     peak_memory_size=5):
    """ Detect peaks on a SSF signal

    This function performs the second part of the SSF-based PPG algorithm
    presented in
    Jang et al. (2014).
    "A Real-Time Pulse Peak Detection Algorithm for the Photoplethysmogram."
    International Journal of Electronics and Electrical Engineering,
    https://doi.org/10.12720/ijeee.2.1.45-49
    with some particularities that have been left out from this paper but are
    available on its previous iteration in
    Jang et al. (2014).
    "A robust method for pulse peak determination in a digital volume pulse waveform with a wandering baseline."
    IEEE Transactions on Biomedical Circuits and Systems,
    https://doi.org/10.1109/TBCAS.2013.2295102

    Briefly, it detects peaks on a SSF signal and then calculates and updates
    an adaptive threshold in order to keep only the most prominent peaks which
    correspond to PPG systolic peaks. The threshold is adaptative to account
    for a wandering baseline present on long PPG recordings. Initially, the
    threshold is set to a fraction of the max peaks on a baseline period. After
    this period, the threshold is updated every time a new peak is detected over
    the threshold.

    The timing of the SSF signal is important for this algorithm. For this
    reason, the input cannot be a numpy array; it must be a
    :py:class:`pandas.Series` whose index are timestamps.

    Parameters
    ----------
    signal: pd.Series
        Input SSF signal with timestamps as index
    fs: float, optional
        Sampling frequency. If not set, it will be deduced from the `signal` index.
    max_bpm: float
        Expected maximum number for beats per minute. This value is used to
        avoid peaks that are too close to each other.
    baseline_length: float
        Length of the baseline period to initialize the adaptive threshold.
    threshold_percentage: float
        Fraction of the maximum or median peak used to update the threshold.
    peak_memory_size: int
        Number of peaks to keep in memory to update the threshold.

    Returns
    -------
    peak_series: pd.Series
        A series with the PPG values where peaks were detected. The index of
        this series can be used to estimate beat-to-beat segments.
    threshold_series: pd.Series
        A series with the values of the dynamic threshold.

    """
    if fs is None:
        fs = estimate_rate(signal)
    if threshold_percentage < 0:
        raise ValueError('threshold_percentage must be positive')

    # Some synonyms for shorter code
    min_dist = int(max_bpm * 60 / fs)
    memsize = peak_memory_size
    memratio = threshold_percentage

    # Peak detection on complete signal, all at once
    i_peaks, props = scipy.signal.find_peaks(signal.values,
                                             distance=min_dist,
                                             prominence=1e-2,
                                             plateau_size=0)
    # Use the left edge of the peak since SSF often a plateau
    idx_peaks = signal.index[props['left_edges']]

    # Calculate initial threshold on a baseline.
    # On the original paper, this is done on the first 3 seconds of the SSF signal,
    # using 70% of the max peak on this baseline
    baseline_start = signal.index[0]
    baseline_end = signal.index[0] + np.timedelta64(baseline_length, 's')
    baseline = signal[baseline_start:baseline_end]
    threshold = np.nan_to_num(baseline[idx_peaks].max() * memratio, nan=0)

    # Keep a peak and threshold series
    peaks_series = baseline[baseline > threshold][idx_peaks].dropna().rename(
        'peaks')
    threshold_series = pd.Series([threshold],
                                 index=[baseline_start],
                                 name='threshold')

    for index, value in signal.loc[baseline_end:][idx_peaks].items():
        if value > threshold:
            peaks_series.at[index] = value
            # Update the threshold to use the last N peaks. On the original paper
            # the details are lost but there is a reference that indicates that
            # it should be the 70% the median of the last 5 peaks
            threshold = np.nan_to_num(peaks_series.tail(n=memsize).median() *
                                      memratio,
                                      nan=0)
            threshold_series.at[index] = threshold

    return peaks_series, threshold_series