Beispiel #1
0
def test_mstump_wrapper(T, m):
    left_P, left_I = naive_mstump(T, m)
    right_P, right_I = mstump(T, m)

    npt.assert_almost_equal(left_P, right_P)
    npt.assert_almost_equal(left_I, right_I)
Beispiel #2
0
def RunMP(aligned_data_root_path, output_path):
    do_compute_individual_k_motifs = True
    do_compute_anchored_chains = False
    do_compute_semantic_segmentation = False
    do_compute_multimodal_mp = False
    window_size = 1300
    #window_size = 1500
    data_dict = LoadAlignedTILESData(aligned_data_root_path)

    #plt.ion()

    pids = list(data_dict.keys())[0:1]
    streams = ['HeartRatePPG', 'StepCount']

    # Compute motifs from the individual MP using a greedy method
    if do_compute_individual_k_motifs:
        num_motifs = 2
        for pid in pids:
            fitbit_df = data_dict[pid]['fitbit']
            fitbit_df = fitbit_df.iloc[0:10000, :]  # HACK

            for stream in streams:
                exclusion_signal = fitbit_df[stream].copy()
                # Keep a NaN'd version for MP and interpolated one for OMP
                #nan_replace_value = -1000000
                #fitbit_df[stream][np.isnan(fitbit_df[stream])] = nan_replace_value
                #fitbit_df_smooth = fitbit_df[stream].interpolate(method='linear', axis=0, inplace=False)
                #fitbit_df_smooth = fitbit_df[stream].copy()
                fitbit_df_smooth = exclusion_signal.copy()

                if np.isnan(fitbit_df_smooth[0]
                            ):  # Fill NaNs at the beginning and end
                    idx = 0
                    while np.isnan(fitbit_df_smooth[idx]):
                        idx += 1
                    fitbit_df_smooth[0:idx] = fitbit_df_smooth[idx]
                if np.isnan(fitbit_df_smooth[fitbit_df_smooth.shape[0] - 1]):
                    idx = fitbit_df_smooth.shape[0] - 1
                    while np.isnan(fitbit_df_smooth[idx]):
                        idx -= 1
                    fitbit_df_smooth[idx:] = fitbit_df_smooth[idx]

                # Use Matrix Profile methods to learn a motif dictionary
                motifs = []
                while len(motifs) < num_motifs:
                    #fitbit_mp = stumpy.stump(fitbit_df[stream], m=window_size) # TODO - use the exclusion_signal
                    fitbit_mp = stumpy.stump(
                        exclusion_signal,
                        m=window_size)  # TODO - use the exclusion_signal
                    fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort()
                    for motif_idx in range(len(fitbit_mp_argsort)):
                        stream_motif_idx = fitbit_mp_argsort[motif_idx]
                        num_nan = np.sum(
                            np.isnan(exclusion_signal.
                                     values[stream_motif_idx:stream_motif_idx +
                                            window_size]))

                        # Avoid finding bad motifs
                        if num_nan >= 5.0 * window_size / 6.0:
                            continue
                        if stream == 'HeartRatePPG':
                            pass
                        break
                    motif_left_idx = fitbit_mp_argsort[motif_idx]
                    motif = fitbit_df_smooth[motif_left_idx:motif_left_idx +
                                             window_size]
                    motif[motif ==
                          0] = 1e-12  # OMP requires non-zeros in the support
                    motifs.append(motif)
                    plt.plot(range(motif_left_idx,
                                   motif_left_idx + window_size),
                             motifs[-1],
                             'g-',
                             linewidth=5)

                # Build a redundant dictionary from the motifs
                num_repetitions = len(fitbit_df_smooth) - window_size
                dictionary_mat = csr_matrix(
                    (len(motifs) * num_repetitions, len(fitbit_df_smooth)))
                for motif_idx in range(len(motifs)):
                    motif_values = motifs[motif_idx].values
                    for repeat_idx in range(num_repetitions):
                        # SLOW: TODO - find better way of generating this matrix.  Maybe I can change the sparse encoding directly and just push extra zeros in front of the motif sequence? Better yet, why not abandon the matrix representation and just use a list of motifs and their starting index in the signal
                        dictionary_mat[motif_idx * num_repetitions +
                                       repeat_idx, repeat_idx:repeat_idx +
                                       window_size] = motif_values

                # Reconstruct the signal using the motif dictionary
                # TODO : Write my own OMP with exclusion of each atom's support. Gram mat?
                # TODO : Use L1 optimization (Lasso)?
                #omp = OrthogonalMatchingPursuit(n_nonzero_coefs=2, fit_intercept=False)
                omp = OrthogonalMatchingPursuitCV(fit_intercept=False)
                omp.fit(dictionary_mat.T, fitbit_df_smooth)
                intercept = omp.intercept_
                coef = omp.coef_
                idx_r = coef.nonzero()
                num_nonzero = omp.n_nonzero_coefs_

                #max_nonzero = 20
                #skip_nan_percent = 0.1
                #coef = np.zeros((dictionary_mat.T.shape[1],1))
                #intercept = np.zeros((dictionary_mat.T.shape[0],1))
                #for num_nonzero in range(1,max_nonzero+1):
                #   # Reconstruct the signal using the motif dictionary
                #   best_dict_idx = -1
                #   best_error = np.inf
                #   best_dict_support = None
                #   for dict_idx in range(dictionary_mat.shape[0]):
                #      # SLOW
                #      dict_vec = dictionary_mat[dict_idx,:].toarray().reshape(-1,)

                #      # Find the support
                #      left_support_idx = 0
                #      right_support_idx = len(dict_vec)-1
                #      while dict_vec[left_support_idx] == 0 and left_support_idx < len(dict_vec):
                #         left_support_idx += 1
                #      while dict_vec[right_support_idx] == 0 and right_support_idx >= 0:
                #         right_support_idx -= 1

                #      # Skip mostly NaN regions
                #      if np.sum(np.isnan(exclusion_signal[left_support_idx:right_support_idx+1])) > skip_nan_percent*(right_support_idx-left_support_idx+1):
                #         continue

                #      # Find the best match
                #      residual = exclusion_signal[left_support_idx:right_support_idx+1] - dict_vec[left_support_idx:right_support_idx+1]
                #      np.nan_to_num(residual, copy=False) # Replace NaN with zero
                #      error = np.dot(residual, residual)
                #      if error < best_error:
                #         best_error = error
                #         coef_val = 1 # TODO - constrain between 0.5 and 2?
                #         best_dict_idx = dict_idx
                #         best_dict_support = (left_support_idx, right_support_idx)

                #   if best_dict_idx < 0:
                #      print("No best next dictionary element found")
                #      break

                #   # Update coef
                #   coef_nonzero = (coef != 0).reshape(-1,)
                #   if np.sum(coef_nonzero) > 0:
                #      dictionary_mat_reduced = dictionary_mat[coef_nonzero, :]
                #      coef_reduced = coef[coef_nonzero]

                #      #prev_fit_signal = np.matmul(dictionary_mat.T, coef)
                #      prev_fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced)
                #      prev_residual = fitbit_df_smooth - prev_fit_signal.reshape(-1,)
                #      np.nan_to_num(prev_residual, copy=False) # Replace NaN with zero
                #      prev_error = np.dot(prev_residual, prev_residual)

                #      coef[best_dict_idx] = coef_val
                #      #fit_signal = np.matmul(dictionary_mat.T, coef)
                #      fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced)
                #      fit_residual = fitbit_df_smooth - fit_signal.reshape(-1,)
                #      np.nan_to_num(fit_residual, copy=False) # Replace NaN with zero
                #      fit_error = np.dot(fit_residual, fit_residual)
                #   else:
                #      prev_residual = fitbit_df_smooth- np.zeros(len(fitbit_df_smooth))
                #      np.nan_to_num(prev_residual, copy=False) # Replace NaN with zero
                #      prev_error = np.dot(prev_residual, prev_residual)

                #      coef[best_dict_idx] = coef_val
                #      coef_nonzero = (coef != 0).reshape(-1,)
                #      dictionary_mat_reduced = dictionary_mat[coef_nonzero, :]
                #      coef_reduced = coef[coef_nonzero]

                #      fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced)
                #      fit_residual = fitbit_df_smooth - fit_signal.reshape(-1,)
                #      np.nan_to_num(fit_residual, copy=False) # Replace NaN with zero
                #      fit_error = np.dot(fit_residual, fit_residual)

                #   if best_dict_support is not None:
                #      exclusion_signal[best_dict_support[0]:best_dict_support[1]+1] = np.inf

                #   if prev_error < fit_error:
                #      print("Avoiding overfitting...")
                #      coef[best_dict_idx,0] = 0
                #      break

                coef_nonzero = (coef != 0).reshape(-1, )
                dictionary_mat_reduced = dictionary_mat[coef_nonzero, :]
                coef_reduced = coef[coef_nonzero]
                fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(),
                                       coef_reduced) + intercept
                plt.plot(range(fitbit_df[stream].shape[0]), fitbit_df[stream],
                         'b-')
                #plt.plot(range(fitbit_df_smooth.shape[0]), fitbit_df_smooth, 'k-')
                plt.plot(range(fitbit_df[stream].shape[0]), fit_signal, 'r--')
                plt.title('OMP (%d coefs) + MP Motifs (%d motifs)' %
                          (num_nonzero, num_motifs))
                plt.xlabel('Time')
                plt.ylabel(stream)
                plt.show()
                return
                pdb.set_trace()

    # Compute individual matrix profiles (stump)
    if do_compute_anchored_chains or do_compute_semantic_segmentation:
        for pid in pids:
            fitbit_df = data_dict[pid]['fitbit']
            for stream in streams:
                fitbit_mp = stumpy.stump(fitbit_df[stream], m=window_size)

                if do_compute_anchored_chains:
                    left_mp_idx = fitbit_mp[:, 2]
                    right_mp_idx = fitbit_mp[:, 3]
                    #atsc_idx = 10
                    #anchored_chain = stumpy.atsc(left_mp_idx, right_mp_idx, atsc_idx)
                    all_chain_set, unanchored_chain = stumpy.allc(
                        left_mp_idx, right_mp_idx)

                if do_compute_semantic_segmentation:
                    subseq_len = window_size
                    correct_arc_curve, regime_locations = stumpy.fluss(
                        fitbit_mp[:, 1],
                        L=subseq_len,
                        n_regimes=2,
                        excl_factor=5)

                # Find the first motif with nearly no NaN values in the stream signal
                fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort()
                for motif_idx in range(len(fitbit_mp_argsort)):
                    stream_motif_idx = fitbit_mp_argsort[motif_idx]
                    num_nan = np.sum(
                        np.isnan(fitbit_df[stream].
                                 values[stream_motif_idx:stream_motif_idx +
                                        window_size]))

                    # Avoid finding bad motifs
                    if num_nan >= 5.0 * window_size / 6.0:
                        continue
                    if stream == 'HeartRatePPG':
                        pass
                        # Check for flat heart rate
                        #nan_like_value = 70
                        #num_valid = np.count_nonzero((fitbit_df[stream] - nan_like_value)[stream_motif_idx:stream_motif_idx+window_size])
                        #if num_valid < window_size - 2:
                        #   continue

                        # Check for linear heart rate over time
                        #residual_threshold = window_size*(4.0**2)
                        #p, res, rank, sing_vals, rcond = np.polyfit(range(window_size), fitbit_df[stream][stream_motif_idx:stream_motif_idx+window_size], deg=1, full=True)
                        #if res < residual_threshold:
                        #   continue
                    break

                num_subplots = 3 if do_compute_semantic_segmentation else 2
                fig, axs = plt.subplots(num_subplots,
                                        sharex=True,
                                        gridspec_kw={'hspace': 0})
                plt.suptitle('Matrix Profile, %s, PID: %s' % (stream, pid),
                             fontsize='30')
                axs[0].plot(fitbit_df[stream].values)
                rect = plt.Rectangle((fitbit_mp_argsort[motif_idx], 0),
                                     window_size,
                                     2000,
                                     facecolor='lightgrey')
                axs[0].add_patch(rect)
                rect = plt.Rectangle((fitbit_mp_argsort[motif_idx + 1], 0),
                                     window_size,
                                     2000,
                                     facecolor='lightgrey')
                axs[0].add_patch(rect)
                axs[0].set_ylabel(stream, fontsize='20')
                axs[1].plot(fitbit_mp[:, 0])
                axs[1].axvline(x=fitbit_mp_argsort[motif_idx],
                               linestyle="dashed")
                axs[1].axvline(x=fitbit_mp_argsort[motif_idx + 1],
                               linestyle="dashed")
                axs[1].set_ylabel('Matrix Profile', fontsize='20')

                if do_compute_anchored_chains:
                    for i in range(unanchored_chain.shape[0]):
                        y = fitbit_df[stream].iloc[
                            unanchored_chain[i]:unanchored_chain[i] +
                            window_size]
                        x = y.index.values
                        axs[0].plot(x, y, linewidth=3)

                if do_compute_semantic_segmentation:
                    axs[2].plot(range(correct_arc_curve.shape[0]),
                                correct_arc_curve,
                                color='C1')
                    axs[0].axvline(x=regime_locations[0], linestyle="dashed")
                    axs[2].axvline(x=regime_locations[0], linestyle="dashed")

                plt.show()

    # Compute multi-dimensional matrix profiles (mstump)
    if do_compute_multimodal_mp:
        for pid in pids:
            fitbit_df = data_dict[pid]['fitbit']
            data = fitbit_df.loc[:, streams].values
            mp, mp_indices = stumpy.mstump(data.T, m=window_size)
            #print("Stumpy's mstump function does not handle NaN values. Skipping multi-dimensional MP")
            #break

            # TODO - This code is copied from above. Fix and finish it once mstump supports NaN
            # Find the first motif with nearly no NaN values in the stream signal
            fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort()
            for motif_idx in range(len(fitbit_mp_argsort)):
                stream_motif_idx = fitbit_mp_argsort[motif_idx]
                num_nan = np.sum(
                    np.isnan(fitbit_df[stream].
                             values[stream_motif_idx:stream_motif_idx +
                                    window_size]))

                # Avoid finding bad motifs
                if num_nan >= 2:
                    continue
                if stream == 'HeartRatePPG':
                    # Check for flat heart rate
                    nan_like_value = 70
                    num_valid = np.count_nonzero(
                        (fitbit_df[stream] -
                         nan_like_value)[stream_motif_idx:stream_motif_idx +
                                         window_size])
                    if num_valid < window_size - 2:
                        continue

                    # Check for linear heart rate over time
                    residual_threshold = window_size * (4.0**2)
                    p, res, rank, sing_vals, rcond = np.polyfit(
                        range(window_size),
                        fitbit_df[stream][stream_motif_idx:stream_motif_idx +
                                          window_size],
                        deg=1,
                        full=True)
                    if res < residual_threshold:
                        continue
                break

            fig, axs = plt.subplots(2, sharex=True, gridspec_kw={'hspace': 0})
            plt.suptitle('Matrix Profile, %s, PID: %s' % (stream, pid),
                         fontsize='30')
            axs[0].plot(fitbit_df[stream].values)
            rect = plt.Rectangle((fitbit_mp_argsort[motif_idx], 0),
                                 window_size,
                                 2000,
                                 facecolor='lightgrey')
            axs[0].add_patch(rect)
            rect = plt.Rectangle((fitbit_mp_argsort[motif_idx + 1], 0),
                                 window_size,
                                 2000,
                                 facecolor='lightgrey')
            axs[0].add_patch(rect)
            axs[0].set_ylabel(stream, fontsize='20')
            axs[1].plot(fitbit_mp[:, 0])
            axs[1].axvline(x=fitbit_mp_argsort[motif_idx], linestyle="dashed")
            axs[1].axvline(x=fitbit_mp_argsort[motif_idx + 1],
                           linestyle="dashed")
            axs[1].set_ylabel('Matrix Profile', fontsize='20')
            plt.show()

    plt.ioff()
    plt.figure()
    plt.plot()
    plt.title('Dummy plot')
    plt.show()
    return
def test_mstump(T, m):
    ref = stumpy.maamp(T, m)
    comp = stumpy.mstump(T, m, normalize=False)
    npt.assert_almost_equal(ref, comp)
Beispiel #4
0
def test_mstump_int_input():
    with pytest.raises(TypeError):
        mstump(np.arange(20).reshape(2, 10), 5)