コード例 #1
0
ファイル: test_chains.py プロジェクト: xhochy/stumpy
def test_allc(Value, IR, IL):
    j = 2
    S_left = [
        np.array([1, 7, 11], dtype=np.int64),
        np.array([0], dtype=np.int64),
        np.array([3], dtype=np.int64),
        np.array([9], dtype=np.int64),
        np.array([2, 4, 6, 8, 10], dtype=np.int64),
        np.array([5], dtype=np.int64)
    ]
    C_left = np.array([2, 4, 6, 8, 10], dtype=np.int64)
    S_right, C_right = allc(IL, IR)
    npt.assert_equal(S_left, S_right)
    npt.assert_equal(C_left, C_right)
コード例 #2
0
def test_allc(Value, IR, IL):
    S_ref = [
        np.array([1, 7, 11], dtype=np.int64),
        np.array([0], dtype=np.int64),
        np.array([3], dtype=np.int64),
        np.array([9], dtype=np.int64),
        np.array([2, 4, 6, 8, 10], dtype=np.int64),
        np.array([5], dtype=np.int64),
    ]
    C_ref = np.array([2, 4, 6, 8, 10], dtype=np.int64)
    S_comp, C_comp = allc(IL, IR)

    S_ref = sorted(S_ref, key=lambda x: (len(x), list(x)))
    S_comp = sorted(S_comp, key=lambda x: (len(x), list(x)))

    npt.assert_equal(S_ref, S_comp)
    npt.assert_equal(C_ref, C_comp)
コード例 #3
0
def RunMP(aligned_data_root_path, output_path):
   min_valid_frame_ratio = 0.5
   min_routine_chain_length = 2
   max_num_motifs = 999999
   window_size = 1300
   streams = ['HeartRatePPG', 'StepCount']

   if not os.path.isdir(output_path):
      os.makedirs(output_path)

   data_dict = LoadAlignedTILESData(aligned_data_root_path)
   pids = list(data_dict.keys())

   # Compute motifs from the individual MP using a greedy method
   for pid in pids:
      signal_df = data_dict[pid]['fitbit']
      #signal_df = signal_df.iloc[0:5000,:] # HACK for speed
      exclusion_mask = np.array(signal_df.shape[0]*[False])

      for stream in streams:
         signal = signal_df[stream]

         # Use Matrix Profile methods to learn a motif dictionary
         motifs = []
         motifs_data = []
         while len(motifs) < max_num_motifs:
            print("Number of motifs found: %d"%(len(motifs)))
            masked_signal = signal.copy()
            masked_signal[exclusion_mask] = np.nan
            fitbit_mp = stumpy.stump(masked_signal, m=window_size)

            left_mp_idx = fitbit_mp[:,2]
            right_mp_idx = fitbit_mp[:,3]
            all_chain_set, unanchored_chain = stumpy.allc(left_mp_idx, right_mp_idx)

            is_valid_chain = True
            for i in range(unanchored_chain.shape[0]):
               num_nan_frames = np.sum(np.isnan(masked_signal[unanchored_chain[i]:unanchored_chain[i]+window_size]))
               if num_nan_frames >= min_valid_frame_ratio*window_size:
                  is_valid_chain = False
                  break

            if do_show_plot:
               nonroutine_signal = signal.copy()
               nonroutine_signal[exclusion_mask] = np.nan
               fig, ax = plt.subplots(2, sharex=True, gridspec_kw={'hspace':0})
               ax[0].plot(signal_df.index, nonroutine_signal, 'b-')
               for i in range(unanchored_chain.shape[0]):
                  y = signal.iloc[unanchored_chain[i]:unanchored_chain[i]+window_size]
                  x = y.index.values
                  ax[0].plot(x, y, linewidth=3, linestyle='--')
               ax[1].plot(signal_df.index[0:len(fitbit_mp)], fitbit_mp[:,0])
               ax[1].set_ylabel('Matrix Profile')
               plt.show()

            if is_valid_chain and (unanchored_chain.shape[0] >= min_routine_chain_length):
               # TODO - Which part of the chain should be the motif?
               motif = signal.iloc[unanchored_chain[0]:unanchored_chain[0]+window_size].values
               motifs.append(motif)
               motifs_data.append(unanchored_chain)
               for i in range(unanchored_chain.shape[0]):
                  exclusion_mask[unanchored_chain[i]:unanchored_chain[i]+window_size] = True
            else:
               break

         routine_signal, reconstruction_data = ReconstructSignalWithDictionary(signal, motifs)

         features_df = ExtractFeatures(routine_signal, signal, motifs, motifs_data, reconstruction_data)
         features_df.to_csv(os.path.join(output_path, '%s_routine_features.csv'%(pid)), index=False, header=True)

         if do_show_plot:
            fig, ax = plt.subplots(2, sharex=True, gridspec_kw={'hspace':0})
            plt.suptitle('Fitbit %s with routines removed'%(stream))
            ax[0].plot(signal_df.index, signal, 'b-')
            ax[1].plot(signal_df.index, signal, 'b--')
            ax[1].plot(signal_df.index, routine_signal, 'g-', linewidth=3)
            plt.show()

   return
コード例 #4
0
def RunMP(aligned_data_root_path, output_path):
    do_compute_individual_k_motifs = True
    do_compute_anchored_chains = False
    do_compute_semantic_segmentation = False
    do_compute_multimodal_mp = False
    window_size = 1300
    #window_size = 1500
    data_dict = LoadAlignedTILESData(aligned_data_root_path)

    #plt.ion()

    pids = list(data_dict.keys())[0:1]
    streams = ['HeartRatePPG', 'StepCount']

    # Compute motifs from the individual MP using a greedy method
    if do_compute_individual_k_motifs:
        num_motifs = 2
        for pid in pids:
            fitbit_df = data_dict[pid]['fitbit']
            fitbit_df = fitbit_df.iloc[0:10000, :]  # HACK

            for stream in streams:
                exclusion_signal = fitbit_df[stream].copy()
                # Keep a NaN'd version for MP and interpolated one for OMP
                #nan_replace_value = -1000000
                #fitbit_df[stream][np.isnan(fitbit_df[stream])] = nan_replace_value
                #fitbit_df_smooth = fitbit_df[stream].interpolate(method='linear', axis=0, inplace=False)
                #fitbit_df_smooth = fitbit_df[stream].copy()
                fitbit_df_smooth = exclusion_signal.copy()

                if np.isnan(fitbit_df_smooth[0]
                            ):  # Fill NaNs at the beginning and end
                    idx = 0
                    while np.isnan(fitbit_df_smooth[idx]):
                        idx += 1
                    fitbit_df_smooth[0:idx] = fitbit_df_smooth[idx]
                if np.isnan(fitbit_df_smooth[fitbit_df_smooth.shape[0] - 1]):
                    idx = fitbit_df_smooth.shape[0] - 1
                    while np.isnan(fitbit_df_smooth[idx]):
                        idx -= 1
                    fitbit_df_smooth[idx:] = fitbit_df_smooth[idx]

                # Use Matrix Profile methods to learn a motif dictionary
                motifs = []
                while len(motifs) < num_motifs:
                    #fitbit_mp = stumpy.stump(fitbit_df[stream], m=window_size) # TODO - use the exclusion_signal
                    fitbit_mp = stumpy.stump(
                        exclusion_signal,
                        m=window_size)  # TODO - use the exclusion_signal
                    fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort()
                    for motif_idx in range(len(fitbit_mp_argsort)):
                        stream_motif_idx = fitbit_mp_argsort[motif_idx]
                        num_nan = np.sum(
                            np.isnan(exclusion_signal.
                                     values[stream_motif_idx:stream_motif_idx +
                                            window_size]))

                        # Avoid finding bad motifs
                        if num_nan >= 5.0 * window_size / 6.0:
                            continue
                        if stream == 'HeartRatePPG':
                            pass
                        break
                    motif_left_idx = fitbit_mp_argsort[motif_idx]
                    motif = fitbit_df_smooth[motif_left_idx:motif_left_idx +
                                             window_size]
                    motif[motif ==
                          0] = 1e-12  # OMP requires non-zeros in the support
                    motifs.append(motif)
                    plt.plot(range(motif_left_idx,
                                   motif_left_idx + window_size),
                             motifs[-1],
                             'g-',
                             linewidth=5)

                # Build a redundant dictionary from the motifs
                num_repetitions = len(fitbit_df_smooth) - window_size
                dictionary_mat = csr_matrix(
                    (len(motifs) * num_repetitions, len(fitbit_df_smooth)))
                for motif_idx in range(len(motifs)):
                    motif_values = motifs[motif_idx].values
                    for repeat_idx in range(num_repetitions):
                        # SLOW: TODO - find better way of generating this matrix.  Maybe I can change the sparse encoding directly and just push extra zeros in front of the motif sequence? Better yet, why not abandon the matrix representation and just use a list of motifs and their starting index in the signal
                        dictionary_mat[motif_idx * num_repetitions +
                                       repeat_idx, repeat_idx:repeat_idx +
                                       window_size] = motif_values

                # Reconstruct the signal using the motif dictionary
                # TODO : Write my own OMP with exclusion of each atom's support. Gram mat?
                # TODO : Use L1 optimization (Lasso)?
                #omp = OrthogonalMatchingPursuit(n_nonzero_coefs=2, fit_intercept=False)
                omp = OrthogonalMatchingPursuitCV(fit_intercept=False)
                omp.fit(dictionary_mat.T, fitbit_df_smooth)
                intercept = omp.intercept_
                coef = omp.coef_
                idx_r = coef.nonzero()
                num_nonzero = omp.n_nonzero_coefs_

                #max_nonzero = 20
                #skip_nan_percent = 0.1
                #coef = np.zeros((dictionary_mat.T.shape[1],1))
                #intercept = np.zeros((dictionary_mat.T.shape[0],1))
                #for num_nonzero in range(1,max_nonzero+1):
                #   # Reconstruct the signal using the motif dictionary
                #   best_dict_idx = -1
                #   best_error = np.inf
                #   best_dict_support = None
                #   for dict_idx in range(dictionary_mat.shape[0]):
                #      # SLOW
                #      dict_vec = dictionary_mat[dict_idx,:].toarray().reshape(-1,)

                #      # Find the support
                #      left_support_idx = 0
                #      right_support_idx = len(dict_vec)-1
                #      while dict_vec[left_support_idx] == 0 and left_support_idx < len(dict_vec):
                #         left_support_idx += 1
                #      while dict_vec[right_support_idx] == 0 and right_support_idx >= 0:
                #         right_support_idx -= 1

                #      # Skip mostly NaN regions
                #      if np.sum(np.isnan(exclusion_signal[left_support_idx:right_support_idx+1])) > skip_nan_percent*(right_support_idx-left_support_idx+1):
                #         continue

                #      # Find the best match
                #      residual = exclusion_signal[left_support_idx:right_support_idx+1] - dict_vec[left_support_idx:right_support_idx+1]
                #      np.nan_to_num(residual, copy=False) # Replace NaN with zero
                #      error = np.dot(residual, residual)
                #      if error < best_error:
                #         best_error = error
                #         coef_val = 1 # TODO - constrain between 0.5 and 2?
                #         best_dict_idx = dict_idx
                #         best_dict_support = (left_support_idx, right_support_idx)

                #   if best_dict_idx < 0:
                #      print("No best next dictionary element found")
                #      break

                #   # Update coef
                #   coef_nonzero = (coef != 0).reshape(-1,)
                #   if np.sum(coef_nonzero) > 0:
                #      dictionary_mat_reduced = dictionary_mat[coef_nonzero, :]
                #      coef_reduced = coef[coef_nonzero]

                #      #prev_fit_signal = np.matmul(dictionary_mat.T, coef)
                #      prev_fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced)
                #      prev_residual = fitbit_df_smooth - prev_fit_signal.reshape(-1,)
                #      np.nan_to_num(prev_residual, copy=False) # Replace NaN with zero
                #      prev_error = np.dot(prev_residual, prev_residual)

                #      coef[best_dict_idx] = coef_val
                #      #fit_signal = np.matmul(dictionary_mat.T, coef)
                #      fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced)
                #      fit_residual = fitbit_df_smooth - fit_signal.reshape(-1,)
                #      np.nan_to_num(fit_residual, copy=False) # Replace NaN with zero
                #      fit_error = np.dot(fit_residual, fit_residual)
                #   else:
                #      prev_residual = fitbit_df_smooth- np.zeros(len(fitbit_df_smooth))
                #      np.nan_to_num(prev_residual, copy=False) # Replace NaN with zero
                #      prev_error = np.dot(prev_residual, prev_residual)

                #      coef[best_dict_idx] = coef_val
                #      coef_nonzero = (coef != 0).reshape(-1,)
                #      dictionary_mat_reduced = dictionary_mat[coef_nonzero, :]
                #      coef_reduced = coef[coef_nonzero]

                #      fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced)
                #      fit_residual = fitbit_df_smooth - fit_signal.reshape(-1,)
                #      np.nan_to_num(fit_residual, copy=False) # Replace NaN with zero
                #      fit_error = np.dot(fit_residual, fit_residual)

                #   if best_dict_support is not None:
                #      exclusion_signal[best_dict_support[0]:best_dict_support[1]+1] = np.inf

                #   if prev_error < fit_error:
                #      print("Avoiding overfitting...")
                #      coef[best_dict_idx,0] = 0
                #      break

                coef_nonzero = (coef != 0).reshape(-1, )
                dictionary_mat_reduced = dictionary_mat[coef_nonzero, :]
                coef_reduced = coef[coef_nonzero]
                fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(),
                                       coef_reduced) + intercept
                plt.plot(range(fitbit_df[stream].shape[0]), fitbit_df[stream],
                         'b-')
                #plt.plot(range(fitbit_df_smooth.shape[0]), fitbit_df_smooth, 'k-')
                plt.plot(range(fitbit_df[stream].shape[0]), fit_signal, 'r--')
                plt.title('OMP (%d coefs) + MP Motifs (%d motifs)' %
                          (num_nonzero, num_motifs))
                plt.xlabel('Time')
                plt.ylabel(stream)
                plt.show()
                return
                pdb.set_trace()

    # Compute individual matrix profiles (stump)
    if do_compute_anchored_chains or do_compute_semantic_segmentation:
        for pid in pids:
            fitbit_df = data_dict[pid]['fitbit']
            for stream in streams:
                fitbit_mp = stumpy.stump(fitbit_df[stream], m=window_size)

                if do_compute_anchored_chains:
                    left_mp_idx = fitbit_mp[:, 2]
                    right_mp_idx = fitbit_mp[:, 3]
                    #atsc_idx = 10
                    #anchored_chain = stumpy.atsc(left_mp_idx, right_mp_idx, atsc_idx)
                    all_chain_set, unanchored_chain = stumpy.allc(
                        left_mp_idx, right_mp_idx)

                if do_compute_semantic_segmentation:
                    subseq_len = window_size
                    correct_arc_curve, regime_locations = stumpy.fluss(
                        fitbit_mp[:, 1],
                        L=subseq_len,
                        n_regimes=2,
                        excl_factor=5)

                # Find the first motif with nearly no NaN values in the stream signal
                fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort()
                for motif_idx in range(len(fitbit_mp_argsort)):
                    stream_motif_idx = fitbit_mp_argsort[motif_idx]
                    num_nan = np.sum(
                        np.isnan(fitbit_df[stream].
                                 values[stream_motif_idx:stream_motif_idx +
                                        window_size]))

                    # Avoid finding bad motifs
                    if num_nan >= 5.0 * window_size / 6.0:
                        continue
                    if stream == 'HeartRatePPG':
                        pass
                        # Check for flat heart rate
                        #nan_like_value = 70
                        #num_valid = np.count_nonzero((fitbit_df[stream] - nan_like_value)[stream_motif_idx:stream_motif_idx+window_size])
                        #if num_valid < window_size - 2:
                        #   continue

                        # Check for linear heart rate over time
                        #residual_threshold = window_size*(4.0**2)
                        #p, res, rank, sing_vals, rcond = np.polyfit(range(window_size), fitbit_df[stream][stream_motif_idx:stream_motif_idx+window_size], deg=1, full=True)
                        #if res < residual_threshold:
                        #   continue
                    break

                num_subplots = 3 if do_compute_semantic_segmentation else 2
                fig, axs = plt.subplots(num_subplots,
                                        sharex=True,
                                        gridspec_kw={'hspace': 0})
                plt.suptitle('Matrix Profile, %s, PID: %s' % (stream, pid),
                             fontsize='30')
                axs[0].plot(fitbit_df[stream].values)
                rect = plt.Rectangle((fitbit_mp_argsort[motif_idx], 0),
                                     window_size,
                                     2000,
                                     facecolor='lightgrey')
                axs[0].add_patch(rect)
                rect = plt.Rectangle((fitbit_mp_argsort[motif_idx + 1], 0),
                                     window_size,
                                     2000,
                                     facecolor='lightgrey')
                axs[0].add_patch(rect)
                axs[0].set_ylabel(stream, fontsize='20')
                axs[1].plot(fitbit_mp[:, 0])
                axs[1].axvline(x=fitbit_mp_argsort[motif_idx],
                               linestyle="dashed")
                axs[1].axvline(x=fitbit_mp_argsort[motif_idx + 1],
                               linestyle="dashed")
                axs[1].set_ylabel('Matrix Profile', fontsize='20')

                if do_compute_anchored_chains:
                    for i in range(unanchored_chain.shape[0]):
                        y = fitbit_df[stream].iloc[
                            unanchored_chain[i]:unanchored_chain[i] +
                            window_size]
                        x = y.index.values
                        axs[0].plot(x, y, linewidth=3)

                if do_compute_semantic_segmentation:
                    axs[2].plot(range(correct_arc_curve.shape[0]),
                                correct_arc_curve,
                                color='C1')
                    axs[0].axvline(x=regime_locations[0], linestyle="dashed")
                    axs[2].axvline(x=regime_locations[0], linestyle="dashed")

                plt.show()

    # Compute multi-dimensional matrix profiles (mstump)
    if do_compute_multimodal_mp:
        for pid in pids:
            fitbit_df = data_dict[pid]['fitbit']
            data = fitbit_df.loc[:, streams].values
            mp, mp_indices = stumpy.mstump(data.T, m=window_size)
            #print("Stumpy's mstump function does not handle NaN values. Skipping multi-dimensional MP")
            #break

            # TODO - This code is copied from above. Fix and finish it once mstump supports NaN
            # Find the first motif with nearly no NaN values in the stream signal
            fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort()
            for motif_idx in range(len(fitbit_mp_argsort)):
                stream_motif_idx = fitbit_mp_argsort[motif_idx]
                num_nan = np.sum(
                    np.isnan(fitbit_df[stream].
                             values[stream_motif_idx:stream_motif_idx +
                                    window_size]))

                # Avoid finding bad motifs
                if num_nan >= 2:
                    continue
                if stream == 'HeartRatePPG':
                    # Check for flat heart rate
                    nan_like_value = 70
                    num_valid = np.count_nonzero(
                        (fitbit_df[stream] -
                         nan_like_value)[stream_motif_idx:stream_motif_idx +
                                         window_size])
                    if num_valid < window_size - 2:
                        continue

                    # Check for linear heart rate over time
                    residual_threshold = window_size * (4.0**2)
                    p, res, rank, sing_vals, rcond = np.polyfit(
                        range(window_size),
                        fitbit_df[stream][stream_motif_idx:stream_motif_idx +
                                          window_size],
                        deg=1,
                        full=True)
                    if res < residual_threshold:
                        continue
                break

            fig, axs = plt.subplots(2, sharex=True, gridspec_kw={'hspace': 0})
            plt.suptitle('Matrix Profile, %s, PID: %s' % (stream, pid),
                         fontsize='30')
            axs[0].plot(fitbit_df[stream].values)
            rect = plt.Rectangle((fitbit_mp_argsort[motif_idx], 0),
                                 window_size,
                                 2000,
                                 facecolor='lightgrey')
            axs[0].add_patch(rect)
            rect = plt.Rectangle((fitbit_mp_argsort[motif_idx + 1], 0),
                                 window_size,
                                 2000,
                                 facecolor='lightgrey')
            axs[0].add_patch(rect)
            axs[0].set_ylabel(stream, fontsize='20')
            axs[1].plot(fitbit_mp[:, 0])
            axs[1].axvline(x=fitbit_mp_argsort[motif_idx], linestyle="dashed")
            axs[1].axvline(x=fitbit_mp_argsort[motif_idx + 1],
                           linestyle="dashed")
            axs[1].set_ylabel('Matrix Profile', fontsize='20')
            plt.show()

    plt.ioff()
    plt.figure()
    plt.plot()
    plt.title('Dummy plot')
    plt.show()
    return