def test_allc(Value, IR, IL): j = 2 S_left = [ np.array([1, 7, 11], dtype=np.int64), np.array([0], dtype=np.int64), np.array([3], dtype=np.int64), np.array([9], dtype=np.int64), np.array([2, 4, 6, 8, 10], dtype=np.int64), np.array([5], dtype=np.int64) ] C_left = np.array([2, 4, 6, 8, 10], dtype=np.int64) S_right, C_right = allc(IL, IR) npt.assert_equal(S_left, S_right) npt.assert_equal(C_left, C_right)
def test_allc(Value, IR, IL): S_ref = [ np.array([1, 7, 11], dtype=np.int64), np.array([0], dtype=np.int64), np.array([3], dtype=np.int64), np.array([9], dtype=np.int64), np.array([2, 4, 6, 8, 10], dtype=np.int64), np.array([5], dtype=np.int64), ] C_ref = np.array([2, 4, 6, 8, 10], dtype=np.int64) S_comp, C_comp = allc(IL, IR) S_ref = sorted(S_ref, key=lambda x: (len(x), list(x))) S_comp = sorted(S_comp, key=lambda x: (len(x), list(x))) npt.assert_equal(S_ref, S_comp) npt.assert_equal(C_ref, C_comp)
def RunMP(aligned_data_root_path, output_path): min_valid_frame_ratio = 0.5 min_routine_chain_length = 2 max_num_motifs = 999999 window_size = 1300 streams = ['HeartRatePPG', 'StepCount'] if not os.path.isdir(output_path): os.makedirs(output_path) data_dict = LoadAlignedTILESData(aligned_data_root_path) pids = list(data_dict.keys()) # Compute motifs from the individual MP using a greedy method for pid in pids: signal_df = data_dict[pid]['fitbit'] #signal_df = signal_df.iloc[0:5000,:] # HACK for speed exclusion_mask = np.array(signal_df.shape[0]*[False]) for stream in streams: signal = signal_df[stream] # Use Matrix Profile methods to learn a motif dictionary motifs = [] motifs_data = [] while len(motifs) < max_num_motifs: print("Number of motifs found: %d"%(len(motifs))) masked_signal = signal.copy() masked_signal[exclusion_mask] = np.nan fitbit_mp = stumpy.stump(masked_signal, m=window_size) left_mp_idx = fitbit_mp[:,2] right_mp_idx = fitbit_mp[:,3] all_chain_set, unanchored_chain = stumpy.allc(left_mp_idx, right_mp_idx) is_valid_chain = True for i in range(unanchored_chain.shape[0]): num_nan_frames = np.sum(np.isnan(masked_signal[unanchored_chain[i]:unanchored_chain[i]+window_size])) if num_nan_frames >= min_valid_frame_ratio*window_size: is_valid_chain = False break if do_show_plot: nonroutine_signal = signal.copy() nonroutine_signal[exclusion_mask] = np.nan fig, ax = plt.subplots(2, sharex=True, gridspec_kw={'hspace':0}) ax[0].plot(signal_df.index, nonroutine_signal, 'b-') for i in range(unanchored_chain.shape[0]): y = signal.iloc[unanchored_chain[i]:unanchored_chain[i]+window_size] x = y.index.values ax[0].plot(x, y, linewidth=3, linestyle='--') ax[1].plot(signal_df.index[0:len(fitbit_mp)], fitbit_mp[:,0]) ax[1].set_ylabel('Matrix Profile') plt.show() if is_valid_chain and (unanchored_chain.shape[0] >= min_routine_chain_length): # TODO - Which part of the chain should be the motif? motif = signal.iloc[unanchored_chain[0]:unanchored_chain[0]+window_size].values motifs.append(motif) motifs_data.append(unanchored_chain) for i in range(unanchored_chain.shape[0]): exclusion_mask[unanchored_chain[i]:unanchored_chain[i]+window_size] = True else: break routine_signal, reconstruction_data = ReconstructSignalWithDictionary(signal, motifs) features_df = ExtractFeatures(routine_signal, signal, motifs, motifs_data, reconstruction_data) features_df.to_csv(os.path.join(output_path, '%s_routine_features.csv'%(pid)), index=False, header=True) if do_show_plot: fig, ax = plt.subplots(2, sharex=True, gridspec_kw={'hspace':0}) plt.suptitle('Fitbit %s with routines removed'%(stream)) ax[0].plot(signal_df.index, signal, 'b-') ax[1].plot(signal_df.index, signal, 'b--') ax[1].plot(signal_df.index, routine_signal, 'g-', linewidth=3) plt.show() return
def RunMP(aligned_data_root_path, output_path): do_compute_individual_k_motifs = True do_compute_anchored_chains = False do_compute_semantic_segmentation = False do_compute_multimodal_mp = False window_size = 1300 #window_size = 1500 data_dict = LoadAlignedTILESData(aligned_data_root_path) #plt.ion() pids = list(data_dict.keys())[0:1] streams = ['HeartRatePPG', 'StepCount'] # Compute motifs from the individual MP using a greedy method if do_compute_individual_k_motifs: num_motifs = 2 for pid in pids: fitbit_df = data_dict[pid]['fitbit'] fitbit_df = fitbit_df.iloc[0:10000, :] # HACK for stream in streams: exclusion_signal = fitbit_df[stream].copy() # Keep a NaN'd version for MP and interpolated one for OMP #nan_replace_value = -1000000 #fitbit_df[stream][np.isnan(fitbit_df[stream])] = nan_replace_value #fitbit_df_smooth = fitbit_df[stream].interpolate(method='linear', axis=0, inplace=False) #fitbit_df_smooth = fitbit_df[stream].copy() fitbit_df_smooth = exclusion_signal.copy() if np.isnan(fitbit_df_smooth[0] ): # Fill NaNs at the beginning and end idx = 0 while np.isnan(fitbit_df_smooth[idx]): idx += 1 fitbit_df_smooth[0:idx] = fitbit_df_smooth[idx] if np.isnan(fitbit_df_smooth[fitbit_df_smooth.shape[0] - 1]): idx = fitbit_df_smooth.shape[0] - 1 while np.isnan(fitbit_df_smooth[idx]): idx -= 1 fitbit_df_smooth[idx:] = fitbit_df_smooth[idx] # Use Matrix Profile methods to learn a motif dictionary motifs = [] while len(motifs) < num_motifs: #fitbit_mp = stumpy.stump(fitbit_df[stream], m=window_size) # TODO - use the exclusion_signal fitbit_mp = stumpy.stump( exclusion_signal, m=window_size) # TODO - use the exclusion_signal fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort() for motif_idx in range(len(fitbit_mp_argsort)): stream_motif_idx = fitbit_mp_argsort[motif_idx] num_nan = np.sum( np.isnan(exclusion_signal. values[stream_motif_idx:stream_motif_idx + window_size])) # Avoid finding bad motifs if num_nan >= 5.0 * window_size / 6.0: continue if stream == 'HeartRatePPG': pass break motif_left_idx = fitbit_mp_argsort[motif_idx] motif = fitbit_df_smooth[motif_left_idx:motif_left_idx + window_size] motif[motif == 0] = 1e-12 # OMP requires non-zeros in the support motifs.append(motif) plt.plot(range(motif_left_idx, motif_left_idx + window_size), motifs[-1], 'g-', linewidth=5) # Build a redundant dictionary from the motifs num_repetitions = len(fitbit_df_smooth) - window_size dictionary_mat = csr_matrix( (len(motifs) * num_repetitions, len(fitbit_df_smooth))) for motif_idx in range(len(motifs)): motif_values = motifs[motif_idx].values for repeat_idx in range(num_repetitions): # SLOW: TODO - find better way of generating this matrix. Maybe I can change the sparse encoding directly and just push extra zeros in front of the motif sequence? Better yet, why not abandon the matrix representation and just use a list of motifs and their starting index in the signal dictionary_mat[motif_idx * num_repetitions + repeat_idx, repeat_idx:repeat_idx + window_size] = motif_values # Reconstruct the signal using the motif dictionary # TODO : Write my own OMP with exclusion of each atom's support. Gram mat? # TODO : Use L1 optimization (Lasso)? #omp = OrthogonalMatchingPursuit(n_nonzero_coefs=2, fit_intercept=False) omp = OrthogonalMatchingPursuitCV(fit_intercept=False) omp.fit(dictionary_mat.T, fitbit_df_smooth) intercept = omp.intercept_ coef = omp.coef_ idx_r = coef.nonzero() num_nonzero = omp.n_nonzero_coefs_ #max_nonzero = 20 #skip_nan_percent = 0.1 #coef = np.zeros((dictionary_mat.T.shape[1],1)) #intercept = np.zeros((dictionary_mat.T.shape[0],1)) #for num_nonzero in range(1,max_nonzero+1): # # Reconstruct the signal using the motif dictionary # best_dict_idx = -1 # best_error = np.inf # best_dict_support = None # for dict_idx in range(dictionary_mat.shape[0]): # # SLOW # dict_vec = dictionary_mat[dict_idx,:].toarray().reshape(-1,) # # Find the support # left_support_idx = 0 # right_support_idx = len(dict_vec)-1 # while dict_vec[left_support_idx] == 0 and left_support_idx < len(dict_vec): # left_support_idx += 1 # while dict_vec[right_support_idx] == 0 and right_support_idx >= 0: # right_support_idx -= 1 # # Skip mostly NaN regions # if np.sum(np.isnan(exclusion_signal[left_support_idx:right_support_idx+1])) > skip_nan_percent*(right_support_idx-left_support_idx+1): # continue # # Find the best match # residual = exclusion_signal[left_support_idx:right_support_idx+1] - dict_vec[left_support_idx:right_support_idx+1] # np.nan_to_num(residual, copy=False) # Replace NaN with zero # error = np.dot(residual, residual) # if error < best_error: # best_error = error # coef_val = 1 # TODO - constrain between 0.5 and 2? # best_dict_idx = dict_idx # best_dict_support = (left_support_idx, right_support_idx) # if best_dict_idx < 0: # print("No best next dictionary element found") # break # # Update coef # coef_nonzero = (coef != 0).reshape(-1,) # if np.sum(coef_nonzero) > 0: # dictionary_mat_reduced = dictionary_mat[coef_nonzero, :] # coef_reduced = coef[coef_nonzero] # #prev_fit_signal = np.matmul(dictionary_mat.T, coef) # prev_fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced) # prev_residual = fitbit_df_smooth - prev_fit_signal.reshape(-1,) # np.nan_to_num(prev_residual, copy=False) # Replace NaN with zero # prev_error = np.dot(prev_residual, prev_residual) # coef[best_dict_idx] = coef_val # #fit_signal = np.matmul(dictionary_mat.T, coef) # fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced) # fit_residual = fitbit_df_smooth - fit_signal.reshape(-1,) # np.nan_to_num(fit_residual, copy=False) # Replace NaN with zero # fit_error = np.dot(fit_residual, fit_residual) # else: # prev_residual = fitbit_df_smooth- np.zeros(len(fitbit_df_smooth)) # np.nan_to_num(prev_residual, copy=False) # Replace NaN with zero # prev_error = np.dot(prev_residual, prev_residual) # coef[best_dict_idx] = coef_val # coef_nonzero = (coef != 0).reshape(-1,) # dictionary_mat_reduced = dictionary_mat[coef_nonzero, :] # coef_reduced = coef[coef_nonzero] # fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced) # fit_residual = fitbit_df_smooth - fit_signal.reshape(-1,) # np.nan_to_num(fit_residual, copy=False) # Replace NaN with zero # fit_error = np.dot(fit_residual, fit_residual) # if best_dict_support is not None: # exclusion_signal[best_dict_support[0]:best_dict_support[1]+1] = np.inf # if prev_error < fit_error: # print("Avoiding overfitting...") # coef[best_dict_idx,0] = 0 # break coef_nonzero = (coef != 0).reshape(-1, ) dictionary_mat_reduced = dictionary_mat[coef_nonzero, :] coef_reduced = coef[coef_nonzero] fit_signal = np.matmul(dictionary_mat_reduced.T.toarray(), coef_reduced) + intercept plt.plot(range(fitbit_df[stream].shape[0]), fitbit_df[stream], 'b-') #plt.plot(range(fitbit_df_smooth.shape[0]), fitbit_df_smooth, 'k-') plt.plot(range(fitbit_df[stream].shape[0]), fit_signal, 'r--') plt.title('OMP (%d coefs) + MP Motifs (%d motifs)' % (num_nonzero, num_motifs)) plt.xlabel('Time') plt.ylabel(stream) plt.show() return pdb.set_trace() # Compute individual matrix profiles (stump) if do_compute_anchored_chains or do_compute_semantic_segmentation: for pid in pids: fitbit_df = data_dict[pid]['fitbit'] for stream in streams: fitbit_mp = stumpy.stump(fitbit_df[stream], m=window_size) if do_compute_anchored_chains: left_mp_idx = fitbit_mp[:, 2] right_mp_idx = fitbit_mp[:, 3] #atsc_idx = 10 #anchored_chain = stumpy.atsc(left_mp_idx, right_mp_idx, atsc_idx) all_chain_set, unanchored_chain = stumpy.allc( left_mp_idx, right_mp_idx) if do_compute_semantic_segmentation: subseq_len = window_size correct_arc_curve, regime_locations = stumpy.fluss( fitbit_mp[:, 1], L=subseq_len, n_regimes=2, excl_factor=5) # Find the first motif with nearly no NaN values in the stream signal fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort() for motif_idx in range(len(fitbit_mp_argsort)): stream_motif_idx = fitbit_mp_argsort[motif_idx] num_nan = np.sum( np.isnan(fitbit_df[stream]. values[stream_motif_idx:stream_motif_idx + window_size])) # Avoid finding bad motifs if num_nan >= 5.0 * window_size / 6.0: continue if stream == 'HeartRatePPG': pass # Check for flat heart rate #nan_like_value = 70 #num_valid = np.count_nonzero((fitbit_df[stream] - nan_like_value)[stream_motif_idx:stream_motif_idx+window_size]) #if num_valid < window_size - 2: # continue # Check for linear heart rate over time #residual_threshold = window_size*(4.0**2) #p, res, rank, sing_vals, rcond = np.polyfit(range(window_size), fitbit_df[stream][stream_motif_idx:stream_motif_idx+window_size], deg=1, full=True) #if res < residual_threshold: # continue break num_subplots = 3 if do_compute_semantic_segmentation else 2 fig, axs = plt.subplots(num_subplots, sharex=True, gridspec_kw={'hspace': 0}) plt.suptitle('Matrix Profile, %s, PID: %s' % (stream, pid), fontsize='30') axs[0].plot(fitbit_df[stream].values) rect = plt.Rectangle((fitbit_mp_argsort[motif_idx], 0), window_size, 2000, facecolor='lightgrey') axs[0].add_patch(rect) rect = plt.Rectangle((fitbit_mp_argsort[motif_idx + 1], 0), window_size, 2000, facecolor='lightgrey') axs[0].add_patch(rect) axs[0].set_ylabel(stream, fontsize='20') axs[1].plot(fitbit_mp[:, 0]) axs[1].axvline(x=fitbit_mp_argsort[motif_idx], linestyle="dashed") axs[1].axvline(x=fitbit_mp_argsort[motif_idx + 1], linestyle="dashed") axs[1].set_ylabel('Matrix Profile', fontsize='20') if do_compute_anchored_chains: for i in range(unanchored_chain.shape[0]): y = fitbit_df[stream].iloc[ unanchored_chain[i]:unanchored_chain[i] + window_size] x = y.index.values axs[0].plot(x, y, linewidth=3) if do_compute_semantic_segmentation: axs[2].plot(range(correct_arc_curve.shape[0]), correct_arc_curve, color='C1') axs[0].axvline(x=regime_locations[0], linestyle="dashed") axs[2].axvline(x=regime_locations[0], linestyle="dashed") plt.show() # Compute multi-dimensional matrix profiles (mstump) if do_compute_multimodal_mp: for pid in pids: fitbit_df = data_dict[pid]['fitbit'] data = fitbit_df.loc[:, streams].values mp, mp_indices = stumpy.mstump(data.T, m=window_size) #print("Stumpy's mstump function does not handle NaN values. Skipping multi-dimensional MP") #break # TODO - This code is copied from above. Fix and finish it once mstump supports NaN # Find the first motif with nearly no NaN values in the stream signal fitbit_mp_argsort = np.array(fitbit_mp[:, 0]).argsort() for motif_idx in range(len(fitbit_mp_argsort)): stream_motif_idx = fitbit_mp_argsort[motif_idx] num_nan = np.sum( np.isnan(fitbit_df[stream]. values[stream_motif_idx:stream_motif_idx + window_size])) # Avoid finding bad motifs if num_nan >= 2: continue if stream == 'HeartRatePPG': # Check for flat heart rate nan_like_value = 70 num_valid = np.count_nonzero( (fitbit_df[stream] - nan_like_value)[stream_motif_idx:stream_motif_idx + window_size]) if num_valid < window_size - 2: continue # Check for linear heart rate over time residual_threshold = window_size * (4.0**2) p, res, rank, sing_vals, rcond = np.polyfit( range(window_size), fitbit_df[stream][stream_motif_idx:stream_motif_idx + window_size], deg=1, full=True) if res < residual_threshold: continue break fig, axs = plt.subplots(2, sharex=True, gridspec_kw={'hspace': 0}) plt.suptitle('Matrix Profile, %s, PID: %s' % (stream, pid), fontsize='30') axs[0].plot(fitbit_df[stream].values) rect = plt.Rectangle((fitbit_mp_argsort[motif_idx], 0), window_size, 2000, facecolor='lightgrey') axs[0].add_patch(rect) rect = plt.Rectangle((fitbit_mp_argsort[motif_idx + 1], 0), window_size, 2000, facecolor='lightgrey') axs[0].add_patch(rect) axs[0].set_ylabel(stream, fontsize='20') axs[1].plot(fitbit_mp[:, 0]) axs[1].axvline(x=fitbit_mp_argsort[motif_idx], linestyle="dashed") axs[1].axvline(x=fitbit_mp_argsort[motif_idx + 1], linestyle="dashed") axs[1].set_ylabel('Matrix Profile', fontsize='20') plt.show() plt.ioff() plt.figure() plt.plot() plt.title('Dummy plot') plt.show() return