def test_event_transfer(): es = EventSegment(2) sample_data = np.asarray([[1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1]]) with pytest.raises(NotFittedError, message="Should need to set variance"): seg = es.find_events(sample_data.T)[0] with pytest.raises(NotFittedError, message="Should need to set patterns"): seg = es.find_events(sample_data.T, np.asarray([1, 1]))[0] es.set_event_patterns(np.asarray([[1, 0], [0, 1]])) seg = es.find_events(sample_data.T, np.asarray([1, 1]))[0] events = np.argmax(seg, axis=1) assert np.array_equal(events, [0, 0, 0, 1, 1, 1, 1]),\ "Failed to correctly transfer two events to new data"
def test_fit_shapes(): K = 5 V = 3 T = 10 es = EventSegment(K, n_iter=2) sample_data = np.random.rand(V, T) es.fit(sample_data.T) assert es.segments_[0].shape == (T, K), "Segmentation from fit " \ "has incorrect shape" assert np.isclose(np.sum(es.segments_[0], axis=1), np.ones(T)).all(), \ "Segmentation from learn_events not correctly normalized" T2 = 15 sample_data2 = np.random.rand(V, T2) test_segments, test_ll = es.find_events(sample_data2.T) assert test_segments.shape == (T2, K), "Segmentation from find_events " \ "has incorrect shape" assert np.isclose(np.sum(test_segments, axis=1), np.ones(T2)).all(), \ "Segmentation from find_events not correctly normalized" es_invalid = EventSegment(K) with pytest.raises(ValueError, message="T < K should cause error"): es_invalid.model_prior(K-1) with pytest.raises(ValueError, message="#Events < K should cause error"): es_invalid.set_event_patterns(np.zeros((V, K-1)))
def test_fit_shapes(): K = 5 V = 3 T = 10 es = EventSegment(K, n_iter=2) sample_data = np.random.rand(V, T) es.fit(sample_data.T) assert es.segments_[0].shape == (T, K), "Segmentation from fit " \ "has incorrect shape" assert np.isclose(np.sum(es.segments_[0], axis=1), np.ones(T)).all(), \ "Segmentation from learn_events not correctly normalized" T2 = 15 sample_data2 = np.random.rand(V, T2) test_segments, test_ll = es.find_events(sample_data2.T) assert test_segments.shape == (T2, K), "Segmentation from find_events " \ "has incorrect shape" assert np.isclose(np.sum(test_segments, axis=1), np.ones(T2)).all(), \ "Segmentation from find_events not correctly normalized" es_invalid = EventSegment(K) with pytest.raises(ValueError, message="T < K should cause error"): es_invalid.model_prior(K - 1) with pytest.raises(ValueError, message="#Events < K should cause error"): es_invalid.set_event_patterns(np.zeros((V, K - 1)))
def test_chains(): es = EventSegment(5, event_chains=np.array(['A', 'A', 'B', 'B', 'B'])) es.set_event_patterns(np.array([[1, 1, 0, 0, 0], [0, 0, 1, 1, 1]])) sample_data = np.array([[0, 0, 0], [1, 1, 1]]) seg = es.find_events(sample_data.T, 0.1)[0] ev = np.nonzero(seg > 0.99)[1] assert np.array_equal(ev, [2, 3, 4]),\ "Failed to fit with multiple chains"
def test_sym_ll(): ev = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2]) random_state = np.random.RandomState(0) ev_pat = random_state.rand(3, 10) D_forward = np.zeros((len(ev), 10)) for t in range(len(ev)): D_forward[t, :] = ev_pat[ev[t], :] + 0.1 * random_state.rand(10) D_backward = np.flip(D_forward, axis=0) hmm_forward = EventSegment(3) hmm_forward.set_event_patterns(ev_pat.T) _, ll_forward = hmm_forward.find_events(D_forward, var=1) hmm_backward = EventSegment(3) hmm_backward.set_event_patterns(np.flip(ev_pat.T, axis=1)) _, ll_backward = hmm_backward.find_events(D_backward, var=1) assert (ll_forward == ll_backward),\ "Log-likelihood not symmetric forward/backward"
def test_sym(): es = EventSegment(4) evpat = np.repeat(np.arange(10).reshape(-1, 1), 4, axis=1) es.set_event_patterns(evpat) D = np.repeat(np.arange(10).reshape(1, -1), 20, axis=0) ev = es.find_events(D, var=1)[0] # Check that events 1-4 and 2-3 are symmetric assert np.all(np.isclose(ev[:, :2], np.fliplr(np.flipud(ev[:, 2:])))),\ "Fit with constant data is not symmetric"
def heldout_ll(data, n_events, split): """Compute log-likelihood on heldout subjects Fits an event segmentation model with n_events to half of the subjects, then measures the log-likelihood of this model on the other half. The returned log-likehood averages across both choices of which half is used for training and which is used for testing. The boolean array split defines which subjects are in each half. Parameters ---------- data : ndarray subj x TR x Voxels data array n_events : int Number of events for event segmentation model split : ndarray Boolean vector, subj in one group are True and in the other are False Returns ------- float Average of log-likelihoods on testing groups """ d = deepcopy(data) # Remove nan voxels nan_idxs = np.where(np.isnan(d)) nan_idxs = list(set(nan_idxs[2])) d = np.delete(np.asarray(d), nan_idxs, axis=2) # Train and test event segmentation across groups group1 = d[split].mean(0) group2 = d[~split].mean(0) es = EventSegment(n_events).fit(group1) _, ll12 = es.find_events(group2) es = EventSegment(n_events).fit(group2) _, ll21 = es.find_events(group1) return (ll12 + ll21) / 2
def test_chains(): es = EventSegment(5, event_chains=np.array(['A', 'A', 'B', 'B', 'B'])) sample_data = np.array([[0, 0, 0], [1, 1, 1]]) with pytest.raises(RuntimeError): seg = es.fit(sample_data.T)[0] pytest.fail("Can't use fit() with event chains") es.set_event_patterns(np.array([[1, 1, 0, 0, 0], [0, 0, 1, 1, 1]])) seg = es.find_events(sample_data.T, 0.1)[0] ev = np.nonzero(seg > 0.99)[1] assert np.array_equal(ev, [2, 3, 4]),\ "Failed to fit with multiple chains"
def compute_fits_hmm(data: np.ndarray, k: int, mindist: int, type='HMM', y=None, t1=None, ind1=None, zs=False): if type == 'HMM': hmm = HMM(k) elif type == 'HMMsplit': hmm = HMM(k, split_merge=True) if zs == True: data = zscore(data, axis=0, ddof=1) hmm.fit(data) if y is None: tdata = data else: if zs == True: y = zscore(y, axis=0, ddof=1) tdata = y _, LL_HMM = hmm.find_events(tdata) hmm_bounds = np.insert(np.diff(np.argmax(hmm.segments_[0], axis=1)), 0, 0).astype(int) if t1 is None and ind1 is None: ind = np.triu(np.ones(tdata.shape[0], bool), mindist) z = GSBS._zscore(tdata) t = np.cov(z)[ind] else: ind = ind1 t = t1 stateseq = deltas_states(deltas=hmm_bounds)[:, None] diff, same, alldiff = (lambda c: (c == 1, c == 0, c > 0))(cdist( stateseq, stateseq, "cityblock")[ind]) WAC_HMM = np.mean(t[same]) - np.mean(t[alldiff]) tdist_HMM = 0 if sum(same) < 2 else ttest_ind( t[same], t[diff], equal_var=False)[0] return LL_HMM, WAC_HMM, tdist_HMM, hmm_bounds, t, ind
subj_id_train_inner = subj_id_all_inner[subj_id_train_inner] subj_id_test_inner = subj_id_all_inner[subj_id_test_inner] print("-Train:", subj_id_train_inner,"Validate:", subj_id_test_inner, ', now try different k...') D_train = D[:,:,subj_id_train_inner] #14 subjects to be trained D_val = D[:,:,subj_id_test_inner] #5 subjects for validation D_test = D[:,:,subj_id_test_outer] #held-out subject for knum in range(len(ks)): #loop through all possible K values #ev = brainiak.eventseg.event.EventSegment(ks[knum]) #set up HMM if splitm: #if running split_merge ev = EventSegment(ks[knum],split_merge=True) else: #if no split_merge ev = EventSegment(ks[knum]) ev.fit(D_train.mean(2).T) #fit to average pattern of training set segments,ll=ev.find_events(D_val.mean(2).T)#grab segments + log lik on average of validation set #print('log likelihood for k=%s is %s' %(ks[knum],ll)) #uncomment to see ll values loglik[subj_id_test_outer,jj,knum] = ll #store loglik value for this K all_ll[jj,:]=loglik[subj_id_test_outer,jj,:] best_ll[subj_id_test_outer,jj] = np.max(loglik[subj_id_test_outer,jj,:]) #best loglik! jj=jj+1 mean_all_ll=np.mean(all_ll,axis=0) #grab mean log lik fill3=mean_all_ll.argsort() #sort by log likelihood value fill3=fill3[len(ks)-1] #find best K print('Best K = %s' %fill3) #print best K best_k_subj[subj_id_test_outer]=ks[fill3] #store best k to assign for this sub if splitm: #if running split_merge ev = brainiak.eventseg.event.EventSegment(ks[fill3],split_merge=True) #re-train model w/ this best K else: #if no split_merge ev = brainiak.eventseg.event.EventSegment(ks[fill3]) #re-train model w/ this best K #ev.fit(D[:,:,np.arange(nS) != subj_id_test_outer].mean(2).T) #fit
What if we don't want to prespecify the number of events, but instead want to determine the number of events from the data? One way to determine the best number of events is to fit the model on a training set and then test the model fit on independent subjects. k_array = np.arange(20, 61, 10) test_ll = np.zeros(len(k_array)) for i, k in enumerate(k_array): print('Trying %d events' % k) print(' Fitting model on training subjects...') movie_train = np.mean(movie[:8], axis = 0) movie_HMM = EventSegment(k) movie_HMM.fit(movie_train) print(' Testing model fit on held-out subjects...') movie_test = np.mean(movie[8:], axis = 0) _, test_ll[i] = movie_HMM.find_events(movie_test) plt.plot(k_array, test_ll) plt.xlabel('Number of events') plt.ylabel('Log-likelihood') movie_dur = nTRs * 1.5 # Data acquired every 1.5 seconds secax = plt.gca().secondary_xaxis('top', functions=(lambda x: movie_dur / x, lambda x: movie_dur / x)) secax.set_xlabel('Average event length (sec)') #### 1.3 Optimal segmentation with the HMM Since 40 events maximized the test log-likelihood, we'll generate two versions of HMM boundaries using 40 events. In addition to the "vanilla" HMM, we'll run an HMM with more flexibility during fitting (allowing for split-merge operations). This is slower (and so should usually only be used for generating a final segmentation), but can produce better fits if events are very uneven in duration. We will use these segmentations below for comparison with an alternative event segmentation method (GSBS) and with human labeled event boundaries. print('Fitting HMM with 40 events...') HMM40 = EventSegment(n_events = 40) HMM40.fit(movie_group)