def branch_cluster(features, *, branch_depth=2, npca=10): if features.size == 0: return np.array([]) min_size_to_try_split = 20 labels1 = cluster(features, npca=npca).ravel().astype('int64') if np.min(labels1) < 0: tmp_fname = '/tmp/isosplit5-debug-features.mda' mdaio.writemda32(features, tmp_fname) raise Exception( 'Unexpected error in isosplit5. Features written to {}'.format( tmp_fname)) K = int(np.max(labels1)) if K <= 1 or branch_depth <= 1: return labels1 label_offset = 0 labels_new = np.zeros(labels1.shape, dtype='int64') for k in range(1, K + 1): inds_k = np.where(labels1 == k)[0] if len(inds_k) > min_size_to_try_split: labels_k = branch_cluster(features[:, inds_k], branch_depth=branch_depth - 1, npca=npca) K_k = int(np.max(labels_k)) labels_new[inds_k] = label_offset + labels_k label_offset += K_k else: labels_new[inds_k] = label_offset + 1 label_offset += 1 return labels_new
def bandpass_filter(timeseries, timeseries_out, samplerate, freq_min, freq_max, freq_wid=1000, padding=3000, chunk_size=3000 * 10, num_processes=os.cpu_count()): """ Apply a bandpass filter to a multi-channel timeseries Parameters ---------- timeseries : INPUT MxN raw timeseries array (M = #channels, N = #timepoints) timeseries_out : OUTPUT Filtered output (MxN array) samplerate : float The sampling rate in Hz freq_min : float The lower endpoint of the frequency band (Hz) freq_max : float The upper endpoint of the frequency band (Hz) freq_wid : float The optional width of the roll-off (Hz) """ X = mdaio.DiskReadMda(timeseries) M = X.N1() # Number of channels N = X.N2() # Number of timepoints num_chunks = int(np.ceil(N / chunk_size)) print('Chunk size: {}, Padding: {}, Num chunks: {}, Num processes: {}'. format(chunk_size, padding, num_chunks, num_processes)) opts = { "timeseries": timeseries, "timeseries_out": timeseries_out, "samplerate": samplerate, "freq_min": freq_min, "freq_max": freq_max, "freq_wid": freq_wid, "chunk_size": chunk_size, "padding": padding, "num_processes": num_processes, "num_chunks": num_chunks } global g_shared_data g_shared_data = SharedChunkInfo(num_chunks) global g_opts g_opts = opts mdaio.writemda32(np.zeros([M, 0]), timeseries_out) pool = multiprocessing.Pool(processes=num_processes) pool.map(filter_chunk, range(num_chunks), chunksize=1) return True
def whiten(*, timeseries,timeseries_out, chunk_size=30000*10,num_processes=os.cpu_count() ): """ Whiten a multi-channel timeseries Parameters ---------- timeseries : INPUT MxN raw timeseries array (M = #channels, N = #timepoints) timeseries_out : OUTPUT Whitened output (MxN array) """ X=mdaio.DiskReadMda(timeseries) M=X.N1() # Number of channels N=X.N2() # Number of timepoints num_chunks_for_computing_cov_matrix=10 num_chunks=int(np.ceil(N/chunk_size)) print ('Chunk size: {}, Num chunks: {}, Num processes: {}'.format(chunk_size,num_chunks,num_processes)) opts={ "timeseries":timeseries, "timeseries_out":timeseries_out, "chunk_size":chunk_size, "num_processes":num_processes, "num_chunks":num_chunks } global g_opts g_opts=opts pool = multiprocessing.Pool(processes=num_processes) step=int(np.maximum(1,np.floor(num_chunks/num_chunks_for_computing_cov_matrix))) AAt_matrices=pool.map(compute_AAt_matrix_for_chunk,range(0,num_chunks,step),chunksize=1) AAt=np.zeros((M,M),dtype='float64') for M0 in AAt_matrices: AAt+=M0/(len(AAt_matrices)*chunk_size) ##important: need to fix the denominator here to account for possible smaller chunk U, S, Ut = np.linalg.svd(AAt, full_matrices=True) W = (U @ np.diag(1/np.sqrt(S))) @ Ut #print ('Whitening matrix:') #print (W) global g_shared_data g_shared_data=SharedChunkInfo(num_chunks) mdaio.writemda32(np.zeros([M,0]),timeseries_out) pool = multiprocessing.Pool(processes=num_processes) pool.starmap(whiten_chunk,[(num,W) for num in range(0,num_chunks)],chunksize=1) return True
def test_mask_out_artifacts(): # Create noisy array samplerate = int(48e3) duration = 30 # seconds n_samples = samplerate*duration noise_amplitude = 5 noise = noise_amplitude*np.random.normal(0,1,n_samples) standard_dev = np.std(noise) # add three artefacts n_artifacts = 3 artifacts = np.zeros_like(noise) artifact_duration = int(0.2*samplerate) # samples artifact_signal = np.zeros((n_artifacts, artifact_duration)) for i in np.arange(n_artifacts): artifact_signal[i, :] = noise_amplitude*np.random.normal(0,6,artifact_duration) artifact_indices = np.tile(np.arange(artifact_duration), (3,1)) artifact_shift = np.array([int(n_samples*0.10), int(n_samples*0.20), int(n_samples*0.70)]) artifact_indices += artifact_shift.reshape((-1,1)) for i, indices in enumerate(artifact_indices): artifacts[indices] = artifact_signal[i,:] signal = noise + artifacts timeseries = 'test_mask.mda' timeseries_out = 'masked.mda' # write as mda mdaio.writemda32(signal.reshape((1,-1)), timeseries) # run the mask artefacts mask_out_artifacts(timeseries=timeseries, timeseries_out=timeseries_out, threshold=6, chunk_size=2000, num_write_chunks=150) # check that they are gone read_data = mdaio.readmda(timeseries).reshape((-1,1)) masked_data = mdaio.readmda(timeseries_out).reshape((-1,1)) indices_masked = sum(masked_data[artifact_indices,0].flatten() == 0) total_indices_to_mask = len(artifact_indices.flatten()) masked = indices_masked == total_indices_to_mask os.remove(timeseries) os.remove(timeseries_out) if masked: print('Artifacts 100% masked') return True else: print('Artifacts %.2f%% masked' % (100*(indices_masked/total_indices_to_mask))) return False
def test_compute_templates(): M, N, K, T, L = 5, 1000, 6, 50, 100 X = np.random.rand(M, N) mdaio.writemda32(X, 'tmp.mda') F = np.zeros((3, L)) F[1, :] = 1 + np.random.randint(N, size=(1, L)) F[2, :] = 1 + np.random.randint(K, size=(1, L)) mdaio.writemda64(F, 'tmp2.mda') ret = compute_templates(timeseries='tmp.mda', firings='tmp2.mda', templates_out='tmp3.mda', clip_size=T) assert (ret) templates0 = mdaio.readmda('tmp3.mda') assert (templates0.shape == (M, T, K)) return True
def compute_templates(*, timeseries, firings, templates_out, clip_size=100): """ Compute templates (average waveforms) for clusters defined by the labeled events in firings. Parameters ---------- timeseries : INPUT Path of timeseries mda file (MxN) from which to draw the event clips (snippets) for computing the templates. M is number of channels, N is number of timepoints. firings : INPUT Path of firings mda file (RxL) where R>=3 and L is the number of events. Second row are timestamps, third row are integer labels. templates_out : OUTPUT Path of output mda file (MxTxK). T=clip_size, K=maximum cluster label. Note that empty clusters will correspond to a template of all zeros. clip_size : int (Optional) clip size, aka snippet size, number of timepoints in a single template """ templates = compute_templates_helper(timeseries=timeseries, firings=firings, clip_size=clip_size) return mdaio.writemda32(templates, templates_out)
return True bandpass_filter.name = 'ephys.bandpass_filter' bandpass_filter.version = '0.1' if __name__ == "__main__": samplerate = int(3e4) freq_min = 250 freq_max = 6000 data_dir = '../ephys_preprocessing/' raw_data_ch1 = np.asarray( sio.loadmat(os.path.join(data_dir, 'raw_data_ch1.mat'))['data']) mdaio.writemda32(raw_data_ch1, os.path.join(data_dir, 'raw_data_ch1.mda')) timeseries = os.path.join(data_dir, 'raw_data_ch1.mda') timeseries_out = os.path.join(data_dir, 'filtered_raw_data_ch1.mda') bandpass_filter(timeseries, timeseries_out, samplerate, freq_min, freq_max) filtered_data = mdaio.readmda( os.path.join(data_dir, 'filtered_raw_data_ch1.mda')) detrended_data_ch1 = np.asarray( sio.loadmat(os.path.join(data_dir, 'detrended_data_ch1.mat'))['copy']) mdaio.writemda32(detrended_data_ch1, os.path.join(data_dir, 'detrended_data_ch1.mda')) timeseries = os.path.join(data_dir, 'detrended_data_ch1.mda') timeseries_out = os.path.join(data_dir, 'filtered_detrended_data_ch1.mda') bandpass_filter(timeseries, timeseries_out, samplerate, freq_min, freq_max) filtered_data_detrended = mdaio.readmda( os.path.join(data_dir, 'filtered_detrended_data_ch1.mda'))
def synthesize_timeseries(*, firings='', waveforms='', timeseries_out, noise_level=1, samplerate=30000, duration=60, waveform_upsamplefac, amplitudes_row=0): """ Synthesize an electrophysiology timeseries from a set of ground-truth firing events and waveforms Parameters ---------- firings : INPUT (Optional) The path of firing events file in .mda format. RxL where R>=3 and L is the number of events. Second row is the timestamps, third row is the integer labels/ waveforms : INPUT (Optional) The path of (possibly upsampled) waveforms file in .mda format. Mx(T*waveform_upsample_factor)*K, where M is the number of channels, T is the clip size, and K is the number of units. timeseries_out : OUTPUT The output path for the new timeseries. MxN noise_level : double (Optional) Standard deviation of the simulated background noise added to the timeseries samplerate : double (Optional) Sample rate for the synthetic dataset in Hz duration : double (Optional) Duration of the synthetic dataset in seconds. The number of timepoints will be duration*samplerate waveform_upsamplefac : int (Optional) The upsampling factor corresponding to the input waveforms. (avoids digitization artifacts) amplitudes_row : int (Optional) If positive, this is the row in the firings arrays where the amplitude scale factors are found. Otherwise, use all 1's """ num_timepoints = np.int64(samplerate * duration) waveform_upsamplefac = int(waveform_upsamplefac) if type(waveforms) == str: if waveforms: W = mdaio.readmda(waveforms) else: W = np.zeros((4, 100 * waveform_upsamplefac, 0)) else: W = waveforms if type(firings) == str: if firings: F = mdaio.readmda(firings) else: F = np.zeros((3, 0)) else: F = firings times = F[1, :] labels = F[2, :].astype('int') M, TT, K = W.shape[0], W.shape[1], W.shape[2] T = int(TT / waveform_upsamplefac) Tmid = int(np.ceil((T + 1) / 2 - 1)) N = num_timepoints if (N == 0): if times.size == 0: N = T else: N = max(times) + T X = np.random.randn(M, N) * noise_level waveform_list = [] for k in range(K): waveform0 = W[:, :, k - 1] waveform_list.append(waveform0) for j in range(times.size): t0 = times[j] k0 = labels[j] amp0 = 1 if amplitudes_row > 0: amp0 = F[amplitudes_row - 1, j] waveform0 = waveform_list[k0 - 1] frac_offset = int(np.floor((t0 - np.floor(t0)) * waveform_upsamplefac)) tstart = np.int64(np.floor(t0)) - Tmid if (0 <= tstart) and (tstart + T <= N): X[:, tstart:tstart + T] = X[:, tstart:tstart + T] + waveform0[:, frac_offset::waveform_upsamplefac] * amp0 if timeseries_out: return mdaio.writemda32(X, timeseries_out) else: return (X)
def mask_out_artifacts(*, timeseries, timeseries_out, threshold=6, chunk_size=2000, num_write_chunks=150, num_processes=os.cpu_count()): """ Masks out artifacts. Each chunk will be analyzed, and if the square root of the RSS of the chunk is above threshold, all the samples in this chunk (and neighboring chunks) will be set to zero. Parameters ---------- timeseries : INPUT MxN raw timeseries array (M = #channels, N = #timepoints) timeseries_out : OUTPUT masked output (MxN array) threshold : int Number of standard deviations away from the mean to consider as artifacts (default of 6). chunk_size : int This chunk size will be the number of samples that will be set to zero if the square root RSS of this chunk is above threshold. num_write_chunks : int How many chunks will be simultaneously written to the timeseries_out path (default of 150). """ if threshold == 0 or chunk_size == 0 or num_write_chunks == 0: print( "Problem with input parameters. Either threshold, num_write_chunks, or chunk_size is zero.\n" ) return False write_chunk_size = chunk_size * num_write_chunks opts = { "timeseries": timeseries, "timeseries_out": timeseries_out, "chunk_size": chunk_size, "num_processes": num_processes, "num_write_chunks": num_write_chunks, "write_chunk_size": write_chunk_size, } global g_opts g_opts = opts X = mdaio.DiskReadMda(timeseries) M = X.N1() # Number of channels N = X.N2() # Number of timepoints # compute norms of chunks num_chunks = int(np.ceil(N / chunk_size)) num_write = int(np.ceil(N / write_chunk_size)) norms = np.zeros((M, num_chunks)) # num channels x num_chunks for i in np.arange(num_chunks): t1 = int(i * chunk_size) # first timepoint of the chunk t2 = int(np.minimum(N, (t1 + chunk_size))) # last timepoint of chunk (+1) chunk = X.readChunk(i1=0, N1=X.N1(), i2=t1, N2=t2 - t1).astype(np.float32) # Read the chunk norms[:, i] = np.sqrt(np.sum(chunk**2, axis=1)) # num_channels x num_chunks # determine which chunks to use use_it = np.ones(num_chunks) # initialize use_it array for m in np.arange(M): vals = norms[m, :] sigma0 = np.std(vals) mean0 = np.mean(vals) artifact_indices = np.where(vals > mean0 + sigma0 * threshold)[0] # check if the first chunk is above threshold, ensure that we don't use negative indices later negIndBool = np.where(artifact_indices > 0)[0] # check if the last chunk is above threshold to avoid a IndexError maxIndBool = np.where(artifact_indices < num_chunks - 1)[0] use_it[artifact_indices] = 0 use_it[artifact_indices[negIndBool] - 1] = 0 # don't use the neighbor chunks either use_it[artifact_indices[maxIndBool] + 1] = 0 # don't use the neighbor chunks either print("For channel %d: mean=%.2f, stdev=%.2f, chunk size = %d\n" % (m, mean0, sigma0, chunk_size)) global g_shared_data g_shared_data = SharedChunkInfo(num_write) mdaio.writemda32( np.zeros([M, 0]), timeseries_out ) # create initial file w/ empty array so we can append to it pool = multiprocessing.Pool(processes=num_processes) # pool.starmap(mask_chunk,[(num,use_it[num]) for num in range(0,num_chunks)],chunksize=1) pool.starmap( mask_chunk, [(num, use_it[num * num_write_chunks:(num + 1) * num_write_chunks]) for num in range(0, num_write)], chunksize=1) num_timepoints_used = sum(use_it) num_timepoints_not_used = sum(use_it == 0) print("Using %.2f%% of all timepoints.\n" % (num_timepoints_used * 100.0 / (num_timepoints_used + num_timepoints_not_used))) return True
def synthesize_random_waveforms(*,waveforms_out=None,geometry_out=None,M=5,T=500,K=20,upsamplefac=13,timeshift_factor=3,average_peak_amplitude=10): """ Synthesize random waveforms for use in creating a synthetic timeseries dataset Parameters ---------- waveforms_out : OUTPUT Path to waveforms mda file. Mx(T*upsamplefac)xK geometry_out : OUTPUT (Optional) Path to geometry csv file M : int (Optional) Number of channels T : int (Optional) Number of timepoints for a waveform, before upsampling K : int (Optional) Number of waveforms to synthesize timeshift_factor : int (Optional) Controls amount of timeshift between waveforms on different channels for each template upsamplefac : int (Optional) used for upsampling the waveforms to avoid discretization artifacts average_peak_amplitude : float (Optional) used to scale the peak spike amplitude """ geometry=None avg_durations=[200,10,30,200] avg_amps=[0.5,10,-1,0] rand_durations_stdev=[10,4,6,20] rand_amps_stdev=[0.2,3,0.5,0] rand_amp_factor_range=[0.5,1] geom_spread_coef1=0.2 geom_spread_coef2=1 if not geometry: geometry=np.zeros((2,M)) geometry[0,:]=np.arange(1,M+1) geometry=np.array(geometry) avg_durations=np.array(avg_durations) avg_amps=np.array(avg_amps) rand_durations_stdev=np.array(rand_durations_stdev) rand_amps_stdev=np.array(rand_amps_stdev) rand_amp_factor_range=np.array(rand_amp_factor_range) neuron_locations=get_default_neuron_locations(M,K,geometry) ## The waveforms_out WW=np.zeros((M,T*upsamplefac,K)) for k in range(1,K+1): for m in range(1,M+1): diff=neuron_locations[:,k-1]-geometry[:,m-1] dist=np.sqrt(np.sum(diff**2)) durations0=np.maximum(np.ones(avg_durations.shape),avg_durations+np.random.randn(1,4)*rand_durations_stdev)*upsamplefac amps0=avg_amps+np.random.randn(1,4)*rand_amps_stdev waveform0=synthesize_single_waveform(N=T*upsamplefac,durations=durations0,amps=amps0) waveform0=np.roll(waveform0,int(timeshift_factor*dist*upsamplefac)) waveform0=waveform0*np.random.uniform(rand_amp_factor_range[0],rand_amp_factor_range[1]) WW[m-1,:,k-1]=waveform0/(geom_spread_coef1+dist*geom_spread_coef2) peaks=np.max(np.abs(WW),axis=(0,1)) WW=WW/np.mean(peaks)*average_peak_amplitude if waveforms_out: mdaio.writemda32(WW,waveforms_out) if geometry_out: np.savetxt(geometry_out,geometry.transpose(),delimiter=",",fmt="%g") return True else: return True else: return (WW,geometry)