def test_comp_decomp(path): """Compress and decompress a random binary file with integer data type, and check the files are byte to byte equal. This would not work for floating-point data types.""" arr = np.array(np.random.randint(low=0, high=255, size=(1000, 1000)), dtype=np.int16).T _write_arr(path, arr) out = path.parent / 'data.cbin' outmeta = path.parent / 'data.ch' compress( path, out, outmeta, sample_rate=sample_rate, n_channels=arr.shape[1], dtype=arr.dtype, ) decompressed_path = path.with_suffix('.decomp.bin') decompress(out, outmeta, out=decompressed_path) # Check the files are equal. with open(str(path), 'rb') as f: buf1 = f.read() sha1_original = sha1(buf1) with open(str(decompressed_path), 'rb') as f: buf2 = f.read() sha1_decompressed = sha1(buf2) assert buf1 == buf2 # Check the SHA1s. with open(str(out), 'rb') as f: sha1_compressed = sha1(f.read()) with open(str(outmeta), 'r') as f: meta = json.load(f) assert meta['sha1_compressed'] == sha1_compressed assert meta['sha1_uncompressed'] == sha1_decompressed == sha1_original
def decompress_file(self, keep_original=True, **kwargs): """ Decompresses a mtscomp file :param keep_original: defaults True. If False, the original compressed file is deleted and the current spikeglx.Reader object is modified in place :return: pathlib.Path of the decompressed *.bin file """ file_out = self.file_bin.with_suffix('.bin') assert self.is_mtscomp mtscomp.decompress(self.file_bin, self.file_bin.with_suffix('.ch'), out=file_out, **kwargs) if not keep_original: self.file_bin.unlink() self.file_bin.with_suffix('.ch').unlink() self.file_bin = file_out return file_out
def load_raw_data(path=None, n_channels_dat=None, dtype=None, offset=None, order=None): """Load raw data at a given path.""" if not path: return path = Path(path) if not path.exists(): logger.warning("Path %s does not exist, trying ephys.raw filename.", path) path = path.parent / ('ephys.raw' + path.suffix) if not path.exists(): logger.warning("Error while loading data: File `%s` not found.", path) return None assert path.exists() logger.debug("Loading traces at `%s`.", path) if str(path).endswith('.cbin'): # pragma: no cover try: from mtscomp import decompress logger.debug("Decompressing %s on the fly with mtscomp.", path) return decompress(path) except ImportError: logger.warning( "The mtscomp package is not available, %s cannot be decompressed. " "In the meantime, the raw data will not be available.", path) return dtype = dtype if dtype is not None else np.int16 return _dat_to_traces(path, n_channels=n_channels_dat, dtype=dtype, offset=offset, order=order)
def decompress_file(self, keep_original=True, **kwargs): """ Decompresses a mtscomp file :param keep_original: defaults True. If False, the original compressed file (input) is deleted and the current spikeglx.Reader object is modified in place NB: This is not equivalent to overwrite (which replaces the output file) :return: pathlib.Path of the decompressed *.bin file """ if 'out' not in kwargs: kwargs['out'] = self.file_bin.with_suffix('.bin') assert self.is_mtscomp mtscomp.decompress(self.file_bin, self.file_bin.with_suffix('.ch'), **kwargs) if not keep_original: self.file_bin.unlink() self.file_bin.with_suffix('.ch').unlink() self.file_bin = kwargs['out'] return kwargs['out']
def download_raw_partial(self, url_cbin, url_ch, first_chunk=0, last_chunk=0): assert url_cbin.endswith('.cbin') assert url_ch.endswith('.ch') relpath = Path(url_cbin.replace(self._par.HTTP_DATA_SERVER, '.')).parents[0] target_dir = Path(self._get_cache_dir(None), relpath) Path(target_dir).mkdir(parents=True, exist_ok=True) # First, download the .ch file. ch_local_path = Path(wc.http_download_file( url_ch, username=self._par.HTTP_DATA_SERVER_LOGIN, password=self._par.HTTP_DATA_SERVER_PWD, cache_dir=target_dir, clobber=True, offline=False, return_md5=False)) ch_local_path = remove_uuid_file(ch_local_path) ch_local_path = ch_local_path.rename(ch_local_path.with_suffix('.chopped.ch')) assert ch_local_path.exists() # Load the .ch file. with open(ch_local_path, 'r') as f: cmeta = json.load(f) # Get the first byte and number of bytes to download. i0 = cmeta['chunk_bounds'][first_chunk] cmeta['chunk_bounds'] = cmeta['chunk_bounds'][first_chunk:last_chunk + 2] cmeta['chunk_bounds'] = [_ - i0 for _ in cmeta['chunk_bounds']] assert len(cmeta['chunk_bounds']) >= 2 assert cmeta['chunk_bounds'][0] == 0 first_byte = cmeta['chunk_offsets'][first_chunk] cmeta['chunk_offsets'] = cmeta['chunk_offsets'][first_chunk:last_chunk + 2] cmeta['chunk_offsets'] = [_ - first_byte for _ in cmeta['chunk_offsets']] assert len(cmeta['chunk_offsets']) >= 2 assert cmeta['chunk_offsets'][0] == 0 n_bytes = cmeta['chunk_offsets'][-1] assert n_bytes > 0 # Save the chopped chunk bounds and ossets. cmeta['sha1_compressed'] = None cmeta['sha1_uncompressed'] = None cmeta['chopped'] = True with open(ch_local_path, 'w') as f: json.dump(cmeta, f, indent=2, sort_keys=True) # Download the requested chunks cbin_local_path = wc.http_download_file( url_cbin, username=self._par.HTTP_DATA_SERVER_LOGIN, password=self._par.HTTP_DATA_SERVER_PWD, cache_dir=target_dir, clobber=True, offline=False, return_md5=False, chunks=(first_byte, n_bytes)) cbin_local_path = remove_uuid_file(cbin_local_path) cbin_local_path = cbin_local_path.rename(cbin_local_path.with_suffix('.chopped.cbin')) assert cbin_local_path.exists() import mtscomp reader = mtscomp.decompress(cbin_local_path, cmeta=ch_local_path) return reader[:]
def mtscomp_perf(**kwargs): ds = kwargs.pop('ds', None) assert ds name, n_channels, sample_rate, duration = ds # Compress the file. path = Path('data/' + name) out = path.parent / 'data.cbin' outmeta = path.parent / 'data.ch' t0 = time.perf_counter() compress(path, out, outmeta, sample_rate=sample_rate, n_channels=n_channels, dtype=dtype, check_after_compress=False, **kwargs) t1 = time.perf_counter() wt = t1 - t0 # Decompress the file and write it to disk. out2 = path.with_suffix('.decomp.bin') t0 = time.perf_counter() decompress(out, outmeta, out2, check_after_decompress=False) t1 = time.perf_counter() rtc = t1 - t0 # Read the uncompressed file. t0 = time.perf_counter() x = load_raw_data(path, n_channels=n_channels, dtype=dtype, mmap=False) assert x.size t1 = time.perf_counter() rtdec = t1 - t0 orig_size = path.stat().st_size compressed_size = out.stat().st_size return { 'read_time_compressed': rtc, 'read_time_decompressed': rtdec, 'write_time': wt, 'ratio': 100 - 100 * compressed_size / orig_size, }
def _round_trip(path, arr, **ckwargs): _write_arr(path, arr) out = path.parent / 'data.cbin' outmeta = path.parent / 'data.ch' compress( path, out, outmeta, sample_rate=sample_rate, n_channels=arr.shape[1], dtype=arr.dtype, **ckwargs) unc = decompress(out, outmeta) assert np.allclose(unc[:], arr) return unc
def traces(request, tempdir, arr, sample_rate): if request.param == 'numpy': return get_ephys_reader(arr, sample_rate=sample_rate) elif request.param == 'npy': path = tempdir / 'data.npy' np.save(path, arr) return get_ephys_reader(path, sample_rate=sample_rate) elif request.param == 'flat': path = tempdir / 'data.bin' with open(path, 'wb') as f: arr.tofile(f) return get_ephys_reader(path, sample_rate=sample_rate, dtype=arr.dtype, n_channels=arr.shape[1]) elif request.param == 'flat_concat': path0 = tempdir / 'data0.bin' with open(path0, 'wb') as f: arr[:arr.shape[0] // 2, :].tofile(f) path1 = tempdir / 'data1.bin' with open(path1, 'wb') as f: arr[arr.shape[0] // 2:, :].tofile(f) return get_ephys_reader([path0, path1], sample_rate=sample_rate, dtype=arr.dtype, n_channels=arr.shape[1]) elif request.param in ('mtscomp', 'mtscomp_reader'): path = tempdir / 'data.bin' with open(path, 'wb') as f: arr.tofile(f) out = tempdir / 'data.cbin' outmeta = tempdir / 'data.ch' mtscomp.compress(path, out, outmeta, sample_rate=sample_rate, n_channels=arr.shape[1], dtype=arr.dtype, n_threads=1, check_after_compress=False, quiet=True) reader = mtscomp.decompress(out, outmeta, check_after_decompress=False, quiet=True) if request.param == 'mtscomp': return get_ephys_reader(reader) else: return get_ephys_reader(out)
def test_3d(path): file_npy = path.parent.joinpath('titi.npy') file_cnpy = path.parent.joinpath('titi.cnpy') array = np.random.randint(-5000, high=5000, size=(100, 120, 130), dtype=np.int16) np.save(file_npy, array) # two way trip - makes sure that # 1) the sample_rate fed as an int64 doesn't error # 2) the initial shape of the array is saved in the meta-data mtscomp_mod.compress(file_npy, out=file_cnpy, outmeta=file_cnpy.with_suffix('.ch'), sample_rate=np.prod(array.shape[1:]), # here needs to cast as float dtype=array.dtype, do_time_diff=False) d = mtscomp_mod.decompress(file_cnpy, cmeta=file_cnpy.with_suffix('.ch')) assert np.all(np.isclose(d[:, :].reshape(d.cmeta.shape), array))
def test_ephys_traces_2(tempdir): data = (50 * np.random.randn(1000, 10)).astype(np.int16) sample_rate = 100 path = tempdir / 'data.bin' with open(path, 'wb') as f: data.tofile(f) out = path.parent / 'data.cbin' outmeta = path.parent / 'data.ch' mtscomp.compress(path, out, outmeta, sample_rate=sample_rate, n_channels=data.shape[1], dtype=data.dtype, n_threads=1, check_after_compress=False, quiet=True) reader = mtscomp.decompress(out, outmeta, check_after_decompress=False, quiet=True) traces = get_ephys_traces(reader) assert isinstance(traces, EphysTraces) assert isinstance(traces, da.Array) assert traces.dtype == data.dtype assert traces.shape == data.shape assert traces.chunks == ((100, ) * 10, (10, )) assert bool(np.all(data == traces).compute()) is True assert traces.chunk_bounds == reader.chunk_bounds spike_times = [5, 50, 100, 901] spike_chunks = traces._get_time_chunks(spike_times) ae(spike_chunks, [0, 0, 1, 9]) waveforms = traces.extract_waveforms(spike_times, [1, 4, 7], 10) assert waveforms.shape == (4, 10, 3) traces_sub = traces.subset_time_range(2.5, 7.5) assert traces_sub.shape == (500, 10) assert bool(np.all(traces[250:750, :] == traces_sub).compute()) is True
def test_decompress_pool(path, arr): _write_arr(path, arr) out = path.parent / 'data.cbin' outmeta = path.parent / 'data.ch' compress( path, out, outmeta, sample_rate=sample_rate, n_channels=arr.shape[1], dtype=arr.dtype, check_after_compress=False) reader = decompress(out, outmeta, cache_size=2) pool = reader.start_thread_pool() d1 = reader.decompress_chunks([0, 1, 2], pool=pool) d2 = reader.decompress_chunks([1, 2, 3], pool=pool) d3 = reader.decompress_chunks([0, 1, 3], pool=pool) reader.stop_thread_pool() assert sorted(d1.keys()) == [0, 1, 2] assert sorted(d2.keys()) == [1, 2, 3] assert sorted(d3.keys()) == [0, 1, 3]
def _run(self, overwrite=False): efiles = spikeglx.glob_ephys_files(self.session_path) apfiles = [(ef.get('ap'), ef.get('label')) for ef in efiles if 'ap' in ef.keys()] for ap_file, label in apfiles: # check for pre-existing spike-sorting # the spike sorting output can either be with the probe (<1.5.5) or in the # session_path/spike_sorters/ks2_matlab/probeXX folder ks2_dir = self.session_path.joinpath('spike_sorters', 'ks2_matlab', label) if ap_file.parent.joinpath('spike_sorting_ks2.log').exists(): _logger.info( f'Already ran: spike_sorting_ks2.log found for {ap_file}, skipping.' ) continue # this will label the job with ok status in the database if ks2_dir.joinpath('spike_sorting_ks2.log').exists(): _logger.info( f'Already ran: spike_sorting_ks2.log found in {ks2_dir}, skipping.' ) continue # get the scratch drive from the shell script SHELL_SCRIPT = Path.home().joinpath( "Documents/PYTHON/iblscripts/deploy/serverpc/kilosort2/task_ks2_matlab.sh" ) with open(SHELL_SCRIPT) as fid: lines = fid.readlines() line = [ line for line in lines if line.startswith('SCRATCH_DRIVE=') ][0] m = re.search(r"\=(.*?)(\#|\n)", line)[0] scratch_drive = Path(m[1:-1].strip()) assert (scratch_drive.exists()) # clean up and create directory, this also checks write permissions # scratch dir has the following shape: ks2m/ZM_3003_2020-07-29_001_probe00 # first makes sure the tmp dir is clean shutil.rmtree(scratch_drive.joinpath('ks2m'), ignore_errors=True) scratch_dir = scratch_drive.joinpath( 'ks2m', '_'.join(list(self.session_path.parts[-3:]) + [label])) if scratch_dir.exists(): shutil.rmtree(scratch_dir, ignore_errors=True) scratch_dir.mkdir(parents=True, exist_ok=True) # decompresses using mtscomp tmp_ap_file = scratch_dir.joinpath( ap_file.name).with_suffix('.bin') mtscomp.decompress(cdata=ap_file, out=tmp_ap_file) # run matlab spike sorting: with R2019a, it would be much easier to run with # -batch option as matlab errors are redirected to stderr automatically command2run = f"{SHELL_SCRIPT} {scratch_dir}" _logger.info(command2run) process = subprocess.Popen(command2run, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable="/bin/bash") info, error = process.communicate() info_str = info.decode('utf-8').strip() if process.returncode != 0: raise RuntimeError(error.decode('utf-8')) elif 'run_ks2_ibl.m failed' in info_str: raise RuntimeError('Matlab error ks2 log below:') _logger.info(info_str) # clean up and copy: output to session/spike_sorters/ks2_matlab/probeXX (ks2_dir) tmp_ap_file.unlink() # remove the uncompressed temp binary file scratch_dir.joinpath('temp_wh.dat').unlink( ) # remove the memmapped pre-processed file shutil.move(scratch_dir, ks2_dir) self.version = self._fetch_ks2_commit_hash() return [] # the job will be labeled as complete with empty string
def _run_ks2(self, ap_file): """ Runs the ks2 matlab spike sorting for one probe dataset the spike sorting output can either be with the probe (<1.5.5) or in the session_path/spike_sorters/ks2_matlab/probeXX folder :return: path of the folder containing ks2 spike sorting output """ label = ap_file.parts[-2] if ap_file.parent.joinpath("spike_sorting_ks2.log").exists(): _logger.info( f"Already ran: spike_sorting_ks2.log found for {ap_file}, skipping." ) return ap_file.parent ks2_dir = self.session_path.joinpath("spike_sorters", "ks2_matlab", label) if ks2_dir.joinpath("spike_sorting_ks2.log").exists(): _logger.info( f"Already ran: spike_sorting_ks2.log found in {ks2_dir}, skipping." ) return ks2_dir # get the scratch drive from the shell script SHELL_SCRIPT = Path.home().joinpath( "Documents/PYTHON/iblscripts/deploy/serverpc/kilosort2/task_ks2_matlab.sh" ) with open(SHELL_SCRIPT) as fid: lines = fid.readlines() line = [line for line in lines if line.startswith("SCRATCH_DRIVE=")][0] m = re.search(r"\=(.*?)(\#|\n)", line)[0] scratch_drive = Path(m[1:-1].strip()) assert scratch_drive.exists() # clean up and create directory, this also checks write permissions # scratch dir has the following shape: ks2m/ZM_3003_2020-07-29_001_probe00 # first makes sure the tmp dir is clean shutil.rmtree(scratch_drive.joinpath("ks2m"), ignore_errors=True) scratch_dir = scratch_drive.joinpath( "ks2m", "_".join(list(self.session_path.parts[-3:]) + [label])) if scratch_dir.exists(): shutil.rmtree(scratch_dir, ignore_errors=True) scratch_dir.mkdir(parents=True, exist_ok=True) # decompresses using mtscomp tmp_ap_file = scratch_dir.joinpath(ap_file.name).with_suffix(".bin") mtscomp.decompress(cdata=ap_file, out=tmp_ap_file) # run matlab spike sorting: with R2019a, it would be much easier to run with # -batch option as matlab errors are redirected to stderr automatically command2run = f"{SHELL_SCRIPT} {scratch_dir}" _logger.info(command2run) process = subprocess.Popen( command2run, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable="/bin/bash", ) info, error = process.communicate() info_str = info.decode("utf-8").strip() if process.returncode != 0: raise RuntimeError(error.decode("utf-8")) elif "run_ks2_ibl.m failed" in info_str: raise RuntimeError("Matlab error ks2 log below:") _logger.info(info_str) # clean up and copy: output to session/spike_sorters/ks2_matlab/probeXX (ks2_dir) tmp_ap_file.unlink() # remove the uncompressed temp binary file scratch_dir.joinpath( "temp_wh.dat").unlink() # remove the memmapped pre-processed file shutil.move(scratch_dir, ks2_dir) self.version = self._fetch_ks2_commit_hash() return ks2_dir