def test_pca(self): django.setup() from koe.models import Feature, Aggregation, FullTensorData, Database from koe.ts_utils import bytes_to_ndarray, get_rawdata_from_binary database = Database.objects.get(name='Bellbird_TMI') features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.all().order_by('id') features_hash = '-'.join( list(map(str, features.values_list('id', flat=True)))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) full_tensor = FullTensorData.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).first() if full_tensor is None: raise Exception('Tensor not found') full_sids_path = full_tensor.get_sids_path() full_bytes_path = full_tensor.get_bytes_path() sids = bytes_to_ndarray(full_sids_path, np.int32) full_data = get_rawdata_from_binary(full_bytes_path, len(sids)) with tictoc('PCA'): dim_reduce_func = pca(n_components=50) dim_reduce_func.fit_transform(full_data)
def _test_update(self, nupdate): _, arrs_for_update = create_random_id_based_dataset(nupdate) id2arr = {x: y for x, y in zip(self.ids, self.arrs)} # We want to make sure there are new ids (to be appended) and old ids (to be updated) nkeeps = nupdate // 2 nnews = nupdate - nkeeps maxid = np.max(self.ids) new_ids = np.arange(maxid + 1, maxid + nnews + 1) keep_ids = self.ids[:nkeeps] ids_for_update = np.concatenate((keep_ids, new_ids)) for x, y in zip(ids_for_update, arrs_for_update): id2arr[x] = y self.ids = np.array(list(id2arr.keys())) np.random.shuffle(self.ids) self.arrs = [id2arr[i] for i in self.ids] with tictoc('Test update {} items'.format(nupdate)): bs.store(ids_for_update, arrs_for_update, self.loc) retrieved_arrs = bs.retrieve(self.ids, self.loc) for id, retrieved_arr in zip(self.ids, retrieved_arrs): self.assertTrue(np.allclose(id2arr[id], retrieved_arr))
def _test_update(self, nupdate): _, new_arrs = create_random_id_based_dataset() npoints = NUM_POINTS id2arr = {x: y for x, y in zip(self.ids, self.arrs)} # We want to make sure there are new ids (to be appended) and old ids (to be updated) while True: new_ids = np.arange(npoints * 10) np.random.shuffle(new_ids) new_ids = new_ids[:nupdate] nnew = np.array([x for x in new_ids if x not in self.ids]) if 0 < len(nnew) < npoints: break for x, y in zip(new_ids, new_arrs): id2arr[x] = y self.ids = np.array(list(id2arr.keys())) np.random.shuffle(self.ids) self.arrs = [id2arr[i] for i in self.ids] with tictoc('Test update {} items'.format(nupdate)): bs.store(new_ids, new_arrs, self.index_filename, self.value_filename) retrieved_arrs = bs.retrieve(self.ids, self.index_filename, self.value_filename) for id, retrieved_arr in zip(self.ids, retrieved_arrs): self.assertTrue(np.allclose(id2arr[id], retrieved_arr))
def _test_retrieve_ids(self, limit=None): with tictoc('Test retrieving IDs limit={}'.format(limit)): ids = bs.retrieve_ids(self.loc, limit) if limit: min, max = limit self.assertGreaterEqual(ids.min(), min - bs.BATCH_SIZE) self.assertLessEqual(ids.max(), max + bs.BATCH_SIZE)
def test_spectral_derivatives(self): cls = self.__class__ with tictoc('test_spectral_derivatives'): hopsize = cls.window_length - cls.noverlap taper1 = cls.tapers[:, 0] taper2 = cls.tapers[:, 1] tapered1 = stft(y=cls.sig, n_fft=cls.nfft, win_length=cls.window_length, hop_length=hopsize, window=taper1, center=False, dtype=np.complex128) tapered2 = stft(y=cls.sig, n_fft=cls.nfft, win_length=cls.window_length, hop_length=hopsize, window=taper2, center=False, dtype=np.complex128) real1 = np.real(tapered1) real2 = np.real(tapered2) imag1 = np.imag(tapered1) imag2 = np.imag(tapered2) time_deriv = (-real1 * real2) - (imag1 * imag2) freq_deriv = (imag1 * real2) - (real1 * imag2) pfm = np.max(time_deriv, axis=0) / (np.max(freq_deriv, axis=0) + 0.1) fm = np.arctan(pfm) cfm = np.cos(fm) sfm = np.sin(fm) derivs = (time_deriv * sfm + freq_deriv * cfm) derivs[0:3, :] = 0 self.assertTrue(np.allclose(time_deriv, cls.time_deriv)) self.assertTrue(np.allclose(freq_deriv, cls.freq_deriv)) self.assertTrue(np.allclose(derivs, cls.derivs)) derivs_abs = np.abs(derivs) row_thresh = 0.3 * np.mean(derivs_abs, axis=0) col_thresh = 100 * np.median(derivs_abs, axis=1) mask_row = derivs_abs <= row_thresh[None, :] mask_col = derivs_abs <= col_thresh[:, None] mask = (mask_row | mask_col) derivs[mask] = -0.1 zcy, zcx = find_zc(derivs) zcx.sort() zcy.sort() self.assertTrue(np.allclose(zcx, cls.peaks_x)) self.assertTrue(np.allclose(zcy, cls.peaks_y))
def test_my_stft(self, ): with tictoc('test_my_stft'): s = my_stft(sig=self.sig, fs=self.fs, nfft=self.nfft, window=self.window, noverlap=self.noverlap) self.assertTrue(np.allclose(s, self.s))
def test_librosa_stft(self, ): with tictoc('test_librosa_stft'): hoplength = self.window_size - self.noverlap s = librosa.stft(y=self.sig, n_fft=self.nfft, win_length=self.window_size, hop_length=hoplength, window=self.window, center=False) self.assertTrue(np.allclose(s, self.s))
def _test_store(self): with tictoc('Test storing'): bs.store(self.ids, self.arrs, self.loc) index_arr = [] value_arr = [] index_files = [ x for x in os.listdir(self.loc) if x.startswith(bs.INDEX_PREFIX) ] batches = {} for index_file in index_files: batch_begin, batch_end = list( map(int, index_file[len(bs.INDEX_PREFIX):].split('-'))) batches[batch_begin] = (batch_begin, batch_end, index_file) batch_begins = sorted(list(batches.keys())) for batch_begin in batch_begins: batch_begin, batch_end, index_file = batches[batch_begin] batch_part = index_file[len(bs.INDEX_PREFIX):] index_file_path = os.path.join(self.loc, index_file) value_file_path = os.path.join(self.loc, bs.VALUE_PREFIX + batch_part) index_arr_ = np.fromfile(index_file_path, dtype=np.int32).reshape( (-1, bs.INDEX_FILE_NCOLS)) assert len(index_arr_) <= bs.BATCH_SIZE value_arr_ = np.fromfile(value_file_path, dtype=np.float32) index_arr.append(index_arr_) value_arr.append(value_arr_) index_arr = np.concatenate(index_arr).reshape( (-1, bs.INDEX_FILE_NCOLS)) value_arr = np.concatenate(value_arr) nids = len(index_arr) self.assertEqual(nids, len(self.ids)) self.assertTrue(np.allclose(self.sorted_ids, index_arr[:, 0])) arrs_ravel = np.concatenate([x.ravel() for x in self.sorted_arrs]) self.assertTrue(np.allclose(value_arr, arrs_ravel)) for id, arr, stored_index in zip(self.sorted_ids, self.sorted_arrs, index_arr): stored_id, _, _, stored_dim0, stored_dim1 = stored_index arr_size = np.size(arr) self.assertEqual(id, stored_id) self.assertEqual(stored_dim0, arr.shape[0] if arr.ndim >= 1 else 0) self.assertEqual(stored_dim1, arr.shape[1] if arr.ndim == 2 else 0) self.assertEqual( max(1, stored_dim0) * max(stored_dim1, 1), arr_size)
def extract_rawdata(ids, features): storage_loc_template = get_storage_loc_template() data_by_id = {id: [] for id in ids} for feature in features: storage_loc = storage_loc_template.format(feature.name) with tictoc('{}'.format(feature.name)): feature_values = bs.retrieve(ids, storage_loc) for id, feature_value in zip(ids, feature_values): data_by_id[id].append(feature_value) data = [] for id in ids: feature_values = data_by_id[id] data.append(feature_values) return data
def extract_rawdata(f2bs, ids, features): # ids = np.array([19078]) data_by_id = {id: [] for id in ids} for feature in features: index_filename, value_filename = f2bs[feature] with tictoc('{}'.format(feature.name)): feature_values = binstorage.retrieve(ids, index_filename, value_filename) for id, feature_value in zip(ids, feature_values): data_by_id[id].append(feature_value) data = [] for id in ids: feature_values = data_by_id[id] data.append(feature_values) return data
def test_scipy_stft(self): with tictoc('test_scipy_stft'): f, t, s = signal.stft(self.sig, fs=self.fs, window=self.window, nperseg=self.window_size, padded=False, noverlap=self.noverlap, nfft=self.nfft, return_onesided=True, boundary=None) # Scipy's STFT is unscaled - where as Matlab's and librosa's are. s *= self.window.sum() self.assertTrue(np.allclose(f, self.f)) self.assertTrue(np.allclose(t, self.t)) self.assertTrue(np.allclose(s, self.s))
def _test_retrieve(self, nselected): selected_ids = copy.deepcopy(self.ids) np.random.shuffle(selected_ids) selected_ids = selected_ids[:nselected] selected_ids_inds = [ np.where(self.ids == x)[0][0] for x in selected_ids ] selected_arrs = [self.arrs[i] for i in selected_ids_inds] with tictoc('Test retrieving {} items'.format(nselected)): retrieved_arrs = bs.retrieve(selected_ids, self.index_filename, self.value_filename) self.assertEqual(len(selected_ids), len(retrieved_arrs)) for i in range(len(selected_ids)): selected_arr = selected_arrs[i] retrieved_arr = retrieved_arrs[i] self.assertTrue(np.allclose(selected_arr, retrieved_arr))
def _test_store(self): with tictoc('Test storing'): bs.store(self.ids, self.arrs, self.index_filename, self.value_filename) index_filesize = os.path.getsize(self.index_filename) index_memory_usage = len(self.ids) * bs.INDEX_FILE_NCOLS * 4 value_filesize = os.path.getsize(self.value_filename) value_memory_usage = sum([np.size(x) for x in self.arrs]) * 4 self.assertEqual(index_filesize, index_memory_usage) self.assertEqual(value_filesize, value_memory_usage) with open(self.index_filename, 'rb') as f: index_arr = np.fromfile(f, dtype=np.int32) nids = len(index_arr) // bs.INDEX_FILE_NCOLS self.assertEqual(nids, len(self.ids)) index_arr = index_arr.reshape((nids, bs.INDEX_FILE_NCOLS)) for i in range(nids): id = self.ids[i] arr = self.arrs[i] arr_size = np.size(arr) id_, beg, end, dim0, dim1 = index_arr[i] self.assertEqual(id, id_) self.assertEqual(end - beg, arr_size) self.assertEqual(dim0, arr.shape[0] if arr.ndim >= 1 else 0) self.assertEqual(dim1, arr.shape[1] if arr.ndim == 2 else 0) self.assertEqual(max(1, dim0) * max(dim1, 1), arr_size) with open(self.value_filename, 'rb') as f: value_arr = np.fromfile(f, dtype=np.float32) self.assertEqual(len(value_arr), sum([np.size(arr) for arr in self.arrs])) arrs_ravel = np.concatenate([x.ravel() for x in self.arrs]) self.assertTrue(np.allclose(value_arr, arrs_ravel))
def _test_retrieve(self, nselected, shuffle=True): selected_ids = copy.deepcopy(self.ids) if shuffle: np.random.shuffle(selected_ids) selected_ids = selected_ids[:nselected] selected_ids_inds = [ np.where(self.ids == x)[0][0] for x in selected_ids ] selected_arrs = [self.arrs[i] for i in selected_ids_inds] with tictoc('Test retrieving {} items shuffle={}'.format( nselected, shuffle)): retrieved_arrs = bs.retrieve(selected_ids, self.loc) self.assertEqual(len(selected_ids), len(retrieved_arrs)) for i in range(len(selected_ids)): selected_arr = selected_arrs[i] retrieved_arr = retrieved_arrs[i] try: self.assertTrue(np.allclose(selected_arr, retrieved_arr)) except TypeError: pass
def _testxcorr2(self, template, image, result): with tictoc('Template size = {}, image size = {}'.format( template.shape, image.shape)): result_ = normxcorr2(template, image) self.assertTrue(np.allclose(result_, result))