def _test_update(self, nupdate): _, new_arrs = create_random_id_based_dataset() npoints = NUM_POINTS id2arr = {x: y for x, y in zip(self.ids, self.arrs)} # We want to make sure there are new ids (to be appended) and old ids (to be updated) while True: new_ids = np.arange(npoints * 10) np.random.shuffle(new_ids) new_ids = new_ids[:nupdate] nnew = np.array([x for x in new_ids if x not in self.ids]) if 0 < len(nnew) < npoints: break for x, y in zip(new_ids, new_arrs): id2arr[x] = y self.ids = np.array(list(id2arr.keys())) np.random.shuffle(self.ids) self.arrs = [id2arr[i] for i in self.ids] with tictoc('Test update {} items'.format(nupdate)): bs.store(new_ids, new_arrs, self.index_filename, self.value_filename) retrieved_arrs = bs.retrieve(self.ids, self.index_filename, self.value_filename) for id, retrieved_arr in zip(self.ids, retrieved_arrs): self.assertTrue(np.allclose(id2arr[id], retrieved_arr))
def store_feature_values(ids, feature, values_arr): index_filename = data_path('binary/features', '{}.idx'.format(feature.name), for_url=False) value_filename = data_path('binary/features', '{}.val'.format(feature.name), for_url=False) ensure_parent_folder_exists(index_filename) binstorage.store(ids, values_arr, index_filename, value_filename)
def _test_store(self): with tictoc('Test storing'): bs.store(self.ids, self.arrs, self.index_filename, self.value_filename) index_filesize = os.path.getsize(self.index_filename) index_memory_usage = len(self.ids) * bs.INDEX_FILE_NCOLS * 4 value_filesize = os.path.getsize(self.value_filename) value_memory_usage = sum([np.size(x) for x in self.arrs]) * 4 self.assertEqual(index_filesize, index_memory_usage) self.assertEqual(value_filesize, value_memory_usage) with open(self.index_filename, 'rb') as f: index_arr = np.fromfile(f, dtype=np.int32) nids = len(index_arr) // bs.INDEX_FILE_NCOLS self.assertEqual(nids, len(self.ids)) index_arr = index_arr.reshape((nids, bs.INDEX_FILE_NCOLS)) for i in range(nids): id = self.ids[i] arr = self.arrs[i] arr_size = np.size(arr) id_, beg, end, dim0, dim1 = index_arr[i] self.assertEqual(id, id_) self.assertEqual(end - beg, arr_size) self.assertEqual(dim0, arr.shape[0] if arr.ndim >= 1 else 0) self.assertEqual(dim1, arr.shape[1] if arr.ndim == 2 else 0) self.assertEqual(max(1, dim0) * max(dim1, 1), arr_size) with open(self.value_filename, 'rb') as f: value_arr = np.fromfile(f, dtype=np.float32) self.assertEqual(len(value_arr), sum([np.size(arr) for arr in self.arrs])) arrs_ravel = np.concatenate([x.ravel() for x in self.arrs]) self.assertTrue(np.allclose(value_arr, arrs_ravel))
def extract_database_measurements(arg=None, force=False): if isinstance(arg, int): task = get_or_wait(arg) else: task = arg runner = TaskRunner(task) try: runner.preparing() if isinstance(task, Task): cls, dm_id = task.target.split(':') dm_id = int(dm_id) assert cls == DataMatrix.__name__ dm = DataMatrix.objects.get(id=dm_id) if dm.database: segments = Segment.objects.filter( audio_file__database=dm.database) sids = segments.values_list('id', flat=True) else: sids = dm.tmpdb.ids features_hash = dm.features_hash aggregations_hash = dm.aggregations_hash else: sids = task.sids features_hash = task.features_hash aggregations_hash = task.aggregations_hash features = Feature.objects.filter(id__in=features_hash.split('-')) aggregations = Aggregation.objects.filter( id__in=aggregations_hash.split('-')) aggregators = [aggregator_map[x.name] for x in aggregations] # feature to binstorage's files f2bs = {} # feature+aggregation to binstorage's files fa2bs = {} for feature in features: feature_name = feature.name index_filename = data_path('binary/features', '{}.idx'.format(feature_name), for_url=False) value_filename = data_path('binary/features', '{}.val'.format(feature_name), for_url=False) f2bs[feature] = (index_filename, value_filename) if feature not in fa2bs: fa2bs[feature] = {} for aggregator in aggregators: aggregator_name = aggregator.get_name() folder = os.path.join('binary', 'features', feature_name) mkdirp(os.path.join(settings.MEDIA_URL, folder)[1:]) index_filename = data_path(folder, '{}.idx'.format(aggregator_name), for_url=False) value_filename = data_path(folder, '{}.val'.format(aggregator_name), for_url=False) fa2bs[feature][aggregator] = (index_filename, value_filename) tids, f2tid2fvals = extract_segment_features_for_segments( runner, sids, features, f2bs, force) for feature, (index_filename, value_filename) in f2bs.items(): _tids, _fvals = f2tid2fvals.get(feature, (None, None)) if _tids: _tids = np.array(_tids, dtype=np.int32) ensure_parent_folder_exists(index_filename) binstorage.store(_tids, _fvals, index_filename, value_filename) runner.wrapping_up() child_task = task.__class__(user=task.user, parent=task) child_task.save() child_runner = TaskRunner(child_task) child_runner.preparing() aggregate_feature_values(child_runner, sids, f2bs, fa2bs, features, aggregators) child_runner.complete() if isinstance(task, Task): full_sids_path = dm.get_sids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() data, col_inds = extract_rawdata(f2bs, fa2bs, tids, features, aggregators) ndarray_to_bytes(data, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f) dm.ndims = data.shape[1] dm.save() runner.complete() except Exception as e: runner.error(e)
def aggregate_feature_values(ptask, sids, f2bs, fa2bs, features, aggregators): """ Compress all feature sequences into fixed-length vectors :param sid_to_label: :param h5file: :param features: :return: """ if features is None or len(features) == 0: raise Exception('must provide non-empty list of features') segment_info = Segment.objects\ .filter(id__in=sids)\ .annotate(duration=F('end_time_ms') - F('start_time_ms')).order_by('duration') attrs = segment_info.values_list('tid', 'duration', 'audio_file__fs') duration2segs = {} for tid, duration, fs in attrs: if duration not in duration2segs: segs = [[], []] duration2segs[duration] = segs else: segs = duration2segs[duration] segs[0].append(tid) segs[1].append(fs) args = dict(nfft=nfft, noverlap=noverlap, wav_file_path=None, start=None, end=None, win_length=win_length, center=False) n_calculations = 0 jobs = {} for duration, (tids, fss) in duration2segs.items(): tids = np.array(tids, dtype=np.int32) fss = np.array(fss, dtype=np.int32) jobs[duration] = {} for feature in features: f_idf, f_vlf = f2bs[feature] for aggregator in aggregators: fa_idf, fa_vlf = fa2bs[feature][aggregator] existing_tids = binstorage.retrieve_ids(fa_idf) sorted_ids, sort_order = np.unique(existing_tids, return_index=True) non_existing_idx = np.where( np.logical_not(np.isin(tids, sorted_ids))) _tids = tids[non_existing_idx] _fss = fss[non_existing_idx] n_calculations += len(_tids) jobs[duration][feature] = (_tids, _fss, f_idf, f_vlf) if not n_calculations: ptask.wrapping_up() return ptask.start(limit=n_calculations) result_by_ft = {} for duration, ftjobs in jobs.items(): for feature, (_tids, _fss, f_idf, f_vlf) in ftjobs.items(): if feature not in result_by_ft: result_by_tid = {} result_by_ft[feature] = result_by_tid else: result_by_tid = result_by_ft[feature] values = binstorage.retrieve(_tids, f_idf, f_vlf) for tid, fs, value in zip(_tids, _fss, values): args['fs'] = fs result_by_agg = {} result_by_tid[tid] = result_by_agg if not feature.is_fixed_length: if value.ndim == 2: nframes = value.shape[1] else: nframes = value.shape[0] min_nsamples = nfft + (nframes - 1) * stepsize args['nsamples'] = min_nsamples for aggregator in aggregators: if aggregator.is_chirpy(): aggregated = aggregator.process(value, args=args, feature=feature) else: aggregated = aggregator.process(value) result_by_agg[aggregator] = aggregated ptask.tick() ptask.wrapping_up() for feature in features: if feature.is_fixed_length: continue result_by_tid = result_by_ft[feature] agg2tids = {aggregator: ([], []) for aggregator in aggregators} for tid, result_by_agg in result_by_tid.items(): for aggregator, val in result_by_agg.items(): agg2tids[aggregator][0].append(tid) agg2tids[aggregator][1].append(val) for aggregator, (tids, vals) in agg2tids.items(): tids = np.array(tids) fa_idf, fa_vlf = fa2bs[feature][aggregator] binstorage.store(tids, vals, fa_idf, fa_vlf)