def extract_rawdata(f2bs, fa2bs, ids, features, aggregators): rawdata = [] col_inds = {} col_inds_start = 0 for feature in features: if feature.is_fixed_length: index_filename, value_filename = f2bs[feature] rawdata_ = binstorage.retrieve(ids, index_filename, value_filename, flat=True) rawdata_stacked = np.stack(rawdata_) rawdata.append(rawdata_stacked) ncols = rawdata_stacked.shape[1] col_inds[feature.name] = (col_inds_start, col_inds_start + ncols) col_inds_start += ncols else: for aggregator in aggregators: index_filename, value_filename = fa2bs[feature][aggregator] rawdata_ = binstorage.retrieve(ids, index_filename, value_filename, flat=True) rawdata_stacked = np.stack(rawdata_) rawdata.append(rawdata_stacked) ncols = rawdata_stacked.shape[1] col_inds['{}_{}'.format( feature.name, aggregator.name)] = (col_inds_start, col_inds_start + ncols) col_inds_start += ncols rawdata = np.concatenate(rawdata, axis=1) return rawdata, col_inds
def _test_update(self, nupdate): _, new_arrs = create_random_id_based_dataset() npoints = NUM_POINTS id2arr = {x: y for x, y in zip(self.ids, self.arrs)} # We want to make sure there are new ids (to be appended) and old ids (to be updated) while True: new_ids = np.arange(npoints * 10) np.random.shuffle(new_ids) new_ids = new_ids[:nupdate] nnew = np.array([x for x in new_ids if x not in self.ids]) if 0 < len(nnew) < npoints: break for x, y in zip(new_ids, new_arrs): id2arr[x] = y self.ids = np.array(list(id2arr.keys())) np.random.shuffle(self.ids) self.arrs = [id2arr[i] for i in self.ids] with tictoc('Test update {} items'.format(nupdate)): bs.store(new_ids, new_arrs, self.index_filename, self.value_filename) retrieved_arrs = bs.retrieve(self.ids, self.index_filename, self.value_filename) for id, retrieved_arr in zip(self.ids, retrieved_arrs): self.assertTrue(np.allclose(id2arr[id], retrieved_arr))
def convert(olddir, newdir): old_index_file = olddir + '.idx' old_value_file = olddir + '.val' ids = bs1.retrieve_ids(old_index_file) arrs = bs1.retrieve(ids, old_index_file, old_value_file) mkdirp(newdir) if not os.path.isfile(os.path.join(newdir, '.converted')): try: bs3.store(ids, arrs, newdir) with open(os.path.join(newdir, '.converted'), 'w') as f: f.write('done') except AssertionError: print('Error converting {}'.format(olddir))
def extract_rawdata(f2bs, ids, features): # ids = np.array([19078]) data_by_id = {id: [] for id in ids} for feature in features: index_filename, value_filename = f2bs[feature] with tictoc('{}'.format(feature.name)): feature_values = binstorage.retrieve(ids, index_filename, value_filename) for id, feature_value in zip(ids, feature_values): data_by_id[id].append(feature_value) data = [] for id in ids: feature_values = data_by_id[id] data.append(feature_values) return data
def _test_retrieve(self, nselected): selected_ids = copy.deepcopy(self.ids) np.random.shuffle(selected_ids) selected_ids = selected_ids[:nselected] selected_ids_inds = [ np.where(self.ids == x)[0][0] for x in selected_ids ] selected_arrs = [self.arrs[i] for i in selected_ids_inds] with tictoc('Test retrieving {} items'.format(nselected)): retrieved_arrs = bs.retrieve(selected_ids, self.index_filename, self.value_filename) self.assertEqual(len(selected_ids), len(retrieved_arrs)) for i in range(len(selected_ids)): selected_arr = selected_arrs[i] retrieved_arr = retrieved_arrs[i] self.assertTrue(np.allclose(selected_arr, retrieved_arr))
def _test_retrieve_error(self): non_existing_ids = NUM_POINTS * 10 + np.random.randint(100, size=5) with self.assertRaises(ValueError): bs.retrieve(non_existing_ids, self.index_filename, self.value_filename)
def aggregate_feature_values(ptask, sids, f2bs, fa2bs, features, aggregators): """ Compress all feature sequences into fixed-length vectors :param sid_to_label: :param h5file: :param features: :return: """ if features is None or len(features) == 0: raise Exception('must provide non-empty list of features') segment_info = Segment.objects\ .filter(id__in=sids)\ .annotate(duration=F('end_time_ms') - F('start_time_ms')).order_by('duration') attrs = segment_info.values_list('tid', 'duration', 'audio_file__fs') duration2segs = {} for tid, duration, fs in attrs: if duration not in duration2segs: segs = [[], []] duration2segs[duration] = segs else: segs = duration2segs[duration] segs[0].append(tid) segs[1].append(fs) args = dict(nfft=nfft, noverlap=noverlap, wav_file_path=None, start=None, end=None, win_length=win_length, center=False) n_calculations = 0 jobs = {} for duration, (tids, fss) in duration2segs.items(): tids = np.array(tids, dtype=np.int32) fss = np.array(fss, dtype=np.int32) jobs[duration] = {} for feature in features: f_idf, f_vlf = f2bs[feature] for aggregator in aggregators: fa_idf, fa_vlf = fa2bs[feature][aggregator] existing_tids = binstorage.retrieve_ids(fa_idf) sorted_ids, sort_order = np.unique(existing_tids, return_index=True) non_existing_idx = np.where( np.logical_not(np.isin(tids, sorted_ids))) _tids = tids[non_existing_idx] _fss = fss[non_existing_idx] n_calculations += len(_tids) jobs[duration][feature] = (_tids, _fss, f_idf, f_vlf) if not n_calculations: ptask.wrapping_up() return ptask.start(limit=n_calculations) result_by_ft = {} for duration, ftjobs in jobs.items(): for feature, (_tids, _fss, f_idf, f_vlf) in ftjobs.items(): if feature not in result_by_ft: result_by_tid = {} result_by_ft[feature] = result_by_tid else: result_by_tid = result_by_ft[feature] values = binstorage.retrieve(_tids, f_idf, f_vlf) for tid, fs, value in zip(_tids, _fss, values): args['fs'] = fs result_by_agg = {} result_by_tid[tid] = result_by_agg if not feature.is_fixed_length: if value.ndim == 2: nframes = value.shape[1] else: nframes = value.shape[0] min_nsamples = nfft + (nframes - 1) * stepsize args['nsamples'] = min_nsamples for aggregator in aggregators: if aggregator.is_chirpy(): aggregated = aggregator.process(value, args=args, feature=feature) else: aggregated = aggregator.process(value) result_by_agg[aggregator] = aggregated ptask.tick() ptask.wrapping_up() for feature in features: if feature.is_fixed_length: continue result_by_tid = result_by_ft[feature] agg2tids = {aggregator: ([], []) for aggregator in aggregators} for tid, result_by_agg in result_by_tid.items(): for aggregator, val in result_by_agg.items(): agg2tids[aggregator][0].append(tid) agg2tids[aggregator][1].append(val) for aggregator, (tids, vals) in agg2tids.items(): tids = np.array(tids) fa_idf, fa_vlf = fa2bs[feature][aggregator] binstorage.store(tids, vals, fa_idf, fa_vlf)