Ejemplo n.º 1
0
    def _test_update(self, nupdate):
        _, new_arrs = create_random_id_based_dataset()
        npoints = NUM_POINTS

        id2arr = {x: y for x, y in zip(self.ids, self.arrs)}

        # We want to make sure there are new ids (to be appended) and old ids (to be updated)
        while True:
            new_ids = np.arange(npoints * 10)
            np.random.shuffle(new_ids)
            new_ids = new_ids[:nupdate]
            nnew = np.array([x for x in new_ids if x not in self.ids])
            if 0 < len(nnew) < npoints:
                break

        for x, y in zip(new_ids, new_arrs):
            id2arr[x] = y

        self.ids = np.array(list(id2arr.keys()))
        np.random.shuffle(self.ids)

        self.arrs = [id2arr[i] for i in self.ids]

        with tictoc('Test update {} items'.format(nupdate)):
            bs.store(new_ids, new_arrs, self.index_filename,
                     self.value_filename)

        retrieved_arrs = bs.retrieve(self.ids, self.index_filename,
                                     self.value_filename)
        for id, retrieved_arr in zip(self.ids, retrieved_arrs):
            self.assertTrue(np.allclose(id2arr[id], retrieved_arr))
Ejemplo n.º 2
0
def store_feature_values(ids, feature, values_arr):
    index_filename = data_path('binary/features',
                               '{}.idx'.format(feature.name),
                               for_url=False)
    value_filename = data_path('binary/features',
                               '{}.val'.format(feature.name),
                               for_url=False)

    ensure_parent_folder_exists(index_filename)
    binstorage.store(ids, values_arr, index_filename, value_filename)
Ejemplo n.º 3
0
    def _test_store(self):
        with tictoc('Test storing'):
            bs.store(self.ids, self.arrs, self.index_filename,
                     self.value_filename)
        index_filesize = os.path.getsize(self.index_filename)
        index_memory_usage = len(self.ids) * bs.INDEX_FILE_NCOLS * 4

        value_filesize = os.path.getsize(self.value_filename)
        value_memory_usage = sum([np.size(x) for x in self.arrs]) * 4

        self.assertEqual(index_filesize, index_memory_usage)
        self.assertEqual(value_filesize, value_memory_usage)

        with open(self.index_filename, 'rb') as f:
            index_arr = np.fromfile(f, dtype=np.int32)
            nids = len(index_arr) // bs.INDEX_FILE_NCOLS

            self.assertEqual(nids, len(self.ids))

            index_arr = index_arr.reshape((nids, bs.INDEX_FILE_NCOLS))
            for i in range(nids):
                id = self.ids[i]
                arr = self.arrs[i]

                arr_size = np.size(arr)
                id_, beg, end, dim0, dim1 = index_arr[i]

                self.assertEqual(id, id_)
                self.assertEqual(end - beg, arr_size)
                self.assertEqual(dim0, arr.shape[0] if arr.ndim >= 1 else 0)
                self.assertEqual(dim1, arr.shape[1] if arr.ndim == 2 else 0)
                self.assertEqual(max(1, dim0) * max(dim1, 1), arr_size)

        with open(self.value_filename, 'rb') as f:
            value_arr = np.fromfile(f, dtype=np.float32)
            self.assertEqual(len(value_arr),
                             sum([np.size(arr) for arr in self.arrs]))

            arrs_ravel = np.concatenate([x.ravel() for x in self.arrs])
            self.assertTrue(np.allclose(value_arr, arrs_ravel))
Ejemplo n.º 4
0
def extract_database_measurements(arg=None, force=False):
    if isinstance(arg, int):
        task = get_or_wait(arg)
    else:
        task = arg
    runner = TaskRunner(task)
    try:
        runner.preparing()

        if isinstance(task, Task):
            cls, dm_id = task.target.split(':')
            dm_id = int(dm_id)
            assert cls == DataMatrix.__name__
            dm = DataMatrix.objects.get(id=dm_id)

            if dm.database:
                segments = Segment.objects.filter(
                    audio_file__database=dm.database)
                sids = segments.values_list('id', flat=True)
            else:
                sids = dm.tmpdb.ids
            features_hash = dm.features_hash
            aggregations_hash = dm.aggregations_hash
        else:
            sids = task.sids
            features_hash = task.features_hash
            aggregations_hash = task.aggregations_hash

        features = Feature.objects.filter(id__in=features_hash.split('-'))
        aggregations = Aggregation.objects.filter(
            id__in=aggregations_hash.split('-'))
        aggregators = [aggregator_map[x.name] for x in aggregations]

        # feature to binstorage's files
        f2bs = {}
        # feature+aggregation to binstorage's files
        fa2bs = {}

        for feature in features:
            feature_name = feature.name
            index_filename = data_path('binary/features',
                                       '{}.idx'.format(feature_name),
                                       for_url=False)
            value_filename = data_path('binary/features',
                                       '{}.val'.format(feature_name),
                                       for_url=False)
            f2bs[feature] = (index_filename, value_filename)

            if feature not in fa2bs:
                fa2bs[feature] = {}

            for aggregator in aggregators:
                aggregator_name = aggregator.get_name()
                folder = os.path.join('binary', 'features', feature_name)
                mkdirp(os.path.join(settings.MEDIA_URL, folder)[1:])

                index_filename = data_path(folder,
                                           '{}.idx'.format(aggregator_name),
                                           for_url=False)
                value_filename = data_path(folder,
                                           '{}.val'.format(aggregator_name),
                                           for_url=False)
                fa2bs[feature][aggregator] = (index_filename, value_filename)

        tids, f2tid2fvals = extract_segment_features_for_segments(
            runner, sids, features, f2bs, force)

        for feature, (index_filename, value_filename) in f2bs.items():
            _tids, _fvals = f2tid2fvals.get(feature, (None, None))
            if _tids:
                _tids = np.array(_tids, dtype=np.int32)
                ensure_parent_folder_exists(index_filename)
                binstorage.store(_tids, _fvals, index_filename, value_filename)

        runner.wrapping_up()
        child_task = task.__class__(user=task.user, parent=task)
        child_task.save()
        child_runner = TaskRunner(child_task)
        child_runner.preparing()

        aggregate_feature_values(child_runner, sids, f2bs, fa2bs, features,
                                 aggregators)
        child_runner.complete()

        if isinstance(task, Task):
            full_sids_path = dm.get_sids_path()
            full_bytes_path = dm.get_bytes_path()
            full_cols_path = dm.get_cols_path()

            data, col_inds = extract_rawdata(f2bs, fa2bs, tids, features,
                                             aggregators)

            ndarray_to_bytes(data, full_bytes_path)
            ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path)

            with open(full_cols_path, 'w', encoding='utf-8') as f:
                json.dump(col_inds, f)

            dm.ndims = data.shape[1]
            dm.save()
        runner.complete()

    except Exception as e:
        runner.error(e)
Ejemplo n.º 5
0
def aggregate_feature_values(ptask, sids, f2bs, fa2bs, features, aggregators):
    """
    Compress all feature sequences into fixed-length vectors
    :param sid_to_label:
    :param h5file:
    :param features:
    :return:
    """
    if features is None or len(features) == 0:
        raise Exception('must provide non-empty list of features')

    segment_info = Segment.objects\
        .filter(id__in=sids)\
        .annotate(duration=F('end_time_ms') - F('start_time_ms')).order_by('duration')

    attrs = segment_info.values_list('tid', 'duration', 'audio_file__fs')

    duration2segs = {}
    for tid, duration, fs in attrs:
        if duration not in duration2segs:
            segs = [[], []]
            duration2segs[duration] = segs
        else:
            segs = duration2segs[duration]
        segs[0].append(tid)
        segs[1].append(fs)

    args = dict(nfft=nfft,
                noverlap=noverlap,
                wav_file_path=None,
                start=None,
                end=None,
                win_length=win_length,
                center=False)

    n_calculations = 0

    jobs = {}
    for duration, (tids, fss) in duration2segs.items():
        tids = np.array(tids, dtype=np.int32)
        fss = np.array(fss, dtype=np.int32)

        jobs[duration] = {}

        for feature in features:
            f_idf, f_vlf = f2bs[feature]

            for aggregator in aggregators:
                fa_idf, fa_vlf = fa2bs[feature][aggregator]

                existing_tids = binstorage.retrieve_ids(fa_idf)
                sorted_ids, sort_order = np.unique(existing_tids,
                                                   return_index=True)

                non_existing_idx = np.where(
                    np.logical_not(np.isin(tids, sorted_ids)))
                _tids = tids[non_existing_idx]
                _fss = fss[non_existing_idx]

                n_calculations += len(_tids)

                jobs[duration][feature] = (_tids, _fss, f_idf, f_vlf)

    if not n_calculations:
        ptask.wrapping_up()
        return

    ptask.start(limit=n_calculations)
    result_by_ft = {}
    for duration, ftjobs in jobs.items():
        for feature, (_tids, _fss, f_idf, f_vlf) in ftjobs.items():
            if feature not in result_by_ft:
                result_by_tid = {}
                result_by_ft[feature] = result_by_tid
            else:
                result_by_tid = result_by_ft[feature]

            values = binstorage.retrieve(_tids, f_idf, f_vlf)

            for tid, fs, value in zip(_tids, _fss, values):
                args['fs'] = fs
                result_by_agg = {}
                result_by_tid[tid] = result_by_agg

                if not feature.is_fixed_length:
                    if value.ndim == 2:
                        nframes = value.shape[1]
                    else:
                        nframes = value.shape[0]

                    min_nsamples = nfft + (nframes - 1) * stepsize
                    args['nsamples'] = min_nsamples

                    for aggregator in aggregators:
                        if aggregator.is_chirpy():
                            aggregated = aggregator.process(value,
                                                            args=args,
                                                            feature=feature)
                        else:
                            aggregated = aggregator.process(value)

                        result_by_agg[aggregator] = aggregated
                        ptask.tick()

    ptask.wrapping_up()
    for feature in features:
        if feature.is_fixed_length:
            continue
        result_by_tid = result_by_ft[feature]
        agg2tids = {aggregator: ([], []) for aggregator in aggregators}

        for tid, result_by_agg in result_by_tid.items():
            for aggregator, val in result_by_agg.items():
                agg2tids[aggregator][0].append(tid)
                agg2tids[aggregator][1].append(val)

        for aggregator, (tids, vals) in agg2tids.items():
            tids = np.array(tids)
            fa_idf, fa_vlf = fa2bs[feature][aggregator]
            binstorage.store(tids, vals, fa_idf, fa_vlf)