Esempio n. 1
0
    def _test_update(self, nupdate):
        _, arrs_for_update = create_random_id_based_dataset(nupdate)

        id2arr = {x: y for x, y in zip(self.ids, self.arrs)}

        # We want to make sure there are new ids (to be appended) and old ids (to be updated)
        nkeeps = nupdate // 2
        nnews = nupdate - nkeeps

        maxid = np.max(self.ids)
        new_ids = np.arange(maxid + 1, maxid + nnews + 1)
        keep_ids = self.ids[:nkeeps]

        ids_for_update = np.concatenate((keep_ids, new_ids))

        for x, y in zip(ids_for_update, arrs_for_update):
            id2arr[x] = y

        self.ids = np.array(list(id2arr.keys()))
        np.random.shuffle(self.ids)

        self.arrs = [id2arr[i] for i in self.ids]

        with tictoc('Test update {} items'.format(nupdate)):
            bs.store(ids_for_update, arrs_for_update, self.loc)

        retrieved_arrs = bs.retrieve(self.ids, self.loc)
        for id, retrieved_arr in zip(self.ids, retrieved_arrs):
            self.assertTrue(np.allclose(id2arr[id], retrieved_arr))
Esempio n. 2
0
def convert(olddir, newdir):
    ids, arrs = bs3.retrieve_raw(olddir)

    mkdirp(newdir)
    if not os.path.isfile(os.path.join(newdir, '.converted')):
        try:
            bs3.store(ids, arrs, newdir)
            with open(os.path.join(newdir, '.converted'), 'w') as f:
                f.write('done')
        except AssertionError:
            print('Error converting {}'.format(olddir))
Esempio n. 3
0
    def _test_store(self):
        with tictoc('Test storing'):
            bs.store(self.ids, self.arrs, self.loc)

        index_arr = []
        value_arr = []

        index_files = [
            x for x in os.listdir(self.loc) if x.startswith(bs.INDEX_PREFIX)
        ]
        batches = {}
        for index_file in index_files:
            batch_begin, batch_end = list(
                map(int, index_file[len(bs.INDEX_PREFIX):].split('-')))
            batches[batch_begin] = (batch_begin, batch_end, index_file)

        batch_begins = sorted(list(batches.keys()))
        for batch_begin in batch_begins:
            batch_begin, batch_end, index_file = batches[batch_begin]

            batch_part = index_file[len(bs.INDEX_PREFIX):]
            index_file_path = os.path.join(self.loc, index_file)
            value_file_path = os.path.join(self.loc,
                                           bs.VALUE_PREFIX + batch_part)

            index_arr_ = np.fromfile(index_file_path, dtype=np.int32).reshape(
                (-1, bs.INDEX_FILE_NCOLS))
            assert len(index_arr_) <= bs.BATCH_SIZE
            value_arr_ = np.fromfile(value_file_path, dtype=np.float32)

            index_arr.append(index_arr_)
            value_arr.append(value_arr_)

        index_arr = np.concatenate(index_arr).reshape(
            (-1, bs.INDEX_FILE_NCOLS))
        value_arr = np.concatenate(value_arr)

        nids = len(index_arr)
        self.assertEqual(nids, len(self.ids))
        self.assertTrue(np.allclose(self.sorted_ids, index_arr[:, 0]))
        arrs_ravel = np.concatenate([x.ravel() for x in self.sorted_arrs])
        self.assertTrue(np.allclose(value_arr, arrs_ravel))

        for id, arr, stored_index in zip(self.sorted_ids, self.sorted_arrs,
                                         index_arr):
            stored_id, _, _, stored_dim0, stored_dim1 = stored_index

            arr_size = np.size(arr)
            self.assertEqual(id, stored_id)
            self.assertEqual(stored_dim0, arr.shape[0] if arr.ndim >= 1 else 0)
            self.assertEqual(stored_dim1, arr.shape[1] if arr.ndim == 2 else 0)
            self.assertEqual(
                max(1, stored_dim0) * max(stored_dim1, 1), arr_size)
Esempio n. 4
0
def convert(olddir, newdir):
    old_index_file = olddir + '.idx'
    old_value_file = olddir + '.val'

    ids = bs1.retrieve_ids(old_index_file)
    arrs = bs1.retrieve(ids, old_index_file, old_value_file)

    mkdirp(newdir)
    if not os.path.isfile(os.path.join(newdir, '.converted')):
        try:
            bs3.store(ids, arrs, newdir)
            with open(os.path.join(newdir, '.converted'), 'w') as f:
                f.write('done')
        except AssertionError:
            print('Error converting {}'.format(olddir))
Esempio n. 5
0
def extract_segment_features_for_segments(runner, sids, features, force=False):
    segments = Segment.objects.filter(id__in=sids)
    tids = np.array(segments.values_list('tid', flat=True), dtype=np.int32)

    if len(tids) == 0:
        return

    tid_min = tids.min()
    tid_max = tids.max()

    storage_loc_template = get_storage_loc_template()

    f2af2segments = {}
    n_calculations = 0

    for feature in features:
        storage_loc = storage_loc_template.format(feature.name)
        mkdirp(storage_loc)

        if force:
            tids_target = tids
        else:
            existing_tids = bs.retrieve_ids(storage_loc, (tid_min, tid_max))
            sorted_ids, sort_order = np.unique(existing_tids,
                                               return_index=True)

            non_existing_idx = np.where(
                np.logical_not(np.isin(tids, sorted_ids)))
            missing_tids = tids[non_existing_idx]
            tids_target = missing_tids

        af_to_segments = {}

        vl = segments.filter(tid__in=tids_target).order_by('audio_file', 'start_time_ms')\
                     .values_list('tid', 'audio_file', 'start_time_ms', 'end_time_ms',
                                  'audio_file__database__nfft', 'audio_file__database__noverlap',
                                  'audio_file__database__lpf', 'audio_file__database__hpf')

        if len(vl):
            for tid, afid, start_time_ms, end_time_ms, nfft, noverlap, lpf, hpf in vl:
                if afid not in af_to_segments:
                    af_to_segments[afid] = []
                af_to_segments[afid].append((tid, start_time_ms, end_time_ms,
                                             nfft, noverlap, lpf, hpf))

            f2af2segments[feature] = af_to_segments
            n_calculations += len(tids_target)

    if n_calculations:
        runner.start(limit=n_calculations)
        for ind, (feature, af_to_segments) in enumerate(f2af2segments.items()):
            _tids = []
            _fvals = []
            storage_loc = storage_loc_template.format(feature.name)

            afids = list(af_to_segments.keys())
            af_lookup = {
                x.id: x
                for x in AudioFile.objects.filter(id__in=afids)
            }
            for afid, segs_info in af_to_segments.items():
                af = af_lookup[afid]
                wav_file_path = wav_path(af)
                try:
                    __tids, __fvals = extract_segment_feature_for_audio_file(
                        wav_file_path, segs_info, feature)
                except Exception as e:
                    raise Exception(
                        'Error extracting [{}] for file {}. Error message: {}'.
                        format(feature.name, af.name, str(e)))
                #
                _tids += __tids
                _fvals += __fvals

                if len(_tids) >= 100:
                    bs.store(_tids, _fvals, storage_loc)
                    runner.tick(len(_tids))
                    _tids = []
                    _fvals = []

            if len(_tids):
                bs.store(_tids, _fvals, storage_loc)
                runner.tick(len(_tids))
Esempio n. 6
0
def aggregate_feature_values(runner, tids, features, aggregators, force=False):
    """
    Compress all feature sequences into fixed-length vectors
    :param sid_to_label:
    :param h5file:
    :param features:
    :return:
    """
    if features is None or len(features) == 0:
        raise Exception('must provide non-empty list of features')

    storage_loc_template = get_storage_loc_template()

    if len(tids) == 0:
        runner.wrapping_up()
        return

    tid_min = tids.min()
    tid_max = tids.max()

    n_calculations = 0
    jobss = []

    for feature in features:
        if feature.is_fixed_length:
            continue

        jobs = []
        storage_loc = storage_loc_template.format(feature.name)
        fa_storage_loc_template = os.path.join(storage_loc, '{}')

        if force:
            combined_tids = [tids]
        else:
            combined_tids = []

        for aggregator in aggregators:
            fa_storage_loc = fa_storage_loc_template.format(aggregator.name)
            mkdirp(fa_storage_loc)

            if force:
                tids_target = tids
            else:
                existing_tids = bs.retrieve_ids(fa_storage_loc,
                                                (tid_min, tid_max))
                sorted_ids, sort_order = np.unique(existing_tids,
                                                   return_index=True)

                non_existing_idx = np.where(
                    np.logical_not(np.isin(tids, sorted_ids)))
                missing_tids = tids[non_existing_idx]
                tids_target = np.array(sorted(missing_tids))

            n_tids_target = len(tids_target)
            if not force and n_tids_target:
                combined_tids.append(tids_target)

            if n_tids_target:
                n_calculations += n_tids_target
                jobs.append((tids_target, aggregator, fa_storage_loc))

        if len(combined_tids):
            combined_tids = np.unique(
                np.concatenate(combined_tids).astype(np.int32))
            jobss.append((combined_tids, storage_loc, jobs))

    if not n_calculations:
        return

    runner.start(limit=n_calculations)

    for combined_tids, storage_loc, jobs in jobss:
        batches = get_batches(combined_tids, batch_size=100)
        for batch_tids in batches:
            batch_size = len(batch_tids)
            batch_arrs = bs.retrieve(batch_tids, storage_loc)
            for tids_target, aggregator, fa_storage_loc in jobs:
                aggregateds = []
                aggregated_ids = []
                target_batch_ind = np.searchsorted(batch_tids, tids_target)
                batch_id_within_range = np.where(target_batch_ind < batch_size)
                target_batch_ind = target_batch_ind[batch_id_within_range]
                tids_within_range = tids_target[batch_id_within_range]

                for batch_ind, tid in zip(target_batch_ind, tids_within_range):
                    if batch_ind == 0 and batch_tids[0] != tid:
                        continue
                    aggregated_ids.append(tid)
                    arr = batch_arrs[batch_ind]
                    aggregated = aggregator.process(arr)
                    aggregateds.append(aggregated)

                if len(aggregated_ids):
                    aggregated_ids = np.array(aggregated_ids)
                    bs.store(aggregated_ids, aggregateds, fa_storage_loc)
                    runner.tick(len(aggregated_ids))