Example #1
0
def extract_rawdata(f2bs, fa2bs, ids, features, aggregators):
    rawdata = []
    col_inds = {}
    col_inds_start = 0

    for feature in features:
        if feature.is_fixed_length:
            index_filename, value_filename = f2bs[feature]
            rawdata_ = binstorage.retrieve(ids,
                                           index_filename,
                                           value_filename,
                                           flat=True)
            rawdata_stacked = np.stack(rawdata_)
            rawdata.append(rawdata_stacked)
            ncols = rawdata_stacked.shape[1]
            col_inds[feature.name] = (col_inds_start, col_inds_start + ncols)
            col_inds_start += ncols
        else:
            for aggregator in aggregators:
                index_filename, value_filename = fa2bs[feature][aggregator]
                rawdata_ = binstorage.retrieve(ids,
                                               index_filename,
                                               value_filename,
                                               flat=True)
                rawdata_stacked = np.stack(rawdata_)
                rawdata.append(rawdata_stacked)
                ncols = rawdata_stacked.shape[1]
                col_inds['{}_{}'.format(
                    feature.name, aggregator.name)] = (col_inds_start,
                                                       col_inds_start + ncols)
                col_inds_start += ncols
    rawdata = np.concatenate(rawdata, axis=1)

    return rawdata, col_inds
Example #2
0
    def _test_update(self, nupdate):
        _, new_arrs = create_random_id_based_dataset()
        npoints = NUM_POINTS

        id2arr = {x: y for x, y in zip(self.ids, self.arrs)}

        # We want to make sure there are new ids (to be appended) and old ids (to be updated)
        while True:
            new_ids = np.arange(npoints * 10)
            np.random.shuffle(new_ids)
            new_ids = new_ids[:nupdate]
            nnew = np.array([x for x in new_ids if x not in self.ids])
            if 0 < len(nnew) < npoints:
                break

        for x, y in zip(new_ids, new_arrs):
            id2arr[x] = y

        self.ids = np.array(list(id2arr.keys()))
        np.random.shuffle(self.ids)

        self.arrs = [id2arr[i] for i in self.ids]

        with tictoc('Test update {} items'.format(nupdate)):
            bs.store(new_ids, new_arrs, self.index_filename,
                     self.value_filename)

        retrieved_arrs = bs.retrieve(self.ids, self.index_filename,
                                     self.value_filename)
        for id, retrieved_arr in zip(self.ids, retrieved_arrs):
            self.assertTrue(np.allclose(id2arr[id], retrieved_arr))
Example #3
0
def convert(olddir, newdir):
    old_index_file = olddir + '.idx'
    old_value_file = olddir + '.val'

    ids = bs1.retrieve_ids(old_index_file)
    arrs = bs1.retrieve(ids, old_index_file, old_value_file)

    mkdirp(newdir)
    if not os.path.isfile(os.path.join(newdir, '.converted')):
        try:
            bs3.store(ids, arrs, newdir)
            with open(os.path.join(newdir, '.converted'), 'w') as f:
                f.write('done')
        except AssertionError:
            print('Error converting {}'.format(olddir))
def extract_rawdata(f2bs, ids, features):
    # ids = np.array([19078])
    data_by_id = {id: [] for id in ids}

    for feature in features:
        index_filename, value_filename = f2bs[feature]
        with tictoc('{}'.format(feature.name)):
            feature_values = binstorage.retrieve(ids, index_filename, value_filename)
            for id, feature_value in zip(ids, feature_values):
                data_by_id[id].append(feature_value)

    data = []
    for id in ids:
        feature_values = data_by_id[id]
        data.append(feature_values)

    return data
Example #5
0
    def _test_retrieve(self, nselected):
        selected_ids = copy.deepcopy(self.ids)
        np.random.shuffle(selected_ids)
        selected_ids = selected_ids[:nselected]

        selected_ids_inds = [
            np.where(self.ids == x)[0][0] for x in selected_ids
        ]
        selected_arrs = [self.arrs[i] for i in selected_ids_inds]

        with tictoc('Test retrieving {} items'.format(nselected)):
            retrieved_arrs = bs.retrieve(selected_ids, self.index_filename,
                                         self.value_filename)

        self.assertEqual(len(selected_ids), len(retrieved_arrs))
        for i in range(len(selected_ids)):
            selected_arr = selected_arrs[i]
            retrieved_arr = retrieved_arrs[i]

            self.assertTrue(np.allclose(selected_arr, retrieved_arr))
Example #6
0
    def _test_retrieve_error(self):
        non_existing_ids = NUM_POINTS * 10 + np.random.randint(100, size=5)

        with self.assertRaises(ValueError):
            bs.retrieve(non_existing_ids, self.index_filename,
                        self.value_filename)
Example #7
0
def aggregate_feature_values(ptask, sids, f2bs, fa2bs, features, aggregators):
    """
    Compress all feature sequences into fixed-length vectors
    :param sid_to_label:
    :param h5file:
    :param features:
    :return:
    """
    if features is None or len(features) == 0:
        raise Exception('must provide non-empty list of features')

    segment_info = Segment.objects\
        .filter(id__in=sids)\
        .annotate(duration=F('end_time_ms') - F('start_time_ms')).order_by('duration')

    attrs = segment_info.values_list('tid', 'duration', 'audio_file__fs')

    duration2segs = {}
    for tid, duration, fs in attrs:
        if duration not in duration2segs:
            segs = [[], []]
            duration2segs[duration] = segs
        else:
            segs = duration2segs[duration]
        segs[0].append(tid)
        segs[1].append(fs)

    args = dict(nfft=nfft,
                noverlap=noverlap,
                wav_file_path=None,
                start=None,
                end=None,
                win_length=win_length,
                center=False)

    n_calculations = 0

    jobs = {}
    for duration, (tids, fss) in duration2segs.items():
        tids = np.array(tids, dtype=np.int32)
        fss = np.array(fss, dtype=np.int32)

        jobs[duration] = {}

        for feature in features:
            f_idf, f_vlf = f2bs[feature]

            for aggregator in aggregators:
                fa_idf, fa_vlf = fa2bs[feature][aggregator]

                existing_tids = binstorage.retrieve_ids(fa_idf)
                sorted_ids, sort_order = np.unique(existing_tids,
                                                   return_index=True)

                non_existing_idx = np.where(
                    np.logical_not(np.isin(tids, sorted_ids)))
                _tids = tids[non_existing_idx]
                _fss = fss[non_existing_idx]

                n_calculations += len(_tids)

                jobs[duration][feature] = (_tids, _fss, f_idf, f_vlf)

    if not n_calculations:
        ptask.wrapping_up()
        return

    ptask.start(limit=n_calculations)
    result_by_ft = {}
    for duration, ftjobs in jobs.items():
        for feature, (_tids, _fss, f_idf, f_vlf) in ftjobs.items():
            if feature not in result_by_ft:
                result_by_tid = {}
                result_by_ft[feature] = result_by_tid
            else:
                result_by_tid = result_by_ft[feature]

            values = binstorage.retrieve(_tids, f_idf, f_vlf)

            for tid, fs, value in zip(_tids, _fss, values):
                args['fs'] = fs
                result_by_agg = {}
                result_by_tid[tid] = result_by_agg

                if not feature.is_fixed_length:
                    if value.ndim == 2:
                        nframes = value.shape[1]
                    else:
                        nframes = value.shape[0]

                    min_nsamples = nfft + (nframes - 1) * stepsize
                    args['nsamples'] = min_nsamples

                    for aggregator in aggregators:
                        if aggregator.is_chirpy():
                            aggregated = aggregator.process(value,
                                                            args=args,
                                                            feature=feature)
                        else:
                            aggregated = aggregator.process(value)

                        result_by_agg[aggregator] = aggregated
                        ptask.tick()

    ptask.wrapping_up()
    for feature in features:
        if feature.is_fixed_length:
            continue
        result_by_tid = result_by_ft[feature]
        agg2tids = {aggregator: ([], []) for aggregator in aggregators}

        for tid, result_by_agg in result_by_tid.items():
            for aggregator, val in result_by_agg.items():
                agg2tids[aggregator][0].append(tid)
                agg2tids[aggregator][1].append(val)

        for aggregator, (tids, vals) in agg2tids.items():
            tids = np.array(tids)
            fa_idf, fa_vlf = fa2bs[feature][aggregator]
            binstorage.store(tids, vals, fa_idf, fa_vlf)