Exemple #1
0
def extract_rawdata(ids, features, aggregators):
    storage_loc_template = get_storage_loc_template()

    rawdata = []
    col_inds = {}
    col_inds_start = 0

    for feature in features:
        storage_loc = storage_loc_template.format(feature.name)
        if feature.is_fixed_length:
            rawdata_ = bs.retrieve(ids, storage_loc, flat=True)
            rawdata_stacked = np.stack(rawdata_)
            rawdata.append(rawdata_stacked)
            ncols = rawdata_stacked.shape[1]
            col_inds[feature.name] = (col_inds_start, col_inds_start + ncols)
            col_inds_start += ncols
        else:
            fa_storage_loc_template = os.path.join(storage_loc, '{}')
            for aggregator in aggregators:
                fa_storage_loc = fa_storage_loc_template.format(
                    aggregator.name)
                rawdata_ = bs.retrieve(ids, fa_storage_loc, flat=True)
                try:
                    rawdata_stacked = np.stack(rawdata_)
                except ValueError:
                    raise
                rawdata.append(rawdata_stacked)
                ncols = rawdata_stacked.shape[1]
                col_inds['{}_{}'.format(
                    feature.name, aggregator.name)] = (col_inds_start,
                                                       col_inds_start + ncols)
                col_inds_start += ncols
    rawdata = np.concatenate(rawdata, axis=1)

    return rawdata, col_inds
Exemple #2
0
    def _test_update(self, nupdate):
        _, arrs_for_update = create_random_id_based_dataset(nupdate)

        id2arr = {x: y for x, y in zip(self.ids, self.arrs)}

        # We want to make sure there are new ids (to be appended) and old ids (to be updated)
        nkeeps = nupdate // 2
        nnews = nupdate - nkeeps

        maxid = np.max(self.ids)
        new_ids = np.arange(maxid + 1, maxid + nnews + 1)
        keep_ids = self.ids[:nkeeps]

        ids_for_update = np.concatenate((keep_ids, new_ids))

        for x, y in zip(ids_for_update, arrs_for_update):
            id2arr[x] = y

        self.ids = np.array(list(id2arr.keys()))
        np.random.shuffle(self.ids)

        self.arrs = [id2arr[i] for i in self.ids]

        with tictoc('Test update {} items'.format(nupdate)):
            bs.store(ids_for_update, arrs_for_update, self.loc)

        retrieved_arrs = bs.retrieve(self.ids, self.loc)
        for id, retrieved_arr in zip(self.ids, retrieved_arrs):
            self.assertTrue(np.allclose(id2arr[id], retrieved_arr))
Exemple #3
0
def extract_rawdata(ids, features):
    storage_loc_template = get_storage_loc_template()
    data_by_id = {id: [] for id in ids}

    for feature in features:
        storage_loc = storage_loc_template.format(feature.name)
        with tictoc('{}'.format(feature.name)):
            feature_values = bs.retrieve(ids, storage_loc)
            for id, feature_value in zip(ids, feature_values):
                data_by_id[id].append(feature_value)

    data = []
    for id in ids:
        feature_values = data_by_id[id]
        data.append(feature_values)

    return data
Exemple #4
0
    def _test_retrieve(self, nselected, shuffle=True):
        selected_ids = copy.deepcopy(self.ids)
        if shuffle:
            np.random.shuffle(selected_ids)
        selected_ids = selected_ids[:nselected]

        selected_ids_inds = [
            np.where(self.ids == x)[0][0] for x in selected_ids
        ]
        selected_arrs = [self.arrs[i] for i in selected_ids_inds]

        with tictoc('Test retrieving {} items shuffle={}'.format(
                nselected, shuffle)):
            retrieved_arrs = bs.retrieve(selected_ids, self.loc)

        self.assertEqual(len(selected_ids), len(retrieved_arrs))
        for i in range(len(selected_ids)):
            selected_arr = selected_arrs[i]
            retrieved_arr = retrieved_arrs[i]

            try:
                self.assertTrue(np.allclose(selected_arr, retrieved_arr))
            except TypeError:
                pass
Exemple #5
0
def aggregate_feature_values(runner, tids, features, aggregators, force=False):
    """
    Compress all feature sequences into fixed-length vectors
    :param sid_to_label:
    :param h5file:
    :param features:
    :return:
    """
    if features is None or len(features) == 0:
        raise Exception('must provide non-empty list of features')

    storage_loc_template = get_storage_loc_template()

    if len(tids) == 0:
        runner.wrapping_up()
        return

    tid_min = tids.min()
    tid_max = tids.max()

    n_calculations = 0
    jobss = []

    for feature in features:
        if feature.is_fixed_length:
            continue

        jobs = []
        storage_loc = storage_loc_template.format(feature.name)
        fa_storage_loc_template = os.path.join(storage_loc, '{}')

        if force:
            combined_tids = [tids]
        else:
            combined_tids = []

        for aggregator in aggregators:
            fa_storage_loc = fa_storage_loc_template.format(aggregator.name)
            mkdirp(fa_storage_loc)

            if force:
                tids_target = tids
            else:
                existing_tids = bs.retrieve_ids(fa_storage_loc,
                                                (tid_min, tid_max))
                sorted_ids, sort_order = np.unique(existing_tids,
                                                   return_index=True)

                non_existing_idx = np.where(
                    np.logical_not(np.isin(tids, sorted_ids)))
                missing_tids = tids[non_existing_idx]
                tids_target = np.array(sorted(missing_tids))

            n_tids_target = len(tids_target)
            if not force and n_tids_target:
                combined_tids.append(tids_target)

            if n_tids_target:
                n_calculations += n_tids_target
                jobs.append((tids_target, aggregator, fa_storage_loc))

        if len(combined_tids):
            combined_tids = np.unique(
                np.concatenate(combined_tids).astype(np.int32))
            jobss.append((combined_tids, storage_loc, jobs))

    if not n_calculations:
        return

    runner.start(limit=n_calculations)

    for combined_tids, storage_loc, jobs in jobss:
        batches = get_batches(combined_tids, batch_size=100)
        for batch_tids in batches:
            batch_size = len(batch_tids)
            batch_arrs = bs.retrieve(batch_tids, storage_loc)
            for tids_target, aggregator, fa_storage_loc in jobs:
                aggregateds = []
                aggregated_ids = []
                target_batch_ind = np.searchsorted(batch_tids, tids_target)
                batch_id_within_range = np.where(target_batch_ind < batch_size)
                target_batch_ind = target_batch_ind[batch_id_within_range]
                tids_within_range = tids_target[batch_id_within_range]

                for batch_ind, tid in zip(target_batch_ind, tids_within_range):
                    if batch_ind == 0 and batch_tids[0] != tid:
                        continue
                    aggregated_ids.append(tid)
                    arr = batch_arrs[batch_ind]
                    aggregated = aggregator.process(arr)
                    aggregateds.append(aggregated)

                if len(aggregated_ids):
                    aggregated_ids = np.array(aggregated_ids)
                    bs.store(aggregated_ids, aggregateds, fa_storage_loc)
                    runner.tick(len(aggregated_ids))
Exemple #6
0
    def _test_retrieve_error(self):
        non_existing_ids = NUM_POINTS * 100 + np.random.randint(
            100, size=NUM_POINTS // 2)

        with self.assertRaises((ValueError, FileNotFoundError)):
            bs.retrieve(non_existing_ids, self.loc)