def _create_sample_sets(raw_data, offset, references=None): if references is None: return [ SampleSet( [ Sample( s, reference_id=str(i + offset), key=str(uuid.uuid4()), ) ], key=str(i + offset), reference_id=str(i + offset), subject_id=str(i + offset), ) for i, s in enumerate(raw_data) ] else: return [ SampleSet( [ Sample( s, reference_id=str(i + offset), key=str(uuid.uuid4()), ) ], key=str(i + offset), reference_id=str(i + offset), subject_id=str(i + offset), references=references, ) for i, s in enumerate(raw_data) ]
def test_extractor_fittable(): with tempfile.TemporaryDirectory() as dir_name: extractor_file = os.path.join(dir_name, "Extractor.hdf5") extractor = FakeExtractorFittable() extractor_transformer = ExtractorTransformer(extractor, model_path=extractor_file) # Testing sample sample_transformer = SampleWrapper(extractor_transformer) # Fitting training_data = np.arange(4).reshape(2, 2) training_samples = [Sample(training_data, key="1")] sample_transformer = sample_transformer.fit(training_samples) test_data = [np.zeros((2, 2)), np.ones((2, 2))] oracle = [np.zeros((2, 2)), np.ones((2, 2)) @ training_data] test_sample = [Sample(d, key=str(i)) for i, d in enumerate(test_data)] transformed_sample = sample_transformer.transform(test_sample) assert assert_sample(transformed_sample, oracle) # Testing checkpoint checkpointing_transformer = CheckpointWrapper( sample_transformer, features_dir=dir_name, load_func=extractor.read_feature, save_func=extractor.write_feature, ) transformed_sample = checkpointing_transformer.transform(test_sample) assert assert_sample(transformed_sample, oracle) assert assert_checkpoints(transformed_sample, dir_name)
def test_sampleset_collection(): n_samples = 10 X = np.ones(shape=(n_samples, 2), dtype=int) sampleset = SampleSet( [Sample(data, key=str(i)) for i, data in enumerate(X)], key="1") assert len(sampleset) == n_samples # Testing insert sample = Sample(X, key=100) sampleset.insert(1, sample) assert len(sampleset) == n_samples + 1 # Testing delete del sampleset[0] assert len(sampleset) == n_samples # Testing set sampleset[0] = copy.deepcopy(sample) # Testing iterator for i in sampleset: assert isinstance(i, Sample) def _load(path): return pickle.loads(open(path, "rb").read()) # Testing delayed sampleset with tempfile.TemporaryDirectory() as dir_name: samples = [Sample(data, key=str(i)) for i, data in enumerate(X)] filename = os.path.join(dir_name, "samples.pkl") with open(filename, "wb") as f: f.write(pickle.dumps(samples)) sampleset = DelayedSampleSet(functools.partial(_load, filename), key=1) assert len(sampleset) == n_samples assert sampleset.samples == samples # Testing delayed sampleset cached with tempfile.TemporaryDirectory() as dir_name: samples = [Sample(data, key=str(i)) for i, data in enumerate(X)] filename = os.path.join(dir_name, "samples.pkl") with open(filename, "wb") as f: f.write(pickle.dumps(samples)) sampleset = DelayedSampleSetCached(functools.partial(_load, filename), key=1) assert len(sampleset) == n_samples assert sampleset.samples == samples
def fit(self, t_scores, y=None): # TODO: THIS IS SUPER INNEFICIENT, BUT # IT'S THE MOST READABLE SOLUTION # Stacking scores by biometric reference self.t_stats = dict() for sset in t_scores: self.t_stats[sset.reference_id] = Sample([s.data for s in sset], parent=sset) # Now computing the statistics in place for key in self.t_stats: data = self.t_stats[key].data # Selecting the top scores if self.top_norm: # Sorting in ascending order data = -np.sort(-data) proportion = int( np.floor(len(data) * self.top_norm_score_fraction)) data = data[0:proportion] self.t_stats[key].mu = np.mean(self.t_stats[key].data) self.t_stats[key].std = np.std(self.t_stats[key].data) # self._z_stats[key].std = legacy_std( # self._z_stats[key].mu, self._z_stats[key].data # ) self.t_stats[key].data = [] return self
def test_mod_4hz(): """Loading and running the mod-4hz annotator.""" # Test setup and config annotator = bob.bio.base.load_resource("mod-4hz", "annotator") assert isinstance(annotator, bob.bio.spear.annotator.Mod_4Hz) # Read input rate, wav = _wav() # Test the VAD annotator annotator = bob.bio.spear.annotator.Mod_4Hz() _compare( annotator.transform_one(wav, sample_rate=rate), pkg_resources.resource_filename( "bob.bio.spear.test", "data/vad_mod_4hz.hdf5" ), ) # Test the processing of Sample objects and tags of annotator transformer wrapped_annotator = wrap(["sample"], annotator) samples = [Sample(data=wav, rate=rate)] # Attribute `rate` should be passed as `sample_rate` argument of transform (tags) result = wrapped_annotator.transform(samples) # Annotations should be in attribute `annotations` of result samples (tags) _compare( result[0].annotations, pkg_resources.resource_filename( "bob.bio.spear.test", "data/vad_mod_4hz.hdf5" ), )
def test_preprocessor(): preprocessor = FakePreprocesor() preprocessor_transformer = PreprocessorTransformer(preprocessor) # Testing sample transform_extra_arguments = [("annotations", "annotations")] sample_transformer = SampleWrapper(preprocessor_transformer, transform_extra_arguments) data = np.zeros((2, 2)) oracle = [np.ones((2, 2))] annotations = 1 sample = [Sample(data, key="1", annotations=annotations)] transformed_sample = sample_transformer.transform(sample) assert assert_sample(transformed_sample, oracle) # Testing checkpoint with tempfile.TemporaryDirectory() as dir_name: checkpointing_transformer = CheckpointWrapper( sample_transformer, features_dir=dir_name, load_func=preprocessor.read_data, save_func=preprocessor.write_data, ) transformed_sample = checkpointing_transformer.transform(sample) assert assert_sample(transformed_sample, oracle) assert assert_checkpoints(transformed_sample, dir_name)
def test_extractor(): extractor = FakeExtractor() extractor_transformer = ExtractorTransformer(extractor) # Testing sample sample_transformer = SampleWrapper(extractor_transformer) data = np.zeros((2, 2)) oracle = [np.zeros((1, 4))] sample = [Sample(data, key="1")] transformed_sample = sample_transformer.transform(sample) assert assert_sample(transformed_sample, oracle) # Testing checkpoint with tempfile.TemporaryDirectory() as dir_name: checkpointing_transformer = CheckpointWrapper( sample_transformer, features_dir=dir_name, load_func=extractor.read_feature, save_func=extractor.write_feature, ) transformed_sample = checkpointing_transformer.transform(sample) assert assert_sample(transformed_sample, oracle) assert assert_checkpoints(transformed_sample, dir_name)
def test_sample_hdf5(): n_samples = 10 X = np.ones(shape=(n_samples, 2), dtype=int) samples = [ Sample(data, key=str(i), subject="Subject") for i, data in enumerate(X) ] with tempfile.TemporaryDirectory() as dir_name: # Single sample filename = os.path.join(dir_name, "sample.hdf5") with h5py.File(filename, "w", driver="core") as hdf5: sample_to_hdf5(samples[0], hdf5) with h5py.File(filename, "r") as hdf5: sample = hdf5_to_sample(hdf5) assert sample == samples[0] # List of samples filename = os.path.join(dir_name, "samples.hdf5") with h5py.File(filename, "w", driver="core") as hdf5: sample_to_hdf5(samples, hdf5) with h5py.File(filename, "r") as hdf5: samples_deserialized = hdf5_to_sample(hdf5) compare = [a == b for a, b in zip(samples_deserialized, samples)] assert np.sum(compare) == 10
def _transform_samples(X, stats): scores = [] for no_normed_score in X: score = (no_normed_score.data - stats.mu) / stats.std t_score = Sample(score, parent=no_normed_score) scores.append(t_score) return scores
def get_fake_samples_for_training(): data = np.random.rand(10, 3, 400, 400) annotations = {"reye": (131, 176), "leye": (222, 170)} return [ Sample(x, key=str(i), reference_id=str(i), annotations=annotations) for i, x in enumerate(data) ]
def _transform_samples(X): scores = [] for no_normed_score in X: score = (no_normed_score.data - self.z_stats[no_normed_score.reference_id].mu ) / self.z_stats[no_normed_score.reference_id].std z_score = Sample(score, parent=no_normed_score) scores.append(z_score) return scores
def test_resample(): """Resample using the transformer.""" audio_path = resource_filename("bob.bio.spear.test", "data/sample.wav") audio_n_samples = 77760 audio_sample_rate = 16000 sample = Sample(data=audio_path, channel=None, rate=audio_sample_rate) pipeline = make_pipeline( PathToAudio(), wrap(["sample"], Resample(audio_sample_rate // 2))) results = pipeline.transform([sample])[0] assert results.data.shape == (audio_n_samples // 2, ), results.data.shape
def _create_random_2dsamples(self, n_samples, offset, dim): return [ Sample( np.random.rand(dim, dim), key=str(uuid.uuid4()), annotations=1, reference_id=str(i), subject_id=str(i), ) for i in range(offset, offset + n_samples) ]
def create_templates_from_samplesets(self, list_of_samplesets, enroll): """Creates enroll or probe templates from multiple SampleSets. Parameters ---------- list_of_samplesets : list A list (length N) of SampleSets. enroll : bool If True, the SampleSets are for enrollment. If False, the SampleSets are for probe. Returns ------- templates : list A list of Samples which has the same length as ``list_of_samplesets``. Each Sample contains a template. """ logger.debug( f"{_frmt(self)}.create_templates_from_samplesets(... enroll={enroll})" ) # create templates from .data attribute of samples inside sample_sets list_of_feature_sets = [] for sampleset in list_of_samplesets: data = [s.data for s in sampleset.samples] valid_data = [d for d in data if d is not None] if len(data) != len(valid_data): logger.warning( f"Removed {len(data)-len(valid_data)} invalid enrollment samples." ) if not valid_data and enroll: # we do not support failure to enroll cases currently raise NotImplementedError( f"None of the enrollment samples were valid for {sampleset}." ) list_of_feature_sets.append(valid_data) templates = self.create_templates(list_of_feature_sets, enroll) expected_size = len(list_of_samplesets) assert len(templates) == expected_size, ( "The number of (%s) templates (%d) created by the algorithm does not match " "the number of sample sets (%d)" % ( "enroll" if enroll else "probe", len(templates), expected_size, )) # return a list of Samples (one per template) templates = [ Sample(t, parent=sampleset) for t, sampleset in zip(templates, list_of_samplesets) ] return templates
def test_path_to_audio(): """Tries to load the audio data from a file.""" audio_path = resource_filename("bob.bio.spear.test", "data/sample.wav") audio_n_samples = 77760 audio_sample_rate = 16000 sample = Sample(data=audio_path) transformer = PathToAudio() results = transformer.transform([sample])[0] assert results.rate == audio_sample_rate, results.rate assert isinstance(results.data, np.ndarray) assert results.data.shape == (audio_n_samples, ), results.data.shape assert results.data.dtype == np.float32, results.data.dtype # Force a different sample rate sample = Sample(data=audio_path) transformer = PathToAudio(forced_sr=audio_sample_rate // 2) results = transformer.transform([sample])[0] assert results.rate == audio_sample_rate // 2, results.rate assert isinstance(results.data, np.ndarray) assert results.data.shape == (audio_n_samples // 2, ), results.data.shape
def test_delayed_samples(): def load_data(): return 0 def load_annot(): return "annotation" def load_annot_variant(): return "annotation_variant" delayed_attr_read = False def load_check(): nonlocal delayed_attr_read delayed_attr_read = True return "delayed_attr_data" delayed_sample = DelayedSample(load_data, delayed_attributes=dict(annot=load_annot)) assert delayed_sample.data == 0, delayed_sample.data assert delayed_sample.annot == "annotation", delayed_sample.annot child_sample = Sample(1, parent=delayed_sample) assert child_sample.data == 1, child_sample.data assert child_sample.annot == "annotation", child_sample.annot assert child_sample.__dict__ == { "data": 1, "annot": "annotation", }, child_sample.__dict__ # Overwriting and adding delayed_attributes to the child new_delayed_attr = { "annot": load_annot_variant, # Override parent's annot "new_annot": load_annot, # Add the new_annot attribute "read_check": load_check, } child_sample = DelayedSample(load_data, parent=delayed_sample, delayed_attributes=new_delayed_attr) assert child_sample.data == 0, child_sample.data assert child_sample.annot == "annotation_variant", child_sample.annot assert child_sample.new_annot == "annotation", child_sample.new_annot assert not delayed_attr_read, "delayed attribute has been read early" assert child_sample.read_check == "delayed_attr_data", child_sample.read_check assert delayed_attr_read, "delayed attribute should have been read by now" delayed_sample.annot = "changed" assert delayed_sample.annot == "changed", delayed_sample.annot
def generate_samples(n_subjects, n_samples_per_subject, shape=(2, 2), annotations=1): """ Simple sample generator that generates a certain number of samples per subject, whose data is np.zeros + subject index """ samples = [] for i in range(n_subjects): data = np.zeros(shape) + i for j in range(n_samples_per_subject): samples += [ Sample( data, subject=str(i), key=str(i * n_subjects + j), annotations=annotations, ) ] return samples
def _delayed_samples_to_samples(delayed_samples): return [Sample(sample.data, parent=sample) for sample in delayed_samples]
def score_sample_templates(self, probe_samples, enroll_samples, score_all_vs_all): """Computes the similarity score between all probe and enroll templates. Parameters ---------- probe_samples : list A list (length N) of Samples containing probe templates. enroll_samples : list A list (length M) of Samples containing enroll templates. score_all_vs_all : bool If True, the similarity scores between all probe and enroll templates are computed. If False, the similarity scores between the probes and their associated enroll templates are computed. Returns ------- score_samplesets : list A list of N SampleSets each containing a list of M score Samples if score_all_vs_all is True. Otherwise, a list of N SampleSets each containing a list of <=M score Samples depending on the database. """ logger.debug( f"{_frmt(self)}.score_sample_templates(... score_all_vs_all={score_all_vs_all})" ) # Returns a list of SampleSets where a Sampleset for each probe # SampleSet where each Sample inside the SampleSets contains the score # for one enroll SampleSet score_samplesets = [] if score_all_vs_all: probe_data = [s.data for s in probe_samples] valid_probe_indices = [ i for i, d in enumerate(probe_data) if _data_valid(d) ] valid_probe_data = [probe_data[i] for i in valid_probe_indices] scores = self.compare(SampleBatch(enroll_samples), valid_probe_data) scores = np.asarray(scores, dtype=float) if len(valid_probe_indices) != len(probe_data): # inject None scores for invalid probe samples scores: list = scores.T.tolist() for i in range(len(probe_data)): if i not in valid_probe_indices: scores.insert(i, [None] * len(enroll_samples)) # transpose back to original shape scores = np.array(scores, dtype=float).T expected_shape = (len(enroll_samples), len(probe_samples)) assert scores.shape == expected_shape, ( "The shape of the similarity scores (%s) does not match the expected shape (%s)" % (scores.shape, expected_shape)) for j, probe in enumerate(probe_samples): samples = [] for i, enroll in enumerate(enroll_samples): samples.append(Sample(scores[i, j], parent=enroll)) score_samplesets.append(SampleSet(samples, parent=probe)) else: for probe in probe_samples: references = [str(ref) for ref in probe.references] # get the indices of references for enroll samplesets indices = [ i for i, enroll in enumerate(enroll_samples) if str(enroll.reference_id) in references ] if not indices: raise ValueError( f"No enroll sampleset found for probe {probe} and its required references {references}. " "Did you mean to set score_all_vs_all=True?") if not _data_valid(probe.data): scores = [[None]] * len(indices) else: scores = self.compare( SampleBatch([enroll_samples[i] for i in indices]), SampleBatch([probe]), ) scores = np.asarray(scores, dtype=float) expected_shape = (len(indices), 1) assert scores.shape == expected_shape, ( "The shape of the similarity scores (%s) does not match the expected shape (%s)" % (scores.shape, expected_shape)) samples = [] for i, j in enumerate(indices): samples.append( Sample(scores[i, 0], parent=enroll_samples[j])) score_samplesets.append(SampleSet(samples, parent=probe)) return score_samplesets