def apply(self, current_file): # Speech Activity Detection # get raw SAD scores soft_sad = self.sad_(current_file) # check once and for all whether SAD scores are log-scaled if not hasattr(self, 'sad_log_scale_'): if np.nanmean(soft_sad.data) < 0: self.sad_log_scale_ = True else: self.sad_log_scale_ = False # get SAD probability prob_sad = np.exp(soft_sad.data) if self.sad_log_scale_ \ else soft_sad.data # support both non-speech/speech & non-speech/single/overlap prob_sad = 1. - prob_sad[:, 0] prob_sad = SlidingWindowFeature(prob_sad, soft_sad.sliding_window) # binarization hard_sad = self.sad_binarize_.apply(prob_sad) # Speaker Change Detection # get raw SCD scores soft_scd = self.scd_(current_file) # check once and for all whether SCD scores are log-scaled if not hasattr(self, 'scd_log_scale_'): if np.nanmean(soft_scd.data) < 0: self.scd_log_scale_ = True else: self.scd_log_scale_ = False # get SCD probability prob_scd = np.exp(soft_scd.data) if self.scd_log_scale_ \ else soft_scd.data # take the final dimension # (in order to support both classification and regression scores) prob_scd = prob_scd[:, -1] prob_scd = SlidingWindowFeature(prob_scd, soft_scd.sliding_window) # peak detection hard_scd = self.scd_peak_.apply(prob_scd) speech_turns = hard_scd.crop(hard_sad) # only process the annotated part speech_turns = speech_turns.crop(get_annotated(current_file)) return speech_turns
def sliding_samples(self): uris = list(self.data_) durations = np.array([self.data_[uri]['duration'] for uri in uris]) probabilities = durations / np.sum(durations) sliding_segments = SlidingSegments(duration=self.duration, step=self.duration, source='annotated') while True: np.random.shuffle(uris) # loop on all files for uri in uris: datum = self.data_[uri] # make a copy of current file current_file = dict(datum['current_file']) # randomly shift 'annotated' segments start time so that # we avoid generating exactly the same subsequence twice annotated = Timeline( [Segment(s.start + np.random.random() * self.duration, s.end) for s in get_annotated(current_file)]) current_file['annotated'] = annotated if self.shuffle: samples = [] for sequence in sliding_segments.from_file(current_file): X = self.precomputed.crop(current_file, sequence, mode='center', fixed=self.duration) y = datum['y'].crop(sequence, mode='center', fixed=self.duration) sample = {'X': X, 'y': np.squeeze(y)} if self.shuffle: samples.append(sample) else: yield sample if self.shuffle: np.random.shuffle(samples) for sample in samples: yield sample
def objective(self, protocol, subset='development', learning=False): """Compute the value of the objective function (the lower, the better) Parameters ---------- protocol : pyannote.database.Protocol Protocol on which to compute the value of the objective function. subset : {'train', 'development', 'test'}, optional Subset on which to compute the value of the objective function. Defaults to 'development'. learning : bool, optional Set to True to indicate that the pipeline is being tuned and that the reference can be passed safely to the pipeline. Default behavior is to remove it from `current_file`. This is useful for pipelines that may take a looooong time to proceed when the hypothesis is completely wrong (e.g. too many segments to cluster). Returns ------- metric : float Value of the objective function (the lower, the better). """ metric = self.get_tune_metric() value, duration = [], [] # NOTE -- embarrasingly parallel # TODO -- parallelize this for current_file in getattr(protocol, subset)(): uem = get_annotated(current_file) if learning: reference = current_file['annotation'] else: reference = current_file.pop('annotation') hypothesis = self.apply(current_file) if hypothesis is None: return 1. metric_value = metric(reference, hypothesis, uem=uem) value.append(metric_value) duration.append(uem.duration()) # support for pyannote.metrics if hasattr(metric, '__abs__'): return abs(metric) # support for any other metric else: return np.average(value, weights=duration)
def apply(self, current_file): # initial segmentation speech_turns = super().apply(current_file) # initialize the hypothesized annotation hypothesis = Annotation(uri=current_file['uri']) if len(speech_turns) < 1: return hypothesis # this only happens during pipeline training if 'annotation' in current_file: # number of speech turns in reference reference = current_file['annotation'] n_turns_true = len(list(reference.itertracks())) # number of speech turns in hypothesis uem = get_annotated(current_file) n_turns_pred = len(speech_turns.crop(uem)) # don't even bother trying to cluster those speech turns # as there are too many of those... if n_turns_pred > 20 * n_turns_true: return None # get raw (sliding window) embeddings emb = self.emb_(current_file) # get one embedding per speech turn # FIXME don't l2_normalize for any metric fX = l2_normalize(np.vstack( [np.sum(emb.crop(t, mode='loose'), axis=0) for t in speech_turns])) # apply clustering try: affinity = -squareform(pdist(fX, metric=self.metric)) clusters = self.cls_.fit_predict(affinity) except MemoryError as e: # cannot compute affinity propagation return None for speech_turn, cluster in zip(speech_turns, clusters): # HACK find why fit_predict returns NaN sometimes and fix it. cluster = -1 if np.isnan(cluster) else cluster hypothesis[speech_turn] = cluster return hypothesis
def fun(threshold): binarizer = Binarize(onset=threshold, offset=threshold, log_scale=False) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) metric = DetectionErrorRate() # NOTE -- embarrasingly parallel # TODO -- parallelize this file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) hypothesis = binarizer.apply( predictions[uri], dimension=0).to_annotation() reference = current_file['annotation'] uem = get_annotated(current_file) _ = metric(reference, hypothesis, uem=uem) return abs(metric)
def apply_pretrained(validate_dir: Path, protocol_name: str, subset: Optional[str] = "test", duration: Optional[float] = None, step: float = 0.25, device: Optional[torch.device] = None, batch_size: int = 32, pretrained: Optional[str] = None, Pipeline: type = None, **kwargs): """Apply pre-trained model Parameters ---------- validate_dir : Path protocol_name : `str` subset : 'train' | 'development' | 'test', optional Defaults to 'test'. duration : `float`, optional step : `float`, optional device : `torch.device`, optional batch_size : `int`, optional pretrained : `str`, optional Pipeline : `type` """ if pretrained is None: pretrained = Pretrained(validate_dir=validate_dir, duration=duration, step=step, batch_size=batch_size, device=device) output_dir = validate_dir / 'apply' / f'{pretrained.epoch_:04d}' else: if pretrained in torch.hub.list('pyannote/pyannote-audio'): output_dir = validate_dir / pretrained else: output_dir = validate_dir pretrained = Wrapper(pretrained, duration=duration, step=step, batch_size=batch_size, device=device) params = {} try: params['classes'] = pretrained.classes except AttributeError as e: pass try: params['dimension'] = pretrained.dimension except AttributeError as e: pass # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=output_dir, sliding_window=pretrained.sliding_window, **params) # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=pretrained.preprocessors_) for current_file in getattr(protocol, subset)(): fX = pretrained(current_file) precomputed.dump(current_file, fX) # do not proceed with the full pipeline # when there is no such thing for current task if Pipeline is None: return # instantiate pipeline pipeline = Pipeline(scores=output_dir) pipeline.instantiate(pretrained.pipeline_params_) # load pipeline metric (when available) try: metric = pipeline.get_metric() except NotImplementedError as e: metric = None # apply pipeline and dump output to RTTM files output_rttm = output_dir / f'{protocol_name}.{subset}.rttm' with open(output_rttm, 'w') as fp: for current_file in getattr(protocol, subset)(): hypothesis = pipeline(current_file) pipeline.write_rttm(fp, hypothesis) # compute evaluation metric (when possible) if 'annotation' not in current_file: metric = None # compute evaluation metric (when available) if metric is None: continue reference = current_file['annotation'] uem = get_annotated(current_file) _ = metric(reference, hypothesis, uem=uem) # print pipeline metric (when available) if metric is None: return output_eval = output_dir / f'{protocol_name}.{subset}.eval' with open(output_eval, 'w') as fp: fp.write(str(metric))
def initialize(self, protocol, subset='train'): """Gather the following information about the training subset: data_ : dict {'segments': <list of annotated segments>, 'duration': <total duration of annotated segments>, 'current_file': <protocol dictionary>, 'y': <labels as numpy array>} databases_ : list Sorted list of (unique) databases in protocol. labels_ : list Sorted list of (unique) lables in protocol. """ self.data_ = {} labels, databases = set(), set() # loop once on all files for current_file in getattr(protocol, subset)(): # keep track of database database = current_file['database'] databases.add(database) # keep track of unique labels for label in current_file['annotation'].labels(): label = get_label_identifier(label, current_file) labels.add(label) annotated = get_annotated(current_file) if not self.precomputed.use_memmap: msg = ('Loading all precomputed features in memory. ' 'Set "use_memmap" to True if you run out of memory.') warnings.warn(msg) segments = [s for s in annotated if s.duration > self.duration] # corner case where no segment is long enough # and we removed them all... if not segments: continue # total duration of label in current_file (after removal of # short segments). duration = sum(s.duration for s in segments) # store all these in data_ dictionary datum = {'segments': segments, 'duration': duration, 'current_file': current_file} uri = get_unique_identifier(current_file) self.data_[uri] = datum self.databases_ = sorted(databases) self.labels_ = sorted(labels) sliding_window = self.precomputed.sliding_window() for current_file in getattr(protocol, subset)(): y, _ = to_numpy(current_file, self.precomputed, labels=self.labels_) uri = get_unique_identifier(current_file) self.data_[uri]['y'] = SlidingWindowFeature( self.postprocess_y(y), sliding_window)
def apply(self, protocol_name: str, output_dir: Path, subset: Optional[str] = "test"): """Apply current best pipeline Parameters ---------- protocol_name : `str` Name of pyannote.database protocol to process. subset : `str`, optional Subset to process. Defaults to 'test' """ # file generator protocol = get_protocol(protocol_name, preprocessors=self.preprocessors_) # load pipeline metric (when available) try: metric = self.pipeline_.get_metric() except NotImplementedError as e: metric = None output_dir.mkdir(parents=True, exist_ok=True) output_ext = ( output_dir / f"{protocol_name}.{subset}.{self.pipeline_.write_format}") with open(output_ext, mode="w") as fp: files = list(getattr(protocol, subset)()) desc = f"Processing {protocol_name} ({subset})" for current_file in tqdm(iterable=files, desc=desc, unit="file"): # apply pipeline and dump output to file output = self.pipeline_(current_file) self.pipeline_.write(fp, output) # compute evaluation metric (when possible) reference = current_file.get("annotation", None) if reference is None: metric = None # compute evaluation metric (when available) if metric is None: continue uem = get_annotated(current_file) _ = metric(reference, output, uem=uem) # "latest" symbolic link latest = output_dir.parent / "latest" if latest.exists(): latest.unlink() latest.symlink_to(output_dir) # print pipeline metric (when available) if metric is None: msg = (f"For some (possibly good) reason, the output of this " f"pipeline could not be evaluated on {protocol_name}.") print(msg) return output_eval = output_dir / f"{protocol_name}.{subset}.eval" with open(output_eval, "w") as fp: fp.write(str(metric))
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): target_purity = self.purity # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) # extract predictions for all files. predictions = {} for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file) predictions[uri] = sequence_labeling.apply(current_file) # dichotomic search to find alpha that maximizes coverage # while having at least `target_purity` lower_alpha = 0. upper_alpha = 1. best_alpha = .5 * (lower_alpha + upper_alpha) best_coverage = 0. for _ in range(10): current_alpha = .5 * (lower_alpha + upper_alpha) peak = Peak(alpha=current_alpha, min_duration=0.0, log_scale=model.logsoftmax) metric = DiarizationPurityCoverageFMeasure() # NOTE -- embarrasingly parallel # TODO -- parallelize this for current_file in getattr(protocol, subset)(): reference = current_file['annotation'] uri = get_unique_identifier(current_file) hypothesis = peak.apply(predictions[uri], dimension=1) hypothesis = hypothesis.to_annotation() uem = get_annotated(current_file) metric(reference, hypothesis, uem=uem) purity, coverage, _ = metric.compute_metrics() if purity < target_purity: upper_alpha = current_alpha else: lower_alpha = current_alpha if coverage > best_coverage: best_coverage = coverage best_alpha = current_alpha task = 'speaker_change_detection' metric_name = f'{task}/coverage@{target_purity:.2f}purity' return { metric_name: {'minimize': False, 'value': best_coverage}, f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
def apply(self, protocol_name: str, output_dir: Path, subset: Optional[str] = None): """Apply current best pipeline Parameters ---------- protocol_name : `str` Name of pyannote.database protocol to process. subset : `str`, optional Subset to process. Defaults processing all subsets. """ # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) output_dir.mkdir(parents=True, exist_ok=False) if subset is None: path = output_dir / f'{protocol_name}.all.txt' else: path = output_dir / f'{protocol_name}.{subset}.txt' # initialize evaluation metric try: metric = self.pipeline_.get_metric() except NotImplementedError as e: metric = None losses = [] with open(path, mode='w') as fp: if subset is None: files = FileFinder.protocol_file_iter(protocol) else: files = getattr(protocol, subset)() for current_file in files: output = self.pipeline_(current_file) # evaluate output if metric is None: loss = self.pipeline_.loss(current_file, output) losses.append(loss) else: from pyannote.database import get_annotated _ = metric(current_file['annotation'], output, uem=get_annotated(current_file)) self.pipeline_.write(fp, output) # report evaluation metric if metric is None: loss = np.mean(losses) print(f'Loss = {loss:g}') else: _ = metric.report(display=True)
def sliding_samples(self): """Sliding window Returns ------- samples : generator Generator that yields {'waveform': ..., 'y': ...} samples indefinitely. """ uris = list(self.data_) durations = np.array([self.data_[uri]['duration'] for uri in uris]) probabilities = durations / np.sum(durations) sliding_segments = SlidingWindow(duration=self.duration, step=self.duration) while True: # shuffle files np.random.shuffle(uris) # loop on shuffled files for uri in uris: datum = self.data_[uri] # make a copy of current file current_file = dict(datum['current_file']) # read waveform for the whole file waveform = self.raw_audio_(current_file) # randomly shift 'annotated' segments start time so that # we avoid generating exactly the same subsequence twice shifted_segments = [ Segment(s.start + np.random.random() * self.duration, s.end) for s in get_annotated(current_file)] # deal with corner case where a shifted segment would be empty shifted_segments = [s for s in shifted_segments if s] annotated = Timeline(segments=shifted_segments) samples = [] for sequence in sliding_segments(annotated): X = waveform.crop(sequence, mode='center', fixed=self.duration) y = datum['y'].crop(sequence, mode=self.alignment, fixed=self.duration) # FIXME -- this is ugly sample = {'waveform': normalize(X), 'y': y, 'database': current_file['database'], 'uri': current_file['uri'], 'audio': current_file['audio'], 'duration': current_file['duration'], } samples.append(sample) np.random.shuffle(samples) for sample in samples: yield sample
def validate_helper_func(current_file, pipeline=None, metric=None): reference = current_file['annotation'] uem = get_annotated(current_file) hypothesis = pipeline(current_file) return metric(reference, hypothesis, uem=uem)
def apply_iter(self, current_file, hypothesis, partial=True, device=None, log_dir=None): """Yield re-segmentation results for each epoch Parameters ---------- current_file : pyannote.database dict Currently processed file hypothesis : pyannote.core.Annotation Input segmentation partial : bool, optional Set to False to only yield final re-segmentation. Set to True to yield re-segmentation after each epoch. device : torch.device, optional Defaults to torch.device('cpu') log_dir : str, optional Path to log directory. Yields ------ resegmented : pyannote.core.Annotation Resegmentation results after each epoch. """ device = torch.device('cpu') if device is None else device current_file = dict(current_file) current_file['annotation'] = hypothesis # set `per_epoch` attribute to current file annotated duration self.per_epoch = get_annotated(current_file).duration() # number of speakers + 1 for non-speech self.n_classes_ = len(hypothesis.labels()) + 1 model = StackedRNN(self.precomputed.dimension(), self.n_classes, rnn=self.rnn, recurrent=self.recurrent, linear=self.linear, bidirectional=self.bidirectional, logsoftmax=True) # initialize dummy protocol that has only one file protocol = self.get_dummy_protocol(current_file) if log_dir is None: log_dir = tempfile.mkdtemp() uri = get_unique_identifier(current_file) log_dir = 'f{log_dir}/{uri}' self.scores_ = collections.deque([], maxlen=self.ensemble) iterations = self.fit_iter(model, self.precomputed, protocol, subset='train', restart=0, epochs=self.epochs, learning_rate='auto', get_optimizer=SGD, get_scheduler=ConstantScheduler, log_dir=log_dir, device=device) for i, iteration in enumerate(iterations): # if 'partial', compute scores for every iteration # if not, compute scores for last 'ensemble' iterations only if partial or (i + 1 > self.epochs - self.ensemble): iteration_score = self._score(iteration['model'], current_file, device=device) self.scores_.append(iteration_score) # if 'partial', generate (and yield) hypothesis if partial: hypothesis = self._decode(self.scores_) yield hypothesis # generate (and yield) final hypothesis in case it's not already if not partial: hypothesis = self._decode(self.scores_) yield hypothesis
def apply_iter(self, current_file, hypothesis, partial=True, device=None, log_dir=None): """Yield re-segmentation results for each epoch Parameters ---------- current_file : pyannote.database dict Currently processed file hypothesis : pyannote.core.Annotation Input segmentation partial : bool, optional Set to False to only yield final re-segmentation. Set to True to yield re-segmentation after each epoch. device : torch.device, optional Defaults to torch.device('cpu') log_dir : str, optional Path to log directory. Yields ------ resegmented : pyannote.core.Annotation Resegmentation results after each epoch. """ device = torch.device('cpu') if device is None else device current_file = dict(current_file) current_file['annotation'] = hypothesis # set `per_epoch` attribute to current file annotated duration self.per_epoch = get_annotated(current_file).duration() # number of speakers + 1 for non-speech self.n_classes_ = len(hypothesis.labels()) + 1 model = StackedRNN(self.precomputed.dimension(), self.n_classes, rnn=self.rnn, recurrent=self.recurrent, linear=self.linear, bidirectional=self.bidirectional, logsoftmax=True) # initialize dummy protocol that has only one file protocol = self.get_dummy_protocol(current_file) if log_dir is None: log_dir = tempfile.mkdtemp() uri = get_unique_identifier(current_file) log_dir = 'f{log_dir}/{uri}' self.scores_ = collections.deque([], maxlen=self.ensemble) iterations = self.fit_iter( model, self.precomputed, protocol, subset='train', restart=0, epochs=self.epochs, learning_rate='auto', get_optimizer=SGD, get_scheduler=ConstantScheduler, log_dir=log_dir, device=device) for i, iteration in enumerate(iterations): # if 'partial', compute scores for every iteration # if not, compute scores for last 'ensemble' iterations only if partial or (i + 1 > self.epochs - self.ensemble): iteration_score = self._score(iteration['model'], current_file, device=device) self.scores_.append(iteration_score) # if 'partial', generate (and yield) hypothesis if partial: hypothesis = self._decode(self.scores_) yield hypothesis # generate (and yield) final hypothesis in case it's not already if not partial: hypothesis = self._decode(self.scores_) yield hypothesis
def __call__(self, current_file: dict) -> Annotation: """Apply speaker diarization Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. Returns ------- hypothesis : `pyannote.core.Annotation` Speaker diarization output. """ # segmentation into speech turns speech_turns = self.speech_turn_segmentation(current_file) # some files are only partially annotated and therefore one cannot # evaluate speaker diarization results on the whole file. # this option simply avoids trying to cluster those # (potentially messy) un-annotated refions by focusing only on # speech turns contained in the annotated regions. if self.evaluation_only: annotated = get_annotated(current_file) speech_turns = speech_turns.crop(annotated, mode='intersection') # in case there is one speech turn or less, there is no need to apply # any kind of clustering approach. if len(speech_turns) < 2: return speech_turns # split short/long speech turns. the idea is to first cluster long # speech turns (i.e. those for which we can trust embeddings) and then # assign each speech turn to the closest cluster. long_speech_turns = speech_turns.empty() shrt_speech_turns = speech_turns.empty() for segment, track, label in speech_turns.itertracks(yield_label=True): if segment.duration < self.min_duration: shrt_speech_turns[segment, track] = label else: long_speech_turns[segment, track] = label # in case there are no long speech turn to cluster, we return the # original speech turns (= shrt_speech_turns) if len(long_speech_turns) < 1: return speech_turns # first: cluster long speech turns long_speech_turns = self.speech_turn_clustering(current_file, long_speech_turns) # then: assign short speech turns to clusters long_speech_turns.rename_labels(generator='string', copy=False) if len(shrt_speech_turns) > 0: shrt_speech_turns.rename_labels(generator='int', copy=False) shrt_speech_turns = self.speech_turn_assignment(current_file, shrt_speech_turns, long_speech_turns) # merge short/long speech turns return long_speech_turns.update( shrt_speech_turns, copy=False).support(collar=0.)
def objective(trial: Trial) -> float: """Compute objective value Parameter --------- trial : `Trial` Current trial Returns ------- loss : `float` Loss """ # use pyannote.metrics metric when available try: metric = self.pipeline.get_metric() except NotImplementedError as e: metric = None losses = [] processing_time = [] evaluation_time = [] # instantiate pipeline with value suggested in current trial pipeline = self.pipeline.instantiate( self.pipeline.parameters(trial=trial)) if show_progress != False: progress_bar = tqdm(total=len(inputs), **show_progress) progress_bar.update(0) # accumulate loss for each input for i, input in enumerate(inputs): # process input with pipeline # (and keep track of processing time) before_processing = time.time() output = pipeline(input) after_processing = time.time() processing_time.append(after_processing - before_processing) # evaluate output (and keep track of evaluation time) before_evaluation = time.time() # when metric is not available, use loss method instead if metric is None: loss = pipeline.loss(input, output) losses.append(loss) # when metric is available,`input` is expected to be provided # by a `pyannote.database` protocol else: from pyannote.database import get_annotated _ = metric(input["annotation"], output, uem=get_annotated(input)) after_evaluation = time.time() evaluation_time.append(after_evaluation - before_evaluation) if show_progress != False: progress_bar.update(1) if self.pruner is None: continue trial.report( np.mean(losses) if metric is None else abs(metric), i) if trial.should_prune(i): raise optuna.structs.TrialPruned() if show_progress != False: progress_bar.close() trial.set_user_attr("processing_time", sum(processing_time)) trial.set_user_attr("evaluation_time", sum(evaluation_time)) return np.mean(losses) if metric is None else abs(metric)
def apply_pretrained( validate_dir: Path, protocol_name: Text, subset: Subset = "test", duration: Optional[float] = None, step: float = 0.25, device: Optional[torch.device] = None, batch_size: int = 32, pretrained: Optional[str] = None, Pipeline: type = None, **kwargs, ): """Apply pre-trained model Parameters ---------- validate_dir : Path protocol_name : `str` subset : 'train' | 'development' | 'test', optional Defaults to 'test'. duration : `float`, optional step : `float`, optional device : `torch.device`, optional batch_size : `int`, optional pretrained : `str`, optional Pipeline : `type` """ if pretrained is None: pretrained = Pretrained( validate_dir=validate_dir, duration=duration, step=step, batch_size=batch_size, device=device, ) output_dir = validate_dir / "apply" / f"{pretrained.epoch_:04d}" else: if pretrained in torch.hub.list("pyannote/pyannote-audio"): output_dir = validate_dir / pretrained else: output_dir = validate_dir pretrained = Wrapper( pretrained, duration=duration, step=step, batch_size=batch_size, device=device, ) params = {} try: params["classes"] = pretrained.classes except AttributeError as e: pass try: params["dimension"] = pretrained.dimension except AttributeError as e: pass # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=output_dir, sliding_window=pretrained.sliding_window, **params) # file generator preprocessors = getattr(pretrained, "preprocessors_", dict()) if "audio" not in preprocessors: preprocessors["audio"] = FileFinder() if "duration" not in preprocessors: preprocessors["duration"] = get_audio_duration protocol = get_protocol(protocol_name, preprocessors=preprocessors) files = getattr(protocol, subset)() for current_file in tqdm(iterable=files, desc=f"{subset.title()}", unit="file"): fX = pretrained(current_file) precomputed.dump(current_file, fX) # do not proceed with the full pipeline # when there is no such thing for current task if Pipeline is None: return # do not proceed with the full pipeline when its parameters cannot be loaded. # this might happen when applying a model that has not been validated yet try: pipeline_params = pretrained.pipeline_params_ except AttributeError as e: return # instantiate pipeline pipeline = Pipeline(scores=output_dir) pipeline.instantiate(pipeline_params) # load pipeline metric (when available) try: metric = pipeline.get_metric() except NotImplementedError as e: metric = None # apply pipeline and dump output to RTTM files output_rttm = output_dir / f"{protocol_name}.{subset}.rttm" with open(output_rttm, "w") as fp: files = getattr(protocol, subset)() for current_file in tqdm(iterable=files, desc=f"{subset.title()}", unit="file"): hypothesis = pipeline(current_file) pipeline.write_rttm(fp, hypothesis) # compute evaluation metric (when possible) reference = current_file.get("annotation", None) if reference is None: metric = None # compute evaluation metric (when available) if metric is None: continue uem = get_annotated(current_file) _ = metric(reference, hypothesis, uem=uem) # print pipeline metric (when available) if metric is None: return output_eval = output_dir / f"{protocol_name}.{subset}.eval" with open(output_eval, "w") as fp: fp.write(str(metric))
purity_list = [] coverage_list = [] for alpha in alphas: peak = Peak(alpha=alpha, min_duration=min_duration, log_scale=True) # evaluation metric # loop on test files for test_file in protocol.test(): # load reference annotation reference = test_file['annotation'] uem = get_annotated(test_file) # load precomputed change scores as pyannote.core.SlidingWindowFeature scd_scores = precomputed(test_file) # binarize scores to obtain speech regions as pyannote.core.Timeline hypothesis = peak.apply(scd_scores, dimension=1) # evaluate speech activity detection metric(reference, hypothesis.to_annotation(), uem=uem) purity, coverage, fmeasure = metric.compute_metrics() purity = f'{100*purity:.1f}' coverage = f'{100*coverage:.1f}' purity_list.append(purity) coverage_list.append(coverage)
def _sliding_samples(self): uris = list(self.data_) durations = np.array([self.data_[uri]['duration'] for uri in uris]) probabilities = durations / np.sum(durations) sliding_segments = SlidingWindow(duration=self.duration, step=self.step * self.duration) while True: np.random.shuffle(uris) # loop on all files for uri in uris: datum = self.data_[uri] # make a copy of current file current_file = dict(datum['current_file']) # compute features for the whole file features = self.feature_extraction(current_file) # randomly shift 'annotated' segments start time so that # we avoid generating exactly the same subsequence twice annotated = Timeline() for segment in get_annotated(current_file): shifted_segment = Segment( segment.start + np.random.random() * self.duration, segment.end) if shifted_segment: annotated.add(shifted_segment) samples = [] for sequence in sliding_segments(annotated): X = features.crop(sequence, mode='center', fixed=self.duration) y = self.crop_y(datum['y'], sequence) sample = {'X': X, 'y': y} if self.mask is not None: # extract mask for current sub-segment mask = current_file[self.mask].crop(sequence, mode='center', fixed=self.duration) # it might happen that "mask" and "y" use different # sliding windows. therefore, we simply resample "mask" # to match "y" if len(mask) != len(y): mask = scipy.signal.resample(mask, len(y), axis=0) sample['mask'] = mask for key, classes in self.file_labels_.items(): sample[key] = classes.index(current_file[key]) samples.append(sample) np.random.shuffle(samples) for sample in samples: yield sample
def _load_metadata(self, protocol, subset='train') -> float: """Load training set metadata This function is called once at instantiation time, returns the total training set duration, and populates the following attributes: Attributes ---------- data_ : dict {'segments': <list of annotated segments>, 'duration': <total duration of annotated segments>, 'current_file': <protocol dictionary>, 'y': <labels as numpy array>} segment_labels_ : list Sorted list of (unique) labels in protocol. file_labels_ : dict of list Sorted lists of (unique) file labels in protocol Returns ------- duration : float Total duration of annotated segments, in seconds. """ self.data_ = {} segment_labels, file_labels = set(), dict() # loop once on all files for current_file in getattr(protocol, subset)(): # ensure annotation/annotated are cropped to actual file duration support = Segment(start=0, end=current_file['duration']) current_file['annotated'] = get_annotated(current_file).crop( support, mode='intersection') current_file['annotation'] = current_file['annotation'].crop( support, mode='intersection') # keep track of unique segment labels segment_labels.update(current_file['annotation'].labels()) # keep track of unique file labels for key, value in current_file.items(): if isinstance(value, (Annotation, Timeline, SlidingWindowFeature)): continue if key not in file_labels: file_labels[key] = set() file_labels[key].add(value) segments = [s for s in current_file['annotated'] if s.duration > self.duration] # corner case where no segment is long enough # and we removed them all... if not segments: continue # total duration of label in current_file (after removal of # short segments). duration = sum(s.duration for s in segments) # store all these in data_ dictionary datum = {'segments': segments, 'duration': duration, 'current_file': current_file} uri = get_unique_identifier(current_file) self.data_[uri] = datum self.file_labels_ = {k: sorted(file_labels[k]) for k in file_labels} self.segment_labels_ = sorted(segment_labels) for uri in list(self.data_): current_file = self.data_[uri]['current_file'] y = self.initialize_y(current_file) self.data_[uri]['y'] = y if self.mask is not None: mask = current_file[self.mask] current_file[self.mask] = mask.align(y) return sum(datum['duration'] for datum in self.data_.values())
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): target_precision = self.precision # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) predictions = {} references = {} file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) # build overlap reference reference = Timeline(uri=uri) annotation = current_file['annotation'] for track1, track2 in annotation.co_iter(annotation): if track1 == track2: continue reference.add(track1[0] & track2[0]) references[uri] = reference.to_annotation() # extract overlap scores scores = sequence_labeling.apply(current_file) if model.logsoftmax: scores = SlidingWindowFeature( np.exp(scores.data[:, 2]), scores.sliding_window) else: scores = SlidingWindowFeature( scores.data[:, 2], scores.sliding_window) predictions[uri] = scores # dichotomic search to find threshold that maximizes recall # while having at least `target_precision` lower_alpha = 0. upper_alpha = 1. best_alpha = .5 * (lower_alpha + upper_alpha) best_recall = 0. for _ in range(10): current_alpha = .5 * (lower_alpha + upper_alpha) binarizer = Binarize(onset=current_alpha, offset=current_alpha, log_scale=False) precision = DetectionPrecision() recall = DetectionRecall() for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file) reference = references[uri] hypothesis = binarizer.apply(predictions[uri], dimension=0) hypothesis = hypothesis.to_annotation() uem = get_annotated(current_file) _ = precision(reference, hypothesis, uem=uem) _ = recall(reference, hypothesis, uem=uem) if abs(precision) < target_precision: # precision is not high enough: try higher thresholds lower_alpha = current_alpha else: upper_alpha = current_alpha r = abs(recall) if r > best_recall: best_recall = r best_alpha = current_alpha task = 'overlap_speech_detection' metric_name = f'{task}/recall@{target_precision:.2f}precision' return { metric_name: {'minimize': False, 'value': best_recall}, f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
def _validate_epoch_diarization( self, epoch, validation_data, protocol=None, subset: Subset = "development", device: Optional[torch.device] = None, batch_size: int = 32, n_jobs: int = 1, duration: float = None, step: float = 0.25, metric: str = None, **kwargs, ): # initialize embedding extraction pretrained = Pretrained( validate_dir=self.validate_dir_, epoch=epoch, duration=duration, step=step, batch_size=batch_size, device=device, ) preprocessors = self.preprocessors_ if "audio" not in preprocessors: preprocessors["audio"] = FileFinder() if "duration" not in preprocessors: preprocessors["duration"] = get_audio_duration _protocol = get_protocol(protocol, preprocessors=preprocessors) Z, t = dict(), dict() min_d, max_d = np.inf, -np.inf for current_file in getattr(_protocol, subset)(): uri = get_unique_identifier(current_file) uem = get_annotated(current_file) reference = current_file["annotation"] X_, t_ = [], [] embedding = pretrained(current_file) for i, (turn, _) in enumerate(reference.itertracks()): # extract embedding for current speech turn x_ = embedding.crop(turn, mode="center") if len(x_) < 1: x_ = embedding.crop(turn, mode="loose") if len(x_) < 1: msg = f"No embedding for {turn} in {uri:s}." raise ValueError(msg) # each speech turn is represented by its average embedding X_.append(np.mean(x_, axis=0)) t_.append(turn) X_ = np.array(X_) # apply hierarchical agglomerative clustering # all the way up to just one cluster (ie complete dendrogram) D = pdist(X_, metric=metric) min_d = min(np.min(D), min_d) max_d = max(np.max(D), max_d) Z[uri] = linkage(X_, method="pool", metric=metric) t[uri] = np.array(t_) def fun(threshold): _metric = DiarizationPurityCoverageFMeasure(weighted=False) for current_file in getattr(_protocol, subset)(): uri = get_unique_identifier(current_file) uem = get_annotated(current_file) reference = current_file["annotation"] clusters = fcluster(Z[uri], threshold, criterion="distance") hypothesis = Annotation(uri=uri) for (start_time, end_time), cluster in zip(t[uri], clusters): hypothesis[Segment(start_time, end_time)] = cluster _ = _metric(reference, hypothesis, uem=uem) return 1.0 - abs(_metric) res = scipy.optimize.minimize_scalar(fun, bounds=(0.0, 1.0), method="bounded", options={"maxiter": 10}) threshold = res.x.item() return { "metric": "diarization_fscore", "minimize": False, "value": float(1.0 - res.fun), }