def _turn_level(self, current_file: dict, speech_turns: Annotation) -> Annotation: """Apply clustering at speech turn level Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. speech_turns : `Annotation` Speech turns. Should only contain `str` labels. Returns ------- hypothesis : `pyannote.core.Annotation` Clustering result. """ assert_string_labels(speech_turns, "speech_turns") embedding = self._embedding(current_file) labels = speech_turns.labels() X, clustered_labels, skipped_labels = [], [], [] for l, label in enumerate(labels): timeline = speech_turns.label_timeline(label, copy=False) # be more and more permissive until we have # at least one embedding for current speech turn for mode in ["strict", "center", "loose"]: x = embedding.crop(timeline, mode=mode) if len(x) > 0: break # skip labels so small we don't have any embedding for it if len(x) < 1: skipped_labels.append(label) continue clustered_labels.append(label) X.append(np.mean(x, axis=0)) # apply clustering of label embeddings clusters = self.clustering(np.vstack(X)) # map each clustered label to its cluster (between 1 and N_CLUSTERS) mapping = {label: k for label, k in zip(clustered_labels, clusters)} # map each skipped label to its own cluster # (between -1 and -N_SKIPPED_LABELS) for l, label in enumerate(skipped_labels): mapping[label] = -(l + 1) # do the actual mapping return speech_turns.rename_labels(mapping=mapping)
def _xxx_try_iter(self, subset): # load "who speaks when" reference data = self._load_data(subset) diarization = getattr(self, 'diarization', True) if diarization: AnnotationGroups = data['annotation'].groupby(by='uri') else: AnnotationGroups = data['annotation'].groupby( by=['uri', 'speaker']) # load trials data_dir = Path(__file__).parent / 'data' / 'speaker_spotting' trials = data_dir / f'{subset}.trial.txt' names = ['model_id', 'uri', 'start', 'end', 'target', 'first', 'total'] trials = read_table(trials, delim_whitespace=True, names=names) for trial in trials.itertuples(): model_id = trial.model_id # FIE038_m1 ==> FIE038 # FIE038_m42 ==> FIE038 # Bernard_Pivot_m1 ==> Bernard_Pivot speaker = '_'.join(model_id.split('_')[:-1]) # append Mix-Headset to uri raw_uri = trial.uri uri = f'{raw_uri}.Mix-Headset' # trial session try_with = Segment(start=trial.start, end=trial.end) if diarization: # 'annotation' & 'annotated' are needed when diarization is set # therefore, this needs a bit more work than when set to False. annotation = Annotation(uri=uri) turns = AnnotationGroups.get_group(raw_uri) for t, turn in enumerate(turns.itertuples()): segment = Segment(start=turn.start, end=turn.start + turn.duration) if not (segment & try_with): continue annotation[segment, t] = turn.speaker annotation = annotation.crop(try_with) reference = annotation.label_timeline(speaker) annotated = Timeline(uri=uri, segments=[try_with]) # pack & yield trial current_trial = { 'database': 'Test', 'uri': uri, 'try_with': try_with, 'model_id': model_id, 'reference': reference, 'annotation': annotation, 'annotated': annotated, } else: # 'annotation' & 'annotated' are not needed when diarization is # set to False -- leading to a faster implementation... segments = [] if trial.target == 'target': turns = AnnotationGroups.get_group((raw_uri, speaker)) for t, turn in enumerate(turns.itertuples()): segment = Segment(start=turn.start, end=turn.start + turn.duration) segments.append(segment) reference = Timeline(uri=uri, segments=segments).crop(try_with) # pack & yield trial current_trial = { 'database': 'Test', 'uri': uri, 'try_with': try_with, 'model_id': model_id, 'reference': reference, } yield current_trial
def __call__( self, current_file: dict, speech_turns: Annotation, targets: Annotation ) -> Annotation: """Assign each speech turn to closest target (if close enough) Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. speech_turns : `Annotation` Speech turns. Should only contain `int` labels. targets : `Annotation` Targets. Should only contain `str` labels. Returns ------- assigned : `Annotation` Assigned speech turns. """ assert_string_labels(targets, "targets") assert_int_labels(speech_turns, "speech_turns") embedding = self._embedding(current_file) # gather targets embedding labels = targets.labels() X_targets, targets_labels = [], [] for l, label in enumerate(labels): timeline = targets.label_timeline(label, copy=False) # be more and more permissive until we have # at least one embedding for current speech turn for mode in ["center", "loose"]: x = embedding.crop(timeline, mode=mode) if len(x) > 0: break # skip labels so small we don't have any embedding for it if len(x) < 1: continue targets_labels.append(label) X_targets.append(np.mean(x, axis=0)) # gather speech turns embedding labels = speech_turns.labels() X, assigned_labels, skipped_labels = [], [], [] for l, label in enumerate(labels): timeline = speech_turns.label_timeline(label, copy=False) # be more and more permissive until we have # at least one embedding for current speech turn for mode in ["center", "loose"]: x = embedding.crop(timeline, mode=mode) if len(x) > 0: break # skip labels so small we don't have any embedding for it if len(x) < 1: skipped_labels.append(label) continue assigned_labels.append(label) X.append(np.mean(x, axis=0)) # assign speech turns to closest class assignments = self.closest_assignment(np.vstack(X_targets), np.vstack(X)) mapping = { label: targets_labels[k] for label, k in zip(assigned_labels, assignments) if not k < 0 } return speech_turns.rename_labels(mapping=mapping)
def tst_try_iter(self): def get_turns(uri): ref_file_path = Path(__file__).parent / 'data' / 'speaker_diarization' / uri ref_file_path = Path(str(ref_file_path) + '.txt') gt_names = ['start', 'end', 'speaker', 'speakerID'] return read_table(os.path.join(data_dir, ref_file_path), delim_whitespace=True, names=gt_names) diarization = getattr(self, 'diarization', True) # load trials data_dir = Path(__file__).parent / 'data' / 'speaker_spotting' trials = data_dir / 'tst.trial.txt' names = ['model_id', 'uri', 'start', 'end', 'target', 'first', 'total'] trials = read_table(trials, delim_whitespace=True, names=names) for trial in trials.itertuples(): model_id = trial.model_id speaker = model_id uri = trial.uri # trial session try_with = Segment(start=trial.start, end=trial.end) if diarization: # 'annotation' & 'annotated' are needed when diarization is set # therefore, this needs a bit more work than when set to False. annotation = Annotation(uri=uri) turns = get_turns(uri) for t, turn in enumerate(turns.itertuples()): segment = Segment(start=turn.start, end=turn.end) if not (segment & try_with): continue annotation[segment, t] = turn.speakerID annotation = annotation.crop(try_with) reference = annotation.label_timeline(speaker) annotated = Timeline(uri=uri, segments=[try_with]) # pack & yield trial current_trial = { 'database': 'Odessa', 'uri': uri, 'try_with': try_with, 'model_id': model_id, 'reference': reference, 'annotation': annotation, 'annotated': annotated, } else: # 'annotation' & 'annotated' are not needed when diarization is # set to False -- leading to a faster implementation... segments = [] if trial.target == 'target': turns = get_turns(uri).groupby(by='speakerID') for t, turn in enumerate(turns.get_group(speaker).itertuples()): segment = Segment(start=turn.start, end=turn.end) segments.append(segment) reference = Timeline(uri=uri, segments=segments).crop(try_with) # pack & yield trial current_trial = { 'database': 'Odessa', 'uri': uri, 'try_with': try_with, 'model_id': model_id, 'reference': reference, } yield current_trial