def _turn_level(self, current_file: dict, speech_turns: Annotation) -> Annotation: """Apply clustering at speech turn level Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. speech_turns : `Annotation` Speech turns. Should only contain `str` labels. Returns ------- hypothesis : `pyannote.core.Annotation` Clustering result. """ assert_string_labels(speech_turns, "speech_turns") embedding = self._embedding(current_file) labels = speech_turns.labels() X, clustered_labels, skipped_labels = [], [], [] for l, label in enumerate(labels): timeline = speech_turns.label_timeline(label, copy=False) # be more and more permissive until we have # at least one embedding for current speech turn for mode in ["strict", "center", "loose"]: x = embedding.crop(timeline, mode=mode) if len(x) > 0: break # skip labels so small we don't have any embedding for it if len(x) < 1: skipped_labels.append(label) continue clustered_labels.append(label) X.append(np.mean(x, axis=0)) # apply clustering of label embeddings clusters = self.clustering(np.vstack(X)) # map each clustered label to its cluster (between 1 and N_CLUSTERS) mapping = {label: k for label, k in zip(clustered_labels, clusters)} # map each skipped label to its own cluster # (between -1 and -N_SKIPPED_LABELS) for l, label in enumerate(skipped_labels): mapping[label] = -(l + 1) # do the actual mapping return speech_turns.rename_labels(mapping=mapping)
def assert_int_labels(annotation: Annotation, name: str): """Check that annotation only contains integer labels Parameters ---------- annotation : `pyannote.core.Annotation` Annotation. name : `str` Name of the annotation (used for user feedback in case of failure) """ if any(not isinstance(label, int) for label in annotation.labels()): msg = f'{name} must contain `int` labels only.' raise ValueError(msg)
def iter_triplets(self, from_annotation): """Yield (anchor, positive, negative) segment triplets Parameters ---------- from_annotation : Annotation Annotation from which triplets are obtained. """ t = RandomTrackTriplets(per_label=self.per_label, yield_label=self.yield_label) annotation = Annotation(uri=from_annotation.uri, modality=from_annotation.modality) for segment, track, label in from_annotation.itertracks(label=True): if segment.duration < self.duration: continue annotation[segment, track] = label if len(annotation.labels()) < 2: return triplets = t.iter_triplets(annotation) for triplet in triplets: a, p, n = [item[0] for item in triplet] if self.duration: a, p, n = [self.pick(s) for s in (a, p, n)] if self.yield_label: a_, p_, n_ = [item[2] for item in triplet] yield (a, a_), (p, p_), (n, n_) else: yield a, p, n
path = OCR.format(repository=REPOSITORY, uri=uri) names = ['start', 'end', 'start_frame', 'end_frame', 'name', 'confidence'] pyannote_ocr = Annotation(uri=uri) try: ocr = pd.read_table(path, delim_whitespace=True, header=None, names=names) for _, (start, end, _, _, name, _) in ocr.iterrows(): pyannote_ocr[Segment(start, end)] = name except pandas.parser.CParserError as e: pass # name each speaker by most co-occurring OCR name if not pyannote_ocr: named_speakers = Annotation(uri=uri) else: named_speakers = argmax_tagger(pyannote_ocr, pyannote_speakers) named_speakers = named_speakers.subset(pyannote_ocr.labels()) path = FUSION.format(repository=REPOSITORY, uri=uri) directory = os.path.dirname(path) if not os.path.exists(directory): os.makedirs(directory) with open(path, 'w') as fp: duplicates = dict() for (speech_turn, track), (_, shot_id) in named_speakers.co_iter(pyannote_shots): original_person_name = named_speakers[speech_turn, track] person_name = mapping.setdefault(
def __call__( self, current_file: dict, speech_turns: Annotation, targets: Annotation ) -> Annotation: """Assign each speech turn to closest target (if close enough) Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. speech_turns : `Annotation` Speech turns. Should only contain `int` labels. targets : `Annotation` Targets. Should only contain `str` labels. Returns ------- assigned : `Annotation` Assigned speech turns. """ assert_string_labels(targets, "targets") assert_int_labels(speech_turns, "speech_turns") embedding = self._embedding(current_file) # gather targets embedding labels = targets.labels() X_targets, targets_labels = [], [] for l, label in enumerate(labels): timeline = targets.label_timeline(label, copy=False) # be more and more permissive until we have # at least one embedding for current speech turn for mode in ["center", "loose"]: x = embedding.crop(timeline, mode=mode) if len(x) > 0: break # skip labels so small we don't have any embedding for it if len(x) < 1: continue targets_labels.append(label) X_targets.append(np.mean(x, axis=0)) # gather speech turns embedding labels = speech_turns.labels() X, assigned_labels, skipped_labels = [], [], [] for l, label in enumerate(labels): timeline = speech_turns.label_timeline(label, copy=False) # be more and more permissive until we have # at least one embedding for current speech turn for mode in ["center", "loose"]: x = embedding.crop(timeline, mode=mode) if len(x) > 0: break # skip labels so small we don't have any embedding for it if len(x) < 1: skipped_labels.append(label) continue assigned_labels.append(label) X.append(np.mean(x, axis=0)) # assign speech turns to closest class assignments = self.closest_assignment(np.vstack(X_targets), np.vstack(X)) mapping = { label: targets_labels[k] for label, k in zip(assigned_labels, assignments) if not k < 0 } return speech_turns.rename_labels(mapping=mapping)
path = OCR.format(repository=REPOSITORY, uri=uri) names = ['start', 'end', 'start_frame', 'end_frame', 'name', 'confidence'] pyannote_ocr = Annotation(uri=uri) try: ocr = pd.read_table(path, delim_whitespace=True, header=None, names=names) for _, (start, end, _, _, name, _) in ocr.iterrows(): pyannote_ocr[Segment(start, end)] = name except pandas.parser.CParserError as e: pass # name each person by most co-occurring OCR name if not pyannote_ocr: named_face = Annotation(uri=uri) else: named_face = argmax_tagger(pyannote_ocr, pyannote_face) named_face = named_face.subset(pyannote_ocr.labels()) path = FUSION.format(repository=REPOSITORY, uri=uri) directory = os.path.dirname(path) if not os.path.exists(directory): os.makedirs(directory) with open(path, 'w') as fp: duplicates = dict() for (segment, track_id), (_, shot_id) in named_face.co_iter(pyannote_shots): original_person_name = named_face[segment, track_id] person_name = mapping.setdefault(