Exemple #1
0
    def _turn_level(self, current_file: dict,
                    speech_turns: Annotation) -> Annotation:
        """Apply clustering at speech turn level

        Parameters
        ----------
        current_file : `dict`
            File as provided by a pyannote.database protocol.
        speech_turns : `Annotation`
            Speech turns. Should only contain `str` labels.

        Returns
        -------
        hypothesis : `pyannote.core.Annotation`
            Clustering result.
        """

        assert_string_labels(speech_turns, "speech_turns")

        embedding = self._embedding(current_file)

        labels = speech_turns.labels()
        X, clustered_labels, skipped_labels = [], [], []
        for l, label in enumerate(labels):

            timeline = speech_turns.label_timeline(label, copy=False)

            # be more and more permissive until we have
            # at least one embedding for current speech turn
            for mode in ["strict", "center", "loose"]:
                x = embedding.crop(timeline, mode=mode)
                if len(x) > 0:
                    break

            # skip labels so small we don't have any embedding for it
            if len(x) < 1:
                skipped_labels.append(label)
                continue

            clustered_labels.append(label)
            X.append(np.mean(x, axis=0))

        # apply clustering of label embeddings
        clusters = self.clustering(np.vstack(X))

        # map each clustered label to its cluster (between 1 and N_CLUSTERS)
        mapping = {label: k for label, k in zip(clustered_labels, clusters)}

        # map each skipped label to its own cluster
        # (between -1 and -N_SKIPPED_LABELS)
        for l, label in enumerate(skipped_labels):
            mapping[label] = -(l + 1)

        # do the actual mapping
        return speech_turns.rename_labels(mapping=mapping)
    def __call__(
        self, current_file: dict, speech_turns: Annotation, targets: Annotation
    ) -> Annotation:
        """Assign each speech turn to closest target (if close enough)

        Parameters
        ----------
        current_file : `dict`
            File as provided by a pyannote.database protocol.
        speech_turns : `Annotation`
            Speech turns. Should only contain `int` labels.
        targets : `Annotation`
            Targets. Should only contain `str` labels.

        Returns
        -------
        assigned : `Annotation`
            Assigned speech turns.
        """

        assert_string_labels(targets, "targets")
        assert_int_labels(speech_turns, "speech_turns")

        embedding = self._embedding(current_file)

        # gather targets embedding
        labels = targets.labels()
        X_targets, targets_labels = [], []
        for l, label in enumerate(labels):

            timeline = targets.label_timeline(label, copy=False)

            # be more and more permissive until we have
            # at least one embedding for current speech turn
            for mode in ["center", "loose"]:
                x = embedding.crop(timeline, mode=mode)
                if len(x) > 0:
                    break

            # skip labels so small we don't have any embedding for it
            if len(x) < 1:
                continue

            targets_labels.append(label)
            X_targets.append(np.mean(x, axis=0))

        # gather speech turns embedding
        labels = speech_turns.labels()
        X, assigned_labels, skipped_labels = [], [], []
        for l, label in enumerate(labels):

            timeline = speech_turns.label_timeline(label, copy=False)

            # be more and more permissive until we have
            # at least one embedding for current speech turn
            for mode in ["center", "loose"]:
                x = embedding.crop(timeline, mode=mode)
                if len(x) > 0:
                    break

            # skip labels so small we don't have any embedding for it
            if len(x) < 1:
                skipped_labels.append(label)
                continue

            assigned_labels.append(label)
            X.append(np.mean(x, axis=0))

        # assign speech turns to closest class
        assignments = self.closest_assignment(np.vstack(X_targets), np.vstack(X))
        mapping = {
            label: targets_labels[k]
            for label, k in zip(assigned_labels, assignments)
            if not k < 0
        }
        return speech_turns.rename_labels(mapping=mapping)