Exemple #1
0
    def _turn_level(self, current_file: dict,
                    speech_turns: Annotation) -> Annotation:
        """Apply clustering at speech turn level

        Parameters
        ----------
        current_file : `dict`
            File as provided by a pyannote.database protocol.
        speech_turns : `Annotation`
            Speech turns. Should only contain `str` labels.

        Returns
        -------
        hypothesis : `pyannote.core.Annotation`
            Clustering result.
        """

        assert_string_labels(speech_turns, "speech_turns")

        embedding = self._embedding(current_file)

        labels = speech_turns.labels()
        X, clustered_labels, skipped_labels = [], [], []
        for l, label in enumerate(labels):

            timeline = speech_turns.label_timeline(label, copy=False)

            # be more and more permissive until we have
            # at least one embedding for current speech turn
            for mode in ["strict", "center", "loose"]:
                x = embedding.crop(timeline, mode=mode)
                if len(x) > 0:
                    break

            # skip labels so small we don't have any embedding for it
            if len(x) < 1:
                skipped_labels.append(label)
                continue

            clustered_labels.append(label)
            X.append(np.mean(x, axis=0))

        # apply clustering of label embeddings
        clusters = self.clustering(np.vstack(X))

        # map each clustered label to its cluster (between 1 and N_CLUSTERS)
        mapping = {label: k for label, k in zip(clustered_labels, clusters)}

        # map each skipped label to its own cluster
        # (between -1 and -N_SKIPPED_LABELS)
        for l, label in enumerate(skipped_labels):
            mapping[label] = -(l + 1)

        # do the actual mapping
        return speech_turns.rename_labels(mapping=mapping)
Exemple #2
0
def assert_int_labels(annotation: Annotation, name: str):
    """Check that annotation only contains integer labels

    Parameters
    ----------
    annotation : `pyannote.core.Annotation`
        Annotation.
    name : `str`
        Name of the annotation (used for user feedback in case of failure)
    """
    if any(not isinstance(label, int) for label in annotation.labels()):
        msg = f'{name} must contain `int` labels only.'
        raise ValueError(msg)
Exemple #3
0
    def iter_triplets(self, from_annotation):
        """Yield (anchor, positive, negative) segment triplets

        Parameters
        ----------
        from_annotation : Annotation
            Annotation from which triplets are obtained.
        """

        t = RandomTrackTriplets(per_label=self.per_label,
                                yield_label=self.yield_label)

        annotation = Annotation(uri=from_annotation.uri,
                                modality=from_annotation.modality)
        for segment, track, label in from_annotation.itertracks(label=True):
            if segment.duration < self.duration:
                continue
            annotation[segment, track] = label

        if len(annotation.labels()) < 2:
            return

        triplets = t.iter_triplets(annotation)

        for triplet in triplets:

            a, p, n = [item[0] for item in triplet]

            if self.duration:
                a, p, n = [self.pick(s) for s in (a, p, n)]

            if self.yield_label:
                a_, p_, n_ = [item[2] for item in triplet]
                yield (a, a_), (p, p_), (n, n_)
            else:
                yield a, p, n
        path = OCR.format(repository=REPOSITORY, uri=uri)
        names = ['start', 'end', 'start_frame', 'end_frame', 'name', 'confidence']
        pyannote_ocr = Annotation(uri=uri)
        try:
            ocr = pd.read_table(path, delim_whitespace=True, header=None, names=names)
            for _, (start, end, _, _, name, _) in ocr.iterrows():
                pyannote_ocr[Segment(start, end)] = name
        except pandas.parser.CParserError as e:
            pass

        # name each speaker by most co-occurring OCR name
        if not pyannote_ocr:
            named_speakers = Annotation(uri=uri)
        else:
            named_speakers = argmax_tagger(pyannote_ocr, pyannote_speakers)
            named_speakers = named_speakers.subset(pyannote_ocr.labels())

        path = FUSION.format(repository=REPOSITORY, uri=uri)
        directory = os.path.dirname(path)
        if not os.path.exists(directory):
            os.makedirs(directory)

        with open(path, 'w') as fp:

            duplicates = dict()

            for (speech_turn, track), (_, shot_id) in named_speakers.co_iter(pyannote_shots):

                original_person_name = named_speakers[speech_turn, track]

                person_name = mapping.setdefault(
    def __call__(
        self, current_file: dict, speech_turns: Annotation, targets: Annotation
    ) -> Annotation:
        """Assign each speech turn to closest target (if close enough)

        Parameters
        ----------
        current_file : `dict`
            File as provided by a pyannote.database protocol.
        speech_turns : `Annotation`
            Speech turns. Should only contain `int` labels.
        targets : `Annotation`
            Targets. Should only contain `str` labels.

        Returns
        -------
        assigned : `Annotation`
            Assigned speech turns.
        """

        assert_string_labels(targets, "targets")
        assert_int_labels(speech_turns, "speech_turns")

        embedding = self._embedding(current_file)

        # gather targets embedding
        labels = targets.labels()
        X_targets, targets_labels = [], []
        for l, label in enumerate(labels):

            timeline = targets.label_timeline(label, copy=False)

            # be more and more permissive until we have
            # at least one embedding for current speech turn
            for mode in ["center", "loose"]:
                x = embedding.crop(timeline, mode=mode)
                if len(x) > 0:
                    break

            # skip labels so small we don't have any embedding for it
            if len(x) < 1:
                continue

            targets_labels.append(label)
            X_targets.append(np.mean(x, axis=0))

        # gather speech turns embedding
        labels = speech_turns.labels()
        X, assigned_labels, skipped_labels = [], [], []
        for l, label in enumerate(labels):

            timeline = speech_turns.label_timeline(label, copy=False)

            # be more and more permissive until we have
            # at least one embedding for current speech turn
            for mode in ["center", "loose"]:
                x = embedding.crop(timeline, mode=mode)
                if len(x) > 0:
                    break

            # skip labels so small we don't have any embedding for it
            if len(x) < 1:
                skipped_labels.append(label)
                continue

            assigned_labels.append(label)
            X.append(np.mean(x, axis=0))

        # assign speech turns to closest class
        assignments = self.closest_assignment(np.vstack(X_targets), np.vstack(X))
        mapping = {
            label: targets_labels[k]
            for label, k in zip(assigned_labels, assignments)
            if not k < 0
        }
        return speech_turns.rename_labels(mapping=mapping)
        path = OCR.format(repository=REPOSITORY, uri=uri)
        names = ['start', 'end', 'start_frame', 'end_frame', 'name', 'confidence']
        pyannote_ocr = Annotation(uri=uri)
        try:
            ocr = pd.read_table(path, delim_whitespace=True, header=None, names=names)
            for _, (start, end, _, _, name, _) in ocr.iterrows():
                pyannote_ocr[Segment(start, end)] = name
        except pandas.parser.CParserError as e:
            pass

        # name each person by most co-occurring OCR name
        if not pyannote_ocr:
            named_face = Annotation(uri=uri)
        else:
            named_face = argmax_tagger(pyannote_ocr, pyannote_face)
            named_face = named_face.subset(pyannote_ocr.labels())

        path = FUSION.format(repository=REPOSITORY, uri=uri)
        directory = os.path.dirname(path)
        if not os.path.exists(directory):
            os.makedirs(directory)

        with open(path, 'w') as fp:

            duplicates = dict()

            for (segment, track_id), (_, shot_id) in named_face.co_iter(pyannote_shots):

                original_person_name = named_face[segment, track_id]

                person_name = mapping.setdefault(