コード例 #1
0
    def _decode(
        self,
        current_file: ProtocolFile,
        hypothesis: Annotation,
        scores: SlidingWindowFeature,
        labels: Iterable,
    ) -> Annotation:

        N, K = scores.data.shape

        if self.allow_overlap:
            active_speakers = scores.data > 0.5

        else:
            if self.lock_speech:
                active_speakers = np.argmax(scores.data, axis=1) + 1

            else:
                active_speakers = np.argmax(scores.data, axis=1)

        # reconstruct annotation
        new_hypothesis = one_hot_decoding(active_speakers,
                                          scores.sliding_window,
                                          labels=labels)

        new_hypothesis.uri = hypothesis.uri

        if self.lock_speech:
            speech = hypothesis.get_timeline().support()
            new_hypothesis = new_hypothesis.crop(speech)

        return new_hypothesis
コード例 #2
0
    def _window_level(self, current_file: dict,
                      speech_regions: Timeline) -> Annotation:
        """Apply clustering at window level

        Parameters
        ----------
        current_file : `dict`
            File as provided by a pyannote.database protocol.
        speech_regions : `Timeline`
            Speech regions.

        Returns
        -------
        hypothesis : `pyannote.core.Annotation`
            Clustering result.
        """

        # load embeddings
        embedding = self._embedding(current_file)
        window = embedding.sliding_window

        # extract and stack embeddings of speech regions
        X = np.vstack([
            embedding.crop(segment, mode="center", fixed=segment.duration)
            for segment in speech_regions
        ])

        # apply clustering
        y_pred = self.clustering(X)

        # reconstruct
        y = np.zeros(len(embedding), dtype=np.int8)

        # n = total number of "speech" embeddings
        # s_pred = current position in y_pred
        s_pred, n = 0, len(y_pred)

        for segment in speech_regions:

            # get indices of current speech segment
            ((s, e), ) = window.crop(segment,
                                     mode="center",
                                     fixed=segment.duration,
                                     return_ranges=True)

            # hack for the very last segment that might overflow by 1
            e_pred = min(s_pred + e - s, n - 1)
            e = s + (e_pred - s_pred)

            # assign y_pred to the corresponding speech regions
            y[s:e] = y_pred[s_pred:e_pred]

            # increment current position in y_red
            s_pred += e - s

        # reconstruct hypothesis
        return one_hot_decoding(y, window)
コード例 #3
0
ファイル: signal.py プロジェクト: zhiqizhang/pyannote-audio
    def apply(self, annotation, features):
        """

        Parameters
        ----------
        annotation : `pyannote.core.Annotation`
            Original annotation to be resegmented.
        features : `SlidingWindowFeature`
            Features

        Returns
        -------
        hypothesis : `pyannote.core.Annotation`
            Resegmented annotation.

        """

        sliding_window = features.sliding_window
        window = np.ones((1, sliding_window.samples(self.window)))

        log_probs = []
        labels = annotation.labels()

        # FIXME: embarrasingly parallel
        for label in labels:

            # gather all features for current label
            span = annotation.label_timeline(label)
            data = features.crop(span, mode='center')

            # train a GMM
            gmm = GaussianMixture(n_components=self.n_components,
                                  covariance_type='diag',
                                  tol=0.001,
                                  reg_covar=1e-06,
                                  max_iter=self.n_iter,
                                  n_init=1,
                                  init_params='kmeans',
                                  weights_init=None,
                                  means_init=None,
                                  precisions_init=None,
                                  random_state=None,
                                  warm_start=False,
                                  verbose=0,
                                  verbose_interval=10).fit(data)

            # compute log-probability across the whole file
            log_prob = gmm.score_samples(features.data)
            log_probs.append(log_prob)

        # smooth log-probability using a sliding window
        log_probs = scipy.signal.convolve(np.vstack(log_probs),
                                          window,
                                          mode='same')

        # assign each frame to the most likely label
        y = np.argmax(log_probs, axis=0)

        # reconstruct the annotation
        hypothesis = one_hot_decoding(y, sliding_window, labels=labels)

        # remove original non-speech regions
        return hypothesis.crop(annotation.get_timeline().support())
コード例 #4
0
    def _decode(
        self,
        current_file: ProtocolFile,
        hypothesis: Annotation,
        scores: SlidingWindowFeature,
        labels: Iterable,
    ) -> Annotation:

        # obtain overlapped speech regions
        overlap = self.binarizer_.apply(current_file["overlap"], dimension=1)

        frames = scores.sliding_window
        N, K = scores.data.shape

        if self.lock_speech:

            # K = 1 <~~> only non-speech
            # K = 2 <~~> just one speaker
            if K < 3:
                return hypothesis

            # sequence of two most likely speaker indices
            # (even when non-speech is in fact the most likely class)
            best_speakers_indices = np.argsort(-scores.data[:, 1:],
                                               axis=1)[:, :2]

            active_speakers = np.zeros((N, K - 1), dtype=np.int64)

            # start by assigning most likely speaker...
            for t, k in enumerate(best_speakers_indices[:, 0]):
                active_speakers[t, k] = 1

            # ... then add second most likely speaker in overlap regions
            T = frames.crop(overlap, mode="strict")

            # because overlap may use a different feature extraction step
            # it might happen that T contains indices slightly large than
            # the actual number of frames. the line below remove any such
            # indices.
            T = T[T < N]

            # mark second most likely speaker as active
            active_speakers[T, best_speakers_indices[T, 1]] = 1

            # reconstruct annotation
            new_hypothesis = one_hot_decoding(active_speakers,
                                              frames,
                                              labels=labels)

            # revert non-speech regions back to original
            speech = hypothesis.get_timeline().support()
            new_hypothesis = new_hypothesis.crop(speech)

        else:

            # K = 1 <~~> only non-speech
            if K < 2:
                return hypothesis

            # sequence of two most likely class indices
            # sequence of two most likely class indices
            # (including 0=non-speech)
            best_speakers_indices = np.argsort(-scores.data, axis=1)[:, :2]

            active_speakers = np.zeros((N, K - 1), dtype=np.int64)

            # start by assigning the most likely speaker...
            for t, k in enumerate(best_speakers_indices[:, 0]):
                # k = 0 is for non-speech
                if k > 0:
                    active_speakers[t, k - 1] = 1

            # ... then add second most likely speaker in overlap regions
            T = frames.crop(overlap, mode="strict")

            # because overlap may use a different feature extraction step
            # it might happen that T contains indices slightly large than
            # the actual number of frames. the line below remove any such
            # indices.
            T = T[T < N]

            # remove timesteps where second most likely class is non-speech
            T = T[best_speakers_indices[T, 1] > 0]

            # mark second most likely speaker as active
            active_speakers[T, best_speakers_indices[T, 1] - 1] = 1

            # reconstruct annotation
            new_hypothesis = one_hot_decoding(active_speakers,
                                              frames,
                                              labels=labels)

        new_hypothesis.uri = hypothesis.uri
        return new_hypothesis