Ejemplo n.º 1
0
def predict(audio, algorithm='SpectralClustering'):
    # Speech Activation Detection

    sad_scores = sad(audio)
    binarize_sad = Binarize(offset=0.52,
                            onset=0.52,
                            log_scale=True,
                            min_duration_off=0.1,
                            min_duration_on=0.1)
    speech = binarize_sad.apply(sad_scores, dimension=1)

    # Speaker Change Detection

    scd_scores = scd(audio)
    peak = Peak(alpha=0.10, min_duration=0.10, log_scale=True)
    partition = peak.apply(scd_scores, dimension=1)

    # Overlapped Speech Detection

    # ovl_scores = ovl(audio)
    # binarize_ovl = Binarize(offset=0.55, onset=0.55, log_scale=True,
    #                         min_duration_off=0.1, min_duration_on=0.1)
    # overlap = binarize_ovl.apply(ovl_scores, dimension=1)

    # Speaker Embedding

    speech_turns = partition.crop(speech)
    embeddings = emb(audio)

    long_turns = Timeline(
        segments=[s for s in speech_turns if s.duration > .5])

    return long_turns, sad_scores, scd_scores, embeddings
Ejemplo n.º 2
0
def interruption_detection(mono_file, vad_dictionary, ovl=ovl, sucessfull_identification=1, delay=0.2):
    """
    на вход принимает:
    mono_file - запись wav до диаризации,
    vad_dictionary - результат работы VAD,
    sucessful_identification - удалось ли определить канал оператора,
    delay - допустимая задержка телефонной связи

    пример вызова:
    interruption_detection('clean_1.wav', vad_activity)

    возвращает целое число - кол-во перебиваний
    """
    test_file = {'uri': '1', 'audio': mono_file}
    ovl_scores = ovl(test_file)
    binarize = Binarize(offset=0.55, onset=0.55, log_scale=True,
                        min_duration_off=0.1, min_duration_on=0.1)
    overlap = binarize.apply(ovl_scores, dimension=1)
    overlap = dict(overlap.for_json())

    interruption_count = 0

    if sucessfull_identification == 1:
        for one_overlap in overlap['content']:
            start_interrupt = one_overlap['start']
            for client_speech in vad_dictionary['client_timeline']:
                if (start_interrupt > client_speech['start']) and (start_interrupt < client_speech['end']):
                    interruption_count += 1

        client_activity, operator_activity = change_dict_format(vad_dictionary)
        for end in client_activity[1]:
            difference = operator_activity[0] - end
            interruption_count += difference[np.abs(difference) < delay].shape[0]

    return interruption_count
Ejemplo n.º 3
0
    def run(self, file_path):
        diarization = self.pipeline({'audio': file_path})
        binarize = Binarize(offset=self.threshold, onset=self.threshold, pad_onset=0.3, pad_offset=0.3, log_scale=True, 
                    min_duration_off=0.5, min_duration_on=2)
        result_list = binarize.apply(diarization, dimension=1).for_json()['content']

        self.raw_result = cleanup_cuts(result_list)
        return self
def remove_silent_parts(filepath, sr, model):
    audio, sr = librosa.load(filepath, sr=sr)
    test_file = {'uri': filepath.split('/')[-1], 'audio': filepath}

    # obtain raw SAD scores (as `pyannote.core.SlidingWindowFeature` instance)
    sad_scores = model(test_file)

    # binarize raw SAD scores
    # NOTE: both onset/offset values were tuned on AMI dataset.
    # you might need to use different values for better results.
    binarize = Binarize(offset=0.52,
                        onset=0.52,
                        log_scale=True,
                        min_duration_off=0.1,
                        min_duration_on=0.1)

    # speech regions (as `pyannote.core.Timeline` instance)
    speech = binarize.apply(sad_scores, dimension=1)

    audio_pieces = []
    for segment in speech:
        segment = list(segment)
        audio_pieces.extend(audio[int(segment[0] * sr):int(segment[1] * sr)])
    return np.array(audio_pieces)
class ResegmentationWithOverlap(Resegmentation):
    """Re-segmentation with overlap

    Parameters
    ----------
    feature_extraction : FeatureExtraction
        Feature extraction.
    Architecture : Model subclass
    architecture_params : dict
    overlap_threshold : `float`, optional
        Defaults to 0.5.
    lock_speech: `boolean`, optional
        Keep speech/non-speech state unchanged. Defaults to False.
    epochs : `int`, optional
        (Self-)train for that many epochs. Defaults to 5.
    ensemble : `int`, optional
        Average output of last `ensemble` epochs. Defaults to no ensembling.
    duration : float, optional
        Duration of audio chunks. Defaults to 2s.
    step : `float`, optional
        Ratio of audio chunk duration used as step between two consecutive
        audio chunks. Defaults to 0.1.
    batch_size : int, optional
        Batch size. Defaults to 32.
    device : `torch.device`, optional
    mask : str, optional
        When provided, current_file[mask] is used by the loss function to weigh
        samples.
    """
    def __init__(
        self,
        feature_extraction: FeatureExtraction,
        Architecture: Type[Model],
        architecture_params: dict,
        lock_speech: bool = False,
        overlap_threshold: float = 0.5,
        epochs: int = 5,
        learning_rate: float = 0.1,
        ensemble: int = 1,
        duration: float = 2.0,
        step: float = 0.1,
        n_jobs: int = 1,
        device: torch.device = None,
        batch_size: int = 32,
        mask: Text = None,
    ):

        super().__init__(
            feature_extraction,
            Architecture,
            architecture_params,
            lock_speech=lock_speech,
            epochs=epochs,
            learning_rate=learning_rate,
            ensemble=ensemble,
            duration=duration,
            step=step,
            n_jobs=n_jobs,
            device=device,
            batch_size=batch_size,
            mask=mask,
        )

        self.overlap_threshold = overlap_threshold
        self.binarizer_ = Binarize(
            onset=self.overlap_threshold,
            offset=self.overlap_threshold,
            scale="absolute",
            log_scale=True,
        )

    def _decode(
        self,
        current_file: ProtocolFile,
        hypothesis: Annotation,
        scores: SlidingWindowFeature,
        labels: Iterable,
    ) -> Annotation:

        # obtain overlapped speech regions
        overlap = self.binarizer_.apply(current_file["overlap"], dimension=1)

        frames = scores.sliding_window
        N, K = scores.data.shape

        if self.lock_speech:

            # K = 1 <~~> only non-speech
            # K = 2 <~~> just one speaker
            if K < 3:
                return hypothesis

            # sequence of two most likely speaker indices
            # (even when non-speech is in fact the most likely class)
            best_speakers_indices = np.argsort(-scores.data[:, 1:],
                                               axis=1)[:, :2]

            active_speakers = np.zeros((N, K - 1), dtype=np.int64)

            # start by assigning most likely speaker...
            for t, k in enumerate(best_speakers_indices[:, 0]):
                active_speakers[t, k] = 1

            # ... then add second most likely speaker in overlap regions
            T = frames.crop(overlap, mode="strict")

            # because overlap may use a different feature extraction step
            # it might happen that T contains indices slightly large than
            # the actual number of frames. the line below remove any such
            # indices.
            T = T[T < N]

            # mark second most likely speaker as active
            active_speakers[T, best_speakers_indices[T, 1]] = 1

            # reconstruct annotation
            new_hypothesis = one_hot_decoding(active_speakers,
                                              frames,
                                              labels=labels)

            # revert non-speech regions back to original
            speech = hypothesis.get_timeline().support()
            new_hypothesis = new_hypothesis.crop(speech)

        else:

            # K = 1 <~~> only non-speech
            if K < 2:
                return hypothesis

            # sequence of two most likely class indices
            # sequence of two most likely class indices
            # (including 0=non-speech)
            best_speakers_indices = np.argsort(-scores.data, axis=1)[:, :2]

            active_speakers = np.zeros((N, K - 1), dtype=np.int64)

            # start by assigning the most likely speaker...
            for t, k in enumerate(best_speakers_indices[:, 0]):
                # k = 0 is for non-speech
                if k > 0:
                    active_speakers[t, k - 1] = 1

            # ... then add second most likely speaker in overlap regions
            T = frames.crop(overlap, mode="strict")

            # because overlap may use a different feature extraction step
            # it might happen that T contains indices slightly large than
            # the actual number of frames. the line below remove any such
            # indices.
            T = T[T < N]

            # remove timesteps where second most likely class is non-speech
            T = T[best_speakers_indices[T, 1] > 0]

            # mark second most likely speaker as active
            active_speakers[T, best_speakers_indices[T, 1] - 1] = 1

            # reconstruct annotation
            new_hypothesis = one_hot_decoding(active_speakers,
                                              frames,
                                              labels=labels)

        new_hypothesis.uri = hypothesis.uri
        return new_hypothesis
class SpeechActivityDetection(Pipeline):
    """Speech activity detection pipeline

    Parameters
    ----------
    scores : Wrappable, optional
        Describes how raw speech activity detection scores should be obtained.
        See pyannote.audio.features.wrapper.Wrapper documentation for details.
        Defaults to "@sad_scores" that indicates that protocol files provide
        the scores in the "sad_scores" key.
    fscore : bool, optional
        Optimize (precision/recall) fscore. Defaults to optimizing detection
        error rate.

    Hyper-parameters
    ----------------
    onset, offset : `float`
        Onset/offset detection thresholds
    min_duration_on, min_duration_off : `float`
        Minimum duration in either state (speech or not)
    pad_onset, pad_offset : `float`
        Padding duration.
    """
    def __init__(self, scores: Wrappable = None, fscore: bool = False):
        super().__init__()

        if scores is None:
            scores = "@sad_scores"
        self.scores = scores
        self._scores = Wrapper(self.scores)

        self.fscore = fscore

        # hyper-parameters
        self.onset = Uniform(0., 1.)
        self.offset = Uniform(0., 1.)
        self.min_duration_on = Uniform(0., 2.)
        self.min_duration_off = Uniform(0., 2.)
        self.pad_onset = Uniform(-1., 1.)
        self.pad_offset = Uniform(-1., 1.)

    def initialize(self):
        """Initialize pipeline with current set of parameters"""

        self._binarize = Binarize(onset=self.onset,
                                  offset=self.offset,
                                  min_duration_on=self.min_duration_on,
                                  min_duration_off=self.min_duration_off,
                                  pad_onset=self.pad_onset,
                                  pad_offset=self.pad_offset)

    def __call__(self, current_file: dict) -> Annotation:
        """Apply speech activity detection

        Parameters
        ----------
        current_file : `dict`
            File as provided by a pyannote.database protocol. May contain a
            'sad_scores' key providing precomputed scores.

        Returns
        -------
        speech : `pyannote.core.Annotation`
            Speech regions.
        """

        sad_scores = self._scores(current_file)

        # if this check has not been done yet, do it once and for all
        if not hasattr(self, "log_scale_"):
            # heuristic to determine whether scores are log-scaled
            if np.nanmean(sad_scores.data) < 0:
                self.log_scale_ = True
            else:
                self.log_scale_ = False

        data = np.exp(sad_scores.data) if self.log_scale_ \
               else sad_scores.data

        # speech vs. non-speech
        if data.shape[1] > 1:
            speech_prob = SlidingWindowFeature(1. - data[:, 0],
                                               sad_scores.sliding_window)
        else:
            speech_prob = SlidingWindowFeature(data, sad_scores.sliding_window)

        speech = self._binarize.apply(speech_prob)

        speech.uri = current_file.get('uri', None)
        return speech.to_annotation(generator='string', modality='speech')

    def get_metric(
        self,
        parallel=False
    ) -> Union[DetectionErrorRate, DetectionPrecisionRecallFMeasure]:
        """Return new instance of detection metric"""

        if self.fscore:
            return DetectionPrecisionRecallFMeasure(collar=0.0,
                                                    skip_overlap=False,
                                                    parallel=parallel)
        else:
            return DetectionErrorRate(collar=0.0,
                                      skip_overlap=False,
                                      parallel=parallel)
Ejemplo n.º 7
0
class OverlapDetection(Pipeline):
    """Overlap detection pipeline

    Parameters
    ----------
    scores : Wrappable, optional
        Describes how raw overlapped speech detection scores should be obtained.
        See pyannote.audio.features.wrapper.Wrapper documentation for details.
        Defaults to "@ovl_scores" that indicates that protocol files provide
        the scores in the "ovl_scores" key.
    precision : `float`, optional
        Target detection precision. Defaults to 0.9.
    fscore : bool, optional
        Optimize (precision/recall) fscore. Defaults to optimizing recall at
        target precision.


    Hyper-parameters
    ----------------
    onset, offset : `float`
        Onset/offset detection thresholds
    min_duration_on, min_duration_off : `float`
        Minimum duration in either state (overlap or not)
    pad_onset, pad_offset : `float`
        Padding duration.
    """
    def __init__(self, scores: Wrappable = None, fscore: bool = False):
        super().__init__()

        if scores is None:
            scores = "@ovl_scores"
        self.scores = scores
        self._scores = Wrapper(self.scores)

        self.precision = precision
        self.fscore = fscore

        # hyper-parameters
        self.onset = Uniform(0., 1.)
        self.offset = Uniform(0., 1.)
        self.min_duration_on = Uniform(0., 2.)
        self.min_duration_off = Uniform(0., 2.)
        self.pad_onset = Uniform(-1., 1.)
        self.pad_offset = Uniform(-1., 1.)

    def initialize(self):
        """Initialize pipeline with current set of parameters"""

        self._binarize = Binarize(onset=self.onset,
                                  offset=self.offset,
                                  min_duration_on=self.min_duration_on,
                                  min_duration_off=self.min_duration_off,
                                  pad_onset=self.pad_onset,
                                  pad_offset=self.pad_offset)

    def __call__(self, current_file: dict) -> Annotation:
        """Apply overlap detection

        Parameters
        ----------
        current_file : `dict`
            File as provided by a pyannote.database protocol. May contain a
            'ovl_scores' key providing precomputed scores.

        Returns
        -------
        overlap : `pyannote.core.Annotation`
            Overlap regions.
        """

        ovl_scores = self._scores(current_file)

        # if this check has not been done yet, do it once and for all
        if not hasattr(self, "log_scale_"):
            # heuristic to determine whether scores are log-scaled
            if np.nanmean(ovl_scores.data) < 0:
                self.log_scale_ = True
            else:
                self.log_scale_ = False

        data = np.exp(ovl_scores.data) if self.log_scale_ \
               else ovl_scores.data

        # overlap vs. non-overlap
        if data.shape[1] > 1:
            overlap_prob = SlidingWindowFeature(1. - data[:, 0],
                                                ovl_scores.sliding_window)
        else:
            overlap_prob = SlidingWindowFeature(data,
                                                ovl_scores.sliding_window)

        overlap = self._binarize.apply(overlap_prob)

        overlap.uri = current_file['uri']
        return overlap.to_annotation(generator='string', modality='overlap')

    @staticmethod
    def to_overlap(reference: Annotation) -> Annotation:
        """Get overlapped speech reference annotation

        Parameters
        ----------
        reference : Annotation
            File yielded by pyannote.database protocols.

        Returns
        -------
        overlap : `pyannote.core.Annotation`
            Overlapped speech reference.
        """

        overlap = Timeline(uri=reference.uri)
        for (s1, t1), (s2, t2) in reference.co_iter(reference):
            l1 = reference[s1, t1]
            l2 = reference[s2, t2]
            if l1 == l2:
                continue
            overlap.add(s1 & s2)
        return overlap.support().to_annotation()

    def get_metric(self, **kwargs) -> DetectionPrecisionRecallFMeasure:
        """Get overlapped speech detection metric

        Returns
        -------
        metric : DetectionPrecisionRecallFMeasure
            Detection metric.
        """

        if not self.fscore:
            raise NotImplementedError()

        class _Metric(DetectionPrecisionRecallFMeasure):
            def compute_components(_self,
                                   reference: Annotation,
                                   hypothesis: Annotation,
                                   uem: Timeline = None,
                                   **kwargs) -> dict:
                return super().compute_components(self.to_overlap(reference),
                                                  hypothesis,
                                                  uem=uem,
                                                  **kwargs)

        return _Metric()

    def loss(self, current_file: dict, hypothesis: Annotation) -> float:
        """Compute (1 - recall) at target precision

        If precision < target, return 1 + (1 - precision)

        Parameters
        ----------
        current_file : `dict`
            File as provided by a pyannote.database protocol.
        hypothesis : `pyannote.core.Annotation`
            Overlap regions.

        Returns
        -------
        error : `float`
            1. - segment coverage.
        """

        precision = DetectionPrecision()
        recall = DetectionRecall()

        if 'overlap_reference' in current_file:
            overlap_reference = current_file['overlap_reference']

        else:
            reference = current_file['annotation']
            overlap_reference = self.to_overlap(reference)
            current_file['overlap_reference'] = overlap_reference

        uem = get_annotated(current_file)
        p = precision(overlap_reference, hypothesis, uem=uem)
        r = recall(overlap_reference, hypothesis, uem=uem)

        if p > self.precision:
            return 1. - r
        return 1. + (1. - p)
Ejemplo n.º 8
0
    wav = signal.astype('int16')
    scipy.io.wavfile.write('temp/temp.wav', rate=16000, data=wav)

    test_file = {'uri': 'temp', 'audio': 'temp/temp.wav'}

    # Detect Sound
    sad_scores = sad(test_file)
    from pyannote.audio.utils.signal import Binarize
    binarize = Binarize(offset=0.9,
                        onset=0.9,
                        log_scale=True,
                        min_duration_off=0.1,
                        min_duration_on=0.1)

    # speech regions (as `pyannote.core.Timeline` instance)
    speech = binarize.apply(sad_scores, dimension=1)
    if len(speech) > 0:
        pred_list.append(1)
    else:
        pred_list.append(0)

    label_list.append(label)

# Performance
from sklearn.metrics import accuracy_score, precision_score, recall_score
accuracy = accuracy_score(label_list, pred_list)
precision = precision_score(label_list, pred_list)
recall = recall_score(label_list, pred_list)
print('Accuracy %.4f' % accuracy)
print('Precision %.4f' % precision)
print('Recall %.4f' % recall)
Ejemplo n.º 9
0
class InteractiveDiarization(Pipeline):
    """Interactive diarization pipeline

    Parameters
    ----------
    sad : str or Path, optional
        Pretrained speech activity detection model. Defaults to "sad".
    emb : str or Path, optional
        Pretrained speaker embedding model. Defaults to "emb".
    batch_size : int, optional
        Batch size.
    only_sad : bool, optional
        Set to True if you only care about speech activity detection.

    Hyper-parameters
    ----------------
    sad_threshold_on, sad_threshold_off : float
        Onset/offset speech activity detection thresholds.
    sad_min_duration_on, sad_min_duration_off : float
        Minimum duration of speech/non-speech regions.
    emb_duration, emb_step_ratio : float
        Sliding window used for embedding extraction.
    emb_threshold : float
        Distance threshold used as stopping criterion for hierarchical
        agglomerative clustering.
    """
    def __init__(
        self,
        sad: Union[Text, Path] = {"sad": {
            "duration": 2.0,
            "step": 0.1
        }},
        emb: Union[Text, Path] = "emb",
        batch_size: int = None,
        only_sad: bool = False,
    ):

        super().__init__()

        self.sad = Wrapper(sad)
        if batch_size is not None:
            self.sad.batch_size = batch_size
        self.sad_speech_index_ = self.sad.classes.index("speech")

        self.sad_threshold_on = Uniform(0.0, 1.0)
        self.sad_threshold_off = Uniform(0.0, 1.0)
        self.sad_min_duration_on = Uniform(0.0, 0.5)
        self.sad_min_duration_off = Uniform(0.0, 0.5)

        self.only_sad = only_sad
        if self.only_sad:
            return

        self.emb = Wrapper(emb)
        if batch_size is not None:
            self.emb.batch_size = batch_size

        max_duration = self.emb.duration
        min_duration = getattr(self.emb, "min_duration", 0.25 * max_duration)
        self.emb_duration = Uniform(min_duration, max_duration)
        self.emb_step_ratio = Uniform(0.1, 1.0)
        self.emb_threshold = Uniform(0.0, 2.0)

    def initialize(self):
        """Initialize pipeline internals with current hyper-parameter values"""

        self.sad_binarize_ = Binarize(
            onset=self.sad_threshold_on,
            offset=self.sad_threshold_off,
            min_duration_on=self.sad_min_duration_on,
            min_duration_off=self.sad_min_duration_off,
        )

        if not self.only_sad:
            # embeddings will be extracted with a sliding window
            # of "emb_duration" duration and "emb_step_ratio x emb_duration" step.
            self.emb.duration = self.emb_duration
            self.emb.step = self.emb_step_ratio

    def compute_speech(self, current_file: ProtocolFile) -> Timeline:
        """Apply speech activity detection

        Parameters
        ----------
        current_file : ProtocolFile
            Protocol file.

        Returns
        -------
        speech : Timeline
            Speech activity detection result.
        """

        # speech activity detection
        if "sad_scores" in current_file:
            sad_scores: SlidingWindowFeature = current_file["sad_scores"]
        else:
            sad_scores = self.sad(current_file)
            if np.nanmean(sad_scores) < 0:
                sad_scores = np.exp(sad_scores)
            current_file["sad_scores"] = sad_scores

        speech: Timeline = self.sad_binarize_.apply(
            sad_scores, dimension=self.sad_speech_index_)

        return speech

    def compute_embedding(self,
                          current_file: ProtocolFile) -> SlidingWindowFeature:
        """Extract speaker embedding

        Parameters
        ----------
        current_file : ProtocolFile
            Protocol file

        Returns
        -------
        embedding : SlidingWindowFeature
            Speaker embedding.
        """

        return self.emb(current_file)

    def get_segment_assignment(self, embedding: SlidingWindowFeature,
                               speech: Timeline) -> np.ndarray:
        """Get segment assignment

        Parameters
        ----------
        embedding : SlidingWindowFeature
            Embeddings.
        speech : Timeline
            Speech regions.

        Returns
        -------
        assignment : (num_embedding, ) np.ndarray
            * assignment[i] = s with s > 0 means that ith embedding is strictly
            contained in (1-based) sth segment.
            * assignment[i] = s with s < 0 means that more than half of ith
            embedding is part of (1-based) sth segment.
            * assignment[i] = 0 means that none of the above is true.
        """

        assignment: np.ndarray = np.zeros((len(embedding), ), dtype=np.int32)

        for s, segment in enumerate(speech):
            indices = embedding.sliding_window.crop(segment, mode="strict")
            if len(indices) > 0:
                strict = 1
            else:
                strict = -1
                indices = embedding.sliding_window.crop(segment, mode="center")
            for i in indices:
                if i < 0 or i >= len(embedding):
                    continue
                assignment[i] = strict * (s + 1)

        return assignment

    def __call__(
        self,
        current_file: ProtocolFile,
        cannot_link: List[Tuple[float, float]] = None,
        must_link: List[Tuple[float, float]] = None,
    ) -> Annotation:
        """Apply speaker diarization

        Parameters
        ----------
        current_file : ProtocolFile
            Protocol file.
        cannot_link :
            List of time-based "cannot link" constraints.
        must_link :
            List of time-based "must link" constraints.

        Returns
        -------
        diarization : Annotation
            Speaker diarization result.
        """

        if cannot_link is None:
            cannot_link = []
        if must_link is None:
            must_link = []

        if "duration" not in current_file:
            current_file["duration"] = get_audio_duration(current_file)

        # in "interactive annotation" mode, there is no need to recompute speech
        # regions every time a file is processed: they can be passed with the
        # file directly
        if "speech" in current_file:
            speech: Timeline = current_file["speech"]

        # in "pipeline optimization" mode, pipeline hyper-parameters are different
        # every time a file is processed: speech regions must be recomputed
        else:
            speech = self.compute_speech(current_file)

        if self.only_sad:
            return speech.to_annotation(generator=iter(lambda: "SPEECH", None))

        # in "interactive annotation" mode, pipeline hyper-parameters are fixed.
        # therefore, there is no need to recompute embeddings every time a file
        # is processed: they can be passed with the file directly.
        if "embedding" in current_file:
            embedding: SlidingWindowFeature = current_file["embedding"]

        # in "pipeline optimization" mode, pipeline hyper-parameters are different
        # every time a file is processed: embeddings must be recomputed
        else:
            embedding = self.compute_embedding(current_file)

        window: SlidingWindow = embedding.sliding_window

        # segment_assignment[i] = s with s > 0 means that ith embedding is
        #       strictly contained in (1-based) sth segment.
        # segment_assignment[i] = s with s < 0 means that more than half of ith
        #       embedding is part of (1-based) sth segment.
        # segment_assignment[i] = 0 means that none of the above is true.
        segment_assignment: np.ndarray = self.get_segment_assignment(
            embedding, speech)

        # cluster_assignment[i] = k (k > 0) means that the ith embedding belongs
        #                           to kth cluster
        # cluster_assignment[i] = 0 when segment_assignment[i] = 0
        cluster_assignment: np.ndarray = np.zeros((len(embedding), ),
                                                  dtype=np.int32)

        clean = segment_assignment > 0
        noisy = segment_assignment < 0
        clean_indices = np.where(clean)[0]
        if len(clean_indices) < 2:
            cluster_assignment[clean_indices] = 1

        else:

            # convert time-based constraints to index-based constraints
            cannot_link = index2index(time2index(cannot_link, window), clean)
            must_link = index2index(time2index(must_link, window), clean)

            dendrogram = pool(
                embedding[clean_indices],
                metric="cosine",
                cannot_link=cannot_link,
                must_link=must_link,
                must_link_method="propagate",
            )
            clusters = fcluster(dendrogram,
                                self.emb_threshold,
                                criterion="distance")
            for i, k in zip(clean_indices, clusters):
                cluster_assignment[i] = k

        loose_indices = np.where(noisy)[0]
        if len(clean_indices) == 0:
            if len(loose_indices) < 2:
                clusters = [1] * len(loose_indices)
            else:
                dendrogram = pool(embedding[loose_indices], metric="cosine")
                clusters = fcluster(dendrogram,
                                    self.emb_threshold,
                                    criterion="distance")
            for i, k in zip(loose_indices, clusters):
                cluster_assignment[i] = k

        else:
            # NEAREST NEIGHBOR
            distance = cdist(embedding[clean_indices],
                             embedding[loose_indices],
                             metric="cosine")
            nearest_neighbor = np.argmin(distance, axis=0)
            for loose_index, nn in zip(loose_indices, nearest_neighbor):
                strict_index = clean_indices[nn]
                cluster_assignment[loose_index] = cluster_assignment[
                    strict_index]

            # # NEAREST CLUSTER
            # centroid = np.vstack(
            #     [
            #         np.mean(embedding[cluster_assignment == k], axis=0)
            #         for k in np.unique(clusters)
            #     ]
            # )
            # distance = cdist(centroid, embedding[loose_indices], metric="cosine")
            # cluster_assignment[loose_indices] = np.argmin(distance, axis=0) + 1

        # convert cluster assignment to pyannote.core.Annotation
        # (make sure to keep speech regions unchanged)
        hypothesis = Annotation(uri=current_file.get("uri", None))
        for s, segment in enumerate(speech):

            indices = np.where(segment_assignment == s + 1)[0]
            if len(indices) == 0:
                indices = np.where(segment_assignment == -(s + 1))[0]
                if len(indices) == 0:
                    continue

            clusters = cluster_assignment[indices]

            start, k = segment.start, clusters[0]
            change_point = np.diff(clusters) != 0
            for i, new_k in zip(indices[1:][change_point],
                                clusters[1:][change_point]):
                end = window[i].middle + 0.5 * window.step
                hypothesis[Segment(start, end)] = k
                start = end
                k = new_k
            hypothesis[Segment(start, segment.end)] = k

        return hypothesis.support()

    def get_metric(self) -> Union[DetectionErrorRate, DiarizationErrorRate]:
        if self.only_sad:
            return DetectionErrorRate(collar=0.0)
        else:
            return DiarizationErrorRate(collar=0.0, skip_overlap=False)
Ejemplo n.º 10
0
class VoiceActivityDetection:
    def __init__(self, binarize_params=None):
        self.sad = torch.hub.load('pyannote/pyannote-audio', model='sad_ami')
        # см. VAD Smoothing в статье https://www.isca-speech.org/archive/
        # interspeech_2015/papers/i15_2650.pdf
        binarize_params_default = {
            # an onset and offset thresholds for the detection of
            # the beginning and end of a speech segment
            'offset': 0.5,
            'onset': 0.5,
            # a threshold for small silence deletion
            'min_duration_off': 0.1,
            # a threshold for short speech segment deletion;
            'min_duration_on': 0.1,
            'log_scale': True,
        }
        binarize_params = binarize_params or binarize_params_default
        self.binarize = Binarize(**binarize_params)

    @staticmethod
    def _validate_wav_file(file_path):
        try:
            with wave.open(file_path, 'rb') as f:
                if f.getnchannels() != 2:
                    raise VADException(
                        'Invalid number of channels for wav file. Must be 2.')
        except wave.Error as e:
            raise VADException(f'Invalid format of wav file: {e}')

    @staticmethod
    def _prepare_wav_by_channels(source_wav, operator_channel, client_channel,
                                 tmpdir):
        rate, data = wavfile.read(source_wav)
        operator_data = data[:, operator_channel]
        client_data = data[:, client_channel]

        operator_file_path = os.path.join(tmpdir, 'operator.wav')
        client_file_path = os.path.join(tmpdir, 'client.wav')

        wavfile.write(operator_file_path, rate, operator_data)
        wavfile.write(client_file_path, rate, client_data)

        return operator_file_path, client_file_path

    def _get_timeline(self, file_path):
        sad_scores = self.sad({'uri': 'filename', 'audio': file_path})
        speech = self.binarize.apply(sad_scores, dimension=0)
        return speech.for_json()['content']

    def get_timelines(self, file_path, operator_channel):
        """
        Для двухканального wav-файла возвращает разметку/таймлайн разговора
        оператора с клиентом.

        :note:
            Предполагается, что оператор и клиент разнесены по двум разным
            каналам wav-файла.

        :param file_path:
            `str`, путь до исходного wav-файла.
        :param operator_channel:
            `int`, номер канала, который относится к оператору.

        :return:
            `dict`, словарь разметки вида:
            {
                'operator_timeline': [
                    {'start': 10.5, 'end': '12.1'},
                    ...
                ],
                'client_timeline': [
                    {'start': 13, 'end': '20'},
                    ...
                ]
            }
            где параметры `start` и `end` указаны в секундах.
        """
        if operator_channel not in (0, 1):
            raise VADException('Invalid number of operator channel')

        client_channel = 0 if operator_channel else 1

        self._validate_wav_file(file_path)

        with tempfile.TemporaryDirectory() as tmpdir:
            operator_wav, client_wav = self._prepare_wav_by_channels(
                file_path, operator_channel, client_channel, tmpdir)
            return {
                'operator_timeline': self._get_timeline(operator_wav),
                'client_timeline': self._get_timeline(client_wav),
            }