def interruption_detection(mono_file, vad_dictionary, ovl=ovl, sucessfull_identification=1, delay=0.2): """ на вход принимает: mono_file - запись wav до диаризации, vad_dictionary - результат работы VAD, sucessful_identification - удалось ли определить канал оператора, delay - допустимая задержка телефонной связи пример вызова: interruption_detection('clean_1.wav', vad_activity) возвращает целое число - кол-во перебиваний """ test_file = {'uri': '1', 'audio': mono_file} ovl_scores = ovl(test_file) binarize = Binarize(offset=0.55, onset=0.55, log_scale=True, min_duration_off=0.1, min_duration_on=0.1) overlap = binarize.apply(ovl_scores, dimension=1) overlap = dict(overlap.for_json()) interruption_count = 0 if sucessfull_identification == 1: for one_overlap in overlap['content']: start_interrupt = one_overlap['start'] for client_speech in vad_dictionary['client_timeline']: if (start_interrupt > client_speech['start']) and (start_interrupt < client_speech['end']): interruption_count += 1 client_activity, operator_activity = change_dict_format(vad_dictionary) for end in client_activity[1]: difference = operator_activity[0] - end interruption_count += difference[np.abs(difference) < delay].shape[0] return interruption_count
def predict(audio, algorithm='SpectralClustering'): # Speech Activation Detection sad_scores = sad(audio) binarize_sad = Binarize(offset=0.52, onset=0.52, log_scale=True, min_duration_off=0.1, min_duration_on=0.1) speech = binarize_sad.apply(sad_scores, dimension=1) # Speaker Change Detection scd_scores = scd(audio) peak = Peak(alpha=0.10, min_duration=0.10, log_scale=True) partition = peak.apply(scd_scores, dimension=1) # Overlapped Speech Detection # ovl_scores = ovl(audio) # binarize_ovl = Binarize(offset=0.55, onset=0.55, log_scale=True, # min_duration_off=0.1, min_duration_on=0.1) # overlap = binarize_ovl.apply(ovl_scores, dimension=1) # Speaker Embedding speech_turns = partition.crop(speech) embeddings = emb(audio) long_turns = Timeline( segments=[s for s in speech_turns if s.duration > .5]) return long_turns, sad_scores, scd_scores, embeddings
def run(self, file_path): diarization = self.pipeline({'audio': file_path}) binarize = Binarize(offset=self.threshold, onset=self.threshold, pad_onset=0.3, pad_offset=0.3, log_scale=True, min_duration_off=0.5, min_duration_on=2) result_list = binarize.apply(diarization, dimension=1).for_json()['content'] self.raw_result = cleanup_cuts(result_list) return self
def initialize(self): """Initialize pipeline with current set of parameters""" self._binarize = Binarize(onset=self.onset, offset=self.offset, min_duration_on=self.min_duration_on, min_duration_off=self.min_duration_off, pad_onset=self.pad_onset, pad_offset=self.pad_offset)
def initialize(self): """Initialize pipeline internals with current hyper-parameter values""" self.sad_binarize_ = Binarize( onset=self.sad_threshold_on, offset=self.sad_threshold_off, min_duration_on=self.sad_min_duration_on, min_duration_off=self.sad_min_duration_off, ) if not self.only_sad: # embeddings will be extracted with a sliding window # of "emb_duration" duration and "emb_step_ratio x emb_duration" step. self.emb.duration = self.emb_duration self.emb.step = self.emb_step_ratio
def __init__(self, binarize_params=None): self.sad = torch.hub.load('pyannote/pyannote-audio', model='sad_ami') # см. VAD Smoothing в статье https://www.isca-speech.org/archive/ # interspeech_2015/papers/i15_2650.pdf binarize_params_default = { # an onset and offset thresholds for the detection of # the beginning and end of a speech segment 'offset': 0.5, 'onset': 0.5, # a threshold for small silence deletion 'min_duration_off': 0.1, # a threshold for short speech segment deletion; 'min_duration_on': 0.1, 'log_scale': True, } binarize_params = binarize_params or binarize_params_default self.binarize = Binarize(**binarize_params)
def __init__( self, feature_extraction: FeatureExtraction, Architecture: Type[Model], architecture_params: dict, lock_speech: bool = False, overlap_threshold: float = 0.5, epochs: int = 5, learning_rate: float = 0.1, ensemble: int = 1, duration: float = 2.0, step: float = 0.1, n_jobs: int = 1, device: torch.device = None, batch_size: int = 32, mask: Text = None, ): super().__init__( feature_extraction, Architecture, architecture_params, lock_speech=lock_speech, epochs=epochs, learning_rate=learning_rate, ensemble=ensemble, duration=duration, step=step, n_jobs=n_jobs, device=device, batch_size=batch_size, mask=mask, ) self.overlap_threshold = overlap_threshold self.binarizer_ = Binarize( onset=self.overlap_threshold, offset=self.overlap_threshold, scale="absolute", log_scale=True, )
def remove_silent_parts(filepath, sr, model): audio, sr = librosa.load(filepath, sr=sr) test_file = {'uri': filepath.split('/')[-1], 'audio': filepath} # obtain raw SAD scores (as `pyannote.core.SlidingWindowFeature` instance) sad_scores = model(test_file) # binarize raw SAD scores # NOTE: both onset/offset values were tuned on AMI dataset. # you might need to use different values for better results. binarize = Binarize(offset=0.52, onset=0.52, log_scale=True, min_duration_off=0.1, min_duration_on=0.1) # speech regions (as `pyannote.core.Timeline` instance) speech = binarize.apply(sad_scores, dimension=1) audio_pieces = [] for segment in speech: segment = list(segment) audio_pieces.extend(audio[int(segment[0] * sr):int(segment[1] * sr)]) return np.array(audio_pieces)
class ResegmentationWithOverlap(Resegmentation): """Re-segmentation with overlap Parameters ---------- feature_extraction : FeatureExtraction Feature extraction. Architecture : Model subclass architecture_params : dict overlap_threshold : `float`, optional Defaults to 0.5. lock_speech: `boolean`, optional Keep speech/non-speech state unchanged. Defaults to False. epochs : `int`, optional (Self-)train for that many epochs. Defaults to 5. ensemble : `int`, optional Average output of last `ensemble` epochs. Defaults to no ensembling. duration : float, optional Duration of audio chunks. Defaults to 2s. step : `float`, optional Ratio of audio chunk duration used as step between two consecutive audio chunks. Defaults to 0.1. batch_size : int, optional Batch size. Defaults to 32. device : `torch.device`, optional mask : str, optional When provided, current_file[mask] is used by the loss function to weigh samples. """ def __init__( self, feature_extraction: FeatureExtraction, Architecture: Type[Model], architecture_params: dict, lock_speech: bool = False, overlap_threshold: float = 0.5, epochs: int = 5, learning_rate: float = 0.1, ensemble: int = 1, duration: float = 2.0, step: float = 0.1, n_jobs: int = 1, device: torch.device = None, batch_size: int = 32, mask: Text = None, ): super().__init__( feature_extraction, Architecture, architecture_params, lock_speech=lock_speech, epochs=epochs, learning_rate=learning_rate, ensemble=ensemble, duration=duration, step=step, n_jobs=n_jobs, device=device, batch_size=batch_size, mask=mask, ) self.overlap_threshold = overlap_threshold self.binarizer_ = Binarize( onset=self.overlap_threshold, offset=self.overlap_threshold, scale="absolute", log_scale=True, ) def _decode( self, current_file: ProtocolFile, hypothesis: Annotation, scores: SlidingWindowFeature, labels: Iterable, ) -> Annotation: # obtain overlapped speech regions overlap = self.binarizer_.apply(current_file["overlap"], dimension=1) frames = scores.sliding_window N, K = scores.data.shape if self.lock_speech: # K = 1 <~~> only non-speech # K = 2 <~~> just one speaker if K < 3: return hypothesis # sequence of two most likely speaker indices # (even when non-speech is in fact the most likely class) best_speakers_indices = np.argsort(-scores.data[:, 1:], axis=1)[:, :2] active_speakers = np.zeros((N, K - 1), dtype=np.int64) # start by assigning most likely speaker... for t, k in enumerate(best_speakers_indices[:, 0]): active_speakers[t, k] = 1 # ... then add second most likely speaker in overlap regions T = frames.crop(overlap, mode="strict") # because overlap may use a different feature extraction step # it might happen that T contains indices slightly large than # the actual number of frames. the line below remove any such # indices. T = T[T < N] # mark second most likely speaker as active active_speakers[T, best_speakers_indices[T, 1]] = 1 # reconstruct annotation new_hypothesis = one_hot_decoding(active_speakers, frames, labels=labels) # revert non-speech regions back to original speech = hypothesis.get_timeline().support() new_hypothesis = new_hypothesis.crop(speech) else: # K = 1 <~~> only non-speech if K < 2: return hypothesis # sequence of two most likely class indices # sequence of two most likely class indices # (including 0=non-speech) best_speakers_indices = np.argsort(-scores.data, axis=1)[:, :2] active_speakers = np.zeros((N, K - 1), dtype=np.int64) # start by assigning the most likely speaker... for t, k in enumerate(best_speakers_indices[:, 0]): # k = 0 is for non-speech if k > 0: active_speakers[t, k - 1] = 1 # ... then add second most likely speaker in overlap regions T = frames.crop(overlap, mode="strict") # because overlap may use a different feature extraction step # it might happen that T contains indices slightly large than # the actual number of frames. the line below remove any such # indices. T = T[T < N] # remove timesteps where second most likely class is non-speech T = T[best_speakers_indices[T, 1] > 0] # mark second most likely speaker as active active_speakers[T, best_speakers_indices[T, 1] - 1] = 1 # reconstruct annotation new_hypothesis = one_hot_decoding(active_speakers, frames, labels=labels) new_hypothesis.uri = hypothesis.uri return new_hypothesis
class SpeechActivityDetection(Pipeline): """Speech activity detection pipeline Parameters ---------- scores : Wrappable, optional Describes how raw speech activity detection scores should be obtained. See pyannote.audio.features.wrapper.Wrapper documentation for details. Defaults to "@sad_scores" that indicates that protocol files provide the scores in the "sad_scores" key. fscore : bool, optional Optimize (precision/recall) fscore. Defaults to optimizing detection error rate. Hyper-parameters ---------------- onset, offset : `float` Onset/offset detection thresholds min_duration_on, min_duration_off : `float` Minimum duration in either state (speech or not) pad_onset, pad_offset : `float` Padding duration. """ def __init__(self, scores: Wrappable = None, fscore: bool = False): super().__init__() if scores is None: scores = "@sad_scores" self.scores = scores self._scores = Wrapper(self.scores) self.fscore = fscore # hyper-parameters self.onset = Uniform(0., 1.) self.offset = Uniform(0., 1.) self.min_duration_on = Uniform(0., 2.) self.min_duration_off = Uniform(0., 2.) self.pad_onset = Uniform(-1., 1.) self.pad_offset = Uniform(-1., 1.) def initialize(self): """Initialize pipeline with current set of parameters""" self._binarize = Binarize(onset=self.onset, offset=self.offset, min_duration_on=self.min_duration_on, min_duration_off=self.min_duration_off, pad_onset=self.pad_onset, pad_offset=self.pad_offset) def __call__(self, current_file: dict) -> Annotation: """Apply speech activity detection Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. May contain a 'sad_scores' key providing precomputed scores. Returns ------- speech : `pyannote.core.Annotation` Speech regions. """ sad_scores = self._scores(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(sad_scores.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(sad_scores.data) if self.log_scale_ \ else sad_scores.data # speech vs. non-speech if data.shape[1] > 1: speech_prob = SlidingWindowFeature(1. - data[:, 0], sad_scores.sliding_window) else: speech_prob = SlidingWindowFeature(data, sad_scores.sliding_window) speech = self._binarize.apply(speech_prob) speech.uri = current_file.get('uri', None) return speech.to_annotation(generator='string', modality='speech') def get_metric( self, parallel=False ) -> Union[DetectionErrorRate, DetectionPrecisionRecallFMeasure]: """Return new instance of detection metric""" if self.fscore: return DetectionPrecisionRecallFMeasure(collar=0.0, skip_overlap=False, parallel=parallel) else: return DetectionErrorRate(collar=0.0, skip_overlap=False, parallel=parallel)
class OverlapDetection(Pipeline): """Overlap detection pipeline Parameters ---------- scores : Wrappable, optional Describes how raw overlapped speech detection scores should be obtained. See pyannote.audio.features.wrapper.Wrapper documentation for details. Defaults to "@ovl_scores" that indicates that protocol files provide the scores in the "ovl_scores" key. precision : `float`, optional Target detection precision. Defaults to 0.9. fscore : bool, optional Optimize (precision/recall) fscore. Defaults to optimizing recall at target precision. Hyper-parameters ---------------- onset, offset : `float` Onset/offset detection thresholds min_duration_on, min_duration_off : `float` Minimum duration in either state (overlap or not) pad_onset, pad_offset : `float` Padding duration. """ def __init__(self, scores: Wrappable = None, fscore: bool = False): super().__init__() if scores is None: scores = "@ovl_scores" self.scores = scores self._scores = Wrapper(self.scores) self.precision = precision self.fscore = fscore # hyper-parameters self.onset = Uniform(0., 1.) self.offset = Uniform(0., 1.) self.min_duration_on = Uniform(0., 2.) self.min_duration_off = Uniform(0., 2.) self.pad_onset = Uniform(-1., 1.) self.pad_offset = Uniform(-1., 1.) def initialize(self): """Initialize pipeline with current set of parameters""" self._binarize = Binarize(onset=self.onset, offset=self.offset, min_duration_on=self.min_duration_on, min_duration_off=self.min_duration_off, pad_onset=self.pad_onset, pad_offset=self.pad_offset) def __call__(self, current_file: dict) -> Annotation: """Apply overlap detection Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. May contain a 'ovl_scores' key providing precomputed scores. Returns ------- overlap : `pyannote.core.Annotation` Overlap regions. """ ovl_scores = self._scores(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(ovl_scores.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(ovl_scores.data) if self.log_scale_ \ else ovl_scores.data # overlap vs. non-overlap if data.shape[1] > 1: overlap_prob = SlidingWindowFeature(1. - data[:, 0], ovl_scores.sliding_window) else: overlap_prob = SlidingWindowFeature(data, ovl_scores.sliding_window) overlap = self._binarize.apply(overlap_prob) overlap.uri = current_file['uri'] return overlap.to_annotation(generator='string', modality='overlap') @staticmethod def to_overlap(reference: Annotation) -> Annotation: """Get overlapped speech reference annotation Parameters ---------- reference : Annotation File yielded by pyannote.database protocols. Returns ------- overlap : `pyannote.core.Annotation` Overlapped speech reference. """ overlap = Timeline(uri=reference.uri) for (s1, t1), (s2, t2) in reference.co_iter(reference): l1 = reference[s1, t1] l2 = reference[s2, t2] if l1 == l2: continue overlap.add(s1 & s2) return overlap.support().to_annotation() def get_metric(self, **kwargs) -> DetectionPrecisionRecallFMeasure: """Get overlapped speech detection metric Returns ------- metric : DetectionPrecisionRecallFMeasure Detection metric. """ if not self.fscore: raise NotImplementedError() class _Metric(DetectionPrecisionRecallFMeasure): def compute_components(_self, reference: Annotation, hypothesis: Annotation, uem: Timeline = None, **kwargs) -> dict: return super().compute_components(self.to_overlap(reference), hypothesis, uem=uem, **kwargs) return _Metric() def loss(self, current_file: dict, hypothesis: Annotation) -> float: """Compute (1 - recall) at target precision If precision < target, return 1 + (1 - precision) Parameters ---------- current_file : `dict` File as provided by a pyannote.database protocol. hypothesis : `pyannote.core.Annotation` Overlap regions. Returns ------- error : `float` 1. - segment coverage. """ precision = DetectionPrecision() recall = DetectionRecall() if 'overlap_reference' in current_file: overlap_reference = current_file['overlap_reference'] else: reference = current_file['annotation'] overlap_reference = self.to_overlap(reference) current_file['overlap_reference'] = overlap_reference uem = get_annotated(current_file) p = precision(overlap_reference, hypothesis, uem=uem) r = recall(overlap_reference, hypothesis, uem=uem) if p > self.precision: return 1. - r return 1. + (1. - p)
name, ix = file_index[index] file = dict(np.load(os.path.join(data_dir, 'val_0.5', data_type, name))) signal, label = file['audio'][int(ix)], int(file['label'][int(ix)]) wav = signal.astype('int16') scipy.io.wavfile.write('temp/temp.wav', rate=16000, data=wav) test_file = {'uri': 'temp', 'audio': 'temp/temp.wav'} # Detect Sound sad_scores = sad(test_file) from pyannote.audio.utils.signal import Binarize binarize = Binarize(offset=0.9, onset=0.9, log_scale=True, min_duration_off=0.1, min_duration_on=0.1) # speech regions (as `pyannote.core.Timeline` instance) speech = binarize.apply(sad_scores, dimension=1) if len(speech) > 0: pred_list.append(1) else: pred_list.append(0) label_list.append(label) # Performance from sklearn.metrics import accuracy_score, precision_score, recall_score accuracy = accuracy_score(label_list, pred_list)
class InteractiveDiarization(Pipeline): """Interactive diarization pipeline Parameters ---------- sad : str or Path, optional Pretrained speech activity detection model. Defaults to "sad". emb : str or Path, optional Pretrained speaker embedding model. Defaults to "emb". batch_size : int, optional Batch size. only_sad : bool, optional Set to True if you only care about speech activity detection. Hyper-parameters ---------------- sad_threshold_on, sad_threshold_off : float Onset/offset speech activity detection thresholds. sad_min_duration_on, sad_min_duration_off : float Minimum duration of speech/non-speech regions. emb_duration, emb_step_ratio : float Sliding window used for embedding extraction. emb_threshold : float Distance threshold used as stopping criterion for hierarchical agglomerative clustering. """ def __init__( self, sad: Union[Text, Path] = {"sad": { "duration": 2.0, "step": 0.1 }}, emb: Union[Text, Path] = "emb", batch_size: int = None, only_sad: bool = False, ): super().__init__() self.sad = Wrapper(sad) if batch_size is not None: self.sad.batch_size = batch_size self.sad_speech_index_ = self.sad.classes.index("speech") self.sad_threshold_on = Uniform(0.0, 1.0) self.sad_threshold_off = Uniform(0.0, 1.0) self.sad_min_duration_on = Uniform(0.0, 0.5) self.sad_min_duration_off = Uniform(0.0, 0.5) self.only_sad = only_sad if self.only_sad: return self.emb = Wrapper(emb) if batch_size is not None: self.emb.batch_size = batch_size max_duration = self.emb.duration min_duration = getattr(self.emb, "min_duration", 0.25 * max_duration) self.emb_duration = Uniform(min_duration, max_duration) self.emb_step_ratio = Uniform(0.1, 1.0) self.emb_threshold = Uniform(0.0, 2.0) def initialize(self): """Initialize pipeline internals with current hyper-parameter values""" self.sad_binarize_ = Binarize( onset=self.sad_threshold_on, offset=self.sad_threshold_off, min_duration_on=self.sad_min_duration_on, min_duration_off=self.sad_min_duration_off, ) if not self.only_sad: # embeddings will be extracted with a sliding window # of "emb_duration" duration and "emb_step_ratio x emb_duration" step. self.emb.duration = self.emb_duration self.emb.step = self.emb_step_ratio def compute_speech(self, current_file: ProtocolFile) -> Timeline: """Apply speech activity detection Parameters ---------- current_file : ProtocolFile Protocol file. Returns ------- speech : Timeline Speech activity detection result. """ # speech activity detection if "sad_scores" in current_file: sad_scores: SlidingWindowFeature = current_file["sad_scores"] else: sad_scores = self.sad(current_file) if np.nanmean(sad_scores) < 0: sad_scores = np.exp(sad_scores) current_file["sad_scores"] = sad_scores speech: Timeline = self.sad_binarize_.apply( sad_scores, dimension=self.sad_speech_index_) return speech def compute_embedding(self, current_file: ProtocolFile) -> SlidingWindowFeature: """Extract speaker embedding Parameters ---------- current_file : ProtocolFile Protocol file Returns ------- embedding : SlidingWindowFeature Speaker embedding. """ return self.emb(current_file) def get_segment_assignment(self, embedding: SlidingWindowFeature, speech: Timeline) -> np.ndarray: """Get segment assignment Parameters ---------- embedding : SlidingWindowFeature Embeddings. speech : Timeline Speech regions. Returns ------- assignment : (num_embedding, ) np.ndarray * assignment[i] = s with s > 0 means that ith embedding is strictly contained in (1-based) sth segment. * assignment[i] = s with s < 0 means that more than half of ith embedding is part of (1-based) sth segment. * assignment[i] = 0 means that none of the above is true. """ assignment: np.ndarray = np.zeros((len(embedding), ), dtype=np.int32) for s, segment in enumerate(speech): indices = embedding.sliding_window.crop(segment, mode="strict") if len(indices) > 0: strict = 1 else: strict = -1 indices = embedding.sliding_window.crop(segment, mode="center") for i in indices: if i < 0 or i >= len(embedding): continue assignment[i] = strict * (s + 1) return assignment def __call__( self, current_file: ProtocolFile, cannot_link: List[Tuple[float, float]] = None, must_link: List[Tuple[float, float]] = None, ) -> Annotation: """Apply speaker diarization Parameters ---------- current_file : ProtocolFile Protocol file. cannot_link : List of time-based "cannot link" constraints. must_link : List of time-based "must link" constraints. Returns ------- diarization : Annotation Speaker diarization result. """ if cannot_link is None: cannot_link = [] if must_link is None: must_link = [] if "duration" not in current_file: current_file["duration"] = get_audio_duration(current_file) # in "interactive annotation" mode, there is no need to recompute speech # regions every time a file is processed: they can be passed with the # file directly if "speech" in current_file: speech: Timeline = current_file["speech"] # in "pipeline optimization" mode, pipeline hyper-parameters are different # every time a file is processed: speech regions must be recomputed else: speech = self.compute_speech(current_file) if self.only_sad: return speech.to_annotation(generator=iter(lambda: "SPEECH", None)) # in "interactive annotation" mode, pipeline hyper-parameters are fixed. # therefore, there is no need to recompute embeddings every time a file # is processed: they can be passed with the file directly. if "embedding" in current_file: embedding: SlidingWindowFeature = current_file["embedding"] # in "pipeline optimization" mode, pipeline hyper-parameters are different # every time a file is processed: embeddings must be recomputed else: embedding = self.compute_embedding(current_file) window: SlidingWindow = embedding.sliding_window # segment_assignment[i] = s with s > 0 means that ith embedding is # strictly contained in (1-based) sth segment. # segment_assignment[i] = s with s < 0 means that more than half of ith # embedding is part of (1-based) sth segment. # segment_assignment[i] = 0 means that none of the above is true. segment_assignment: np.ndarray = self.get_segment_assignment( embedding, speech) # cluster_assignment[i] = k (k > 0) means that the ith embedding belongs # to kth cluster # cluster_assignment[i] = 0 when segment_assignment[i] = 0 cluster_assignment: np.ndarray = np.zeros((len(embedding), ), dtype=np.int32) clean = segment_assignment > 0 noisy = segment_assignment < 0 clean_indices = np.where(clean)[0] if len(clean_indices) < 2: cluster_assignment[clean_indices] = 1 else: # convert time-based constraints to index-based constraints cannot_link = index2index(time2index(cannot_link, window), clean) must_link = index2index(time2index(must_link, window), clean) dendrogram = pool( embedding[clean_indices], metric="cosine", cannot_link=cannot_link, must_link=must_link, must_link_method="propagate", ) clusters = fcluster(dendrogram, self.emb_threshold, criterion="distance") for i, k in zip(clean_indices, clusters): cluster_assignment[i] = k loose_indices = np.where(noisy)[0] if len(clean_indices) == 0: if len(loose_indices) < 2: clusters = [1] * len(loose_indices) else: dendrogram = pool(embedding[loose_indices], metric="cosine") clusters = fcluster(dendrogram, self.emb_threshold, criterion="distance") for i, k in zip(loose_indices, clusters): cluster_assignment[i] = k else: # NEAREST NEIGHBOR distance = cdist(embedding[clean_indices], embedding[loose_indices], metric="cosine") nearest_neighbor = np.argmin(distance, axis=0) for loose_index, nn in zip(loose_indices, nearest_neighbor): strict_index = clean_indices[nn] cluster_assignment[loose_index] = cluster_assignment[ strict_index] # # NEAREST CLUSTER # centroid = np.vstack( # [ # np.mean(embedding[cluster_assignment == k], axis=0) # for k in np.unique(clusters) # ] # ) # distance = cdist(centroid, embedding[loose_indices], metric="cosine") # cluster_assignment[loose_indices] = np.argmin(distance, axis=0) + 1 # convert cluster assignment to pyannote.core.Annotation # (make sure to keep speech regions unchanged) hypothesis = Annotation(uri=current_file.get("uri", None)) for s, segment in enumerate(speech): indices = np.where(segment_assignment == s + 1)[0] if len(indices) == 0: indices = np.where(segment_assignment == -(s + 1))[0] if len(indices) == 0: continue clusters = cluster_assignment[indices] start, k = segment.start, clusters[0] change_point = np.diff(clusters) != 0 for i, new_k in zip(indices[1:][change_point], clusters[1:][change_point]): end = window[i].middle + 0.5 * window.step hypothesis[Segment(start, end)] = k start = end k = new_k hypothesis[Segment(start, segment.end)] = k return hypothesis.support() def get_metric(self) -> Union[DetectionErrorRate, DiarizationErrorRate]: if self.only_sad: return DetectionErrorRate(collar=0.0) else: return DiarizationErrorRate(collar=0.0, skip_overlap=False)
class VoiceActivityDetection: def __init__(self, binarize_params=None): self.sad = torch.hub.load('pyannote/pyannote-audio', model='sad_ami') # см. VAD Smoothing в статье https://www.isca-speech.org/archive/ # interspeech_2015/papers/i15_2650.pdf binarize_params_default = { # an onset and offset thresholds for the detection of # the beginning and end of a speech segment 'offset': 0.5, 'onset': 0.5, # a threshold for small silence deletion 'min_duration_off': 0.1, # a threshold for short speech segment deletion; 'min_duration_on': 0.1, 'log_scale': True, } binarize_params = binarize_params or binarize_params_default self.binarize = Binarize(**binarize_params) @staticmethod def _validate_wav_file(file_path): try: with wave.open(file_path, 'rb') as f: if f.getnchannels() != 2: raise VADException( 'Invalid number of channels for wav file. Must be 2.') except wave.Error as e: raise VADException(f'Invalid format of wav file: {e}') @staticmethod def _prepare_wav_by_channels(source_wav, operator_channel, client_channel, tmpdir): rate, data = wavfile.read(source_wav) operator_data = data[:, operator_channel] client_data = data[:, client_channel] operator_file_path = os.path.join(tmpdir, 'operator.wav') client_file_path = os.path.join(tmpdir, 'client.wav') wavfile.write(operator_file_path, rate, operator_data) wavfile.write(client_file_path, rate, client_data) return operator_file_path, client_file_path def _get_timeline(self, file_path): sad_scores = self.sad({'uri': 'filename', 'audio': file_path}) speech = self.binarize.apply(sad_scores, dimension=0) return speech.for_json()['content'] def get_timelines(self, file_path, operator_channel): """ Для двухканального wav-файла возвращает разметку/таймлайн разговора оператора с клиентом. :note: Предполагается, что оператор и клиент разнесены по двум разным каналам wav-файла. :param file_path: `str`, путь до исходного wav-файла. :param operator_channel: `int`, номер канала, который относится к оператору. :return: `dict`, словарь разметки вида: { 'operator_timeline': [ {'start': 10.5, 'end': '12.1'}, ... ], 'client_timeline': [ {'start': 13, 'end': '20'}, ... ] } где параметры `start` и `end` указаны в секундах. """ if operator_channel not in (0, 1): raise VADException('Invalid number of operator channel') client_channel = 0 if operator_channel else 1 self._validate_wav_file(file_path) with tempfile.TemporaryDirectory() as tmpdir: operator_wav, client_wav = self._prepare_wav_by_channels( file_path, operator_channel, client_channel, tmpdir) return { 'operator_timeline': self._get_timeline(operator_wav), 'client_timeline': self._get_timeline(client_wav), }