def from_file(self, current_file): from_annotation = current_file['annotation'] if self.source == 'annotated': support = get_annotated(current_file) elif self.source == 'support': support = current_file['annotation'].get_timeline().support() elif self.source == 'annotation': support = current_file['annotation'] elif self.source == 'audio': from pyannote.audio.features.utils import get_audio_duration support = get_audio_duration(current_file) else: raise ValueError( 'source must be one of "annotated", "annotation", "support" ' 'or "audio"') if self.heterogeneous: generator = self.iter_heterogeneous_segments(from_annotation, support) else: generator = self.iter_segments(from_annotation) for segment, label in generator: if label is None and self.skip_unlabeled: continue yield segment, label
def from_file(self, current_file): from pyannote.audio.features.utils import get_audio_duration duration = get_audio_duration(current_file) for left in self.iter_segments(duration): right = Segment(left.end + self.gap, left.end + self.duration + self.gap) if right.end < duration: t = .5 * (left.end + right.start) yield t, left, right
def dia_manual_stream(self) -> Iterable[Dict]: for audio_source in Audio(self.source): path = audio_source["path"] text = audio_source["text"] # load speech/non-speech annotations (from pyannote.sad.manual recipe) file = load_sad_manual(self.dataset, path) manual_speech = file["speech"] annotated = file["annotated"] # use manual speech/non-speech annotation where available, # and automatic speech/non-speech else where duration = get_audio_duration(file) file_extent = Segment(0, duration) non_annotated = annotated.gaps(file_extent) if non_annotated: automatic_speech = self.pipeline.compute_speech(file) file["speech"] = automatic_speech.crop(non_annotated).update( manual_speech) # load existing same/different annotations (from pyannote.dia.binary recipe) self.load_dia_binary(path) # apply speaker diarization pipeline using same/different speaker # binary annotation as must link/cannot link constraints hypothesis = self.pipeline(file, cannot_link=self.cannot_link_time, must_link=self.must_link_time) # rename 9 most talkative speakers to {SPEAKER_1, ..., SPEAKER_9} # and remaining speakers as OTHER mapping = { label: f"SPEAKER_{s+1}" if s < 9 else "OTHER" for s, (label, duration) in enumerate(hypothesis.chart()) } hypothesis = hypothesis.rename_labels(mapping=mapping) audio_spans = to_audio_spans(hypothesis) audio_source["audio_spans"] = audio_spans audio_source["audio_spans_original"] = deepcopy(audio_spans) audio_source["recipe"] = "pyannote.dia.manual" yield audio_source
def get_annotated(current_file): """Get part of the file that is annotated. Parameters ---------- current_file : `dict` File generated by a `pyannote.database` protocol. Returns ------- annotated : `pyannote.core.Timeline` Part of the file that is annotated. Defaults to `current_file["annotated"]`. When it does not exist, try to use the full audio extent. When that fails, use "annotation" extent. """ # if protocol provides 'annotated' key, use it if 'annotated' in current_file: annotated = current_file['annotated'] return annotated # if it does not, but does provide 'audio' key # try and use wav duration if 'audio' in current_file: try: from pyannote.audio.features.utils import get_audio_duration duration = get_audio_duration(current_file) except ImportError as e: pass else: warnings.warn('"annotated" was approximated by "audio" duration.') annotated = Timeline([Segment(0, duration)]) return annotated warnings.warn('"annotated" was approximated by "annotation" extent.') extent = current_file['annotation'].get_timeline().extent() annotated = Timeline([extent]) return annotated
def check(protocol_name, file_finder, experiment_dir): protocol = get_protocol(protocol_name) precomputed = Precomputed(experiment_dir) for subset in ['development', 'test', 'train']: try: file_generator = getattr(protocol, subset)() first_item = next(file_generator) except NotImplementedError as e: continue for current_file in getattr(protocol, subset)(): try: audio = file_finder(current_file) current_file['audio'] = audio except ValueError as e: print(e) continue duration = get_audio_duration(current_file) try: features = precomputed(current_file) except PyannoteFeatureExtractionError as e: print(e) continue if not np.isclose(duration, features.getExtent().duration, atol=1.): uri = get_unique_identifier(current_file) print('Duration mismatch for "{uri}"'.format(uri=uri)) if np.any(np.isnan(features.data)): uri = get_unique_identifier(current_file) print('NaN for "{uri}"'.format(uri=uri))
def from_file(self, current_file): if isinstance(self.source, (Segment, Timeline)): source = self.source elif self.source == 'annotated': source = get_annotated(current_file) elif self.source == 'annotated_extent': source = get_annotated(current_file).extent() elif self.source == 'annotation': source = current_file['annotation'] elif self.source == 'support': source = current_file['annotation'].get_timeline().support() elif self.source == 'audio': from pyannote.audio.features.utils import get_audio_duration source = get_audio_duration(current_file) for segment in self.iter_segments(source): yield segment
#!/usr/bin/env python # encoding: utf-8 from pyannote.audio.features.utils import get_audio_duration import glob subpath = '/vol/work3/maurice/AlbayzinEvaluationIberSPEECH-RTVE2018/data/RTVE2018DB/dev2/' path = subpath + 'audio/' files = glob.glob(path + '*-16000.wav') for f in files: print(f.split('/')[-1].split('-mono')[0], get_audio_duration({'audio': f})) path = subpath + 'enrollment/' files = glob.glob(path + '*/*-16000.wav') for f in files: print( f.split('/')[-1].split('-16000')[0], get_audio_duration({'audio': f}))
def sad_manual_stream( pipeline: InteractiveDiarization, source: Path, chunk: float = 10.0 ) -> Iterable[Dict]: """Stream for pyannote.sad.manual recipe Applies (pretrained) speech activity detection and sends the results for manual correction chunk by chunk. Parameters ---------- pipeline : InteractiveDiarization Pretrained speaker diarization interactive pipeline. Note that only the speech activity detection part is used. source : Path Directory containing audio files to process. chunk : float, optional Duration of chunks, in seconds. Defaults to 10s. Yields ------ task : dict Prodigy task with the following keys: "path" : path to audio file "text" : name of audio file "chunk" : chunk start and end times "audio" : base64 encoding of audio chunk "audio_spans" : speech spans detected by pretrained SAD model "audio_spans_original" : copy of "audio_spans" "meta" : additional meta-data displayed in Prodigy UI "recipe" : "pyannote.sad.manual" """ raw_audio = RawAudio(sample_rate=SAMPLE_RATE, mono=True) for audio_source in Audio(source): path = audio_source["path"] text = audio_source["text"] file = {"uri": text, "database": source, "audio": path} duration = get_audio_duration(file) file["duration"] = duration prodigy.log(f"RECIPE: detecting speech regions in '{path}'") speech: Annotation = pipeline.compute_speech(file).to_annotation( generator=iter(lambda: "SPEECH", None) ) if duration <= chunk: waveform = raw_audio.crop(file, Segment(0, duration)) task_audio = to_base64(normalize(waveform), sample_rate=SAMPLE_RATE) task_audio_spans = to_audio_spans(speech) yield { "path": path, "text": text, "audio": task_audio, "audio_spans": task_audio_spans, "audio_spans_original": deepcopy(task_audio_spans), "chunk": {"start": 0, "end": duration}, "meta": {"file": text}, # this is needed by other recipes "recipe": "pyannote.sad.manual", } else: for focus in chunks(duration, chunk=chunk, shuffle=True): task_text = f"{text} [{focus.start:.1f}, {focus.end:.1f}]" waveform = raw_audio.crop(file, focus) task_audio = to_base64(normalize(waveform), sample_rate=SAMPLE_RATE) task_audio_spans = to_audio_spans( speech.crop(focus, mode="intersection"), focus=focus ) yield { "path": path, "text": task_text, "audio": task_audio, "audio_spans": task_audio_spans, "audio_spans_original": deepcopy(task_audio_spans), "chunk": {"start": focus.start, "end": focus.end}, "meta": { "file": text, "start": f"{focus.start:.1f}", "end": f"{focus.end:.1f}", }, # this is needed by other recipes "recipe": "pyannote.sad.manual", }
def __call__( self, current_file: ProtocolFile, cannot_link: List[Tuple[float, float]] = None, must_link: List[Tuple[float, float]] = None, ) -> Annotation: """Apply speaker diarization Parameters ---------- current_file : ProtocolFile Protocol file. cannot_link : List of time-based "cannot link" constraints. must_link : List of time-based "must link" constraints. Returns ------- diarization : Annotation Speaker diarization result. """ if cannot_link is None: cannot_link = [] if must_link is None: must_link = [] if "duration" not in current_file: current_file["duration"] = get_audio_duration(current_file) # in "interactive annotation" mode, there is no need to recompute speech # regions every time a file is processed: they can be passed with the # file directly if "speech" in current_file: speech: Timeline = current_file["speech"] # in "pipeline optimization" mode, pipeline hyper-parameters are different # every time a file is processed: speech regions must be recomputed else: speech = self.compute_speech(current_file) if self.only_sad: return speech.to_annotation(generator=iter(lambda: "SPEECH", None)) # in "interactive annotation" mode, pipeline hyper-parameters are fixed. # therefore, there is no need to recompute embeddings every time a file # is processed: they can be passed with the file directly. if "embedding" in current_file: embedding: SlidingWindowFeature = current_file["embedding"] # in "pipeline optimization" mode, pipeline hyper-parameters are different # every time a file is processed: embeddings must be recomputed else: embedding = self.compute_embedding(current_file) window: SlidingWindow = embedding.sliding_window # segment_assignment[i] = s with s > 0 means that ith embedding is # strictly contained in (1-based) sth segment. # segment_assignment[i] = s with s < 0 means that more than half of ith # embedding is part of (1-based) sth segment. # segment_assignment[i] = 0 means that none of the above is true. segment_assignment: np.ndarray = self.get_segment_assignment( embedding, speech) # cluster_assignment[i] = k (k > 0) means that the ith embedding belongs # to kth cluster # cluster_assignment[i] = 0 when segment_assignment[i] = 0 cluster_assignment: np.ndarray = np.zeros((len(embedding), ), dtype=np.int32) clean = segment_assignment > 0 noisy = segment_assignment < 0 clean_indices = np.where(clean)[0] if len(clean_indices) < 2: cluster_assignment[clean_indices] = 1 else: # convert time-based constraints to index-based constraints cannot_link = index2index(time2index(cannot_link, window), clean) must_link = index2index(time2index(must_link, window), clean) dendrogram = pool( embedding[clean_indices], metric="cosine", cannot_link=cannot_link, must_link=must_link, must_link_method="propagate", ) clusters = fcluster(dendrogram, self.emb_threshold, criterion="distance") for i, k in zip(clean_indices, clusters): cluster_assignment[i] = k loose_indices = np.where(noisy)[0] if len(clean_indices) == 0: if len(loose_indices) < 2: clusters = [1] * len(loose_indices) else: dendrogram = pool(embedding[loose_indices], metric="cosine") clusters = fcluster(dendrogram, self.emb_threshold, criterion="distance") for i, k in zip(loose_indices, clusters): cluster_assignment[i] = k else: # NEAREST NEIGHBOR distance = cdist(embedding[clean_indices], embedding[loose_indices], metric="cosine") nearest_neighbor = np.argmin(distance, axis=0) for loose_index, nn in zip(loose_indices, nearest_neighbor): strict_index = clean_indices[nn] cluster_assignment[loose_index] = cluster_assignment[ strict_index] # # NEAREST CLUSTER # centroid = np.vstack( # [ # np.mean(embedding[cluster_assignment == k], axis=0) # for k in np.unique(clusters) # ] # ) # distance = cdist(centroid, embedding[loose_indices], metric="cosine") # cluster_assignment[loose_indices] = np.argmin(distance, axis=0) + 1 # convert cluster assignment to pyannote.core.Annotation # (make sure to keep speech regions unchanged) hypothesis = Annotation(uri=current_file.get("uri", None)) for s, segment in enumerate(speech): indices = np.where(segment_assignment == s + 1)[0] if len(indices) == 0: indices = np.where(segment_assignment == -(s + 1))[0] if len(indices) == 0: continue clusters = cluster_assignment[indices] start, k = segment.start, clusters[0] change_point = np.diff(clusters) != 0 for i, new_k in zip(indices[1:][change_point], clusters[1:][change_point]): end = window[i].middle + 0.5 * window.step hypothesis[Segment(start, end)] = k start = end k = new_k hypothesis[Segment(start, segment.end)] = k return hypothesis.support()
def init_database(db_dir, protocols, annotation_dir, path_to_wav): """Create annotation files for datasets Parameters ---------- db_dir : string Path where SPEAKERS.txt exists (path to LibriSpeech filedir) protocols : list of strings List of strings with protocols like ['dev-clean', 'dev-other', ...] annotation_dir: string Path to annotation files path_to_wav : string Path where wav files created. This string should put to ~/.pyannote/db.yaml Usage ----- """ # wav_path_template = '{db_dir}/wav/{subset}/{uri}' wav_path_template = '{path_to_wav}/{uri}' # read file descriptor desc = {} with open(os.path.join(db_dir, 'SPEAKERS.TXT'), 'r') as file: content = file.readlines() for line in content: fields = line.translate(str.maketrans( dict.fromkeys('\' -()\n'))).split('|') # fields = c.translate('\' -()\n').split('|') if fields[0][0] == ';': continue desc[fields[0]] = { 'gender': 'male' if fields[1] == 'M' else 'female', 'subset': fields[2], 'duration': float(fields[3]), 'client_id': fields[4] } for protocol in protocols: filedir = os.path.join(db_dir, protocol) subset = 'librispeech-{}.{}'.format( protocol.split('-')[1], protocol.split('-')[0]) try: # os.makedirs(wav_path_template.format(db_dir = db_dir, subset = subset, uri='')) os.makedirs( wav_path_template.format(path_to_wav=path_to_wav, uri='')) except: print('Directory exists') clients = listdir_nohidden(filedir) clients.sort(key=lambda a: a.lower()) n_clients = len(clients) counter = 0 for c in clients: d = desc[c] progress(counter, n_clients, d['client_id']) counter += 1 group_sample_path = os.path.join(filedir, c) books = listdir_nohidden(group_sample_path) for b in books: books_sample_path = os.path.join(group_sample_path, b) files = listdir_nohidden(books_sample_path) if not CONCATENATE: for f in files: flac_sample_path = os.path.join(books_sample_path, f) if flac_sample_path.endswith(".flac"): # sample_path = wav_path_template.format( # uri = os.path.splitext(flac_sample_path)[0].split('/')[-1], # subset = subset, # db_dir = db_dir # ) sample_path = wav_path_template.format( uri=os.path.splitext( flac_sample_path)[0].split('/')[-1], path_to_wav=path_to_wav) if not os.path.exists(sample_path + '.wav'): file2wav(flac_sample_path, sample_path) with open( os.path.join(annotation_dir, 'data', subset + '.mdtm'), 'a') as datafile: datafile.write( '{uri} {channel} {start} {duration} {modality} {confidence} {gender} {label}\n' .format(uri=os.path.splitext( flac_sample_path)[0].split('/')[-1], channel=1, start=0, duration=get_audio_duration( sample_path + '.wav'), modality='speaker', confidence='NA', gender=d['gender'], label=d['client_id'])) else: fname = "list.txt" with open(fname, 'a') as file: for f in files: flac_sample_path = os.path.join( books_sample_path, f) if flac_sample_path.endswith(".flac"): #file2wav(flac_sample_path, os.path.splitext(flac_sample_path)[0]) file.write( "file \'{}\'\n".format(flac_sample_path)) # sample_path = wav_path_template.format( # uri = '{}-{}-{}'.format(c, d['client_id'], b), # subset = subset, # db_dir = db_dir # ) sample_path = wav_path_template.format( uri='{}-{}-{}'.format(c, d['client_id'], b), path_to_wav=path_to_wav) if not os.path.exists(sample_path + '.wav'): list2wav(fname, sample_path) os.remove(fname) with open( os.path.join(annotation_dir, 'data', subset + '.mdtm'), 'a') as datafile: datafile.write( '{uri} {channel} {start} {duration} {modality} {confidence} {gender} {label}\n' .format(uri=sample_path.split('/')[-1], channel=1, start=0, duration=get_audio_duration( {'audio': sample_path + '.wav'}), modality='speaker', confidence='NA', gender=d['gender'], label=d['client_id']))