Exemple #1
0
    def from_file(self, current_file):

        from_annotation = current_file['annotation']

        if self.source == 'annotated':
            support = get_annotated(current_file)

        elif self.source == 'support':
            support = current_file['annotation'].get_timeline().support()

        elif self.source == 'annotation':
            support = current_file['annotation']

        elif self.source == 'audio':
            from pyannote.audio.features.utils import get_audio_duration
            support = get_audio_duration(current_file)

        else:
            raise ValueError(
                'source must be one of "annotated", "annotation", "support" '
                'or "audio"')

        if self.heterogeneous:
            generator = self.iter_heterogeneous_segments(from_annotation,
                                                         support)
        else:
            generator = self.iter_segments(from_annotation)

        for segment, label in generator:
            if label is None and self.skip_unlabeled:
                continue
            yield segment, label
Exemple #2
0
    def from_file(self, current_file):
        from pyannote.audio.features.utils import get_audio_duration

        duration = get_audio_duration(current_file)

        for left in self.iter_segments(duration):
            right = Segment(left.end + self.gap,
                            left.end + self.duration + self.gap)
            if right.end < duration:
                t = .5 * (left.end + right.start)
                yield t, left, right
    def dia_manual_stream(self) -> Iterable[Dict]:

        for audio_source in Audio(self.source):

            path = audio_source["path"]
            text = audio_source["text"]

            # load speech/non-speech annotations (from pyannote.sad.manual recipe)
            file = load_sad_manual(self.dataset, path)
            manual_speech = file["speech"]
            annotated = file["annotated"]

            # use manual speech/non-speech annotation where available,
            # and automatic speech/non-speech else where
            duration = get_audio_duration(file)
            file_extent = Segment(0, duration)
            non_annotated = annotated.gaps(file_extent)
            if non_annotated:
                automatic_speech = self.pipeline.compute_speech(file)
                file["speech"] = automatic_speech.crop(non_annotated).update(
                    manual_speech)

            # load existing same/different annotations (from pyannote.dia.binary recipe)
            self.load_dia_binary(path)

            # apply speaker diarization pipeline using same/different speaker
            # binary annotation as must link/cannot link constraints
            hypothesis = self.pipeline(file,
                                       cannot_link=self.cannot_link_time,
                                       must_link=self.must_link_time)

            # rename 9 most talkative speakers to {SPEAKER_1, ..., SPEAKER_9}
            # and remaining speakers as OTHER
            mapping = {
                label: f"SPEAKER_{s+1}" if s < 9 else "OTHER"
                for s, (label, duration) in enumerate(hypothesis.chart())
            }
            hypothesis = hypothesis.rename_labels(mapping=mapping)

            audio_spans = to_audio_spans(hypothesis)
            audio_source["audio_spans"] = audio_spans
            audio_source["audio_spans_original"] = deepcopy(audio_spans)
            audio_source["recipe"] = "pyannote.dia.manual"

            yield audio_source
def get_annotated(current_file):
    """Get part of the file that is annotated.

    Parameters
    ----------
    current_file : `dict`
        File generated by a `pyannote.database` protocol.

    Returns
    -------
    annotated : `pyannote.core.Timeline`
        Part of the file that is annotated. Defaults to
        `current_file["annotated"]`. When it does not exist, try to use the
        full audio extent. When that fails, use "annotation" extent.
    """

    # if protocol provides 'annotated' key, use it
    if 'annotated' in current_file:
        annotated = current_file['annotated']
        return annotated

    # if it does not, but does provide 'audio' key
    # try and use wav duration

    if 'audio' in current_file:
        try:
            from pyannote.audio.features.utils import get_audio_duration
            duration = get_audio_duration(current_file)
        except ImportError as e:
            pass
        else:
            warnings.warn('"annotated" was approximated by "audio" duration.')
            annotated = Timeline([Segment(0, duration)])
            return annotated

    warnings.warn('"annotated" was approximated by "annotation" extent.')
    extent = current_file['annotation'].get_timeline().extent()
    annotated = Timeline([extent])
    return annotated
Exemple #5
0
def check(protocol_name, file_finder, experiment_dir):

    protocol = get_protocol(protocol_name)
    precomputed = Precomputed(experiment_dir)

    for subset in ['development', 'test', 'train']:

        try:
            file_generator = getattr(protocol, subset)()
            first_item = next(file_generator)
        except NotImplementedError as e:
            continue

        for current_file in getattr(protocol, subset)():

            try:
                audio = file_finder(current_file)
                current_file['audio'] = audio
            except ValueError as e:
                print(e)
                continue

            duration = get_audio_duration(current_file)

            try:
                features = precomputed(current_file)
            except PyannoteFeatureExtractionError as e:
                print(e)
                continue

            if not np.isclose(duration,
                              features.getExtent().duration,
                              atol=1.):
                uri = get_unique_identifier(current_file)
                print('Duration mismatch for "{uri}"'.format(uri=uri))

            if np.any(np.isnan(features.data)):
                uri = get_unique_identifier(current_file)
                print('NaN for "{uri}"'.format(uri=uri))
def check(protocol_name, file_finder, experiment_dir):

    protocol = get_protocol(protocol_name)
    precomputed = Precomputed(experiment_dir)

    for subset in ['development', 'test', 'train']:

        try:
            file_generator = getattr(protocol, subset)()
            first_item = next(file_generator)
        except NotImplementedError as e:
            continue

        for current_file in getattr(protocol, subset)():

            try:
                audio = file_finder(current_file)
                current_file['audio'] = audio
            except ValueError as e:
                print(e)
                continue

            duration = get_audio_duration(current_file)

            try:
                features = precomputed(current_file)
            except PyannoteFeatureExtractionError as e:
                print(e)
                continue

            if not np.isclose(duration,
                              features.getExtent().duration,
                              atol=1.):
                uri = get_unique_identifier(current_file)
                print('Duration mismatch for "{uri}"'.format(uri=uri))

            if np.any(np.isnan(features.data)):
                uri = get_unique_identifier(current_file)
                print('NaN for "{uri}"'.format(uri=uri))
Exemple #7
0
    def from_file(self, current_file):

        if isinstance(self.source, (Segment, Timeline)):
            source = self.source

        elif self.source == 'annotated':
            source = get_annotated(current_file)

        elif self.source == 'annotated_extent':
            source = get_annotated(current_file).extent()

        elif self.source == 'annotation':
            source = current_file['annotation']

        elif self.source == 'support':
            source = current_file['annotation'].get_timeline().support()

        elif self.source == 'audio':
            from pyannote.audio.features.utils import get_audio_duration
            source = get_audio_duration(current_file)

        for segment in self.iter_segments(source):
            yield segment
Exemple #8
0
#!/usr/bin/env python
# encoding: utf-8

from pyannote.audio.features.utils import get_audio_duration
import glob

subpath = '/vol/work3/maurice/AlbayzinEvaluationIberSPEECH-RTVE2018/data/RTVE2018DB/dev2/'
path = subpath + 'audio/'
files = glob.glob(path + '*-16000.wav')
for f in files:
    print(f.split('/')[-1].split('-mono')[0], get_audio_duration({'audio': f}))

path = subpath + 'enrollment/'
files = glob.glob(path + '*/*-16000.wav')
for f in files:
    print(
        f.split('/')[-1].split('-16000')[0], get_audio_duration({'audio': f}))
Exemple #9
0
def sad_manual_stream(
    pipeline: InteractiveDiarization, source: Path, chunk: float = 10.0
) -> Iterable[Dict]:
    """Stream for pyannote.sad.manual recipe

    Applies (pretrained) speech activity detection and sends the results for
    manual correction chunk by chunk.

    Parameters
    ----------
    pipeline : InteractiveDiarization
        Pretrained speaker diarization interactive pipeline.
        Note that only the speech activity detection part is used.
    source : Path
        Directory containing audio files to process.
    chunk : float, optional
        Duration of chunks, in seconds. Defaults to 10s.

    Yields
    ------
    task : dict
        Prodigy task with the following keys:
        "path" : path to audio file
        "text" : name of audio file
        "chunk" : chunk start and end times
        "audio" : base64 encoding of audio chunk
        "audio_spans" : speech spans detected by pretrained SAD model
        "audio_spans_original" : copy of "audio_spans"
        "meta" : additional meta-data displayed in Prodigy UI
        "recipe" : "pyannote.sad.manual"

    """

    raw_audio = RawAudio(sample_rate=SAMPLE_RATE, mono=True)

    for audio_source in Audio(source):

        path = audio_source["path"]
        text = audio_source["text"]
        file = {"uri": text, "database": source, "audio": path}

        duration = get_audio_duration(file)
        file["duration"] = duration

        prodigy.log(f"RECIPE: detecting speech regions in '{path}'")

        speech: Annotation = pipeline.compute_speech(file).to_annotation(
            generator=iter(lambda: "SPEECH", None)
        )

        if duration <= chunk:
            waveform = raw_audio.crop(file, Segment(0, duration))
            task_audio = to_base64(normalize(waveform), sample_rate=SAMPLE_RATE)
            task_audio_spans = to_audio_spans(speech)

            yield {
                "path": path,
                "text": text,
                "audio": task_audio,
                "audio_spans": task_audio_spans,
                "audio_spans_original": deepcopy(task_audio_spans),
                "chunk": {"start": 0, "end": duration},
                "meta": {"file": text},
                # this is needed by other recipes
                "recipe": "pyannote.sad.manual",
            }

        else:
            for focus in chunks(duration, chunk=chunk, shuffle=True):
                task_text = f"{text} [{focus.start:.1f}, {focus.end:.1f}]"
                waveform = raw_audio.crop(file, focus)
                task_audio = to_base64(normalize(waveform), sample_rate=SAMPLE_RATE)
                task_audio_spans = to_audio_spans(
                    speech.crop(focus, mode="intersection"), focus=focus
                )

                yield {
                    "path": path,
                    "text": task_text,
                    "audio": task_audio,
                    "audio_spans": task_audio_spans,
                    "audio_spans_original": deepcopy(task_audio_spans),
                    "chunk": {"start": focus.start, "end": focus.end},
                    "meta": {
                        "file": text,
                        "start": f"{focus.start:.1f}",
                        "end": f"{focus.end:.1f}",
                    },
                    # this is needed by other recipes
                    "recipe": "pyannote.sad.manual",
                }
    def __call__(
        self,
        current_file: ProtocolFile,
        cannot_link: List[Tuple[float, float]] = None,
        must_link: List[Tuple[float, float]] = None,
    ) -> Annotation:
        """Apply speaker diarization

        Parameters
        ----------
        current_file : ProtocolFile
            Protocol file.
        cannot_link :
            List of time-based "cannot link" constraints.
        must_link :
            List of time-based "must link" constraints.

        Returns
        -------
        diarization : Annotation
            Speaker diarization result.
        """

        if cannot_link is None:
            cannot_link = []
        if must_link is None:
            must_link = []

        if "duration" not in current_file:
            current_file["duration"] = get_audio_duration(current_file)

        # in "interactive annotation" mode, there is no need to recompute speech
        # regions every time a file is processed: they can be passed with the
        # file directly
        if "speech" in current_file:
            speech: Timeline = current_file["speech"]

        # in "pipeline optimization" mode, pipeline hyper-parameters are different
        # every time a file is processed: speech regions must be recomputed
        else:
            speech = self.compute_speech(current_file)

        if self.only_sad:
            return speech.to_annotation(generator=iter(lambda: "SPEECH", None))

        # in "interactive annotation" mode, pipeline hyper-parameters are fixed.
        # therefore, there is no need to recompute embeddings every time a file
        # is processed: they can be passed with the file directly.
        if "embedding" in current_file:
            embedding: SlidingWindowFeature = current_file["embedding"]

        # in "pipeline optimization" mode, pipeline hyper-parameters are different
        # every time a file is processed: embeddings must be recomputed
        else:
            embedding = self.compute_embedding(current_file)

        window: SlidingWindow = embedding.sliding_window

        # segment_assignment[i] = s with s > 0 means that ith embedding is
        #       strictly contained in (1-based) sth segment.
        # segment_assignment[i] = s with s < 0 means that more than half of ith
        #       embedding is part of (1-based) sth segment.
        # segment_assignment[i] = 0 means that none of the above is true.
        segment_assignment: np.ndarray = self.get_segment_assignment(
            embedding, speech)

        # cluster_assignment[i] = k (k > 0) means that the ith embedding belongs
        #                           to kth cluster
        # cluster_assignment[i] = 0 when segment_assignment[i] = 0
        cluster_assignment: np.ndarray = np.zeros((len(embedding), ),
                                                  dtype=np.int32)

        clean = segment_assignment > 0
        noisy = segment_assignment < 0
        clean_indices = np.where(clean)[0]
        if len(clean_indices) < 2:
            cluster_assignment[clean_indices] = 1

        else:

            # convert time-based constraints to index-based constraints
            cannot_link = index2index(time2index(cannot_link, window), clean)
            must_link = index2index(time2index(must_link, window), clean)

            dendrogram = pool(
                embedding[clean_indices],
                metric="cosine",
                cannot_link=cannot_link,
                must_link=must_link,
                must_link_method="propagate",
            )
            clusters = fcluster(dendrogram,
                                self.emb_threshold,
                                criterion="distance")
            for i, k in zip(clean_indices, clusters):
                cluster_assignment[i] = k

        loose_indices = np.where(noisy)[0]
        if len(clean_indices) == 0:
            if len(loose_indices) < 2:
                clusters = [1] * len(loose_indices)
            else:
                dendrogram = pool(embedding[loose_indices], metric="cosine")
                clusters = fcluster(dendrogram,
                                    self.emb_threshold,
                                    criterion="distance")
            for i, k in zip(loose_indices, clusters):
                cluster_assignment[i] = k

        else:
            # NEAREST NEIGHBOR
            distance = cdist(embedding[clean_indices],
                             embedding[loose_indices],
                             metric="cosine")
            nearest_neighbor = np.argmin(distance, axis=0)
            for loose_index, nn in zip(loose_indices, nearest_neighbor):
                strict_index = clean_indices[nn]
                cluster_assignment[loose_index] = cluster_assignment[
                    strict_index]

            # # NEAREST CLUSTER
            # centroid = np.vstack(
            #     [
            #         np.mean(embedding[cluster_assignment == k], axis=0)
            #         for k in np.unique(clusters)
            #     ]
            # )
            # distance = cdist(centroid, embedding[loose_indices], metric="cosine")
            # cluster_assignment[loose_indices] = np.argmin(distance, axis=0) + 1

        # convert cluster assignment to pyannote.core.Annotation
        # (make sure to keep speech regions unchanged)
        hypothesis = Annotation(uri=current_file.get("uri", None))
        for s, segment in enumerate(speech):

            indices = np.where(segment_assignment == s + 1)[0]
            if len(indices) == 0:
                indices = np.where(segment_assignment == -(s + 1))[0]
                if len(indices) == 0:
                    continue

            clusters = cluster_assignment[indices]

            start, k = segment.start, clusters[0]
            change_point = np.diff(clusters) != 0
            for i, new_k in zip(indices[1:][change_point],
                                clusters[1:][change_point]):
                end = window[i].middle + 0.5 * window.step
                hypothesis[Segment(start, end)] = k
                start = end
                k = new_k
            hypothesis[Segment(start, segment.end)] = k

        return hypothesis.support()
def init_database(db_dir, protocols, annotation_dir, path_to_wav):
    """Create annotation files for datasets

    Parameters
    ----------
    db_dir : string
        Path where SPEAKERS.txt exists (path to LibriSpeech filedir)
    protocols : list of strings
        List of strings with protocols like ['dev-clean', 'dev-other', ...]
    annotation_dir: string
        Path to annotation files
    path_to_wav : string
        Path where wav files created. This string should put to ~/.pyannote/db.yaml

    Usage
    -----

    """

    # wav_path_template = '{db_dir}/wav/{subset}/{uri}'
    wav_path_template = '{path_to_wav}/{uri}'

    # read file descriptor
    desc = {}
    with open(os.path.join(db_dir, 'SPEAKERS.TXT'), 'r') as file:
        content = file.readlines()
        for line in content:
            fields = line.translate(str.maketrans(
                dict.fromkeys('\' -()\n'))).split('|')
            # fields = c.translate('\' -()\n').split('|')
            if fields[0][0] == ';':
                continue
            desc[fields[0]] = {
                'gender': 'male' if fields[1] == 'M' else 'female',
                'subset': fields[2],
                'duration': float(fields[3]),
                'client_id': fields[4]
            }

    for protocol in protocols:

        filedir = os.path.join(db_dir, protocol)

        subset = 'librispeech-{}.{}'.format(
            protocol.split('-')[1],
            protocol.split('-')[0])
        try:
            # os.makedirs(wav_path_template.format(db_dir = db_dir, subset = subset, uri=''))
            os.makedirs(
                wav_path_template.format(path_to_wav=path_to_wav, uri=''))
        except:
            print('Directory exists')

        clients = listdir_nohidden(filedir)
        clients.sort(key=lambda a: a.lower())

        n_clients = len(clients)
        counter = 0

        for c in clients:
            d = desc[c]

            progress(counter, n_clients, d['client_id'])
            counter += 1

            group_sample_path = os.path.join(filedir, c)
            books = listdir_nohidden(group_sample_path)
            for b in books:
                books_sample_path = os.path.join(group_sample_path, b)
                files = listdir_nohidden(books_sample_path)

                if not CONCATENATE:
                    for f in files:
                        flac_sample_path = os.path.join(books_sample_path, f)
                        if flac_sample_path.endswith(".flac"):
                            # sample_path = wav_path_template.format(
                            #     uri = os.path.splitext(flac_sample_path)[0].split('/')[-1],
                            #     subset = subset,
                            #     db_dir = db_dir
                            # )
                            sample_path = wav_path_template.format(
                                uri=os.path.splitext(
                                    flac_sample_path)[0].split('/')[-1],
                                path_to_wav=path_to_wav)

                            if not os.path.exists(sample_path + '.wav'):
                                file2wav(flac_sample_path, sample_path)

                            with open(
                                    os.path.join(annotation_dir, 'data',
                                                 subset + '.mdtm'),
                                    'a') as datafile:
                                datafile.write(
                                    '{uri} {channel} {start} {duration} {modality} {confidence} {gender} {label}\n'
                                    .format(uri=os.path.splitext(
                                        flac_sample_path)[0].split('/')[-1],
                                            channel=1,
                                            start=0,
                                            duration=get_audio_duration(
                                                sample_path + '.wav'),
                                            modality='speaker',
                                            confidence='NA',
                                            gender=d['gender'],
                                            label=d['client_id']))

                else:

                    fname = "list.txt"
                    with open(fname, 'a') as file:
                        for f in files:
                            flac_sample_path = os.path.join(
                                books_sample_path, f)
                            if flac_sample_path.endswith(".flac"):
                                #file2wav(flac_sample_path, os.path.splitext(flac_sample_path)[0])
                                file.write(
                                    "file \'{}\'\n".format(flac_sample_path))

                    # sample_path = wav_path_template.format(
                    #     uri = '{}-{}-{}'.format(c, d['client_id'], b),
                    #     subset = subset,
                    #     db_dir = db_dir
                    # )
                    sample_path = wav_path_template.format(
                        uri='{}-{}-{}'.format(c, d['client_id'], b),
                        path_to_wav=path_to_wav)

                    if not os.path.exists(sample_path + '.wav'):
                        list2wav(fname, sample_path)
                    os.remove(fname)

                    with open(
                            os.path.join(annotation_dir, 'data',
                                         subset + '.mdtm'), 'a') as datafile:
                        datafile.write(
                            '{uri} {channel} {start} {duration} {modality} {confidence} {gender} {label}\n'
                            .format(uri=sample_path.split('/')[-1],
                                    channel=1,
                                    start=0,
                                    duration=get_audio_duration(
                                        {'audio': sample_path + '.wav'}),
                                    modality='speaker',
                                    confidence='NA',
                                    gender=d['gender'],
                                    label=d['client_id']))