Beispiel #1
0
def process_current_file(current_file, file_finder=None, precomputed=None,
                         feature_extraction=None, robust=False):

    try:
        current_file['audio'] = file_finder(current_file)
    except ValueError as e:
        if not robust:
            raise PyannoteFeatureExtractionError(*e.args)
        return e

    uri = get_unique_identifier(current_file)
    path = precomputed.get_path(current_file)

    if os.path.exists(path):
        return

    try:
        features = feature_extraction(current_file)
    except PyannoteFeatureExtractionError as e:
        msg = 'Feature extraction failed for file "{uri}".'
        return msg.format(uri=uri)

    if features is None:
        msg = 'Feature extraction returned None for file "{uri}".'
        return msg.format(uri=uri)

    if np.any(np.isnan(features.data)):
        msg = 'Feature extraction returned NaNs for file "{uri}".'
        return msg.format(uri=uri)

    precomputed.dump(current_file, features)

    return
Beispiel #2
0
    def __call__(self, current_file) -> SlidingWindowFeature:
        """Extract features from file

        Parameters
        ----------
        current_file : dict
            `pyannote.database` files.

        Returns
        -------
        features : `pyannote.core.SlidingWindowFeature`
            Extracted features
        """

        # load waveform, re-sample, convert to mono, augment, normalize
        y, sample_rate = self.raw_audio_(current_file, return_sr=True)

        # compute features
        features = self.get_features(y.data, sample_rate)

        # basic quality check
        if np.any(np.isnan(features)):
            uri = get_unique_identifier(current_file)
            msg = f'Features extracted from "{uri}" contain NaNs.'
            warnings.warn(msg.format(uri=uri))

        # wrap features in a `SlidingWindowFeature` instance
        return SlidingWindowFeature(features, self.sliding_window)
Beispiel #3
0
 def get_hash(trial_file):
     uri = get_unique_identifier(trial_file)
     try_with = trial_file['try_with']
     if isinstance(try_with, Timeline):
         segments = tuple(try_with)
     else:
         segments = (try_with, )
     return hash((uri, segments))
Beispiel #4
0
    def apply(self, current_file):
        """Compute predictions on a sliding window

        Parameter
        ---------
        current_file : dict

        Returns
        -------
        predictions : SlidingWindowFeature
        """

        # frame and sub-sequence sliding windows
        frames = self.feature_extraction.sliding_window()

        batches = [
            batch for batch in self.from_file(current_file, incomplete=True)
        ]
        if not batches:
            data = np.zeros((0, self.dimension), dtype=np.float32)
            return SlidingWindowFeature(data, frames)

        fX = np.vstack(batches)

        subsequences = SlidingWindow(duration=self.duration, step=self.step)

        # get total number of frames
        if isinstance(self.feature_extraction, Precomputed):
            n_frames, _ = self.feature_extraction.shape(current_file)
        else:
            uri = get_unique_identifier(current_file)
            n_frames, _ = self.preprocessed_[uri].data

        # data[i] is the sum of all predictions for frame #i
        data = np.zeros((n_frames, self.dimension), dtype=np.float32)

        # k[i] is the number of sequences that overlap with frame #i
        k = np.zeros((n_frames, 1), dtype=np.int8)

        for subsequence, fX_ in zip(subsequences, fX):

            # indices of frames overlapped by subsequence
            indices = frames.crop(subsequence,
                                  mode='center',
                                  fixed=self.duration)

            # accumulate the outputs
            data[indices] += fX_

            # keep track of the number of overlapping sequence
            # TODO - use smarter weights (e.g. Hamming window)
            k[indices] += 1

        # compute average embedding of each frame
        data = data / np.maximum(k, 1)

        return SlidingWindowFeature(data, frames)
    def apply(self, current_file):
        """Compute predictions on a sliding window

        Parameter
        ---------
        current_file : dict

        Returns
        -------
        predictions : SlidingWindowFeature
        """

        # frame and sub-sequence sliding windows
        frames = self.feature_extraction.sliding_window()

        batches = [batch for batch in self.from_file(current_file,
                                                     incomplete=True)]
        if not batches:
            data = np.zeros((0, self.dimension), dtype=np.float32)
            return SlidingWindowFeature(data, frames)

        fX = np.vstack(batches)

        subsequences = SlidingWindow(duration=self.duration, step=self.step)

        # get total number of frames
        if isinstance(self.feature_extraction, Precomputed):
            n_frames, _ = self.feature_extraction.shape(current_file)
        else:
            uri = get_unique_identifier(current_file)
            n_frames, _ = self.preprocessed_[uri].data

        # data[i] is the sum of all predictions for frame #i
        data = np.zeros((n_frames, self.dimension), dtype=np.float32)

        # k[i] is the number of sequences that overlap with frame #i
        k = np.zeros((n_frames, 1), dtype=np.int8)

        for subsequence, fX_ in zip(subsequences, fX):

            # indices of frames overlapped by subsequence
            indices = frames.crop(subsequence,
                                  mode='center',
                                  fixed=self.duration)

            # accumulate the outputs
            data[indices] += fX_

            # keep track of the number of overlapping sequence
            # TODO - use smarter weights (e.g. Hamming window)
            k[indices] += 1

        # compute average embedding of each frame
        data = data / np.maximum(k, 1)

        return SlidingWindowFeature(data, frames)
    def preprocess(self, current_file):
        """On-demand feature extraction

        Parameters
        ----------
        current_file : dict
            Generated by a pyannote.database.Protocol

        Returns
        -------
        current_file : dict
            Current file with additional "features" entry

        Notes
        -----
        Does nothing when self.feature_extraction is a
        pyannote.audio.features.Precomputed instance.
        """

        # if "features" are precomputed on disk, do nothing
        # as "process_segment" will load just the part we need
        if isinstance(self.feature_extraction, Precomputed):
            return current_file

        # if (by chance) current_file already contains "features"
        # do nothing.
        if 'features' in current_file:
            return current_file

        # if we get there, it means that we need to extract features
        # for current_file. let's create a cache to store them...
        if not hasattr(self, 'preprocessed_'):
            self.preprocessed_ = LRUCache(maxsize=CACHE_MAXSIZE)

        # this is the key that will be used to know if "features"
        # already exist in cache
        uri = get_unique_identifier(current_file)

        # if "features" are not cached for current file
        # compute and cache them...
        if uri not in self.preprocessed_:
            features = self.feature_extraction(current_file)
            self.preprocessed_[uri] = features

        # create copy of current_file to prevent "features"
        # from consuming increasing memory...
        preprocessed = dict(current_file)

        # add "features" key
        preprocessed['features'] = self.preprocessed_[uri]

        return preprocessed
Beispiel #7
0
    def preprocess(self, current_file):
        """On-demand feature extraction

        Parameters
        ----------
        current_file : dict
            Generated by a pyannote.database.Protocol

        Returns
        -------
        current_file : dict
            Current file with additional "features" entry

        Notes
        -----
        Does nothing when self.feature_extraction is a
        pyannote.audio.features.Precomputed instance.
        """

        # if "features" are precomputed on disk, do nothing
        # as "process_segment" will load just the part we need
        if isinstance(self.feature_extraction, Precomputed):
            return current_file

        # if (by chance) current_file already contains "features"
        # do nothing.
        if 'features' in current_file:
            return current_file

        # if we get there, it means that we need to extract features
        # for current_file. let's create a cache to store them...
        if not hasattr(self, 'preprocessed_'):
            self.preprocessed_ = LRUCache(maxsize=CACHE_MAXSIZE)

        # this is the key that will be used to know if "features"
        # already exist in cache
        uri = get_unique_identifier(current_file)

        # if "features" are not cached for current file
        # compute and cache them...
        if uri not in self.preprocessed_:
            features = self.feature_extraction(current_file)
            self.preprocessed_[uri] = features

        # create copy of current_file to prevent "features"
        # from consuming increasing memory...
        preprocessed = dict(current_file)

        # add "features" key
        preprocessed['features'] = self.preprocessed_[uri]

        return preprocessed
def check(protocol_name, file_finder, experiment_dir):

    protocol = get_protocol(protocol_name)
    precomputed = Precomputed(experiment_dir)

    for subset in ['development', 'test', 'train']:

        try:
            file_generator = getattr(protocol, subset)()
            first_item = next(file_generator)
        except NotImplementedError as e:
            continue

        for current_file in getattr(protocol, subset)():

            try:
                audio = file_finder(current_file)
                current_file['audio'] = audio
            except ValueError as e:
                print(e)
                continue

            duration = get_audio_duration(current_file)

            try:
                features = precomputed(current_file)
            except PyannoteFeatureExtractionError as e:
                print(e)
                continue

            if not np.isclose(duration,
                              features.getExtent().duration,
                              atol=1.):
                uri = get_unique_identifier(current_file)
                print('Duration mismatch for "{uri}"'.format(uri=uri))

            if np.any(np.isnan(features.data)):
                uri = get_unique_identifier(current_file)
                print('NaN for "{uri}"'.format(uri=uri))
Beispiel #9
0
def check(protocol_name, file_finder, experiment_dir):

    protocol = get_protocol(protocol_name)
    precomputed = Precomputed(experiment_dir)

    for subset in ['development', 'test', 'train']:

        try:
            file_generator = getattr(protocol, subset)()
            first_item = next(file_generator)
        except NotImplementedError as e:
            continue

        for current_file in getattr(protocol, subset)():

            try:
                audio = file_finder(current_file)
                current_file['audio'] = audio
            except ValueError as e:
                print(e)
                continue

            duration = get_audio_duration(current_file)

            try:
                features = precomputed(current_file)
            except PyannoteFeatureExtractionError as e:
                print(e)
                continue

            if not np.isclose(duration,
                              features.getExtent().duration,
                              atol=1.):
                uri = get_unique_identifier(current_file)
                print('Duration mismatch for "{uri}"'.format(uri=uri))

            if np.any(np.isnan(features.data)):
                uri = get_unique_identifier(current_file)
                print('NaN for "{uri}"'.format(uri=uri))
        def fun(threshold):

            _metric = DiarizationPurityCoverageFMeasure(weighted=False)

            for current_file in getattr(_protocol, subset)():

                uri = get_unique_identifier(current_file)
                uem = get_annotated(current_file)
                reference = current_file["annotation"]

                clusters = fcluster(Z[uri], threshold, criterion="distance")

                hypothesis = Annotation(uri=uri)
                for (start_time, end_time), cluster in zip(t[uri], clusters):
                    hypothesis[Segment(start_time, end_time)] = cluster

                _ = _metric(reference, hypothesis, uem=uem)

            return 1.0 - abs(_metric)
def process_current_file(current_file, file_finder=None, precomputed=None,
                         feature_extraction=None, normalization=None,
                         robust=False):

    try:
        current_file['audio'] = file_finder(current_file)
    except ValueError as e:
        if not robust:
            raise PyannoteFeatureExtractionError(*e.args)
        return e

    uri = get_unique_identifier(current_file)
    path = precomputed.get_path(current_file)

    if os.path.exists(path):
        return

    try:
        features = feature_extraction(current_file)
    except PyannoteFeatureExtractionError as e:
        msg = 'Feature extraction failed for file "{uri}".'
        return msg.format(uri=uri)

    if features is None:
        msg = 'Feature extraction returned None for file "{uri}".'
        return msg.format(uri=uri)

    if np.any(np.isnan(features.data)):
        msg = 'Feature extraction returned NaNs for file "{uri}".'
        return msg.format(uri=uri)

    if normalization is not None:
        features = normalization(features)

    precomputed.dump(current_file, features)

    return
Beispiel #12
0
    def apply(self, protocol_name, output_dir):

        # file generator
        protocol = get_protocol(protocol_name, progress=True,
                                preprocessors=self.preprocessors_)

        mkdir_p(output_dir)
        path = Path(output_dir) / f'{protocol_name}.txt'

        with open(path, mode='w') as fp:

            for current_file in FileFinder.protocol_file_iter(
                protocol, extra_keys=['audio']):

                uri = get_unique_identifier(current_file)
                hypothesis = self.pipeline_.apply(current_file)

                if isinstance(hypothesis, Timeline):
                    for s in hypothesis:
                        fp.write(f'{uri} {s.start:.3f} {s.end:.3f}\n')
                    continue

                for s, t, l in hypothesis.itertracks(yield_label=True):
                    fp.write(f'{uri} {s.start:.3f} {s.end:.3f} {t} {l}\n')
        def fun(threshold):

            binarizer = Binarize(onset=threshold,
                                 offset=threshold,
                                 log_scale=False)

            protocol = get_protocol(protocol_name, progress=False,
                                    preprocessors=self.preprocessors_)

            metric = DetectionErrorRate()

            # NOTE -- embarrasingly parallel
            # TODO -- parallelize this
            file_generator = getattr(protocol, subset)()
            for current_file in file_generator:

                uri = get_unique_identifier(current_file)
                hypothesis = binarizer.apply(
                    predictions[uri], dimension=0).to_annotation()
                reference = current_file['annotation']
                uem = get_annotated(current_file)
                _ = metric(reference, hypothesis, uem=uem)

            return abs(metric)
Beispiel #14
0
    def initialize(self, protocol, subset='train'):
        """Gather the following information about the training subset:

        data_ : dict

            {'segments': <list of annotated segments>,
             'duration': <total duration of annotated segments>,
             'current_file': <protocol dictionary>,
             'y': <labels as numpy array>}

        databases_ : list
            Sorted list of (unique) databases in protocol.

        labels_ : list
            Sorted list of (unique) lables in protocol.
        """

        self.data_ = {}
        labels, databases = set(), set()

        # loop once on all files
        for current_file in getattr(protocol, subset)():

            # keep track of database
            database = current_file['database']
            databases.add(database)

            # keep track of unique labels
            for label in current_file['annotation'].labels():
                label = get_label_identifier(label, current_file)
                labels.add(label)

            annotated = get_annotated(current_file)

            if not self.precomputed.use_memmap:
                msg = ('Loading all precomputed features in memory. '
                       'Set "use_memmap" to True if you run out of memory.')
                warnings.warn(msg)

            segments = [s for s in annotated if s.duration > self.duration]

            # corner case where no segment is long enough
            # and we removed them all...
            if not segments:
                continue

            # total duration of label in current_file (after removal of
            # short segments).
            duration = sum(s.duration for s in segments)

            # store all these in data_ dictionary
            datum = {'segments': segments,
                     'duration': duration,
                     'current_file': current_file}
            uri = get_unique_identifier(current_file)
            self.data_[uri] = datum

        self.databases_ = sorted(databases)
        self.labels_ = sorted(labels)

        sliding_window = self.precomputed.sliding_window()
        for current_file in getattr(protocol, subset)():
            y, _ = to_numpy(current_file, self.precomputed,
                            labels=self.labels_)
            uri = get_unique_identifier(current_file)
            self.data_[uri]['y'] = SlidingWindowFeature(
                self.postprocess_y(y), sliding_window)
    def _validate_epoch_diarization(
        self,
        epoch,
        validation_data,
        protocol=None,
        subset: Subset = "development",
        device: Optional[torch.device] = None,
        batch_size: int = 32,
        n_jobs: int = 1,
        duration: float = None,
        step: float = 0.25,
        metric: str = None,
        **kwargs,
    ):

        # initialize embedding extraction
        pretrained = Pretrained(
            validate_dir=self.validate_dir_,
            epoch=epoch,
            duration=duration,
            step=step,
            batch_size=batch_size,
            device=device,
        )

        preprocessors = self.preprocessors_
        if "audio" not in preprocessors:
            preprocessors["audio"] = FileFinder()
        if "duration" not in preprocessors:
            preprocessors["duration"] = get_audio_duration
        _protocol = get_protocol(protocol, preprocessors=preprocessors)

        Z, t = dict(), dict()
        min_d, max_d = np.inf, -np.inf

        for current_file in getattr(_protocol, subset)():

            uri = get_unique_identifier(current_file)
            uem = get_annotated(current_file)
            reference = current_file["annotation"]

            X_, t_ = [], []
            embedding = pretrained(current_file)
            for i, (turn, _) in enumerate(reference.itertracks()):

                # extract embedding for current speech turn
                x_ = embedding.crop(turn, mode="center")
                if len(x_) < 1:
                    x_ = embedding.crop(turn, mode="loose")
                if len(x_) < 1:
                    msg = f"No embedding for {turn} in {uri:s}."
                    raise ValueError(msg)

                # each speech turn is represented by its average embedding
                X_.append(np.mean(x_, axis=0))
                t_.append(turn)

            X_ = np.array(X_)
            # apply hierarchical agglomerative clustering
            # all the way up to just one cluster (ie complete dendrogram)
            D = pdist(X_, metric=metric)
            min_d = min(np.min(D), min_d)
            max_d = max(np.max(D), max_d)

            Z[uri] = linkage(X_, method="pool", metric=metric)
            t[uri] = np.array(t_)

        def fun(threshold):

            _metric = DiarizationPurityCoverageFMeasure(weighted=False)

            for current_file in getattr(_protocol, subset)():

                uri = get_unique_identifier(current_file)
                uem = get_annotated(current_file)
                reference = current_file["annotation"]

                clusters = fcluster(Z[uri], threshold, criterion="distance")

                hypothesis = Annotation(uri=uri)
                for (start_time, end_time), cluster in zip(t[uri], clusters):
                    hypothesis[Segment(start_time, end_time)] = cluster

                _ = _metric(reference, hypothesis, uem=uem)

            return 1.0 - abs(_metric)

        res = scipy.optimize.minimize_scalar(fun,
                                             bounds=(0.0, 1.0),
                                             method="bounded",
                                             options={"maxiter": 10})

        threshold = res.x.item()

        return {
            "metric": "diarization_fscore",
            "minimize": False,
            "value": float(1.0 - res.fun),
        }
Beispiel #16
0
    def validate_epoch(self, epoch, protocol_name, subset='development',
                       validation_data=None):

        target_precision = self.precision

        # load model for current epoch
        model = self.load_model(epoch).to(self.device)
        model.eval()

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        duration = self.task_.duration
        step = .25 * duration
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        predictions = {}
        references = {}

        file_generator = getattr(protocol, subset)()
        for current_file in file_generator:
            uri = get_unique_identifier(current_file)

            # build overlap reference
            reference = Timeline(uri=uri)
            annotation = current_file['annotation']
            for track1, track2 in annotation.co_iter(annotation):
                if track1 == track2:
                    continue
                reference.add(track1[0] & track2[0])
            references[uri] = reference.to_annotation()

            # extract overlap scores
            scores = sequence_labeling.apply(current_file)

            if model.logsoftmax:
                scores = SlidingWindowFeature(
                    np.exp(scores.data[:, 2]), scores.sliding_window)
            else:
                scores = SlidingWindowFeature(
                    scores.data[:, 2], scores.sliding_window)

            predictions[uri] = scores

        # dichotomic search to find threshold that maximizes recall
        # while having at least `target_precision`

        lower_alpha = 0.
        upper_alpha = 1.
        best_alpha = .5 * (lower_alpha + upper_alpha)
        best_recall = 0.

        for _ in range(10):
            current_alpha = .5 * (lower_alpha + upper_alpha)
            binarizer = Binarize(onset=current_alpha,
                                 offset=current_alpha,
                                 log_scale=False)

            precision = DetectionPrecision()
            recall = DetectionRecall()

            for current_file in getattr(protocol, subset)():
                uri = get_unique_identifier(current_file)
                reference = references[uri]
                hypothesis = binarizer.apply(predictions[uri], dimension=0)
                hypothesis = hypothesis.to_annotation()
                uem = get_annotated(current_file)
                _ = precision(reference, hypothesis, uem=uem)
                _ = recall(reference, hypothesis, uem=uem)

            if abs(precision) < target_precision:
                # precision is not high enough: try higher thresholds
                lower_alpha = current_alpha
            else:
                upper_alpha = current_alpha
                r = abs(recall)
                if r > best_recall:
                    best_recall = r
                    best_alpha = current_alpha

        task = 'overlap_speech_detection'
        metric_name = f'{task}/recall@{target_precision:.2f}precision'
        return {
            metric_name: {'minimize': False, 'value': best_recall},
            f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
    def apply_iter(self, current_file, hypothesis,
                   partial=True, device=None,
                   log_dir=None):
        """Yield re-segmentation results for each epoch

        Parameters
        ----------
        current_file : pyannote.database dict
            Currently processed file
        hypothesis : pyannote.core.Annotation
            Input segmentation
        partial : bool, optional
            Set to False to only yield final re-segmentation.
            Set to True to yield re-segmentation after each epoch.
        device : torch.device, optional
            Defaults to torch.device('cpu')
        log_dir : str, optional
            Path to log directory.

        Yields
        ------
        resegmented : pyannote.core.Annotation
            Resegmentation results after each epoch.
        """

        device = torch.device('cpu') if device is None else device

        current_file = dict(current_file)
        current_file['annotation'] = hypothesis

        # set `per_epoch` attribute to current file annotated duration
        self.per_epoch = get_annotated(current_file).duration()

        # number of speakers + 1 for non-speech
        self.n_classes_ = len(hypothesis.labels()) + 1

        model = StackedRNN(self.precomputed.dimension(), self.n_classes,
                           rnn=self.rnn, recurrent=self.recurrent,
                           linear=self.linear,
                           bidirectional=self.bidirectional,
                           logsoftmax=True)

        # initialize dummy protocol that has only one file
        protocol = self.get_dummy_protocol(current_file)

        if log_dir is None:
            log_dir = tempfile.mkdtemp()
        uri = get_unique_identifier(current_file)
        log_dir = 'f{log_dir}/{uri}'

        self.scores_ = collections.deque([], maxlen=self.ensemble)

        iterations = self.fit_iter(
            model, self.precomputed, protocol, subset='train',
            restart=0, epochs=self.epochs, learning_rate='auto',
            get_optimizer=SGD, get_scheduler=ConstantScheduler,
            log_dir=log_dir, device=device)

        for i, iteration in enumerate(iterations):

            # if 'partial', compute scores for every iteration
            # if not, compute scores for last 'ensemble' iterations only
            if partial or (i + 1 > self.epochs - self.ensemble):
                iteration_score = self._score(iteration['model'],
                                              current_file, device=device)
                self.scores_.append(iteration_score)

            # if 'partial', generate (and yield) hypothesis
            if partial:
                hypothesis = self._decode(self.scores_)
                yield hypothesis

        # generate (and yield) final hypothesis in case it's not already
        if not partial:
            hypothesis = self._decode(self.scores_)
            yield hypothesis
    def _validate_epoch_verification(self,
                                     epoch,
                                     protocol_name,
                                     subset='development',
                                     validation_data=None):
        """Perform a speaker verification experiment using model at `epoch`

        Parameters
        ----------
        epoch : int
            Epoch to validate.
        protocol_name : str
            Name of speaker verification protocol
        subset : {'train', 'development', 'test'}, optional
            Name of subset.
        validation_data : provided by `validate_init`

        Returns
        -------
        metrics : dict
        """

        # load current model
        model = self.load_model(epoch).to(self.device)
        model.eval()

        # use user-provided --duration when available
        # otherwise use 'duration' used for training
        if self.duration is None:
            duration = self.task_.duration
        else:
            duration = self.duration
        min_duration = None

        # if 'duration' is still None, it means that
        # network was trained with variable lengths
        if duration is None:
            duration = self.task_.max_duration
            min_duration = self.task_.min_duration

        step = .5 * duration

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(model,
                                               self.feature_extraction_,
                                               duration=duration,
                                               step=step,
                                               min_duration=min_duration,
                                               batch_size=self.batch_size,
                                               device=self.device)

        metrics = {}
        protocol = get_protocol(protocol_name,
                                progress=False,
                                preprocessors=self.preprocessors_)

        enrolment_models, enrolment_khashes = {}, {}
        enrolments = getattr(protocol, '{0}_enrolment'.format(subset))()
        for i, enrolment in enumerate(enrolments):
            data = sequence_embedding.apply(enrolment,
                                            crop=enrolment['enrol_with'])
            model_id = enrolment['model_id']
            model = np.mean(np.stack(data), axis=0, keepdims=True)
            enrolment_models[model_id] = model

            # in some specific speaker verification protocols,
            # enrolment data may be  used later as trial data.
            # therefore, we cache information about enrolment data
            # to speed things up by reusing the enrolment as trial
            h = hash((get_unique_identifier(enrolment),
                      tuple(enrolment['enrol_with'])))
            enrolment_khashes[h] = model_id

        trial_models = {}
        trials = getattr(protocol, '{0}_trial'.format(subset))()
        y_true, y_pred = [], []
        for i, trial in enumerate(trials):
            model_id = trial['model_id']

            h = hash((get_unique_identifier(trial), tuple(trial['try_with'])))

            # re-use enrolment model whenever possible
            if h in enrolment_khashes:
                model = enrolment_models[enrolment_khashes[h]]

            # re-use trial model whenever possible
            elif h in trial_models:
                model = trial_models[h]

            else:
                data = sequence_embedding.apply(trial, crop=trial['try_with'])
                model = np.mean(data, axis=0, keepdims=True)
                # cache trial model for later re-use
                trial_models[h] = model

            distance = cdist(enrolment_models[model_id],
                             model,
                             metric=self.metric)[0, 0]
            y_pred.append(distance)
            y_true.append(trial['reference'])

        _, _, _, eer = det_curve(np.array(y_true),
                                 np.array(y_pred),
                                 distances=True)
        metrics['EER'] = {'minimize': True, 'value': eer}

        return metrics
    def _validate_epoch_verification(self, epoch, protocol_name,
                                     subset='development',
                                     validation_data=None):
        """Perform a speaker verification experiment using model at `epoch`

        Parameters
        ----------
        epoch : int
            Epoch to validate.
        protocol_name : str
            Name of speaker verification protocol
        subset : {'train', 'development', 'test'}, optional
            Name of subset.
        validation_data : provided by `validate_init`

        Returns
        -------
        metrics : dict
        """


        # load current model
        model = self.load_model(epoch).to(self.device)
        model.eval()

        # use user-provided --duration when available
        # otherwise use 'duration' used for training
        if self.duration is None:
            duration = self.task_.duration
        else:
            duration = self.duration
        min_duration = None

        # if 'duration' is still None, it means that
        # network was trained with variable lengths
        if duration is None:
            duration = self.task_.max_duration
            min_duration = self.task_.min_duration

        step = .5 * duration

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        # initialize embedding extraction
        sequence_embedding = SequenceEmbedding(
            model, self.feature_extraction_, duration=duration,
            step=step, min_duration=min_duration,
            batch_size=self.batch_size, device=self.device)

        metrics = {}
        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        enrolment_models, enrolment_khashes = {}, {}
        enrolments = getattr(protocol, '{0}_enrolment'.format(subset))()
        for i, enrolment in enumerate(enrolments):
            data = sequence_embedding.apply(enrolment,
                                            crop=enrolment['enrol_with'])
            model_id = enrolment['model_id']
            model = np.mean(np.stack(data), axis=0, keepdims=True)
            enrolment_models[model_id] = model

            # in some specific speaker verification protocols,
            # enrolment data may be  used later as trial data.
            # therefore, we cache information about enrolment data
            # to speed things up by reusing the enrolment as trial
            h = hash((get_unique_identifier(enrolment),
                      tuple(enrolment['enrol_with'])))
            enrolment_khashes[h] = model_id

        trial_models = {}
        trials = getattr(protocol, '{0}_trial'.format(subset))()
        y_true, y_pred = [], []
        for i, trial in enumerate(trials):
            model_id = trial['model_id']

            h = hash((get_unique_identifier(trial),
                      tuple(trial['try_with'])))

            # re-use enrolment model whenever possible
            if h in enrolment_khashes:
                model = enrolment_models[enrolment_khashes[h]]

            # re-use trial model whenever possible
            elif h in trial_models:
                model = trial_models[h]

            else:
                data = sequence_embedding.apply(trial, crop=trial['try_with'])
                model = np.mean(data, axis=0, keepdims=True)
                # cache trial model for later re-use
                trial_models[h] = model

            distance = cdist(enrolment_models[model_id], model,
                             metric=self.metric)[0, 0]
            y_pred.append(distance)
            y_true.append(trial['reference'])

        _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred),
                                 distances=True)
        metrics['EER'] = {'minimize': True, 'value': eer}

        return metrics
    def validate_epoch(self, epoch, protocol_name, subset='development',
                       validation_data=None):

        target_purity = self.purity

        # load model for current epoch
        model = self.load_model(epoch).to(self.device)
        model.eval()

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        duration = self.task_.duration
        step = .25 * duration
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        # extract predictions for all files.
        predictions = {}
        for current_file in getattr(protocol, subset)():
            uri = get_unique_identifier(current_file)
            predictions[uri] = sequence_labeling.apply(current_file)

        # dichotomic search to find alpha that maximizes coverage
        # while having at least `target_purity`

        lower_alpha = 0.
        upper_alpha = 1.
        best_alpha = .5 * (lower_alpha + upper_alpha)
        best_coverage = 0.

        for _ in range(10):
            current_alpha = .5 * (lower_alpha + upper_alpha)
            peak = Peak(alpha=current_alpha, min_duration=0.0,
                        log_scale=model.logsoftmax)
            metric = DiarizationPurityCoverageFMeasure()

            # NOTE -- embarrasingly parallel
            # TODO -- parallelize this
            for current_file in getattr(protocol, subset)():
                reference = current_file['annotation']
                uri = get_unique_identifier(current_file)
                hypothesis = peak.apply(predictions[uri], dimension=1)
                hypothesis = hypothesis.to_annotation()
                uem = get_annotated(current_file)
                metric(reference, hypothesis, uem=uem)

            purity, coverage, _ = metric.compute_metrics()

            if purity < target_purity:
                upper_alpha = current_alpha
            else:
                lower_alpha = current_alpha
                if coverage > best_coverage:
                    best_coverage = coverage
                    best_alpha = current_alpha

        task = 'speaker_change_detection'
        metric_name = f'{task}/coverage@{target_purity:.2f}purity'
        return {
            metric_name: {'minimize': False, 'value': best_coverage},
            f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
    from pyannote.metrics.diarization import GreedyDiarizationErrorRate
    metric1 = GreedyDiarizationErrorRate(parallel=False)
    metric2 = GreedyDiarizationErrorRate(parallel=False,
                                         collar=0.500,
                                         skip_overlap=True)
    metric3 = GreedyDiarizationErrorRate(parallel=False,
                                         collar=0.500,
                                         skip_overlap=False)

    from optimize_cluster import speaker_diarization
    from pyannote.audio.features import Precomputed

    file_list = []
    for current_file in getattr(protocol, subset)():
        uri = get_unique_identifier(current_file).split('/')[1]
        hypothesis = diarization_res[uri]
        reference = current_file['annotation']
        current_file['prediction'] = hypothesis
        file_list.append(current_file)
        uem = get_annotated(current_file)
        metric1(reference, hypothesis, uem=uem)
        metric2(reference, hypothesis, uem=uem)
        metric3(reference, hypothesis, uem=uem)

    print(abs(metric1))
    print(abs(metric2))
    print(abs(metric3))

    config_yml = arguments['<config_yml>']
    models_dir = arguments['<models_dir>']
Beispiel #22
0
    def _load_metadata(self, protocol, subset: Subset = "train") -> float:
        """Load training set metadata

        This function is called once at instantiation time, returns the total
        training set duration, and populates the following attributes:

        Attributes
        ----------
        data_ : dict

            {'segments': <list of annotated segments>,
             'duration': <total duration of annotated segments>,
             'current_file': <protocol dictionary>,
             'y': <labels as numpy array>}

        segment_labels_ : list
            Sorted list of (unique) labels in protocol.

        file_labels_ : dict of list
            Sorted lists of (unique) file labels in protocol

        Returns
        -------
        duration : float
            Total duration of annotated segments, in seconds.
        """

        self.data_ = {}
        segment_labels, file_labels = set(), dict()

        # loop once on all files
        files = getattr(protocol, subset)()
        for current_file in tqdm(files, desc="Loading labels", unit="file"):

            # ensure annotation/annotated are cropped to actual file duration
            support = Segment(start=0, end=current_file["duration"])
            current_file["annotated"] = get_annotated(current_file).crop(
                support, mode="intersection"
            )
            current_file["annotation"] = current_file["annotation"].crop(
                support, mode="intersection"
            )

            # keep track of unique segment labels
            segment_labels.update(current_file["annotation"].labels())

            # keep track of unique file labels
            for key, value in current_file.items():
                if isinstance(value, (Annotation, Timeline, SlidingWindowFeature)):
                    continue
                if key not in file_labels:
                    file_labels[key] = set()
                file_labels[key].add(value)

            segments = [
                s for s in current_file["annotated"] if s.duration > self.duration
            ]

            # corner case where no segment is long enough
            # and we removed them all...
            if not segments:
                continue

            # total duration of label in current_file (after removal of
            # short segments).
            duration = sum(s.duration for s in segments)

            # store all these in data_ dictionary
            datum = {
                "segments": segments,
                "duration": duration,
                "current_file": current_file,
            }
            uri = get_unique_identifier(current_file)
            self.data_[uri] = datum

        self.file_labels_ = {k: sorted(file_labels[k]) for k in file_labels}
        self.segment_labels_ = sorted(segment_labels)

        for uri in list(self.data_):
            current_file = self.data_[uri]["current_file"]
            y = self.initialize_y(current_file)
            self.data_[uri]["y"] = y
            if self.mask is not None:
                mask = current_file[self.mask]
                current_file[self.mask] = mask.align(y)

        return sum(datum["duration"] for datum in self.data_.values())
Beispiel #23
0
    def apply_iter(self,
                   current_file,
                   hypothesis,
                   partial=True,
                   device=None,
                   log_dir=None):
        """Yield re-segmentation results for each epoch

        Parameters
        ----------
        current_file : pyannote.database dict
            Currently processed file
        hypothesis : pyannote.core.Annotation
            Input segmentation
        partial : bool, optional
            Set to False to only yield final re-segmentation.
            Set to True to yield re-segmentation after each epoch.
        device : torch.device, optional
            Defaults to torch.device('cpu')
        log_dir : str, optional
            Path to log directory.

        Yields
        ------
        resegmented : pyannote.core.Annotation
            Resegmentation results after each epoch.
        """

        device = torch.device('cpu') if device is None else device

        current_file = dict(current_file)
        current_file['annotation'] = hypothesis

        # set `per_epoch` attribute to current file annotated duration
        self.per_epoch = get_annotated(current_file).duration()

        # number of speakers + 1 for non-speech
        self.n_classes_ = len(hypothesis.labels()) + 1

        model = StackedRNN(self.precomputed.dimension(),
                           self.n_classes,
                           rnn=self.rnn,
                           recurrent=self.recurrent,
                           linear=self.linear,
                           bidirectional=self.bidirectional,
                           logsoftmax=True)

        # initialize dummy protocol that has only one file
        protocol = self.get_dummy_protocol(current_file)

        if log_dir is None:
            log_dir = tempfile.mkdtemp()
        uri = get_unique_identifier(current_file)
        log_dir = 'f{log_dir}/{uri}'

        self.scores_ = collections.deque([], maxlen=self.ensemble)

        iterations = self.fit_iter(model,
                                   self.precomputed,
                                   protocol,
                                   subset='train',
                                   restart=0,
                                   epochs=self.epochs,
                                   learning_rate='auto',
                                   get_optimizer=SGD,
                                   get_scheduler=ConstantScheduler,
                                   log_dir=log_dir,
                                   device=device)

        for i, iteration in enumerate(iterations):

            # if 'partial', compute scores for every iteration
            # if not, compute scores for last 'ensemble' iterations only
            if partial or (i + 1 > self.epochs - self.ensemble):
                iteration_score = self._score(iteration['model'],
                                              current_file,
                                              device=device)
                self.scores_.append(iteration_score)

            # if 'partial', generate (and yield) hypothesis
            if partial:
                hypothesis = self._decode(self.scores_)
                yield hypothesis

        # generate (and yield) final hypothesis in case it's not already
        if not partial:
            hypothesis = self._decode(self.scores_)
            yield hypothesis
Beispiel #24
0
    def train(self, current_file, batch_size=32):
        def generator(xs, ys, batch_size, shuffle=True):
            length = len(xs)
            idxs = list(range(length))
            if shuffle:
                random.shuffle(idxs)
            while True:
                tmp = []
                for i in idxs:
                    tmp.append(i)
                    if len(tmp) == batch_size:
                        xbatch = np.vstack([xs[i] for i in tmp])
                        ybatch = np.vstack([ys[i] for i in tmp])
                        tmp = []
                        yield xbatch, ybatch

        duration = self.config_['sequences']['duration']
        step = self.config_['sequences']['step']

        current_file['features'] = self.feature_precomputed(current_file)
        realignment_generator = RealignmentBatchGenerator(duration=duration,
                                                          step=step,
                                                          batch_size=1,
                                                          source=self.source)

        bg = realignment_generator.from_file(current_file)
        xys = [(x, y) for x, y in bg]
        xs = [x for x, _ in xys]
        ys = [y for _, y in xys]

        input_shape = realignment_generator.input_shape
        n_classes = realignment_generator.n_classes
        # architecture
        architecture_name = self.config_['architecture']['name']
        models = __import__('pyannote.audio.labeling.models',
                            fromlist=[architecture_name])
        Architecture = getattr(models, architecture_name)
        params = self.config_['architecture'].get('params', {})
        params['n_classes'] = n_classes
        self.architecture_ = Architecture(**params)

        train_total = sum(
            [end - start for start, end in current_file['annotated']])
        steps_per_epoch = int(np.ceil((train_total / step) / batch_size))

        if self.models_dir is None:
            return SequenceLabeling.train(input_shape,
                                          self.architecture_,
                                          generator(xs,
                                                    ys,
                                                    batch_size,
                                                    shuffle=True),
                                          steps_per_epoch,
                                          self.num_epoch,
                                          optimizer=SSMORMS3(),
                                          log_dir=None)
        else:
            return SequenceLabeling.train(input_shape,
                                          self.architecture_,
                                          generator(xs,
                                                    ys,
                                                    batch_size,
                                                    shuffle=True),
                                          steps_per_epoch,
                                          self.num_epoch,
                                          optimizer=SSMORMS3(),
                                          log_dir=self.models_dir +
                                          get_unique_identifier(current_file))
    def validate_epoch(self, epoch, protocol_name, subset='development',
                       validation_data=None):

        # load model for current epoch
        model = self.load_model(epoch).to(self.device)
        model.eval()

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        duration = self.task_.duration
        step = .25 * duration
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        metric = DetectionErrorRate()

        predictions = {}

        file_generator = getattr(protocol, subset)()
        for current_file in file_generator:
            uri = get_unique_identifier(current_file)
            scores = sequence_labeling.apply(current_file)

            if model.logsoftmax:
                scores = SlidingWindowFeature(
                    1. - np.exp(scores.data[:, 0]),
                    scores.sliding_window)
            else:
                scores = SlidingWindowFeature(
                    1. - scores.data[:, 0],
                    scores.sliding_window)

            predictions[uri] = scores

        def fun(threshold):

            binarizer = Binarize(onset=threshold,
                                 offset=threshold,
                                 log_scale=False)

            protocol = get_protocol(protocol_name, progress=False,
                                    preprocessors=self.preprocessors_)

            metric = DetectionErrorRate()

            # NOTE -- embarrasingly parallel
            # TODO -- parallelize this
            file_generator = getattr(protocol, subset)()
            for current_file in file_generator:

                uri = get_unique_identifier(current_file)
                hypothesis = binarizer.apply(
                    predictions[uri], dimension=0).to_annotation()
                reference = current_file['annotation']
                uem = get_annotated(current_file)
                _ = metric(reference, hypothesis, uem=uem)

            return abs(metric)

        res = scipy.optimize.minimize_scalar(
            fun, bounds=(0., 1.), method='bounded', options={'maxiter': 10})

        return {
            'speech_activity_detection/error': {'minimize': True,
                                                'value': res.fun},
            'speech_activity_detection/threshold': {'minimize': 'NA',
                                                    'value': res.x}}