コード例 #1
0
    def apply(self, current_file):

        # Speech Activity Detection

        # get raw SAD scores
        soft_sad = self.sad_(current_file)

        # check once and for all whether SAD scores are log-scaled
        if not hasattr(self, 'sad_log_scale_'):
            if np.nanmean(soft_sad.data) < 0:
                self.sad_log_scale_ = True
            else:
                self.sad_log_scale_ = False

        # get SAD probability
        prob_sad = np.exp(soft_sad.data) if self.sad_log_scale_ \
                   else soft_sad.data

        # support both non-speech/speech & non-speech/single/overlap
        prob_sad = 1. - prob_sad[:, 0]
        prob_sad = SlidingWindowFeature(prob_sad, soft_sad.sliding_window)

        # binarization
        hard_sad = self.sad_binarize_.apply(prob_sad)

        # Speaker Change Detection

        # get raw SCD scores
        soft_scd = self.scd_(current_file)

        # check once and for all whether SCD scores are log-scaled
        if not hasattr(self, 'scd_log_scale_'):
            if np.nanmean(soft_scd.data) < 0:
                self.scd_log_scale_ = True
            else:
                self.scd_log_scale_ = False

        # get SCD probability
        prob_scd = np.exp(soft_scd.data) if self.scd_log_scale_ \
                   else soft_scd.data

        # take the final dimension
        # (in order to support both classification and regression scores)
        prob_scd = prob_scd[:, -1]
        prob_scd = SlidingWindowFeature(prob_scd, soft_scd.sliding_window)

        # peak detection
        hard_scd = self.scd_peak_.apply(prob_scd)

        speech_turns = hard_scd.crop(hard_sad)

        # only process the annotated part
        speech_turns = speech_turns.crop(get_annotated(current_file))

        return speech_turns
コード例 #2
0
ファイル: base.py プロジェクト: instinct2k18/pyannote-audio
    def sliding_samples(self):

        uris = list(self.data_)
        durations = np.array([self.data_[uri]['duration'] for uri in uris])
        probabilities = durations / np.sum(durations)

        sliding_segments = SlidingSegments(duration=self.duration,
                                           step=self.duration,
                                           source='annotated')

        while True:

            np.random.shuffle(uris)

            # loop on all files
            for uri in uris:
                datum = self.data_[uri]

                # make a copy of current file
                current_file = dict(datum['current_file'])

                # randomly shift 'annotated' segments start time so that
                # we avoid generating exactly the same subsequence twice
                annotated = Timeline(
                    [Segment(s.start + np.random.random() * self.duration,
                             s.end) for s in get_annotated(current_file)])
                current_file['annotated'] = annotated

                if self.shuffle:
                    samples = []

                for sequence in sliding_segments.from_file(current_file):

                    X = self.precomputed.crop(current_file,
                                              sequence, mode='center',
                                              fixed=self.duration)

                    y = datum['y'].crop(sequence, mode='center',
                                        fixed=self.duration)

                    sample = {'X': X, 'y': np.squeeze(y)}

                    if self.shuffle:
                        samples.append(sample)
                    else:
                        yield sample

                if self.shuffle:
                    np.random.shuffle(samples)
                    for sample in samples:
                        yield sample
コード例 #3
0
ファイル: base.py プロジェクト: instinct2k18/pyannote-audio
    def objective(self, protocol, subset='development', learning=False):
        """Compute the value of the objective function (the lower, the better)

        Parameters
        ----------
        protocol : pyannote.database.Protocol
            Protocol on which to compute the value of the objective function.
        subset : {'train', 'development', 'test'}, optional
            Subset on which to compute the value of the objective function.
            Defaults to 'development'.
        learning : bool, optional
            Set to True to indicate that the pipeline is being tuned and that
            the reference can be passed safely to the pipeline. Default
            behavior is to remove it from `current_file`. This is useful for
            pipelines that may take a looooong time to proceed when the
            hypothesis is completely wrong (e.g. too many segments to cluster).

        Returns
        -------
        metric : float
            Value of the objective function (the lower, the better).
        """
        metric = self.get_tune_metric()
        value, duration = [], []
        # NOTE -- embarrasingly parallel
        # TODO -- parallelize this
        for current_file in getattr(protocol, subset)():

            uem = get_annotated(current_file)

            if learning:
                reference = current_file['annotation']
            else:
                reference = current_file.pop('annotation')

            hypothesis = self.apply(current_file)

            if hypothesis is None:
                return 1.

            metric_value = metric(reference, hypothesis, uem=uem)
            value.append(metric_value)
            duration.append(uem.duration())

        # support for pyannote.metrics
        if hasattr(metric, '__abs__'):
            return abs(metric)
        # support for any other metric
        else:
            return np.average(value, weights=duration)
コード例 #4
0
    def apply(self, current_file):

        # initial segmentation
        speech_turns = super().apply(current_file)

        # initialize the hypothesized annotation
        hypothesis = Annotation(uri=current_file['uri'])
        if len(speech_turns) < 1:
            return hypothesis

        # this only happens during pipeline training
        if 'annotation' in current_file:
            # number of speech turns in reference
            reference = current_file['annotation']
            n_turns_true = len(list(reference.itertracks()))

            # number of speech turns in hypothesis
            uem = get_annotated(current_file)
            n_turns_pred = len(speech_turns.crop(uem))

            # don't even bother trying to cluster those speech turns
            # as there are too many of those...
            if n_turns_pred > 20 * n_turns_true:
                return None

        # get raw (sliding window) embeddings
        emb = self.emb_(current_file)

        # get one embedding per speech turn
        # FIXME don't l2_normalize for any metric
        fX = l2_normalize(np.vstack(
            [np.sum(emb.crop(t, mode='loose'), axis=0) for t in speech_turns]))

        # apply clustering
        try:
            affinity = -squareform(pdist(fX, metric=self.metric))
            clusters = self.cls_.fit_predict(affinity)
        except MemoryError as e:
            # cannot compute affinity propagation
            return None

        for speech_turn, cluster in zip(speech_turns, clusters):
            # HACK find why fit_predict returns NaN sometimes and fix it.
            cluster = -1 if np.isnan(cluster) else cluster
            hypothesis[speech_turn] = cluster

        return hypothesis
コード例 #5
0
        def fun(threshold):

            binarizer = Binarize(onset=threshold,
                                 offset=threshold,
                                 log_scale=False)

            protocol = get_protocol(protocol_name, progress=False,
                                    preprocessors=self.preprocessors_)

            metric = DetectionErrorRate()

            # NOTE -- embarrasingly parallel
            # TODO -- parallelize this
            file_generator = getattr(protocol, subset)()
            for current_file in file_generator:

                uri = get_unique_identifier(current_file)
                hypothesis = binarizer.apply(
                    predictions[uri], dimension=0).to_annotation()
                reference = current_file['annotation']
                uem = get_annotated(current_file)
                _ = metric(reference, hypothesis, uem=uem)

            return abs(metric)
コード例 #6
0
ファイル: base.py プロジェクト: herbert-wu/pyannote-audio
def apply_pretrained(validate_dir: Path,
                     protocol_name: str,
                     subset: Optional[str] = "test",
                     duration: Optional[float] = None,
                     step: float = 0.25,
                     device: Optional[torch.device] = None,
                     batch_size: int = 32,
                     pretrained: Optional[str] = None,
                     Pipeline: type = None,
                     **kwargs):
    """Apply pre-trained model

    Parameters
    ----------
    validate_dir : Path
    protocol_name : `str`
    subset : 'train' | 'development' | 'test', optional
        Defaults to 'test'.
    duration : `float`, optional
    step : `float`, optional
    device : `torch.device`, optional
    batch_size : `int`, optional
    pretrained : `str`, optional
    Pipeline : `type`
    """

    if pretrained is None:
        pretrained = Pretrained(validate_dir=validate_dir,
                                duration=duration,
                                step=step,
                                batch_size=batch_size,
                                device=device)
        output_dir = validate_dir / 'apply' / f'{pretrained.epoch_:04d}'
    else:

        if pretrained in torch.hub.list('pyannote/pyannote-audio'):
            output_dir = validate_dir / pretrained
        else:
            output_dir = validate_dir

        pretrained = Wrapper(pretrained,
                             duration=duration,
                             step=step,
                             batch_size=batch_size,
                             device=device)

    params = {}
    try:
        params['classes'] = pretrained.classes
    except AttributeError as e:
        pass
    try:
        params['dimension'] = pretrained.dimension
    except AttributeError as e:
        pass

    # create metadata file at root that contains
    # sliding window and dimension information
    precomputed = Precomputed(root_dir=output_dir,
                              sliding_window=pretrained.sliding_window,
                              **params)

    # file generator
    protocol = get_protocol(protocol_name,
                            progress=True,
                            preprocessors=pretrained.preprocessors_)

    for current_file in getattr(protocol, subset)():
        fX = pretrained(current_file)
        precomputed.dump(current_file, fX)

    # do not proceed with the full pipeline
    # when there is no such thing for current task
    if Pipeline is None:
        return

    # instantiate pipeline
    pipeline = Pipeline(scores=output_dir)
    pipeline.instantiate(pretrained.pipeline_params_)

    # load pipeline metric (when available)
    try:
        metric = pipeline.get_metric()
    except NotImplementedError as e:
        metric = None

    # apply pipeline and dump output to RTTM files
    output_rttm = output_dir / f'{protocol_name}.{subset}.rttm'
    with open(output_rttm, 'w') as fp:
        for current_file in getattr(protocol, subset)():
            hypothesis = pipeline(current_file)
            pipeline.write_rttm(fp, hypothesis)

            # compute evaluation metric (when possible)
            if 'annotation' not in current_file:
                metric = None

            # compute evaluation metric (when available)
            if metric is None:
                continue

            reference = current_file['annotation']
            uem = get_annotated(current_file)
            _ = metric(reference, hypothesis, uem=uem)

    # print pipeline metric (when available)
    if metric is None:
        return

    output_eval = output_dir / f'{protocol_name}.{subset}.eval'
    with open(output_eval, 'w') as fp:
        fp.write(str(metric))
コード例 #7
0
ファイル: base.py プロジェクト: yining4869/pyannote-audio
    def initialize(self, protocol, subset='train'):
        """Gather the following information about the training subset:

        data_ : dict

            {'segments': <list of annotated segments>,
             'duration': <total duration of annotated segments>,
             'current_file': <protocol dictionary>,
             'y': <labels as numpy array>}

        databases_ : list
            Sorted list of (unique) databases in protocol.

        labels_ : list
            Sorted list of (unique) lables in protocol.
        """

        self.data_ = {}
        labels, databases = set(), set()

        # loop once on all files
        for current_file in getattr(protocol, subset)():

            # keep track of database
            database = current_file['database']
            databases.add(database)

            # keep track of unique labels
            for label in current_file['annotation'].labels():
                label = get_label_identifier(label, current_file)
                labels.add(label)

            annotated = get_annotated(current_file)

            if not self.precomputed.use_memmap:
                msg = ('Loading all precomputed features in memory. '
                       'Set "use_memmap" to True if you run out of memory.')
                warnings.warn(msg)

            segments = [s for s in annotated if s.duration > self.duration]

            # corner case where no segment is long enough
            # and we removed them all...
            if not segments:
                continue

            # total duration of label in current_file (after removal of
            # short segments).
            duration = sum(s.duration for s in segments)

            # store all these in data_ dictionary
            datum = {'segments': segments,
                     'duration': duration,
                     'current_file': current_file}
            uri = get_unique_identifier(current_file)
            self.data_[uri] = datum

        self.databases_ = sorted(databases)
        self.labels_ = sorted(labels)

        sliding_window = self.precomputed.sliding_window()
        for current_file in getattr(protocol, subset)():
            y, _ = to_numpy(current_file, self.precomputed,
                            labels=self.labels_)
            uri = get_unique_identifier(current_file)
            self.data_[uri]['y'] = SlidingWindowFeature(
                self.postprocess_y(y), sliding_window)
コード例 #8
0
    def apply(self,
              protocol_name: str,
              output_dir: Path,
              subset: Optional[str] = "test"):
        """Apply current best pipeline

        Parameters
        ----------
        protocol_name : `str`
            Name of pyannote.database protocol to process.
        subset : `str`, optional
            Subset to process. Defaults to 'test'
        """

        # file generator
        protocol = get_protocol(protocol_name,
                                preprocessors=self.preprocessors_)

        # load pipeline metric (when available)
        try:
            metric = self.pipeline_.get_metric()
        except NotImplementedError as e:
            metric = None

        output_dir.mkdir(parents=True, exist_ok=True)
        output_ext = (
            output_dir /
            f"{protocol_name}.{subset}.{self.pipeline_.write_format}")
        with open(output_ext, mode="w") as fp:

            files = list(getattr(protocol, subset)())

            desc = f"Processing {protocol_name} ({subset})"
            for current_file in tqdm(iterable=files, desc=desc, unit="file"):

                # apply pipeline and dump output to file
                output = self.pipeline_(current_file)
                self.pipeline_.write(fp, output)

                # compute evaluation metric (when possible)
                reference = current_file.get("annotation", None)
                if reference is None:
                    metric = None

                # compute evaluation metric (when available)
                if metric is None:
                    continue

                uem = get_annotated(current_file)
                _ = metric(reference, output, uem=uem)

        # "latest" symbolic link
        latest = output_dir.parent / "latest"
        if latest.exists():
            latest.unlink()
        latest.symlink_to(output_dir)

        # print pipeline metric (when available)
        if metric is None:
            msg = (f"For some (possibly good) reason, the output of this "
                   f"pipeline could not be evaluated on {protocol_name}.")
            print(msg)
            return

        output_eval = output_dir / f"{protocol_name}.{subset}.eval"
        with open(output_eval, "w") as fp:
            fp.write(str(metric))
コード例 #9
0
    def validate_epoch(self, epoch, protocol_name, subset='development',
                       validation_data=None):

        target_purity = self.purity

        # load model for current epoch
        model = self.load_model(epoch).to(self.device)
        model.eval()

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        duration = self.task_.duration
        step = .25 * duration
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        # extract predictions for all files.
        predictions = {}
        for current_file in getattr(protocol, subset)():
            uri = get_unique_identifier(current_file)
            predictions[uri] = sequence_labeling.apply(current_file)

        # dichotomic search to find alpha that maximizes coverage
        # while having at least `target_purity`

        lower_alpha = 0.
        upper_alpha = 1.
        best_alpha = .5 * (lower_alpha + upper_alpha)
        best_coverage = 0.

        for _ in range(10):
            current_alpha = .5 * (lower_alpha + upper_alpha)
            peak = Peak(alpha=current_alpha, min_duration=0.0,
                        log_scale=model.logsoftmax)
            metric = DiarizationPurityCoverageFMeasure()

            # NOTE -- embarrasingly parallel
            # TODO -- parallelize this
            for current_file in getattr(protocol, subset)():
                reference = current_file['annotation']
                uri = get_unique_identifier(current_file)
                hypothesis = peak.apply(predictions[uri], dimension=1)
                hypothesis = hypothesis.to_annotation()
                uem = get_annotated(current_file)
                metric(reference, hypothesis, uem=uem)

            purity, coverage, _ = metric.compute_metrics()

            if purity < target_purity:
                upper_alpha = current_alpha
            else:
                lower_alpha = current_alpha
                if coverage > best_coverage:
                    best_coverage = coverage
                    best_alpha = current_alpha

        task = 'speaker_change_detection'
        metric_name = f'{task}/coverage@{target_purity:.2f}purity'
        return {
            metric_name: {'minimize': False, 'value': best_coverage},
            f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
コード例 #10
0
    def apply(self,
              protocol_name: str,
              output_dir: Path,
              subset: Optional[str] = None):
        """Apply current best pipeline

        Parameters
        ----------
        protocol_name : `str`
            Name of pyannote.database protocol to process.
        subset : `str`, optional
            Subset to process. Defaults processing all subsets.
        """

        # file generator
        protocol = get_protocol(protocol_name,
                                progress=True,
                                preprocessors=self.preprocessors_)

        output_dir.mkdir(parents=True, exist_ok=False)
        if subset is None:
            path = output_dir / f'{protocol_name}.all.txt'
        else:
            path = output_dir / f'{protocol_name}.{subset}.txt'

        # initialize evaluation metric
        try:
            metric = self.pipeline_.get_metric()
        except NotImplementedError as e:
            metric = None
            losses = []

        with open(path, mode='w') as fp:

            if subset is None:
                files = FileFinder.protocol_file_iter(protocol)
            else:
                files = getattr(protocol, subset)()

            for current_file in files:
                output = self.pipeline_(current_file)

                # evaluate output
                if metric is None:
                    loss = self.pipeline_.loss(current_file, output)
                    losses.append(loss)

                else:
                    from pyannote.database import get_annotated
                    _ = metric(current_file['annotation'],
                               output,
                               uem=get_annotated(current_file))

                self.pipeline_.write(fp, output)

        # report evaluation metric
        if metric is None:
            loss = np.mean(losses)
            print(f'Loss = {loss:g}')
        else:
            _ = metric.report(display=True)
コード例 #11
0
    def sliding_samples(self):
        """Sliding window

        Returns
        -------
        samples : generator
            Generator that yields {'waveform': ..., 'y': ...} samples
            indefinitely.
        """

        uris = list(self.data_)
        durations = np.array([self.data_[uri]['duration'] for uri in uris])
        probabilities = durations / np.sum(durations)

        sliding_segments = SlidingWindow(duration=self.duration,
                                         step=self.duration)

        while True:

            # shuffle files
            np.random.shuffle(uris)

            # loop on shuffled files
            for uri in uris:

                datum = self.data_[uri]

                # make a copy of current file
                current_file = dict(datum['current_file'])

                # read waveform for the whole file
                waveform = self.raw_audio_(current_file)

                # randomly shift 'annotated' segments start time so that
                # we avoid generating exactly the same subsequence twice
                shifted_segments = [
                    Segment(s.start + np.random.random() * self.duration, s.end)
                    for s in get_annotated(current_file)]
                # deal with corner case where a shifted segment would be empty
                shifted_segments = [s for s in shifted_segments if s]
                annotated = Timeline(segments=shifted_segments)

                samples = []
                for sequence in sliding_segments(annotated):

                    X = waveform.crop(sequence, mode='center',
                                      fixed=self.duration)

                    y = datum['y'].crop(sequence, mode=self.alignment,
                                        fixed=self.duration)

                    # FIXME -- this is ugly
                    sample = {'waveform': normalize(X),
                              'y': y,
                              'database': current_file['database'],
                              'uri': current_file['uri'],
                              'audio': current_file['audio'],
                              'duration': current_file['duration'],
                    }

                    samples.append(sample)

                np.random.shuffle(samples)
                for sample in samples:
                    yield sample
コード例 #12
0
def validate_helper_func(current_file, pipeline=None, metric=None):
    reference = current_file['annotation']
    uem = get_annotated(current_file)
    hypothesis = pipeline(current_file)
    return metric(reference, hypothesis, uem=uem)
コード例 #13
0
    def apply_iter(self,
                   current_file,
                   hypothesis,
                   partial=True,
                   device=None,
                   log_dir=None):
        """Yield re-segmentation results for each epoch

        Parameters
        ----------
        current_file : pyannote.database dict
            Currently processed file
        hypothesis : pyannote.core.Annotation
            Input segmentation
        partial : bool, optional
            Set to False to only yield final re-segmentation.
            Set to True to yield re-segmentation after each epoch.
        device : torch.device, optional
            Defaults to torch.device('cpu')
        log_dir : str, optional
            Path to log directory.

        Yields
        ------
        resegmented : pyannote.core.Annotation
            Resegmentation results after each epoch.
        """

        device = torch.device('cpu') if device is None else device

        current_file = dict(current_file)
        current_file['annotation'] = hypothesis

        # set `per_epoch` attribute to current file annotated duration
        self.per_epoch = get_annotated(current_file).duration()

        # number of speakers + 1 for non-speech
        self.n_classes_ = len(hypothesis.labels()) + 1

        model = StackedRNN(self.precomputed.dimension(),
                           self.n_classes,
                           rnn=self.rnn,
                           recurrent=self.recurrent,
                           linear=self.linear,
                           bidirectional=self.bidirectional,
                           logsoftmax=True)

        # initialize dummy protocol that has only one file
        protocol = self.get_dummy_protocol(current_file)

        if log_dir is None:
            log_dir = tempfile.mkdtemp()
        uri = get_unique_identifier(current_file)
        log_dir = 'f{log_dir}/{uri}'

        self.scores_ = collections.deque([], maxlen=self.ensemble)

        iterations = self.fit_iter(model,
                                   self.precomputed,
                                   protocol,
                                   subset='train',
                                   restart=0,
                                   epochs=self.epochs,
                                   learning_rate='auto',
                                   get_optimizer=SGD,
                                   get_scheduler=ConstantScheduler,
                                   log_dir=log_dir,
                                   device=device)

        for i, iteration in enumerate(iterations):

            # if 'partial', compute scores for every iteration
            # if not, compute scores for last 'ensemble' iterations only
            if partial or (i + 1 > self.epochs - self.ensemble):
                iteration_score = self._score(iteration['model'],
                                              current_file,
                                              device=device)
                self.scores_.append(iteration_score)

            # if 'partial', generate (and yield) hypothesis
            if partial:
                hypothesis = self._decode(self.scores_)
                yield hypothesis

        # generate (and yield) final hypothesis in case it's not already
        if not partial:
            hypothesis = self._decode(self.scores_)
            yield hypothesis
コード例 #14
0
    def apply_iter(self, current_file, hypothesis,
                   partial=True, device=None,
                   log_dir=None):
        """Yield re-segmentation results for each epoch

        Parameters
        ----------
        current_file : pyannote.database dict
            Currently processed file
        hypothesis : pyannote.core.Annotation
            Input segmentation
        partial : bool, optional
            Set to False to only yield final re-segmentation.
            Set to True to yield re-segmentation after each epoch.
        device : torch.device, optional
            Defaults to torch.device('cpu')
        log_dir : str, optional
            Path to log directory.

        Yields
        ------
        resegmented : pyannote.core.Annotation
            Resegmentation results after each epoch.
        """

        device = torch.device('cpu') if device is None else device

        current_file = dict(current_file)
        current_file['annotation'] = hypothesis

        # set `per_epoch` attribute to current file annotated duration
        self.per_epoch = get_annotated(current_file).duration()

        # number of speakers + 1 for non-speech
        self.n_classes_ = len(hypothesis.labels()) + 1

        model = StackedRNN(self.precomputed.dimension(), self.n_classes,
                           rnn=self.rnn, recurrent=self.recurrent,
                           linear=self.linear,
                           bidirectional=self.bidirectional,
                           logsoftmax=True)

        # initialize dummy protocol that has only one file
        protocol = self.get_dummy_protocol(current_file)

        if log_dir is None:
            log_dir = tempfile.mkdtemp()
        uri = get_unique_identifier(current_file)
        log_dir = 'f{log_dir}/{uri}'

        self.scores_ = collections.deque([], maxlen=self.ensemble)

        iterations = self.fit_iter(
            model, self.precomputed, protocol, subset='train',
            restart=0, epochs=self.epochs, learning_rate='auto',
            get_optimizer=SGD, get_scheduler=ConstantScheduler,
            log_dir=log_dir, device=device)

        for i, iteration in enumerate(iterations):

            # if 'partial', compute scores for every iteration
            # if not, compute scores for last 'ensemble' iterations only
            if partial or (i + 1 > self.epochs - self.ensemble):
                iteration_score = self._score(iteration['model'],
                                              current_file, device=device)
                self.scores_.append(iteration_score)

            # if 'partial', generate (and yield) hypothesis
            if partial:
                hypothesis = self._decode(self.scores_)
                yield hypothesis

        # generate (and yield) final hypothesis in case it's not already
        if not partial:
            hypothesis = self._decode(self.scores_)
            yield hypothesis
コード例 #15
0
    def __call__(self, current_file: dict) -> Annotation:
        """Apply speaker diarization

        Parameters
        ----------
        current_file : `dict`
            File as provided by a pyannote.database protocol.

        Returns
        -------
        hypothesis : `pyannote.core.Annotation`
            Speaker diarization output.
        """

        # segmentation into speech turns
        speech_turns = self.speech_turn_segmentation(current_file)

        # some files are only partially annotated and therefore one cannot
        # evaluate speaker diarization results on the whole file.
        # this option simply avoids trying to cluster those
        # (potentially messy) un-annotated refions by focusing only on
        # speech turns contained in the annotated regions.
        if self.evaluation_only:
            annotated = get_annotated(current_file)
            speech_turns = speech_turns.crop(annotated, mode='intersection')

        # in case there is one speech turn or less, there is no need to apply
        # any kind of clustering approach.
        if len(speech_turns) < 2:
            return speech_turns

        # split short/long speech turns. the idea is to first cluster long
        # speech turns (i.e. those for which we can trust embeddings) and then
        # assign each speech turn to the closest cluster.
        long_speech_turns = speech_turns.empty()
        shrt_speech_turns = speech_turns.empty()
        for segment, track, label in speech_turns.itertracks(yield_label=True):
            if segment.duration < self.min_duration:
                shrt_speech_turns[segment, track] = label
            else:
                long_speech_turns[segment, track] = label

        # in case there are no long speech turn to cluster, we return the
        # original speech turns (= shrt_speech_turns)
        if len(long_speech_turns) < 1:
            return speech_turns

        # first: cluster long speech turns
        long_speech_turns = self.speech_turn_clustering(current_file,
                                                        long_speech_turns)

        # then: assign short speech turns to clusters
        long_speech_turns.rename_labels(generator='string', copy=False)

        if len(shrt_speech_turns) > 0:
            shrt_speech_turns.rename_labels(generator='int', copy=False)
            shrt_speech_turns = self.speech_turn_assignment(current_file,
                                                            shrt_speech_turns,
                                                            long_speech_turns)
        # merge short/long speech turns
        return long_speech_turns.update(
            shrt_speech_turns, copy=False).support(collar=0.)
コード例 #16
0
        def objective(trial: Trial) -> float:
            """Compute objective value

            Parameter
            ---------
            trial : `Trial`
                Current trial

            Returns
            -------
            loss : `float`
                Loss
            """

            # use pyannote.metrics metric when available
            try:
                metric = self.pipeline.get_metric()
            except NotImplementedError as e:
                metric = None
                losses = []

            processing_time = []
            evaluation_time = []

            # instantiate pipeline with value suggested in current trial
            pipeline = self.pipeline.instantiate(
                self.pipeline.parameters(trial=trial))

            if show_progress != False:
                progress_bar = tqdm(total=len(inputs), **show_progress)
                progress_bar.update(0)

            # accumulate loss for each input
            for i, input in enumerate(inputs):

                # process input with pipeline
                # (and keep track of processing time)
                before_processing = time.time()
                output = pipeline(input)
                after_processing = time.time()
                processing_time.append(after_processing - before_processing)

                # evaluate output (and keep track of evaluation time)
                before_evaluation = time.time()

                # when metric is not available, use loss method instead
                if metric is None:
                    loss = pipeline.loss(input, output)
                    losses.append(loss)

                # when metric is available,`input` is expected to be provided
                # by a `pyannote.database` protocol
                else:
                    from pyannote.database import get_annotated

                    _ = metric(input["annotation"],
                               output,
                               uem=get_annotated(input))

                after_evaluation = time.time()
                evaluation_time.append(after_evaluation - before_evaluation)

                if show_progress != False:
                    progress_bar.update(1)

                if self.pruner is None:
                    continue

                trial.report(
                    np.mean(losses) if metric is None else abs(metric), i)
                if trial.should_prune(i):
                    raise optuna.structs.TrialPruned()

            if show_progress != False:
                progress_bar.close()

            trial.set_user_attr("processing_time", sum(processing_time))
            trial.set_user_attr("evaluation_time", sum(evaluation_time))

            return np.mean(losses) if metric is None else abs(metric)
コード例 #17
0
def apply_pretrained(
    validate_dir: Path,
    protocol_name: Text,
    subset: Subset = "test",
    duration: Optional[float] = None,
    step: float = 0.25,
    device: Optional[torch.device] = None,
    batch_size: int = 32,
    pretrained: Optional[str] = None,
    Pipeline: type = None,
    **kwargs,
):
    """Apply pre-trained model

    Parameters
    ----------
    validate_dir : Path
    protocol_name : `str`
    subset : 'train' | 'development' | 'test', optional
        Defaults to 'test'.
    duration : `float`, optional
    step : `float`, optional
    device : `torch.device`, optional
    batch_size : `int`, optional
    pretrained : `str`, optional
    Pipeline : `type`
    """

    if pretrained is None:
        pretrained = Pretrained(
            validate_dir=validate_dir,
            duration=duration,
            step=step,
            batch_size=batch_size,
            device=device,
        )
        output_dir = validate_dir / "apply" / f"{pretrained.epoch_:04d}"
    else:

        if pretrained in torch.hub.list("pyannote/pyannote-audio"):
            output_dir = validate_dir / pretrained
        else:
            output_dir = validate_dir

        pretrained = Wrapper(
            pretrained,
            duration=duration,
            step=step,
            batch_size=batch_size,
            device=device,
        )

    params = {}
    try:
        params["classes"] = pretrained.classes
    except AttributeError as e:
        pass
    try:
        params["dimension"] = pretrained.dimension
    except AttributeError as e:
        pass

    # create metadata file at root that contains
    # sliding window and dimension information
    precomputed = Precomputed(root_dir=output_dir,
                              sliding_window=pretrained.sliding_window,
                              **params)

    # file generator
    preprocessors = getattr(pretrained, "preprocessors_", dict())
    if "audio" not in preprocessors:
        preprocessors["audio"] = FileFinder()
    if "duration" not in preprocessors:
        preprocessors["duration"] = get_audio_duration
    protocol = get_protocol(protocol_name, preprocessors=preprocessors)

    files = getattr(protocol, subset)()
    for current_file in tqdm(iterable=files,
                             desc=f"{subset.title()}",
                             unit="file"):
        fX = pretrained(current_file)
        precomputed.dump(current_file, fX)

    # do not proceed with the full pipeline
    # when there is no such thing for current task
    if Pipeline is None:
        return

    # do not proceed with the full pipeline when its parameters cannot be loaded.
    # this might happen when applying a model that has not been validated yet
    try:
        pipeline_params = pretrained.pipeline_params_
    except AttributeError as e:
        return

    # instantiate pipeline
    pipeline = Pipeline(scores=output_dir)
    pipeline.instantiate(pipeline_params)

    # load pipeline metric (when available)
    try:
        metric = pipeline.get_metric()
    except NotImplementedError as e:
        metric = None

    # apply pipeline and dump output to RTTM files
    output_rttm = output_dir / f"{protocol_name}.{subset}.rttm"
    with open(output_rttm, "w") as fp:
        files = getattr(protocol, subset)()
        for current_file in tqdm(iterable=files,
                                 desc=f"{subset.title()}",
                                 unit="file"):
            hypothesis = pipeline(current_file)
            pipeline.write_rttm(fp, hypothesis)

            # compute evaluation metric (when possible)
            reference = current_file.get("annotation", None)
            if reference is None:
                metric = None

            # compute evaluation metric (when available)
            if metric is None:
                continue

            uem = get_annotated(current_file)
            _ = metric(reference, hypothesis, uem=uem)

    # print pipeline metric (when available)
    if metric is None:
        return

    output_eval = output_dir / f"{protocol_name}.{subset}.eval"
    with open(output_eval, "w") as fp:
        fp.write(str(metric))
コード例 #18
0
purity_list = []
coverage_list = []

for alpha in alphas:

	peak = Peak(alpha=alpha, min_duration=min_duration, log_scale=True)

	# evaluation metric


	# loop on test files
	
	for test_file in protocol.test():
	  # load reference annotation
	  reference = test_file['annotation']
	  uem = get_annotated(test_file)

	  # load precomputed change scores as pyannote.core.SlidingWindowFeature
	  scd_scores = precomputed(test_file)

	  # binarize scores to obtain speech regions as pyannote.core.Timeline
	  hypothesis = peak.apply(scd_scores, dimension=1)

	  # evaluate speech activity detection
	  metric(reference, hypothesis.to_annotation(), uem=uem)

	purity, coverage, fmeasure = metric.compute_metrics()
	purity = f'{100*purity:.1f}'
	coverage = f'{100*coverage:.1f}'
	purity_list.append(purity)
	coverage_list.append(coverage)
コード例 #19
0
ファイル: base.py プロジェクト: zhiqizhang/pyannote-audio
    def _sliding_samples(self):

        uris = list(self.data_)
        durations = np.array([self.data_[uri]['duration'] for uri in uris])
        probabilities = durations / np.sum(durations)
        sliding_segments = SlidingWindow(duration=self.duration,
                                         step=self.step * self.duration)

        while True:

            np.random.shuffle(uris)

            # loop on all files
            for uri in uris:

                datum = self.data_[uri]

                # make a copy of current file
                current_file = dict(datum['current_file'])

                # compute features for the whole file
                features = self.feature_extraction(current_file)

                # randomly shift 'annotated' segments start time so that
                # we avoid generating exactly the same subsequence twice
                annotated = Timeline()
                for segment in get_annotated(current_file):
                    shifted_segment = Segment(
                        segment.start + np.random.random() * self.duration,
                        segment.end)
                    if shifted_segment:
                        annotated.add(shifted_segment)

                samples = []
                for sequence in sliding_segments(annotated):

                    X = features.crop(sequence, mode='center',
                                      fixed=self.duration)
                    y = self.crop_y(datum['y'], sequence)
                    sample = {'X': X, 'y': y}

                    if self.mask is not None:

                        # extract mask for current sub-segment
                        mask = current_file[self.mask].crop(sequence,
                                                            mode='center',
                                                            fixed=self.duration)

                        # it might happen that "mask" and "y" use different
                        # sliding windows. therefore, we simply resample "mask"
                        # to match "y"
                        if len(mask) != len(y):
                            mask = scipy.signal.resample(mask, len(y), axis=0)
                        sample['mask'] = mask

                    for key, classes in self.file_labels_.items():
                        sample[key] = classes.index(current_file[key])

                    samples.append(sample)

                np.random.shuffle(samples)
                for sample in samples:
                    yield sample
コード例 #20
0
ファイル: base.py プロジェクト: zhiqizhang/pyannote-audio
    def _load_metadata(self, protocol, subset='train') -> float:
        """Load training set metadata

        This function is called once at instantiation time, returns the total
        training set duration, and populates the following attributes:

        Attributes
        ----------
        data_ : dict

            {'segments': <list of annotated segments>,
             'duration': <total duration of annotated segments>,
             'current_file': <protocol dictionary>,
             'y': <labels as numpy array>}

        segment_labels_ : list
            Sorted list of (unique) labels in protocol.

        file_labels_ : dict of list
            Sorted lists of (unique) file labels in protocol

        Returns
        -------
        duration : float
            Total duration of annotated segments, in seconds.
        """

        self.data_ = {}
        segment_labels, file_labels = set(), dict()

        # loop once on all files
        for current_file in getattr(protocol, subset)():

            # ensure annotation/annotated are cropped to actual file duration
            support = Segment(start=0, end=current_file['duration'])
            current_file['annotated'] = get_annotated(current_file).crop(
                support, mode='intersection')
            current_file['annotation'] = current_file['annotation'].crop(
                support, mode='intersection')

            # keep track of unique segment labels
            segment_labels.update(current_file['annotation'].labels())

            # keep track of unique file labels
            for key, value in current_file.items():
                if isinstance(value, (Annotation, Timeline, SlidingWindowFeature)):
                    continue
                if key not in file_labels:
                    file_labels[key] = set()
                file_labels[key].add(value)

            segments = [s for s in current_file['annotated']
                          if s.duration > self.duration]

            # corner case where no segment is long enough
            # and we removed them all...
            if not segments:
                continue

            # total duration of label in current_file (after removal of
            # short segments).
            duration = sum(s.duration for s in segments)

            # store all these in data_ dictionary
            datum = {'segments': segments,
                     'duration': duration,
                     'current_file': current_file}
            uri = get_unique_identifier(current_file)
            self.data_[uri] = datum

        self.file_labels_ = {k: sorted(file_labels[k]) for k in file_labels}
        self.segment_labels_ = sorted(segment_labels)

        for uri in list(self.data_):
            current_file = self.data_[uri]['current_file']
            y = self.initialize_y(current_file)
            self.data_[uri]['y'] = y
            if self.mask is not None:
                mask = current_file[self.mask]
                current_file[self.mask] = mask.align(y)

        return sum(datum['duration'] for datum in self.data_.values())
コード例 #21
0
    def validate_epoch(self, epoch, protocol_name, subset='development',
                       validation_data=None):

        target_precision = self.precision

        # load model for current epoch
        model = self.load_model(epoch).to(self.device)
        model.eval()

        if isinstance(self.feature_extraction_, Precomputed):
            self.feature_extraction_.use_memmap = False

        duration = self.task_.duration
        step = .25 * duration
        sequence_labeling = SequenceLabeling(
            model, self.feature_extraction_, duration=duration,
            step=.25 * duration, batch_size=self.batch_size,
            source='audio', device=self.device)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        predictions = {}
        references = {}

        file_generator = getattr(protocol, subset)()
        for current_file in file_generator:
            uri = get_unique_identifier(current_file)

            # build overlap reference
            reference = Timeline(uri=uri)
            annotation = current_file['annotation']
            for track1, track2 in annotation.co_iter(annotation):
                if track1 == track2:
                    continue
                reference.add(track1[0] & track2[0])
            references[uri] = reference.to_annotation()

            # extract overlap scores
            scores = sequence_labeling.apply(current_file)

            if model.logsoftmax:
                scores = SlidingWindowFeature(
                    np.exp(scores.data[:, 2]), scores.sliding_window)
            else:
                scores = SlidingWindowFeature(
                    scores.data[:, 2], scores.sliding_window)

            predictions[uri] = scores

        # dichotomic search to find threshold that maximizes recall
        # while having at least `target_precision`

        lower_alpha = 0.
        upper_alpha = 1.
        best_alpha = .5 * (lower_alpha + upper_alpha)
        best_recall = 0.

        for _ in range(10):
            current_alpha = .5 * (lower_alpha + upper_alpha)
            binarizer = Binarize(onset=current_alpha,
                                 offset=current_alpha,
                                 log_scale=False)

            precision = DetectionPrecision()
            recall = DetectionRecall()

            for current_file in getattr(protocol, subset)():
                uri = get_unique_identifier(current_file)
                reference = references[uri]
                hypothesis = binarizer.apply(predictions[uri], dimension=0)
                hypothesis = hypothesis.to_annotation()
                uem = get_annotated(current_file)
                _ = precision(reference, hypothesis, uem=uem)
                _ = recall(reference, hypothesis, uem=uem)

            if abs(precision) < target_precision:
                # precision is not high enough: try higher thresholds
                lower_alpha = current_alpha
            else:
                upper_alpha = current_alpha
                r = abs(recall)
                if r > best_recall:
                    best_recall = r
                    best_alpha = current_alpha

        task = 'overlap_speech_detection'
        metric_name = f'{task}/recall@{target_precision:.2f}precision'
        return {
            metric_name: {'minimize': False, 'value': best_recall},
            f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
コード例 #22
0
    def _validate_epoch_diarization(
        self,
        epoch,
        validation_data,
        protocol=None,
        subset: Subset = "development",
        device: Optional[torch.device] = None,
        batch_size: int = 32,
        n_jobs: int = 1,
        duration: float = None,
        step: float = 0.25,
        metric: str = None,
        **kwargs,
    ):

        # initialize embedding extraction
        pretrained = Pretrained(
            validate_dir=self.validate_dir_,
            epoch=epoch,
            duration=duration,
            step=step,
            batch_size=batch_size,
            device=device,
        )

        preprocessors = self.preprocessors_
        if "audio" not in preprocessors:
            preprocessors["audio"] = FileFinder()
        if "duration" not in preprocessors:
            preprocessors["duration"] = get_audio_duration
        _protocol = get_protocol(protocol, preprocessors=preprocessors)

        Z, t = dict(), dict()
        min_d, max_d = np.inf, -np.inf

        for current_file in getattr(_protocol, subset)():

            uri = get_unique_identifier(current_file)
            uem = get_annotated(current_file)
            reference = current_file["annotation"]

            X_, t_ = [], []
            embedding = pretrained(current_file)
            for i, (turn, _) in enumerate(reference.itertracks()):

                # extract embedding for current speech turn
                x_ = embedding.crop(turn, mode="center")
                if len(x_) < 1:
                    x_ = embedding.crop(turn, mode="loose")
                if len(x_) < 1:
                    msg = f"No embedding for {turn} in {uri:s}."
                    raise ValueError(msg)

                # each speech turn is represented by its average embedding
                X_.append(np.mean(x_, axis=0))
                t_.append(turn)

            X_ = np.array(X_)
            # apply hierarchical agglomerative clustering
            # all the way up to just one cluster (ie complete dendrogram)
            D = pdist(X_, metric=metric)
            min_d = min(np.min(D), min_d)
            max_d = max(np.max(D), max_d)

            Z[uri] = linkage(X_, method="pool", metric=metric)
            t[uri] = np.array(t_)

        def fun(threshold):

            _metric = DiarizationPurityCoverageFMeasure(weighted=False)

            for current_file in getattr(_protocol, subset)():

                uri = get_unique_identifier(current_file)
                uem = get_annotated(current_file)
                reference = current_file["annotation"]

                clusters = fcluster(Z[uri], threshold, criterion="distance")

                hypothesis = Annotation(uri=uri)
                for (start_time, end_time), cluster in zip(t[uri], clusters):
                    hypothesis[Segment(start_time, end_time)] = cluster

                _ = _metric(reference, hypothesis, uem=uem)

            return 1.0 - abs(_metric)

        res = scipy.optimize.minimize_scalar(fun,
                                             bounds=(0.0, 1.0),
                                             method="bounded",
                                             options={"maxiter": 10})

        threshold = res.x.item()

        return {
            "metric": "diarization_fscore",
            "minimize": False,
            "value": float(1.0 - res.fun),
        }