Exemple #1
0
    def _xxx_iter(self, subset):

        if not isinstance(subset, list):
            subsets = [subset]
        else:
            subsets = subset

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('verification')

        for subset in subsets:

            subset_data.get_group(subset)

            for uri, rows in subset_data.groupby('uri'):
                annotation = Annotation(uri=uri)
                for row in rows.itertuples():
                    segment = Segment(row.start, row.end)
                    annotation[segment] = row.speaker
                annotated = annotation.get_timeline()

                current_file = {
                    'uri': uri,
                    'database': 'VoxCeleb',
                    'annotation': annotation,
                    'annotated': annotated,
                }

                yield current_file
    def _decode(
        self,
        current_file: ProtocolFile,
        hypothesis: Annotation,
        scores: SlidingWindowFeature,
        labels: Iterable,
    ) -> Annotation:

        N, K = scores.data.shape

        if self.allow_overlap:
            active_speakers = scores.data > 0.5

        else:
            if self.lock_speech:
                active_speakers = np.argmax(scores.data, axis=1) + 1

            else:
                active_speakers = np.argmax(scores.data, axis=1)

        # reconstruct annotation
        new_hypothesis = one_hot_decoding(active_speakers,
                                          scores.sliding_window,
                                          labels=labels)

        new_hypothesis.uri = hypothesis.uri

        if self.lock_speech:
            speech = hypothesis.get_timeline().support()
            new_hypothesis = new_hypothesis.crop(speech)

        return new_hypothesis
Exemple #3
0
    def _xxx_iter(self, subset):

        if not isinstance(subset, list):
            subsets = [subset]
        else:
            subsets = subset

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('verification')

        # segment                          uri                      start end  speaker      verification identification
        # A.J._Buckley/1zcIwhmdeo4_0000001 A.J._Buckley/1zcIwhmdeo4 14.7  22.8 A.J._Buckley dev          trn

        for subset in subsets:

            subset_data = data.get_group(subset)

            for uri, datum in subset_data.iterrows():

                annotation = Annotation(uri=uri)
                segment = Segment(0., datum.end - datum.start)
                annotation[segment] = datum.speaker

                annotated = annotation.get_timeline()

                current_file = {
                    'uri': uri,
                    'database': 'VoxCeleb',
                    'annotation': annotation,
                    'annotated': annotated,
                }

                yield current_file
    def _xxx_iter(self, subset):

        if not isinstance(subset, list):
            subsets = [subset]
        else:
            subsets = subset

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('verification')

        # segment                          uri                      start end  speaker      verification identification
        # A.J._Buckley/1zcIwhmdeo4_0000001 A.J._Buckley/1zcIwhmdeo4 14.7  22.8 A.J._Buckley dev          trn

        for subset in subsets:

            subset_data = data.get_group(subset)

            for uri, datum in subset_data.iterrows():

                annotation = Annotation(uri=uri)
                segment = Segment(0., datum.end - datum.start)
                annotation[segment] = datum.speaker

                annotated = annotation.get_timeline()

                current_file = {
                    'uri': uri,
                    'database': 'VoxCeleb',
                    'annotation': annotation,
                    'annotated': annotated,
                }

                yield current_file
    def _xxx_iter(self, subset):

        if not isinstance(subset, list):
            subsets = [subset]
        else:
            subsets = subset

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('verification')

        for subset in subsets:

            subset_data.get_group(subset)

            for uri, rows in subset_data.groupby('uri'):
                annotation = Annotation(uri=uri)
                for row in rows.itertuples():
                    segment = Segment(row.start, row.end)
                    annotation[segment] = row.speaker
                annotated = annotation.get_timeline()

                current_file = {
                    'uri': uri,
                    'database': 'VoxCeleb',
                    'annotation': annotation,
                    'annotated': annotated,
                }

                yield current_file
Exemple #6
0
    def trn_iter(self):

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('identification').get_group('trn')

        for uri, rows in data.groupby('uri'):
            annotation = Annotation(uri=uri)
            for row in rows.itertuples():
                segment = Segment(row.start, row.end)
                annotation[segment] = row.speaker
            annotated = annotation.get_timeline()

            current_file = {
                'uri': uri,
                'database': 'VoxCeleb',
                'annotation': annotation,
                'annotated': annotated,
            }

            yield current_file
    def trn_iter(self):

        data_dir = op.join(op.dirname(op.realpath(__file__)), 'data')
        data_csv = op.join(data_dir, 'voxceleb1.csv')
        data = pd.read_csv(data_csv, index_col=['segment'])
        data = data.groupby('identification').get_group('trn')

        for uri, rows in data.groupby('uri'):
            annotation = Annotation(uri=uri)
            for row in rows.itertuples():
                segment = Segment(row.start, row.end)
                annotation[segment] = row.speaker
            annotated = annotation.get_timeline()

            current_file = {
                'uri': uri,
                'database': 'VoxCeleb',
                'annotation': annotation,
                'annotated': annotated,
            }

            yield current_file
Exemple #8
0
    def run(self):

        # wav file duration
        wav = self.in_wav().path
        with contextlib.closing(wave.open(wav, 'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
        duration = frames / rate
        extent = Segment(0., duration)

        with self.in_speaker().open('r') as fp:
            speaker = pyannote.core.json.load(fp)

        segmentation = Annotation()
        for segment, _ in speaker.itertracks():
            segmentation[segment] = 'speech'
        segmentation = segmentation.smooth()

        for gap in segmentation.get_timeline().gaps(extent):
                segmentation[gap] = 'non_speech'
        segmentation = segmentation.smooth()

        with self.out_put().open('w') as fp:
            pyannote.core.json.dump(segmentation, fp)
Exemple #9
0
    def run(self):

        # wav file duration
        wav = self.in_wav().path
        with contextlib.closing(wave.open(wav, 'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
        duration = frames / rate
        extent = Segment(0., duration)

        with self.in_subtitles().open('r') as fp:
            transcription = pyannote.core.json.load(fp)
        annotation = Annotation()
        for start, end, edge in transcription.ordered_edges_iter(data=True):
            if 'subtitle' not in edge:
                continue
            segment = Segment(start, end)
            annotation[segment] = 'speech'

        for gap in annotation.get_timeline().gaps(extent):
            annotation[gap] = 'non_speech'

        with self.out_put().open('w') as fp:
            pyannote.core.json.dump(annotation, fp)
Exemple #10
0
    def annotations_to_recordings(ref_annotations, sys_annotations,
                                  annotated=None, uris=None):
        """Extract ``Recording`` instances from paired annotations.

        Parameters
        ----------
        ref_annotations : dict
            ``ref_annotations[uri]`` is the reference speech annotation for
            recording ``uri``.

        sys_annotations : dict
            ``sys_annotations[uri]`` is the system speech annotation for
         recording ``uri``.

        annotated : dict, optional
            ``annotated[uri]`` is the timeline of scoring regions for
            recording``uri``; if ``annotated`` is ``None``, then the scoring
            regions will be approximated as the smallest extent containing all
            reference/system segments.

        uris : iterable of str, optional
            URIs of recordings to score. If ``None``, determined
            automatically from ``ref_annotations``.

        Returns
        -------
        list of Recording
            Recordings.
        """
        annotated = {} if annotated is None else annotated

        # Determine recordings to score.
        if uris is None:
            uris = ref_annotations.keys()
        uris = set(uris)

        # Check for missing recordings.
        for uri in uris:
            # Only check for presence in reference as we know speech is always
            # present in those segmentations, whereas system output could
            # concievably not output speech for some recordings, resulting in
            # no lines in the segments file.
            if uri not in ref_annotations:
                raise(ValueError(
                    f'"ref_annotations" missing Recording "{uri}".'))

        # Group.
        recordings = []
        for uri in sorted(uris):
            ref_ann = ref_annotations[uri]
            sys_ann = Annotation(uri=uri)
            if uri in sys_annotations:
                sys_ann = sys_annotations[uri]
            annotated_t = annotated.get(uri, None)
            if annotated_t is None:
                # Approximate scoring regions from smallest extent containing
                # all reference/system segments.
                ref_extent = ref_ann.get_timeline(copy=False).extent()
                sys_extent = sys_ann.get_timeline(copy=False).extent()
                annotated_t = ref_extent | sys_extent
            recordings.append(Recording(
                uri, ref_ann, sys_ann, annotated_t))

        return recordings
    def _decode(
        self,
        current_file: ProtocolFile,
        hypothesis: Annotation,
        scores: SlidingWindowFeature,
        labels: Iterable,
    ) -> Annotation:

        # obtain overlapped speech regions
        overlap = self.binarizer_.apply(current_file["overlap"], dimension=1)

        frames = scores.sliding_window
        N, K = scores.data.shape

        if self.lock_speech:

            # K = 1 <~~> only non-speech
            # K = 2 <~~> just one speaker
            if K < 3:
                return hypothesis

            # sequence of two most likely speaker indices
            # (even when non-speech is in fact the most likely class)
            best_speakers_indices = np.argsort(-scores.data[:, 1:],
                                               axis=1)[:, :2]

            active_speakers = np.zeros((N, K - 1), dtype=np.int64)

            # start by assigning most likely speaker...
            for t, k in enumerate(best_speakers_indices[:, 0]):
                active_speakers[t, k] = 1

            # ... then add second most likely speaker in overlap regions
            T = frames.crop(overlap, mode="strict")

            # because overlap may use a different feature extraction step
            # it might happen that T contains indices slightly large than
            # the actual number of frames. the line below remove any such
            # indices.
            T = T[T < N]

            # mark second most likely speaker as active
            active_speakers[T, best_speakers_indices[T, 1]] = 1

            # reconstruct annotation
            new_hypothesis = one_hot_decoding(active_speakers,
                                              frames,
                                              labels=labels)

            # revert non-speech regions back to original
            speech = hypothesis.get_timeline().support()
            new_hypothesis = new_hypothesis.crop(speech)

        else:

            # K = 1 <~~> only non-speech
            if K < 2:
                return hypothesis

            # sequence of two most likely class indices
            # sequence of two most likely class indices
            # (including 0=non-speech)
            best_speakers_indices = np.argsort(-scores.data, axis=1)[:, :2]

            active_speakers = np.zeros((N, K - 1), dtype=np.int64)

            # start by assigning the most likely speaker...
            for t, k in enumerate(best_speakers_indices[:, 0]):
                # k = 0 is for non-speech
                if k > 0:
                    active_speakers[t, k - 1] = 1

            # ... then add second most likely speaker in overlap regions
            T = frames.crop(overlap, mode="strict")

            # because overlap may use a different feature extraction step
            # it might happen that T contains indices slightly large than
            # the actual number of frames. the line below remove any such
            # indices.
            T = T[T < N]

            # remove timesteps where second most likely class is non-speech
            T = T[best_speakers_indices[T, 1] > 0]

            # mark second most likely speaker as active
            active_speakers[T, best_speakers_indices[T, 1] - 1] = 1

            # reconstruct annotation
            new_hypothesis = one_hot_decoding(active_speakers,
                                              frames,
                                              labels=labels)

        new_hypothesis.uri = hypothesis.uri
        return new_hypothesis