def _xxx_iter(self, subset): if not isinstance(subset, list): subsets = [subset] else: subsets = subset data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) data = data.groupby('verification') for subset in subsets: subset_data.get_group(subset) for uri, rows in subset_data.groupby('uri'): annotation = Annotation(uri=uri) for row in rows.itertuples(): segment = Segment(row.start, row.end) annotation[segment] = row.speaker annotated = annotation.get_timeline() current_file = { 'uri': uri, 'database': 'VoxCeleb', 'annotation': annotation, 'annotated': annotated, } yield current_file
def _decode( self, current_file: ProtocolFile, hypothesis: Annotation, scores: SlidingWindowFeature, labels: Iterable, ) -> Annotation: N, K = scores.data.shape if self.allow_overlap: active_speakers = scores.data > 0.5 else: if self.lock_speech: active_speakers = np.argmax(scores.data, axis=1) + 1 else: active_speakers = np.argmax(scores.data, axis=1) # reconstruct annotation new_hypothesis = one_hot_decoding(active_speakers, scores.sliding_window, labels=labels) new_hypothesis.uri = hypothesis.uri if self.lock_speech: speech = hypothesis.get_timeline().support() new_hypothesis = new_hypothesis.crop(speech) return new_hypothesis
def _xxx_iter(self, subset): if not isinstance(subset, list): subsets = [subset] else: subsets = subset data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) data = data.groupby('verification') # segment uri start end speaker verification identification # A.J._Buckley/1zcIwhmdeo4_0000001 A.J._Buckley/1zcIwhmdeo4 14.7 22.8 A.J._Buckley dev trn for subset in subsets: subset_data = data.get_group(subset) for uri, datum in subset_data.iterrows(): annotation = Annotation(uri=uri) segment = Segment(0., datum.end - datum.start) annotation[segment] = datum.speaker annotated = annotation.get_timeline() current_file = { 'uri': uri, 'database': 'VoxCeleb', 'annotation': annotation, 'annotated': annotated, } yield current_file
def trn_iter(self): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) data = data.groupby('identification').get_group('trn') for uri, rows in data.groupby('uri'): annotation = Annotation(uri=uri) for row in rows.itertuples(): segment = Segment(row.start, row.end) annotation[segment] = row.speaker annotated = annotation.get_timeline() current_file = { 'uri': uri, 'database': 'VoxCeleb', 'annotation': annotation, 'annotated': annotated, } yield current_file
def run(self): # wav file duration wav = self.in_wav().path with contextlib.closing(wave.open(wav, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / rate extent = Segment(0., duration) with self.in_speaker().open('r') as fp: speaker = pyannote.core.json.load(fp) segmentation = Annotation() for segment, _ in speaker.itertracks(): segmentation[segment] = 'speech' segmentation = segmentation.smooth() for gap in segmentation.get_timeline().gaps(extent): segmentation[gap] = 'non_speech' segmentation = segmentation.smooth() with self.out_put().open('w') as fp: pyannote.core.json.dump(segmentation, fp)
def run(self): # wav file duration wav = self.in_wav().path with contextlib.closing(wave.open(wav, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / rate extent = Segment(0., duration) with self.in_subtitles().open('r') as fp: transcription = pyannote.core.json.load(fp) annotation = Annotation() for start, end, edge in transcription.ordered_edges_iter(data=True): if 'subtitle' not in edge: continue segment = Segment(start, end) annotation[segment] = 'speech' for gap in annotation.get_timeline().gaps(extent): annotation[gap] = 'non_speech' with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp)
def annotations_to_recordings(ref_annotations, sys_annotations, annotated=None, uris=None): """Extract ``Recording`` instances from paired annotations. Parameters ---------- ref_annotations : dict ``ref_annotations[uri]`` is the reference speech annotation for recording ``uri``. sys_annotations : dict ``sys_annotations[uri]`` is the system speech annotation for recording ``uri``. annotated : dict, optional ``annotated[uri]`` is the timeline of scoring regions for recording``uri``; if ``annotated`` is ``None``, then the scoring regions will be approximated as the smallest extent containing all reference/system segments. uris : iterable of str, optional URIs of recordings to score. If ``None``, determined automatically from ``ref_annotations``. Returns ------- list of Recording Recordings. """ annotated = {} if annotated is None else annotated # Determine recordings to score. if uris is None: uris = ref_annotations.keys() uris = set(uris) # Check for missing recordings. for uri in uris: # Only check for presence in reference as we know speech is always # present in those segmentations, whereas system output could # concievably not output speech for some recordings, resulting in # no lines in the segments file. if uri not in ref_annotations: raise(ValueError( f'"ref_annotations" missing Recording "{uri}".')) # Group. recordings = [] for uri in sorted(uris): ref_ann = ref_annotations[uri] sys_ann = Annotation(uri=uri) if uri in sys_annotations: sys_ann = sys_annotations[uri] annotated_t = annotated.get(uri, None) if annotated_t is None: # Approximate scoring regions from smallest extent containing # all reference/system segments. ref_extent = ref_ann.get_timeline(copy=False).extent() sys_extent = sys_ann.get_timeline(copy=False).extent() annotated_t = ref_extent | sys_extent recordings.append(Recording( uri, ref_ann, sys_ann, annotated_t)) return recordings
def _decode( self, current_file: ProtocolFile, hypothesis: Annotation, scores: SlidingWindowFeature, labels: Iterable, ) -> Annotation: # obtain overlapped speech regions overlap = self.binarizer_.apply(current_file["overlap"], dimension=1) frames = scores.sliding_window N, K = scores.data.shape if self.lock_speech: # K = 1 <~~> only non-speech # K = 2 <~~> just one speaker if K < 3: return hypothesis # sequence of two most likely speaker indices # (even when non-speech is in fact the most likely class) best_speakers_indices = np.argsort(-scores.data[:, 1:], axis=1)[:, :2] active_speakers = np.zeros((N, K - 1), dtype=np.int64) # start by assigning most likely speaker... for t, k in enumerate(best_speakers_indices[:, 0]): active_speakers[t, k] = 1 # ... then add second most likely speaker in overlap regions T = frames.crop(overlap, mode="strict") # because overlap may use a different feature extraction step # it might happen that T contains indices slightly large than # the actual number of frames. the line below remove any such # indices. T = T[T < N] # mark second most likely speaker as active active_speakers[T, best_speakers_indices[T, 1]] = 1 # reconstruct annotation new_hypothesis = one_hot_decoding(active_speakers, frames, labels=labels) # revert non-speech regions back to original speech = hypothesis.get_timeline().support() new_hypothesis = new_hypothesis.crop(speech) else: # K = 1 <~~> only non-speech if K < 2: return hypothesis # sequence of two most likely class indices # sequence of two most likely class indices # (including 0=non-speech) best_speakers_indices = np.argsort(-scores.data, axis=1)[:, :2] active_speakers = np.zeros((N, K - 1), dtype=np.int64) # start by assigning the most likely speaker... for t, k in enumerate(best_speakers_indices[:, 0]): # k = 0 is for non-speech if k > 0: active_speakers[t, k - 1] = 1 # ... then add second most likely speaker in overlap regions T = frames.crop(overlap, mode="strict") # because overlap may use a different feature extraction step # it might happen that T contains indices slightly large than # the actual number of frames. the line below remove any such # indices. T = T[T < N] # remove timesteps where second most likely class is non-speech T = T[best_speakers_indices[T, 1] > 0] # mark second most likely speaker as active active_speakers[T, best_speakers_indices[T, 1] - 1] = 1 # reconstruct annotation new_hypothesis = one_hot_decoding(active_speakers, frames, labels=labels) new_hypothesis.uri = hypothesis.uri return new_hypothesis