def write_test_file(data_dir, output_file, trial_length): annotations, max_length, speakers = read_annotaitons(data_dir) # create an artificial non-overlapping segments each of the trial_length size trial_segments = Timeline() for i in range(0, int(max_length) // trial_length): trial_segments.add(Segment(start=i*trial_length, end=(i+1)*trial_length)) with open(output_file, 'w') as f: for label in speakers.keys(): for annotation in annotations: # make sure our trial segments are not extending beyond the total length of the speech data support = annotation.get_timeline().extent() # we consider smaller segment here to make sure an embedding of 3 seconds can be computed adjusted_trial_segments = trial_segments.crop(Segment(start=support.start, end=support.end - 3.), mode='loose') uri = annotation.uri cur_timeline = annotation.label_timeline(label, copy=False) for trial_segment in adjusted_trial_segments: cropped_speaker = cur_timeline.crop(trial_segment, mode='intersection') if not cropped_speaker: f.write('{0} {1} {2:0>7.2f} {3:0>7.2f} nontarget - -\n'.format( label, uri, trial_segment.start, trial_segment.end)) else: f.write('{0} {1} {2:0>7.2f} {3:0>7.2f} target {4:0>7.2f} {5:0>7.2f}\n'.format( label, uri, trial_segment.start, trial_segment.end, cropped_speaker[0].start, cropped_speaker[0].duration))
def apply(self, features, segmentation=None): """ Parameters ---------- features : Features segmentation : Timeline, optional """ if segmentation is None: segmentation = Timeline(segments=[features.getExtent()]) sliding_window = features.sliding_window min_samples = sliding_window.durationToSamples(self.min_duration) precision = sliding_window.durationToSamples(self.precision) segmenter = SKLearnBICSegmentation( penalty_coef=self.penalty_coef, covariance_type=self.covariance_type, min_samples=min_samples, precision=precision) result = Timeline() for long_segment in segmentation: X = features.crop(long_segment) boundaries = segmenter.apply(X) for t, T in pairwise(boundaries): segment = sliding_window.rangeToSegment(t, T - t) shifted_segment = Segment(long_segment.start + segment.start, long_segment.start + segment.end) result.add(shifted_segment) return result
def _preprocess(self, reference, hypothesis): if not isinstance(reference, Annotation): raise TypeError('reference must be an instance of `Annotation`') if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # reference where short intra-label gaps are removed filled = Timeline() for label in reference.labels(): label_timeline = reference.label_timeline(label) for gap in label_timeline.gaps(): if gap.duration < self.tolerance: label_timeline.add(gap) for segment in label_timeline.coverage(): filled.add(segment) # reference coverage after filling gaps coverage = filled.coverage() reference_partition = self._partition(filled, coverage) hypothesis_partition = self._partition(hypothesis, coverage) return reference_partition, hypothesis_partition
def test_get_overlap(): annotation = Annotation() annotation[Segment(0, 5)] = "A" annotation[Segment(10, 15)] = "A" annotation[Segment(20, 25)] = "A" annotation[Segment(0, 10)] = "B" annotation[Segment(15, 25)] = "B" annotation[Segment(5, 10)] = "C" annotation[Segment(20, 30)] = "C" assert (annotation.get_overlap() == Timeline([Segment(0, 10), Segment(20, 25)])) assert (annotation.get_overlap(["A", "B"]) == Timeline([Segment(0, 5), Segment(20, 25)])) assert (annotation.get_overlap(["A", "C"]) == Timeline([Segment(20, 25)])) assert (annotation.get_overlap(["B", "C"]) == Timeline([Segment(5, 10), Segment(20, 25)]))
def get_annotated(current_file): # if protocol provides 'annotated' key, use it if 'annotated' in current_file: annotated = current_file['annotated'] return annotated # if it does not, but does provide 'wav' key # try and use wav duration if 'wav' in current_file: wav = current_file['wav'] try: from pyannote.audio.features.utils import get_wav_duration duration = get_wav_duration(wav) except ImportError as e: pass else: warnings.warn('"annotated" was approximated by "wav" duration.') annotated = Timeline([Segment(0, duration)]) return annotated warnings.warn('"annotated" was approximated by "annotation" extent.') extent = current_file['annotation'].get_timeline().extent() annotated = Timeline([extent]) return annotated
def _preprocess(self, reference, hypothesis): if not isinstance(reference, Annotation): raise TypeError('reference must be an instance of `Annotation`') if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # reference where short intra-label gaps are removed filled = Timeline() for label in reference.labels(): label_timeline = reference.label_timeline(label) for gap in label_timeline.gaps(): if gap.duration < self.tolerance: label_timeline.add(gap) for segment in label_timeline.support(): filled.add(segment) # reference coverage after filling gaps coverage = filled.support() reference_partition = self._partition(filled, coverage) hypothesis_partition = self._partition(hypothesis, coverage) return reference_partition, hypothesis_partition
def test_initialized_with_empty_segments(): # The first timeline includes empty segments. first_timeline = Timeline([Segment(1, 5), Segment(6, 6), Segment(7, 7), Segment(8, 10)]) # The second has no empty segments. second_timeline = Timeline([Segment(1, 5), Segment(8, 10)]) assert first_timeline == second_timeline
def test_union_extent(): first_timeline = Timeline([Segment(0, 1), Segment(2, 3), Segment(4, 5)]) second_timeline = Timeline([Segment(1.5, 6)]) union_timeline = first_timeline.union(second_timeline) assert union_timeline.extent() == Segment(0, 6)
def apply(self, predictions, dimension=0): """Peak detection Parameter --------- predictions : SlidingWindowFeature Predictions returned by segmentation approaches. Returns ------- segmentation : Timeline Partition. """ if len(predictions.data.shape) == 1: y = predictions.data elif predictions.data.shape[1] == 1: y = predictions.data[:, 0] else: y = predictions.data[:, dimension] if self.log_scale: y = np.exp(y) sw = predictions.sliding_window precision = sw.step order = max(1, int(np.rint(self.min_duration / precision))) indices = scipy.signal.argrelmax(y, order=order)[0] if self.scale == 'absolute': mini = 0 maxi = 1 elif self.scale == 'relative': mini = np.nanmin(y) maxi = np.nanmax(y) elif self.scale == 'percentile': mini = np.nanpercentile(y, 1) maxi = np.nanpercentile(y, 99) threshold = mini + self.alpha * (maxi - mini) peak_time = np.array( [sw[i].middle for i in indices if y[i] > threshold]) n_windows = len(y) start_time = sw[0].start end_time = sw[n_windows].end boundaries = np.hstack([[start_time], peak_time, [end_time]]) segmentation = Timeline() for i, (start, end) in enumerate(pairwise(boundaries)): segment = Segment(start, end) segmentation.add(segment) return segmentation
def test_crop_mapping(): timeline = Timeline([Segment(0, 2), Segment(1, 2), Segment(3, 4)]) cropped, mapping = timeline.crop(Segment(1, 2), returns_mapping=True) expected_cropped = Timeline([Segment(1, 2)]) assert cropped == expected_cropped expected_mapping = {Segment(1, 2): [Segment(0, 2), Segment(1, 2)]} assert mapping == expected_mapping
def apply(self, predictions, dimension=0): """Peak detection Parameter --------- predictions : SlidingWindowFeature Predictions returned by segmentation approaches. Returns ------- segmentation : Timeline Partition. """ if len(predictions.data.shape) == 1: y = predictions.data elif predictions.data.shape[1] == 1: y = predictions.data[:, 0] else: y = predictions.data[:, dimension] if self.log_scale: y = np.exp(y) sw = predictions.sliding_window precision = sw.step order = max(1, int(np.rint(self.min_duration / precision))) indices = scipy.signal.argrelmax(y, order=order)[0] if self.scale == 'absolute': mini = 0 maxi = 1 elif self.scale == 'relative': mini = np.nanmin(data) maxi = np.nanmax(data) elif self.scale == 'percentile': mini = np.nanpercentile(data, 1) maxi = np.nanpercentile(data, 99) threshold = mini + self.alpha * (maxi - mini) peak_time = np.array([sw[i].middle for i in indices if y[i] > threshold]) n_windows = len(y) start_time = sw[0].start end_time = sw[n_windows].end boundaries = np.hstack([[start_time], peak_time, [end_time]]) segmentation = Timeline() for i, (start, end) in enumerate(pairwise(boundaries)): segment = Segment(start, end) segmentation.add(segment) return segmentation
def test_remove_and_extent(): t = Timeline(uri='MyAudioFile') t.add(Segment(6, 8)) t.add(Segment(7, 9)) t.add(Segment(6, 9)) t.remove(Segment(6, 9)) assert t.extent() == Segment(6, 9)
def run(self): with self.in_subtitles().open('r') as fp: transcription = pyannote.core.json.load(fp) timeline = Timeline() for start, end, edge in transcription.ordered_edges_iter(data=True): if 'subtitle' not in edge: continue segment = Segment(start, end) timeline.add(segment) with self.out_put().open('w') as fp: pyannote.core.json.dump(timeline, fp)
def __call__(self, sequence=Stream.NoNewData): if isinstance(sequence, More): sequence = sequence.output if sequence in [Stream.EndOfStream, Stream.NoNewData]: return sequence data = sequence.data active = data[0] sw = sequence.sliding_window start = sw[0].middle timeline = Timeline() timeline.start = start for i, y in enumerate(data): if active and not y: segment = Segment(start, sw[i].middle) timeline.add(segment) active = False elif not active and y: active = True start = sw[i].middle if active: segment = Segment(start, sw[i].middle) timeline.add(segment) timeline.end = sw[i].middle return timeline
def overlap_timeline(uri, annotation): timeline = annotation.get_timeline() segmentation = timeline.segmentation() l_segments = [{'seg': segment, 'count': 0} for segment in segmentation] #print(l_segments) for seg in timeline: for curr in l_segments: if curr['seg'] in seg: curr['count'] += 1 overlap_timeline = Timeline(uri=uri) for curr in l_segments: if curr['count'] > 1: overlap_timeline.add(curr['seg']) return overlap_timeline
def preprocess(self, current_file, identifier=None): """Pre-compute file-wise X and y""" # extract features for the whole file # (if it has not been done already) current_file = self.periodic_preprocess(current_file, identifier=identifier) # if labels have already been extracted, do nothing if identifier in self.preprocessed_.setdefault('y', {}): return current_file # get features as pyannote.core.SlidingWindowFeature instance X = self.preprocessed_['X'][identifier] sw = X.sliding_window n_samples = X.getNumber() y = np.zeros((n_samples + 4, 1), dtype=np.int8) - 1 # [-1] ==> unknown / [0] ==> not change part / [1] ==> change part annotated = get_annotated(current_file) annotation = current_file['annotation'] segments = [] for segment, _ in annotation.itertracks(): segments.append( Segment(segment.start - self.balance, segment.start + self.balance)) segments.append( Segment(segment.end - self.balance, segment.end + self.balance)) change_part = Timeline(segments).support().crop(annotated, mode='intersection') # iterate over non-change regions for non_changes in change_part.gaps(annotated): indices = sw.crop(non_changes, mode='loose') y[indices, 0] = 0 # iterate over change regions for changes in change_part: indices = sw.crop(changes, mode='loose') y[indices, 0] = 1 y = SlidingWindowFeature(y[:-1], sw) self.preprocessed_['y'][identifier] = y return current_file
def test_consistent_timelines_with_empty_segments(): # The first timeline is initialized with Segments, some empty. first_timeline = Timeline([Segment(1, 5), Segment(6, 6), Segment(7, 7), Segment(8, 10)]) # The second timeline adds one Segment at a time, including empty ones. second_timeline = Timeline() second_timeline.add(Segment(1, 5)) second_timeline.add(Segment(6, 6)) second_timeline.add(Segment(7, 7)) second_timeline.add(Segment(8, 10)) assert first_timeline == second_timeline
def tst_enrol_iter(self): # load enrolments data_dir = Path(__file__).parent / 'data' / 'speaker_spotting' enrolments = data_dir / 'tst.enrol.txt' names = ['uri', 'NA0', 'start', 'duration', 'NA1', 'NA2', 'NA3', 'model_id'] enrolments = read_table(enrolments, delim_whitespace=True, names=names) for model_id, turns in enrolments.groupby(by=['uri', 'model_id']): # gather enrolment data segments = [] uri = '' for t, turn in enumerate(turns.itertuples()): if t == 0: uri = turn.uri segment = Segment(start=turn.start, end=turn.start + turn.duration) if segment: segments.append(segment) enrol_with = Timeline(segments=segments, uri=uri) current_enrolment = { 'database': 'Odessa', 'uri': uri, 'model_id': model_id[1], # model_id 'enrol_with': enrol_with, } yield current_enrolment
def _subset(self, protocol, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') # load annotations path = op.join( data_dir, 'librispeech-{protocol}.{subset}.mdtm'.format(subset=subset, protocol=protocol)) mdtms = self.mdtm_parser_.read(path) for uri in sorted(mdtms.uris): annotation = mdtms(uri) current_file = { 'database': 'LibriSpeech', 'uri': uri, 'annotation': annotation, # annotated part as pyannote.core.Timeline instance 'annotated': Timeline(uri=uri, segments=[annotation.get_timeline().extent()]) } yield current_file
def _subset_enrollment(self, protocol, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') enrolments = op.join(data_dir, '{protocol}.{subset}.txt'.format(subset=subset, protocol=protocol)) names = ['uri', 'NA0', 'start', 'duration', 'NA1', 'NA2', 'NA3', 'model_id'] enrolments = read_table(enrolments, delim_whitespace=True, names=names) for model_id, turns in enrolments.groupby(by='model_id'): # gather enrolment data segments = [] for t, turn in enumerate(turns.itertuples()): if t == 0: raw_uri = turn.uri uri = f'{raw_uri}' segment = Segment(start=turn.start, end=turn.start + turn.duration) if segment: segments.append(segment) enrol_with = Timeline(segments=segments, uri=uri) current_enrolment = { 'database': 'RTVE2018', 'uri': uri, 'model_id': model_id, 'enrol_with': enrol_with, } yield current_enrolment
def _xxx_enrol_iter(self, subset): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) trial_csv = op.join( data_dir, 'voxceleb1.verification.{subset}.csv'.format(subset=subset)) trials = pd.read_csv(trial_csv) for model_id in trials['enrolment'].unique(): try: row = data.ix[model_id] except KeyError as e: # file_id = model_id.split('/')[1][:-8] # msg = '{file_id} marked as duplicate in VoxCeleb 1.1' # warnings.warn(msg.format(file_id=file_id)) continue uri = model_id segment = Segment(0., row.end - row.start) current_enrolment = { 'database': 'VoxCeleb', 'uri': uri, 'model_id': model_id, 'enrol_with': Timeline(uri=uri, segments=[segment]), } yield current_enrolment
def test_union(): first_timeline = Timeline([Segment(0, 1), Segment(2, 3), Segment(4, 5)]) second_timeline = Timeline([Segment(1.5, 4.5)]) assert first_timeline.union(second_timeline) == Timeline([Segment(0, 1), Segment(1.5, 4.5), Segment(2, 3), Segment(4, 5)]) assert second_timeline.crop(first_timeline) == Timeline([Segment(2, 3), Segment(4, 4.5)]) assert list(first_timeline.co_iter(second_timeline)) == [(Segment(2, 3), Segment(1.5, 4.5)), (Segment(4, 5), Segment(1.5, 4.5))]
def load_uem(file_uem): """Load UEM file Parameter --------- file_uem : `str` Path to UEM file. Returns ------- timelines : `dict` Evaluation map as a {uri: pyannote.core.Timeline} dictionary. """ names = ['uri', 'NA1', 'start', 'end'] dtype = {'uri': str, 'start': float, 'end': float} data = pd.read_csv(file_uem, names=names, dtype=dtype, delim_whitespace=True) timelines = dict() for uri, parts in data.groupby('uri'): segments = [Segment(part.start, part.end) for i, part in parts.iterrows()] timelines[uri] = Timeline(segments=segments, uri=uri) return timelines
def common_enrol_iter(self): data_dir = op.join(op.dirname(op.realpath(__file__)), 'data') data_csv = op.join(data_dir, 'voxceleb1.csv') data = pd.read_csv(data_csv, index_col=['segment']) data = data.groupby('identification').get_group('trn') for model_id, model_rows in data.groupby('speaker'): uris = [] enrol_with = [] for uri, rows in model_rows.groupby('uri'): uris.append(uri) segments = [] for row in rows.itertuples(): segments.append(Segment(row.start, row.end)) enrol_with.append(Timeline(uri=uri, segments=segments)) current_enrolment = { 'database': 'VoxCeleb', 'model_id': model_id, 'uri': uris, 'enrol_with': enrol_with } yield current_enrolment
def test_extrude(): annotation = Annotation() annotation[Segment(0, 10)] = "A" annotation[Segment(15, 20)] = "A" annotation[Segment(20, 35)] = "B" annotation[Segment(15, 25)] = "C" annotation[Segment(30, 35)] = "C" extrusion_tl = Timeline([Segment(5, 12), Segment(14, 25)]) intersection_expected = Annotation() intersection_expected[Segment(0, 5)] = "A" intersection_expected[Segment(25, 35)] = "B" intersection_expected[Segment(30, 35)] = "C" assert (annotation.extrude(extrusion_tl, mode="intersection") == intersection_expected) loose_expected = Annotation() loose_expected[Segment(30, 35)] = "C" assert (annotation.extrude(extrusion_tl, mode="loose") == loose_expected) strict_expected = Annotation() strict_expected[Segment(0, 10)] = "A" strict_expected[Segment(20, 35)] = "B" strict_expected[Segment(30, 35)] = "C" assert (annotation.extrude(extrusion_tl, mode="strict") == strict_expected)
def predict(audio, algorithm='SpectralClustering'): # Speech Activation Detection sad_scores = sad(audio) binarize_sad = Binarize(offset=0.52, onset=0.52, log_scale=True, min_duration_off=0.1, min_duration_on=0.1) speech = binarize_sad.apply(sad_scores, dimension=1) # Speaker Change Detection scd_scores = scd(audio) peak = Peak(alpha=0.10, min_duration=0.10, log_scale=True) partition = peak.apply(scd_scores, dimension=1) # Overlapped Speech Detection # ovl_scores = ovl(audio) # binarize_ovl = Binarize(offset=0.55, onset=0.55, log_scale=True, # min_duration_off=0.1, min_duration_on=0.1) # overlap = binarize_ovl.apply(ovl_scores, dimension=1) # Speaker Embedding speech_turns = partition.crop(speech) embeddings = emb(audio) long_turns = Timeline( segments=[s for s in speech_turns if s.duration > .5]) return long_turns, sad_scores, scd_scores, embeddings
def tst_iter(self): # absolute path to 'data' directory where annotations are stored data_dir = Path(__file__).parent / 'data' / 'speaker_diarization' annotated = data_dir / 'fullset.uem' names = ['uri', 'NA0', 'start', 'end'] annotated = read_table(annotated, delim_whitespace=True, names=names) annotated_segments = {} for segment in annotated.itertuples(): annotated_segments[segment.uri] = Segment(start=segment.start, end=segment.end) # iterate through the text annotation files for filename in os.listdir(data_dir): if filename.endswith(".txt"): uri, _ = os.path.splitext(os.path.basename(filename)) annotation = Annotation(uri=uri) names = ['start', 'end', 'speaker', 'speakerID'] parsed_file = read_table(os.path.join(data_dir, filename), delim_whitespace=True, names=names) for t, turn in enumerate(parsed_file.itertuples()): segment = Segment(start=turn.start, end=turn.end) annotation[segment, t] = turn.speakerID current_file = { 'database': 'Odessa', 'uri': uri, 'annotated': Timeline(uri=uri, segments=[annotated_segments[uri]]), 'annotation': annotation} yield current_file
def _xxx_iter(self, subset): data = self._load_data(subset) AnnotatedGroups = data['annotated'].groupby(by='uri') AnnotationGroups = data['annotation'].groupby(by='uri') for raw_uri, annotated in AnnotatedGroups: uri = f'{raw_uri}.Mix-Headset' segments = [] for segment in annotated.itertuples(): segments.append(Segment(start=segment.start, end=segment.end)) annotation = Annotation(uri=uri) for t, turn in enumerate( AnnotationGroups.get_group(raw_uri).itertuples()): segment = Segment(start=turn.start, end=turn.start + turn.duration) annotation[segment, t] = turn.speaker current_file = { 'database': 'Test', 'uri': uri, 'annotated': Timeline(uri=uri, segments=segments), 'annotation': annotation } yield current_file
def uem_timeline_from_file(uem_file, uniq_name=''): """ outputs pyannote timeline segments for uem file <UEM> file format UNIQ_SPEAKER_ID CHANNEL START_TIME END_TIME """ timeline = Timeline(uri=uniq_name) with open(uem_file, 'r') as f: lines = f.readlines() for line in lines: line = line.strip() speaker_id, channel, start_time, end_time = line.split() timeline.add(Segment(float(start_time), float(end_time))) return timeline
def _xxx_enrol_iter(self, subset): # load enrolments data_dir = Path(__file__).parent / 'data' / 'speaker_spotting' enrolments = data_dir / f'{subset}.enrol.txt' names = [ 'uri', 'NA0', 'start', 'duration', 'NA1', 'NA2', 'NA3', 'model_id' ] enrolments = read_table(enrolments, delim_whitespace=True, names=names) for model_id, turns in enrolments.groupby(by='model_id'): # gather enrolment data segments = [] for t, turn in enumerate(turns.itertuples()): if t == 0: raw_uri = turn.uri uri = f'{raw_uri}.Mix-Headset' segment = Segment(start=turn.start, end=turn.start + turn.duration) if segment: segments.append(segment) enrol_with = Timeline(segments=segments, uri=uri) current_enrolment = { 'database': 'Test', 'uri': uri, 'model_id': model_id, 'enrol_with': enrol_with, } yield current_enrolment
def DER(outfile, AudioDataSet, annotationlist, audioLength): reference = Annotation() if not AudioDataSet == 'DiaExample': treeA = ET.parse(annotationlist[0]) rootA = treeA.getroot() for child in rootA.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'A' treeB = ET.parse(annotationlist[1]) rootB = treeB.getroot() for child in rootB.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'B' treeC = ET.parse(annotationlist[2]) rootC = treeC.getroot() for child in rootC.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'C' treeD = ET.parse(annotationlist[3]) rootD = treeD.getroot() for child in rootD.findall('segment'): start, end = float(child.get('transcriber_start')), float( child.get('transcriber_end')) reference[Segment(start, end)] = 'D' else: reference = Annotation() reference[Segment(0.15, 3.41)] = 'A' reference[Segment(3.83, 5.82)] = 'A' reference[Segment(6.75, 11.10)] = 'B' reference[Segment(11.32, 15.8)] = 'C' reference[Segment(15.9, 18.8)] = 'B' reference[Segment(18.8, 27.8)] = 'C' reference[Segment(27.8, 34.4)] = 'B' reference[Segment(34.4, 42)] = 'D' hypothesis = Annotation() f = open(outfile, 'r') for line in f.readlines(): start = float(line.split(' ')[3]) end = start + float(line.split(' ')[4]) annotation = line.split(' ')[5][0:-1] hypothesis[Segment(start, end)] = annotation f.close() metric = DiarizationErrorRate() metricPurity = DiarizationPurity() uem = Timeline([Segment(0, audioLength)]) print('DER: %.2f %%' % (metric(reference, hypothesis, uem=uem) * 100)) print('Cluster Purity: %.2f %%' % (metricPurity(reference, hypothesis, uem=uem) * 100)) return metric, reference, hypothesis
def load_sad_manual(dataset: Text, path: Text) -> Dict: """Load accepted pyannote.sad.manual examples Parameters ---------- dataset : str Dataset containing annotations. path : str Path to annotated file Returns ------- file : dict Dictionary containing the following keys: "audio" (Path) : path to audio file "annotated" (Timeline) : part of the audio annotated and accepted "speech" (Timeline) : part of the audio accepted as speech """ db = connect() examples = [ eg for eg in db.get_dataset(dataset) if eg["recipe"] == "pyannote.sad.manual" and eg["path"] == path and eg["answer"] == "accept" ] speech = Timeline( segments=[ Segment(span["start"], span["end"]) for eg in examples for span in eg["audio_spans"] ], ).support() annotated = Timeline(segments=[Segment(**eg["chunk"]) for eg in examples]).support() prodigy.log(f"RECIPE: {path}: loaded speech regions") return { "audio": Path(path), "speech": speech, "annotated": annotated, }
def get_annotated(current_file): """Get part of the file that is annotated. Parameters ---------- current_file : `dict` File generated by a ` pyannote.database` protocol. Returns ------- annotated : `pyannote.core.Timeline` Part of the file that is annotated. Defaults to `current_file["annotated"]`. When it does not exist, try to use the full audio extent. When that fails, use "annotation" extent. """ # if protocol provides 'annotated' key, use it if "annotated" in current_file: annotated = current_file["annotated"] return annotated # if it does not, but does provide 'audio' key # try and use wav duration if "duration" in current_file: try: duration = current_file["duration"] except ImportError: pass else: annotated = Timeline([Segment(0, duration)]) msg = '"annotated" was approximated by [0, audio duration].' warnings.warn(msg) return annotated extent = current_file["annotation"].get_timeline().extent() annotated = Timeline([extent]) msg = ('"annotated" was approximated by "annotation" extent. ' 'Please provide "annotated" directly, or at the very ' 'least, use a "duration" preprocessor.') warnings.warn(msg) return annotated
def apply(self, current_file): # extract precomputed scores precomputed = self.precomputed_(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(precomputed.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(precomputed.data) if self.log_scale_ \ else precomputed.data # speech vs. non-speech speech_prob = SlidingWindowFeature( 1. - data[:, 0], precomputed.sliding_window) speech = self.speech_binarize_.apply(speech_prob) if self.has_overlap_: # overlap vs. non-overlap overlap_prob = SlidingWindowFeature( data[:, 2], precomputed.sliding_window) overlap = self.overlap_binarize_.apply(overlap_prob) # overlap speech can only happen in speech regions overlap = overlap.crop(speech) else: # empty timeline overlap = Timeline() speech = speech.to_annotation(generator='string') overlap = overlap.to_annotation(generator='int') hypothesis = speech.update(overlap) return hypothesis
def apply(self, feature, segmentation=None): if segmentation is None: focus = feature.getExtent() segmentation = Timeline(segments=[focus], uri=None) result = Timeline() for focus in segmentation: x, y = list(zip(*[ (m, d) for m, d in self.iterdiff(feature, focus) ])) x = np.array(x) y = np.array(y) # find local maxima order = 1 if self.min_duration > 0: order = int(self.min_duration / self.step) maxima = scipy.signal.argrelmax(y, order=order) x = x[maxima] y = y[maxima] # only keep high enough local maxima high_maxima = np.where(y > self.threshold) # create list of segment boundaries # do not forget very first and last boundaries boundaries = itertools.chain( [focus.start], x[high_maxima], [focus.end] ) # create list of segments from boundaries segments = [Segment(*p) for p in pairwise(boundaries)] result.update(Timeline(segments=segments)) return result
def read(self, path, uri=None, **kwargs): # load whole file df = pandas.read_table(path, delim_whitespace=True, header=None, names=self.fields(), comment=self.comment(), converters=self.converters()) # remove comment lines # (i.e. lines for which all fields are either None or NaN) keep = [not all([pandas.isnull(r[n]) for n in self.fields()]) for _, r in df.iterrows()] df = df[keep] # add 'segment' column build from start time & duration df[PYANNOTE_SEGMENT] = [self.get_segment(row) for r, row in df.iterrows()] # add uri column in case it does not exist if PYANNOTE_URI not in df: if uri is None: raise ValueError('missing uri -- use uri=') df[PYANNOTE_URI] = uri # obtain list of resources uris = list(df[PYANNOTE_URI].unique()) self._loaded = {} # loop on resources for uri in uris: # filter based on resource df_ = df[df[PYANNOTE_URI] == uri] t = Timeline.from_df(df_, uri=uri) self._loaded[uri, None] = t return self
def run(self): # wav file duration wav = self.in_wav().path with contextlib.closing(wave.open(wav, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / rate extent = Segment(0., duration) with self.in_speaker().open('r') as fp: speaker = pyannote.core.json.load(fp) timeline = Timeline() for segment, _ in speaker.itertracks(): timeline.add(segment) # fill gaps for gap in timeline.gaps(extent): if gap.duration < self.fill_gaps: timeline.add(gap) timeline = timeline.coverage() # dump as annotation... if self.to_annotation: annotation = Annotation() for s, segment in enumerate(timeline): annotation[segment] = s annotation = annotation.anonymize_labels(generator='string') with self.out_put().open('w') as fp: pyannote.core.json.dump(annotation, fp) # ... or as timeline else: with self.out_put().open('w') as fp: pyannote.core.json.dump(timeline, fp)
def _get_collar(self, reference, duration): # initialize empty timeline collar = Timeline(uri=reference.uri) if duration == 0.: return collar # iterate over all segments in reference for segment in reference.itersegments(): # add collar centered on start time t = segment.start collar.add(Segment(t - .5 * duration, t + .5 * duration)) # add collar centered on end time t = segment.end collar.add(Segment(t - .5 * duration, t + .5 * duration)) # merge overlapping collars and return return collar.coverage()
def __call__(self, reference, hypothesis): if isinstance(reference, Annotation): reference = reference.get_timeline() if isinstance(hypothesis, Annotation): hypothesis = hypothesis.get_timeline() # over-segmentation over = Timeline(uri=reference.uri) prev_r = reference[0] intersection = [] for r, h in reference.co_iter(hypothesis): if r != prev_r: intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) intersection = [] prev_r = r segment = r & h intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: over.add(segment) # under-segmentation under = Timeline(uri=reference.uri) prev_h = hypothesis[0] intersection = [] for h, r in hypothesis.co_iter(reference): if h != prev_h: intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) intersection = [] prev_h = h segment = h & r intersection.append((segment.duration, segment)) intersection = sorted(intersection) for _, segment in intersection[:-1]: under.add(segment) # extent extent = reference.extent() # correct (neither under- nor over-segmented) correct = under.union(over).gaps(focus=extent) # frontier error (both under- and over-segmented) frontier = under.crop(over) # under-segmented not_over = over.gaps(focus=extent) only_under = under.crop(not_over) # over-segmented not_under = under.gaps(focus=extent) only_over = over.crop(not_under) status = Annotation(uri=reference.uri) for segment in correct: status[segment, '_'] = 'correct' for segment in frontier: status[segment, '_'] = 'frontier' for segment in only_over: status[segment, '_'] = 'over' for segment in only_under: status[segment, '_'] = 'under' return status.smooth()
def test_crop(timeline): selection = Segment(3,7) expected_answer = Timeline(uri='MyAudioFile') expected_answer.add(Segment(3, 4)) expected_answer.add(Segment(5, 7)) expected_answer.add(Segment(6, 7)) assert timeline.crop(selection, mode='intersection') == expected_answer expected_answer = Timeline(uri='MyAudioFile') expected_answer.add(Segment(5, 7)) assert timeline.crop(selection, mode='strict') == expected_answer expected_answer = Timeline(uri="pouet") expected_answer.add(Segment(1, 4)) expected_answer.add(Segment(5, 7)) expected_answer.add(Segment(6, 8)) timeline.crop(selection, mode='loose') == expected_answer
def timeline(): t = Timeline(uri='MyAudioFile') t.add(Segment(6, 8)) t.add(Segment(0.5, 3)) t.add(Segment(8.5, 10)) t.add(Segment(1, 4)) t.add(Segment(5, 7)) t.add(Segment(7, 8)) return t
def apply(self, predictions, dimension=0): """ Parameters ---------- predictions : SlidingWindowFeature Must be mono-dimensional dimension : int, optional Which dimension to process """ if len(predictions.data.shape) == 1: data = predictions.data elif predictions.data.shape[1] == 1: data = predictions.data[:, 0] else: data = predictions.data[:, dimension] if self.log_scale: data = np.exp(data) n_samples = predictions.getNumber() window = predictions.sliding_window timestamps = [window[i].middle for i in range(n_samples)] # initial state start = timestamps[0] label = data[0] > self.onset if self.scale == 'absolute': mini = 0 maxi = 1 elif self.scale == 'relative': mini = np.nanmin(data) maxi = np.nanmax(data) elif self.scale == 'percentile': mini = np.nanpercentile(data, 1) maxi = np.nanpercentile(data, 99) onset = mini + self.onset * (maxi - mini) offset = mini + self.offset * (maxi - mini) # timeline meant to store 'active' segments active = Timeline() for t, y in zip(timestamps[1:], data[1:]): # currently active if label: # switching from active to inactive if y < offset: segment = Segment(start - self.pad_onset, t + self.pad_offset) active.add(segment) start = t label = False # currently inactive else: # switching from inactive to active if y > onset: start = t label = True # if active at the end, add final segment if label: segment = Segment(start - self.pad_onset, t + self.pad_offset) active.add(segment) # because of padding, some 'active' segments might be overlapping # therefore, we merge those overlapping segments active = active.support() # remove short 'active' segments active = Timeline( [s for s in active if s.duration > self.min_duration_on]) # fill short 'inactive' segments inactive = active.gaps() for s in inactive: if s.duration < self.min_duration_off: active.add(s) active = active.support() return active
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): target_precision = self.precision # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) predictions = {} references = {} file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) # build overlap reference reference = Timeline(uri=uri) annotation = current_file['annotation'] for track1, track2 in annotation.co_iter(annotation): if track1 == track2: continue reference.add(track1[0] & track2[0]) references[uri] = reference.to_annotation() # extract overlap scores scores = sequence_labeling.apply(current_file) if model.logsoftmax: scores = SlidingWindowFeature( np.exp(scores.data[:, 2]), scores.sliding_window) else: scores = SlidingWindowFeature( scores.data[:, 2], scores.sliding_window) predictions[uri] = scores # dichotomic search to find threshold that maximizes recall # while having at least `target_precision` lower_alpha = 0. upper_alpha = 1. best_alpha = .5 * (lower_alpha + upper_alpha) best_recall = 0. for _ in range(10): current_alpha = .5 * (lower_alpha + upper_alpha) binarizer = Binarize(onset=current_alpha, offset=current_alpha, log_scale=False) precision = DetectionPrecision() recall = DetectionRecall() for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file) reference = references[uri] hypothesis = binarizer.apply(predictions[uri], dimension=0) hypothesis = hypothesis.to_annotation() uem = get_annotated(current_file) _ = precision(reference, hypothesis, uem=uem) _ = recall(reference, hypothesis, uem=uem) if abs(precision) < target_precision: # precision is not high enough: try higher thresholds lower_alpha = current_alpha else: upper_alpha = current_alpha r = abs(recall) if r > best_recall: best_recall = r best_alpha = current_alpha task = 'overlap_speech_detection' metric_name = f'{task}/recall@{target_precision:.2f}precision' return { metric_name: {'minimize': False, 'value': best_recall}, f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}